CoCalc -- AArch64LoadStoreOptimizer.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
³⁵²⁹⁴ views
1
//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file contains a pass that performs load / store related peephole
10
// optimizations. This pass should be run after register allocation.
11
//
12
// The pass runs after the PrologEpilogInserter where we emit the CFI
13
// instructions. In order to preserve the correctness of the unwind informaiton,
14
// the pass should not change the order of any two instructions, one of which
15
// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16
// to unwind information.
17
//
18
//===----------------------------------------------------------------------===//
19

20
#include "AArch64InstrInfo.h"
21
#include "AArch64MachineFunctionInfo.h"
22
#include "AArch64Subtarget.h"
23
#include "MCTargetDesc/AArch64AddressingModes.h"
24
#include "llvm/ADT/SmallVector.h"
25
#include "llvm/ADT/Statistic.h"
26
#include "llvm/ADT/StringRef.h"
27
#include "llvm/ADT/iterator_range.h"
28
#include "llvm/Analysis/AliasAnalysis.h"
29
#include "llvm/CodeGen/MachineBasicBlock.h"
30
#include "llvm/CodeGen/MachineFunction.h"
31
#include "llvm/CodeGen/MachineFunctionPass.h"
32
#include "llvm/CodeGen/MachineInstr.h"
33
#include "llvm/CodeGen/MachineInstrBuilder.h"
34
#include "llvm/CodeGen/MachineOperand.h"
35
#include "llvm/CodeGen/MachineRegisterInfo.h"
36
#include "llvm/CodeGen/TargetRegisterInfo.h"
37
#include "llvm/IR/DebugLoc.h"
38
#include "llvm/MC/MCAsmInfo.h"
39
#include "llvm/MC/MCDwarf.h"
40
#include "llvm/MC/MCRegisterInfo.h"
41
#include "llvm/Pass.h"
42
#include "llvm/Support/CommandLine.h"
43
#include "llvm/Support/Debug.h"
44
#include "llvm/Support/DebugCounter.h"
45
#include "llvm/Support/ErrorHandling.h"
46
#include "llvm/Support/raw_ostream.h"
47
#include <cassert>
48
#include <cstdint>
49
#include <functional>
50
#include <iterator>
51
#include <limits>
52
#include <optional>
53

54
using namespace llvm;
55

56
#define DEBUG_TYPE "aarch64-ldst-opt"
57

58
STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
59
STATISTIC(NumPostFolded, "Number of post-index updates folded");
60
STATISTIC(NumPreFolded, "Number of pre-index updates folded");
61
STATISTIC(NumUnscaledPairCreated,
62
          "Number of load/store from unscaled generated");
63
STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
64
STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
65
STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "
66
                                   "not passed the alignment check");
67

68
DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
69
              "Controls which pairs are considered for renaming");
70

71
// The LdStLimit limits how far we search for load/store pairs.
72
static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
73
                                   cl::init(20), cl::Hidden);
74

75
// The UpdateLimit limits how far we search for update instructions when we form
76
// pre-/post-index instructions.
77
static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
78
                                     cl::Hidden);
79

80
// Enable register renaming to find additional store pairing opportunities.
81
static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
82
                                    cl::init(true), cl::Hidden);
83

84
#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
85

86
namespace {
87

88
using LdStPairFlags = struct LdStPairFlags {
89
  // If a matching instruction is found, MergeForward is set to true if the
90
  // merge is to remove the first instruction and replace the second with
91
  // a pair-wise insn, and false if the reverse is true.
92
  bool MergeForward = false;
93

94
  // SExtIdx gives the index of the result of the load pair that must be
95
  // extended. The value of SExtIdx assumes that the paired load produces the
96
  // value in this order: (I, returned iterator), i.e., -1 means no value has
97
  // to be extended, 0 means I, and 1 means the returned iterator.
98
  int SExtIdx = -1;
99

100
  // If not none, RenameReg can be used to rename the result register of the
101
  // first store in a pair. Currently this only works when merging stores
102
  // forward.
103
  std::optional<MCPhysReg> RenameReg;
104

105
  LdStPairFlags() = default;
106

107
  void setMergeForward(bool V = true) { MergeForward = V; }
108
  bool getMergeForward() const { return MergeForward; }
109

110
  void setSExtIdx(int V) { SExtIdx = V; }
111
  int getSExtIdx() const { return SExtIdx; }
112

113
  void setRenameReg(MCPhysReg R) { RenameReg = R; }
114
  void clearRenameReg() { RenameReg = std::nullopt; }
115
  std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
116
};
117

118
struct AArch64LoadStoreOpt : public MachineFunctionPass {
119
  static char ID;
120

121
  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
122
    initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
123
  }
124

125
  AliasAnalysis *AA;
126
  const AArch64InstrInfo *TII;
127
  const TargetRegisterInfo *TRI;
128
  const AArch64Subtarget *Subtarget;
129

130
  // Track which register units have been modified and used.
131
  LiveRegUnits ModifiedRegUnits, UsedRegUnits;
132
  LiveRegUnits DefinedInBB;
133

134
  void getAnalysisUsage(AnalysisUsage &AU) const override {
135
    AU.addRequired<AAResultsWrapperPass>();
136
    MachineFunctionPass::getAnalysisUsage(AU);
137
  }
138

139
  // Scan the instructions looking for a load/store that can be combined
140
  // with the current instruction into a load/store pair.
141
  // Return the matching instruction if one is found, else MBB->end().
142
  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
143
                                               LdStPairFlags &Flags,
144
                                               unsigned Limit,
145
                                               bool FindNarrowMerge);
146

147
  // Scan the instructions looking for a store that writes to the address from
148
  // which the current load instruction reads. Return true if one is found.
149
  bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
150
                         MachineBasicBlock::iterator &StoreI);
151

152
  // Merge the two instructions indicated into a wider narrow store instruction.
153
  MachineBasicBlock::iterator
154
  mergeNarrowZeroStores(MachineBasicBlock::iterator I,
155
                        MachineBasicBlock::iterator MergeMI,
156
                        const LdStPairFlags &Flags);
157

158
  // Merge the two instructions indicated into a single pair-wise instruction.
159
  MachineBasicBlock::iterator
160
  mergePairedInsns(MachineBasicBlock::iterator I,
161
                   MachineBasicBlock::iterator Paired,
162
                   const LdStPairFlags &Flags);
163

164
  // Promote the load that reads directly from the address stored to.
165
  MachineBasicBlock::iterator
166
  promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
167
                       MachineBasicBlock::iterator StoreI);
168

169
  // Scan the instruction list to find a base register update that can
170
  // be combined with the current instruction (a load or store) using
171
  // pre or post indexed addressing with writeback. Scan forwards.
172
  MachineBasicBlock::iterator
173
  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
174
                                int UnscaledOffset, unsigned Limit);
175

176
  // Scan the instruction list to find a base register update that can
177
  // be combined with the current instruction (a load or store) using
178
  // pre or post indexed addressing with writeback. Scan backwards.
179
  MachineBasicBlock::iterator
180
  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
181

182
  // Find an instruction that updates the base register of the ld/st
183
  // instruction.
184
  bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
185
                            unsigned BaseReg, int Offset);
186

187
  // Merge a pre- or post-index base register update into a ld/st instruction.
188
  MachineBasicBlock::iterator
189
  mergeUpdateInsn(MachineBasicBlock::iterator I,
190
                  MachineBasicBlock::iterator Update, bool IsPreIdx);
191

192
  // Find and merge zero store instructions.
193
  bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
194

195
  // Find and pair ldr/str instructions.
196
  bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
197

198
  // Find and promote load instructions which read directly from store.
199
  bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
200

201
  // Find and merge a base register updates before or after a ld/st instruction.
202
  bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
203

204
  bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205

206
  bool runOnMachineFunction(MachineFunction &Fn) override;
207

208
  MachineFunctionProperties getRequiredProperties() const override {
209
    return MachineFunctionProperties().set(
210
        MachineFunctionProperties::Property::NoVRegs);
211
  }
212

213
  StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
214
};
215

216
char AArch64LoadStoreOpt::ID = 0;
217

218
} // end anonymous namespace
219

220
INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
221
                AARCH64_LOAD_STORE_OPT_NAME, false, false)
222

223
static bool isNarrowStore(unsigned Opc) {
224
  switch (Opc) {
225
  default:
226
    return false;
227
  case AArch64::STRBBui:
228
  case AArch64::STURBBi:
229
  case AArch64::STRHHui:
230
  case AArch64::STURHHi:
231
    return true;
232
  }
233
}
234

235
// These instruction set memory tag and either keep memory contents unchanged or
236
// set it to zero, ignoring the address part of the source register.
237
static bool isTagStore(const MachineInstr &MI) {
238
  switch (MI.getOpcode()) {
239
  default:
240
    return false;
241
  case AArch64::STGi:
242
  case AArch64::STZGi:
243
  case AArch64::ST2Gi:
244
  case AArch64::STZ2Gi:
245
    return true;
246
  }
247
}
248

249
static unsigned getMatchingNonSExtOpcode(unsigned Opc,
250
                                         bool *IsValidLdStrOpc = nullptr) {
251
  if (IsValidLdStrOpc)
252
    *IsValidLdStrOpc = true;
253
  switch (Opc) {
254
  default:
255
    if (IsValidLdStrOpc)
256
      *IsValidLdStrOpc = false;
257
    return std::numeric_limits<unsigned>::max();
258
  case AArch64::STRDui:
259
  case AArch64::STURDi:
260
  case AArch64::STRDpre:
261
  case AArch64::STRQui:
262
  case AArch64::STURQi:
263
  case AArch64::STRQpre:
264
  case AArch64::STRBBui:
265
  case AArch64::STURBBi:
266
  case AArch64::STRHHui:
267
  case AArch64::STURHHi:
268
  case AArch64::STRWui:
269
  case AArch64::STRWpre:
270
  case AArch64::STURWi:
271
  case AArch64::STRXui:
272
  case AArch64::STRXpre:
273
  case AArch64::STURXi:
274
  case AArch64::LDRDui:
275
  case AArch64::LDURDi:
276
  case AArch64::LDRDpre:
277
  case AArch64::LDRQui:
278
  case AArch64::LDURQi:
279
  case AArch64::LDRQpre:
280
  case AArch64::LDRWui:
281
  case AArch64::LDURWi:
282
  case AArch64::LDRWpre:
283
  case AArch64::LDRXui:
284
  case AArch64::LDURXi:
285
  case AArch64::LDRXpre:
286
  case AArch64::STRSui:
287
  case AArch64::STURSi:
288
  case AArch64::STRSpre:
289
  case AArch64::LDRSui:
290
  case AArch64::LDURSi:
291
  case AArch64::LDRSpre:
292
    return Opc;
293
  case AArch64::LDRSWui:
294
    return AArch64::LDRWui;
295
  case AArch64::LDURSWi:
296
    return AArch64::LDURWi;
297
  case AArch64::LDRSWpre:
298
    return AArch64::LDRWpre;
299
  }
300
}
301

302
static unsigned getMatchingWideOpcode(unsigned Opc) {
303
  switch (Opc) {
304
  default:
305
    llvm_unreachable("Opcode has no wide equivalent!");
306
  case AArch64::STRBBui:
307
    return AArch64::STRHHui;
308
  case AArch64::STRHHui:
309
    return AArch64::STRWui;
310
  case AArch64::STURBBi:
311
    return AArch64::STURHHi;
312
  case AArch64::STURHHi:
313
    return AArch64::STURWi;
314
  case AArch64::STURWi:
315
    return AArch64::STURXi;
316
  case AArch64::STRWui:
317
    return AArch64::STRXui;
318
  }
319
}
320

321
static unsigned getMatchingPairOpcode(unsigned Opc) {
322
  switch (Opc) {
323
  default:
324
    llvm_unreachable("Opcode has no pairwise equivalent!");
325
  case AArch64::STRSui:
326
  case AArch64::STURSi:
327
    return AArch64::STPSi;
328
  case AArch64::STRSpre:
329
    return AArch64::STPSpre;
330
  case AArch64::STRDui:
331
  case AArch64::STURDi:
332
    return AArch64::STPDi;
333
  case AArch64::STRDpre:
334
    return AArch64::STPDpre;
335
  case AArch64::STRQui:
336
  case AArch64::STURQi:
337
    return AArch64::STPQi;
338
  case AArch64::STRQpre:
339
    return AArch64::STPQpre;
340
  case AArch64::STRWui:
341
  case AArch64::STURWi:
342
    return AArch64::STPWi;
343
  case AArch64::STRWpre:
344
    return AArch64::STPWpre;
345
  case AArch64::STRXui:
346
  case AArch64::STURXi:
347
    return AArch64::STPXi;
348
  case AArch64::STRXpre:
349
    return AArch64::STPXpre;
350
  case AArch64::LDRSui:
351
  case AArch64::LDURSi:
352
    return AArch64::LDPSi;
353
  case AArch64::LDRSpre:
354
    return AArch64::LDPSpre;
355
  case AArch64::LDRDui:
356
  case AArch64::LDURDi:
357
    return AArch64::LDPDi;
358
  case AArch64::LDRDpre:
359
    return AArch64::LDPDpre;
360
  case AArch64::LDRQui:
361
  case AArch64::LDURQi:
362
    return AArch64::LDPQi;
363
  case AArch64::LDRQpre:
364
    return AArch64::LDPQpre;
365
  case AArch64::LDRWui:
366
  case AArch64::LDURWi:
367
    return AArch64::LDPWi;
368
  case AArch64::LDRWpre:
369
    return AArch64::LDPWpre;
370
  case AArch64::LDRXui:
371
  case AArch64::LDURXi:
372
    return AArch64::LDPXi;
373
  case AArch64::LDRXpre:
374
    return AArch64::LDPXpre;
375
  case AArch64::LDRSWui:
376
  case AArch64::LDURSWi:
377
    return AArch64::LDPSWi;
378
  case AArch64::LDRSWpre:
379
    return AArch64::LDPSWpre;
380
  }
381
}
382

383
static unsigned isMatchingStore(MachineInstr &LoadInst,
384
                                MachineInstr &StoreInst) {
385
  unsigned LdOpc = LoadInst.getOpcode();
386
  unsigned StOpc = StoreInst.getOpcode();
387
  switch (LdOpc) {
388
  default:
389
    llvm_unreachable("Unsupported load instruction!");
390
  case AArch64::LDRBBui:
391
    return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
392
           StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
393
  case AArch64::LDURBBi:
394
    return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
395
           StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
396
  case AArch64::LDRHHui:
397
    return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
398
           StOpc == AArch64::STRXui;
399
  case AArch64::LDURHHi:
400
    return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
401
           StOpc == AArch64::STURXi;
402
  case AArch64::LDRWui:
403
    return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
404
  case AArch64::LDURWi:
405
    return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
406
  case AArch64::LDRXui:
407
    return StOpc == AArch64::STRXui;
408
  case AArch64::LDURXi:
409
    return StOpc == AArch64::STURXi;
410
  }
411
}
412

413
static unsigned getPreIndexedOpcode(unsigned Opc) {
414
  // FIXME: We don't currently support creating pre-indexed loads/stores when
415
  // the load or store is the unscaled version.  If we decide to perform such an
416
  // optimization in the future the cases for the unscaled loads/stores will
417
  // need to be added here.
418
  switch (Opc) {
419
  default:
420
    llvm_unreachable("Opcode has no pre-indexed equivalent!");
421
  case AArch64::STRSui:
422
    return AArch64::STRSpre;
423
  case AArch64::STRDui:
424
    return AArch64::STRDpre;
425
  case AArch64::STRQui:
426
    return AArch64::STRQpre;
427
  case AArch64::STRBBui:
428
    return AArch64::STRBBpre;
429
  case AArch64::STRHHui:
430
    return AArch64::STRHHpre;
431
  case AArch64::STRWui:
432
    return AArch64::STRWpre;
433
  case AArch64::STRXui:
434
    return AArch64::STRXpre;
435
  case AArch64::LDRSui:
436
    return AArch64::LDRSpre;
437
  case AArch64::LDRDui:
438
    return AArch64::LDRDpre;
439
  case AArch64::LDRQui:
440
    return AArch64::LDRQpre;
441
  case AArch64::LDRBBui:
442
    return AArch64::LDRBBpre;
443
  case AArch64::LDRHHui:
444
    return AArch64::LDRHHpre;
445
  case AArch64::LDRWui:
446
    return AArch64::LDRWpre;
447
  case AArch64::LDRXui:
448
    return AArch64::LDRXpre;
449
  case AArch64::LDRSWui:
450
    return AArch64::LDRSWpre;
451
  case AArch64::LDPSi:
452
    return AArch64::LDPSpre;
453
  case AArch64::LDPSWi:
454
    return AArch64::LDPSWpre;
455
  case AArch64::LDPDi:
456
    return AArch64::LDPDpre;
457
  case AArch64::LDPQi:
458
    return AArch64::LDPQpre;
459
  case AArch64::LDPWi:
460
    return AArch64::LDPWpre;
461
  case AArch64::LDPXi:
462
    return AArch64::LDPXpre;
463
  case AArch64::STPSi:
464
    return AArch64::STPSpre;
465
  case AArch64::STPDi:
466
    return AArch64::STPDpre;
467
  case AArch64::STPQi:
468
    return AArch64::STPQpre;
469
  case AArch64::STPWi:
470
    return AArch64::STPWpre;
471
  case AArch64::STPXi:
472
    return AArch64::STPXpre;
473
  case AArch64::STGi:
474
    return AArch64::STGPreIndex;
475
  case AArch64::STZGi:
476
    return AArch64::STZGPreIndex;
477
  case AArch64::ST2Gi:
478
    return AArch64::ST2GPreIndex;
479
  case AArch64::STZ2Gi:
480
    return AArch64::STZ2GPreIndex;
481
  case AArch64::STGPi:
482
    return AArch64::STGPpre;
483
  }
484
}
485

486
static unsigned getPostIndexedOpcode(unsigned Opc) {
487
  switch (Opc) {
488
  default:
489
    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
490
  case AArch64::STRSui:
491
  case AArch64::STURSi:
492
    return AArch64::STRSpost;
493
  case AArch64::STRDui:
494
  case AArch64::STURDi:
495
    return AArch64::STRDpost;
496
  case AArch64::STRQui:
497
  case AArch64::STURQi:
498
    return AArch64::STRQpost;
499
  case AArch64::STRBBui:
500
    return AArch64::STRBBpost;
501
  case AArch64::STRHHui:
502
    return AArch64::STRHHpost;
503
  case AArch64::STRWui:
504
  case AArch64::STURWi:
505
    return AArch64::STRWpost;
506
  case AArch64::STRXui:
507
  case AArch64::STURXi:
508
    return AArch64::STRXpost;
509
  case AArch64::LDRSui:
510
  case AArch64::LDURSi:
511
    return AArch64::LDRSpost;
512
  case AArch64::LDRDui:
513
  case AArch64::LDURDi:
514
    return AArch64::LDRDpost;
515
  case AArch64::LDRQui:
516
  case AArch64::LDURQi:
517
    return AArch64::LDRQpost;
518
  case AArch64::LDRBBui:
519
    return AArch64::LDRBBpost;
520
  case AArch64::LDRHHui:
521
    return AArch64::LDRHHpost;
522
  case AArch64::LDRWui:
523
  case AArch64::LDURWi:
524
    return AArch64::LDRWpost;
525
  case AArch64::LDRXui:
526
  case AArch64::LDURXi:
527
    return AArch64::LDRXpost;
528
  case AArch64::LDRSWui:
529
    return AArch64::LDRSWpost;
530
  case AArch64::LDPSi:
531
    return AArch64::LDPSpost;
532
  case AArch64::LDPSWi:
533
    return AArch64::LDPSWpost;
534
  case AArch64::LDPDi:
535
    return AArch64::LDPDpost;
536
  case AArch64::LDPQi:
537
    return AArch64::LDPQpost;
538
  case AArch64::LDPWi:
539
    return AArch64::LDPWpost;
540
  case AArch64::LDPXi:
541
    return AArch64::LDPXpost;
542
  case AArch64::STPSi:
543
    return AArch64::STPSpost;
544
  case AArch64::STPDi:
545
    return AArch64::STPDpost;
546
  case AArch64::STPQi:
547
    return AArch64::STPQpost;
548
  case AArch64::STPWi:
549
    return AArch64::STPWpost;
550
  case AArch64::STPXi:
551
    return AArch64::STPXpost;
552
  case AArch64::STGi:
553
    return AArch64::STGPostIndex;
554
  case AArch64::STZGi:
555
    return AArch64::STZGPostIndex;
556
  case AArch64::ST2Gi:
557
    return AArch64::ST2GPostIndex;
558
  case AArch64::STZ2Gi:
559
    return AArch64::STZ2GPostIndex;
560
  case AArch64::STGPi:
561
    return AArch64::STGPpost;
562
  }
563
}
564

565
static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
566

567
  unsigned OpcA = FirstMI.getOpcode();
568
  unsigned OpcB = MI.getOpcode();
569

570
  switch (OpcA) {
571
  default:
572
    return false;
573
  case AArch64::STRSpre:
574
    return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
575
  case AArch64::STRDpre:
576
    return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
577
  case AArch64::STRQpre:
578
    return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
579
  case AArch64::STRWpre:
580
    return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
581
  case AArch64::STRXpre:
582
    return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
583
  case AArch64::LDRSpre:
584
    return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
585
  case AArch64::LDRDpre:
586
    return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
587
  case AArch64::LDRQpre:
588
    return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
589
  case AArch64::LDRWpre:
590
    return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
591
  case AArch64::LDRXpre:
592
    return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
593
  case AArch64::LDRSWpre:
594
    return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi);
595
  }
596
}
597

598
// Returns the scale and offset range of pre/post indexed variants of MI.
599
static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
600
                                       int &MinOffset, int &MaxOffset) {
601
  bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
602
  bool IsTagStore = isTagStore(MI);
603
  // ST*G and all paired ldst have the same scale in pre/post-indexed variants
604
  // as in the "unsigned offset" variant.
605
  // All other pre/post indexed ldst instructions are unscaled.
606
  Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
607

608
  if (IsPaired) {
609
    MinOffset = -64;
610
    MaxOffset = 63;
611
  } else {
612
    MinOffset = -256;
613
    MaxOffset = 255;
614
  }
615
}
616

617
static MachineOperand &getLdStRegOp(MachineInstr &MI,
618
                                    unsigned PairedRegOp = 0) {
619
  assert(PairedRegOp < 2 && "Unexpected register operand idx.");
620
  bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
621
  if (IsPreLdSt)
622
    PairedRegOp += 1;
623
  unsigned Idx =
624
      AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
625
  return MI.getOperand(Idx);
626
}
627

628
static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
629
                                  MachineInstr &StoreInst,
630
                                  const AArch64InstrInfo *TII) {
631
  assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
632
  int LoadSize = TII->getMemScale(LoadInst);
633
  int StoreSize = TII->getMemScale(StoreInst);
634
  int UnscaledStOffset =
635
      TII->hasUnscaledLdStOffset(StoreInst)
636
          ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm()
637
          : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize;
638
  int UnscaledLdOffset =
639
      TII->hasUnscaledLdStOffset(LoadInst)
640
          ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm()
641
          : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize;
642
  return (UnscaledStOffset <= UnscaledLdOffset) &&
643
         (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
644
}
645

646
static bool isPromotableZeroStoreInst(MachineInstr &MI) {
647
  unsigned Opc = MI.getOpcode();
648
  return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
649
          isNarrowStore(Opc)) &&
650
         getLdStRegOp(MI).getReg() == AArch64::WZR;
651
}
652

653
static bool isPromotableLoadFromStore(MachineInstr &MI) {
654
  switch (MI.getOpcode()) {
655
  default:
656
    return false;
657
  // Scaled instructions.
658
  case AArch64::LDRBBui:
659
  case AArch64::LDRHHui:
660
  case AArch64::LDRWui:
661
  case AArch64::LDRXui:
662
  // Unscaled instructions.
663
  case AArch64::LDURBBi:
664
  case AArch64::LDURHHi:
665
  case AArch64::LDURWi:
666
  case AArch64::LDURXi:
667
    return true;
668
  }
669
}
670

671
static bool isMergeableLdStUpdate(MachineInstr &MI) {
672
  unsigned Opc = MI.getOpcode();
673
  switch (Opc) {
674
  default:
675
    return false;
676
  // Scaled instructions.
677
  case AArch64::STRSui:
678
  case AArch64::STRDui:
679
  case AArch64::STRQui:
680
  case AArch64::STRXui:
681
  case AArch64::STRWui:
682
  case AArch64::STRHHui:
683
  case AArch64::STRBBui:
684
  case AArch64::LDRSui:
685
  case AArch64::LDRDui:
686
  case AArch64::LDRQui:
687
  case AArch64::LDRXui:
688
  case AArch64::LDRWui:
689
  case AArch64::LDRHHui:
690
  case AArch64::LDRBBui:
691
  case AArch64::STGi:
692
  case AArch64::STZGi:
693
  case AArch64::ST2Gi:
694
  case AArch64::STZ2Gi:
695
  case AArch64::STGPi:
696
  // Unscaled instructions.
697
  case AArch64::STURSi:
698
  case AArch64::STURDi:
699
  case AArch64::STURQi:
700
  case AArch64::STURWi:
701
  case AArch64::STURXi:
702
  case AArch64::LDURSi:
703
  case AArch64::LDURDi:
704
  case AArch64::LDURQi:
705
  case AArch64::LDURWi:
706
  case AArch64::LDURXi:
707
  // Paired instructions.
708
  case AArch64::LDPSi:
709
  case AArch64::LDPSWi:
710
  case AArch64::LDPDi:
711
  case AArch64::LDPQi:
712
  case AArch64::LDPWi:
713
  case AArch64::LDPXi:
714
  case AArch64::STPSi:
715
  case AArch64::STPDi:
716
  case AArch64::STPQi:
717
  case AArch64::STPWi:
718
  case AArch64::STPXi:
719
    // Make sure this is a reg+imm (as opposed to an address reloc).
720
    if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
721
      return false;
722

723
    return true;
724
  }
725
}
726

727
static bool isRewritableImplicitDef(unsigned Opc) {
728
  switch (Opc) {
729
  default:
730
    return false;
731
  case AArch64::ORRWrs:
732
  case AArch64::ADDWri:
733
    return true;
734
  }
735
}
736

737
MachineBasicBlock::iterator
738
AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
739
                                           MachineBasicBlock::iterator MergeMI,
740
                                           const LdStPairFlags &Flags) {
741
  assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
742
         "Expected promotable zero stores.");
743

744
  MachineBasicBlock::iterator E = I->getParent()->end();
745
  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
746
  // If NextI is the second of the two instructions to be merged, we need
747
  // to skip one further. Either way we merge will invalidate the iterator,
748
  // and we don't need to scan the new instruction, as it's a pairwise
749
  // instruction, which we're not considering for further action anyway.
750
  if (NextI == MergeMI)
751
    NextI = next_nodbg(NextI, E);
752

753
  unsigned Opc = I->getOpcode();
754
  unsigned MergeMIOpc = MergeMI->getOpcode();
755
  bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
756
  bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(MergeMIOpc);
757
  int OffsetStride = IsScaled ? TII->getMemScale(*I) : 1;
758
  int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(*MergeMI) : 1;
759

760
  bool MergeForward = Flags.getMergeForward();
761
  // Insert our new paired instruction after whichever of the paired
762
  // instructions MergeForward indicates.
763
  MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
764
  // Also based on MergeForward is from where we copy the base register operand
765
  // so we get the flags compatible with the input code.
766
  const MachineOperand &BaseRegOp =
767
      MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI)
768
                   : AArch64InstrInfo::getLdStBaseOp(*I);
769

770
  // Which register is Rt and which is Rt2 depends on the offset order.
771
  int64_t IOffsetInBytes =
772
      AArch64InstrInfo::getLdStOffsetOp(*I).getImm() * OffsetStride;
773
  int64_t MIOffsetInBytes =
774
      AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() *
775
      MergeMIOffsetStride;
776
  // Select final offset based on the offset order.
777
  int64_t OffsetImm;
778
  if (IOffsetInBytes > MIOffsetInBytes)
779
    OffsetImm = MIOffsetInBytes;
780
  else
781
    OffsetImm = IOffsetInBytes;
782

783
  int NewOpcode = getMatchingWideOpcode(Opc);
784
  bool FinalIsScaled = !TII->hasUnscaledLdStOffset(NewOpcode);
785

786
  // Adjust final offset if the result opcode is a scaled store.
787
  if (FinalIsScaled) {
788
    int NewOffsetStride = FinalIsScaled ? TII->getMemScale(NewOpcode) : 1;
789
    assert(((OffsetImm % NewOffsetStride) == 0) &&
790
           "Offset should be a multiple of the store memory scale");
791
    OffsetImm = OffsetImm / NewOffsetStride;
792
  }
793

794
  // Construct the new instruction.
795
  DebugLoc DL = I->getDebugLoc();
796
  MachineBasicBlock *MBB = I->getParent();
797
  MachineInstrBuilder MIB;
798
  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
799
            .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
800
            .add(BaseRegOp)
801
            .addImm(OffsetImm)
802
            .cloneMergedMemRefs({&*I, &*MergeMI})
803
            .setMIFlags(I->mergeFlagsWith(*MergeMI));
804
  (void)MIB;
805

806
  LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
807
  LLVM_DEBUG(I->print(dbgs()));
808
  LLVM_DEBUG(dbgs() << "    ");
809
  LLVM_DEBUG(MergeMI->print(dbgs()));
810
  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
811
  LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
812
  LLVM_DEBUG(dbgs() << "\n");
813

814
  // Erase the old instructions.
815
  I->eraseFromParent();
816
  MergeMI->eraseFromParent();
817
  return NextI;
818
}
819

820
// Apply Fn to all instructions between MI and the beginning of the block, until
821
// a def for DefReg is reached. Returns true, iff Fn returns true for all
822
// visited instructions. Stop after visiting Limit iterations.
823
static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
824
                              const TargetRegisterInfo *TRI, unsigned Limit,
825
                              std::function<bool(MachineInstr &, bool)> &Fn) {
826
  auto MBB = MI.getParent();
827
  for (MachineInstr &I :
828
       instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {
829
    if (!Limit)
830
      return false;
831
    --Limit;
832

833
    bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {
834
      return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
835
             TRI->regsOverlap(MOP.getReg(), DefReg);
836
    });
837
    if (!Fn(I, isDef))
838
      return false;
839
    if (isDef)
840
      break;
841
  }
842
  return true;
843
}
844

845
static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
846
                                   const TargetRegisterInfo *TRI) {
847

848
  for (const MachineOperand &MOP : phys_regs_and_masks(MI))
849
    if (MOP.isReg() && MOP.isKill())
850
      Units.removeReg(MOP.getReg());
851

852
  for (const MachineOperand &MOP : phys_regs_and_masks(MI))
853
    if (MOP.isReg() && !MOP.isKill())
854
      Units.addReg(MOP.getReg());
855
}
856

857
MachineBasicBlock::iterator
858
AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
859
                                      MachineBasicBlock::iterator Paired,
860
                                      const LdStPairFlags &Flags) {
861
  MachineBasicBlock::iterator E = I->getParent()->end();
862
  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
863
  // If NextI is the second of the two instructions to be merged, we need
864
  // to skip one further. Either way we merge will invalidate the iterator,
865
  // and we don't need to scan the new instruction, as it's a pairwise
866
  // instruction, which we're not considering for further action anyway.
867
  if (NextI == Paired)
868
    NextI = next_nodbg(NextI, E);
869

870
  int SExtIdx = Flags.getSExtIdx();
871
  unsigned Opc =
872
      SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
873
  bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
874
  int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
875

876
  bool MergeForward = Flags.getMergeForward();
877

878
  std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
879
  if (RenameReg) {
880
    MCRegister RegToRename = getLdStRegOp(*I).getReg();
881
    DefinedInBB.addReg(*RenameReg);
882

883
    // Return the sub/super register for RenameReg, matching the size of
884
    // OriginalReg.
885
    auto GetMatchingSubReg =
886
        [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
887
      for (MCPhysReg SubOrSuper :
888
           TRI->sub_and_superregs_inclusive(*RenameReg)) {
889
        if (C->contains(SubOrSuper))
890
          return SubOrSuper;
891
      }
892
      llvm_unreachable("Should have found matching sub or super register!");
893
    };
894

895
    std::function<bool(MachineInstr &, bool)> UpdateMIs =
896
        [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,
897
                                                             bool IsDef) {
898
          if (IsDef) {
899
            bool SeenDef = false;
900
            for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
901
              MachineOperand &MOP = MI.getOperand(OpIdx);
902
              // Rename the first explicit definition and all implicit
903
              // definitions matching RegToRename.
904
              if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
905
                  (!MergeForward || !SeenDef ||
906
                   (MOP.isDef() && MOP.isImplicit())) &&
907
                  TRI->regsOverlap(MOP.getReg(), RegToRename)) {
908
                assert((MOP.isImplicit() ||
909
                        (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
910
                       "Need renamable operands");
911
                Register MatchingReg;
912
                if (const TargetRegisterClass *RC =
913
                        MI.getRegClassConstraint(OpIdx, TII, TRI))
914
                  MatchingReg = GetMatchingSubReg(RC);
915
                else {
916
                  if (!isRewritableImplicitDef(MI.getOpcode()))
917
                    continue;
918
                  MatchingReg = GetMatchingSubReg(
919
                      TRI->getMinimalPhysRegClass(MOP.getReg()));
920
                }
921
                MOP.setReg(MatchingReg);
922
                SeenDef = true;
923
              }
924
            }
925
          } else {
926
            for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
927
              MachineOperand &MOP = MI.getOperand(OpIdx);
928
              if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
929
                  TRI->regsOverlap(MOP.getReg(), RegToRename)) {
930
                assert((MOP.isImplicit() ||
931
                        (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
932
                           "Need renamable operands");
933
                Register MatchingReg;
934
                if (const TargetRegisterClass *RC =
935
                        MI.getRegClassConstraint(OpIdx, TII, TRI))
936
                  MatchingReg = GetMatchingSubReg(RC);
937
                else
938
                  MatchingReg = GetMatchingSubReg(
939
                      TRI->getMinimalPhysRegClass(MOP.getReg()));
940
                assert(MatchingReg != AArch64::NoRegister &&
941
                       "Cannot find matching regs for renaming");
942
                MOP.setReg(MatchingReg);
943
              }
944
            }
945
          }
946
          LLVM_DEBUG(dbgs() << "Renamed " << MI);
947
          return true;
948
        };
949
    forAllMIsUntilDef(MergeForward ? *I : *std::prev(Paired), RegToRename, TRI,
950
                      UINT32_MAX, UpdateMIs);
951

952
#if !defined(NDEBUG)
953
    // For forward merging store:
954
    // Make sure the register used for renaming is not used between the
955
    // paired instructions. That would trash the content before the new
956
    // paired instruction.
957
    MCPhysReg RegToCheck = *RenameReg;
958
    // For backward merging load:
959
    // Make sure the register being renamed is not used between the
960
    // paired instructions. That would trash the content after the new
961
    // paired instruction.
962
    if (!MergeForward)
963
      RegToCheck = RegToRename;
964
    for (auto &MI :
965
         iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
966
             MergeForward ? std::next(I) : I,
967
             MergeForward ? std::next(Paired) : Paired))
968
      assert(all_of(MI.operands(),
969
                    [this, RegToCheck](const MachineOperand &MOP) {
970
                      return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
971
                             MOP.isUndef() ||
972
                             !TRI->regsOverlap(MOP.getReg(), RegToCheck);
973
                    }) &&
974
             "Rename register used between paired instruction, trashing the "
975
             "content");
976
#endif
977
  }
978

979
  // Insert our new paired instruction after whichever of the paired
980
  // instructions MergeForward indicates.
981
  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
982
  // Also based on MergeForward is from where we copy the base register operand
983
  // so we get the flags compatible with the input code.
984
  const MachineOperand &BaseRegOp =
985
      MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired)
986
                   : AArch64InstrInfo::getLdStBaseOp(*I);
987

988
  int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm();
989
  int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm();
990
  bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
991
  if (IsUnscaled != PairedIsUnscaled) {
992
    // We're trying to pair instructions that differ in how they are scaled.  If
993
    // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
994
    // the opposite (i.e., make Paired's offset unscaled).
995
    int MemSize = TII->getMemScale(*Paired);
996
    if (PairedIsUnscaled) {
997
      // If the unscaled offset isn't a multiple of the MemSize, we can't
998
      // pair the operations together.
999
      assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
1000
             "Offset should be a multiple of the stride!");
1001
      PairedOffset /= MemSize;
1002
    } else {
1003
      PairedOffset *= MemSize;
1004
    }
1005
  }
1006

1007
  // Which register is Rt and which is Rt2 depends on the offset order.
1008
  // However, for pre load/stores the Rt should be the one of the pre
1009
  // load/store.
1010
  MachineInstr *RtMI, *Rt2MI;
1011
  if (Offset == PairedOffset + OffsetStride &&
1012
      !AArch64InstrInfo::isPreLdSt(*I)) {
1013
    RtMI = &*Paired;
1014
    Rt2MI = &*I;
1015
    // Here we swapped the assumption made for SExtIdx.
1016
    // I.e., we turn ldp I, Paired into ldp Paired, I.
1017
    // Update the index accordingly.
1018
    if (SExtIdx != -1)
1019
      SExtIdx = (SExtIdx + 1) % 2;
1020
  } else {
1021
    RtMI = &*I;
1022
    Rt2MI = &*Paired;
1023
  }
1024
  int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
1025
  // Scale the immediate offset, if necessary.
1026
  if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
1027
    assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
1028
           "Unscaled offset cannot be scaled.");
1029
    OffsetImm /= TII->getMemScale(*RtMI);
1030
  }
1031

1032
  // Construct the new instruction.
1033
  MachineInstrBuilder MIB;
1034
  DebugLoc DL = I->getDebugLoc();
1035
  MachineBasicBlock *MBB = I->getParent();
1036
  MachineOperand RegOp0 = getLdStRegOp(*RtMI);
1037
  MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
1038
  MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;
1039
  // Kill flags may become invalid when moving stores for pairing.
1040
  if (RegOp0.isUse()) {
1041
    if (!MergeForward) {
1042
      // Clear kill flags on store if moving upwards. Example:
1043
      //   STRWui kill %w0, ...
1044
      //   USE %w1
1045
      //   STRWui kill %w1  ; need to clear kill flag when moving STRWui upwards
1046
      // We are about to move the store of w1, so its kill flag may become
1047
      // invalid; not the case for w0.
1048
      // Since w1 is used between the stores, the kill flag on w1 is cleared
1049
      // after merging.
1050
      //   STPWi kill %w0, %w1, ...
1051
      //   USE %w1
1052
      for (auto It = std::next(I); It != Paired && PairedRegOp.isKill(); ++It)
1053
        if (It->readsRegister(PairedRegOp.getReg(), TRI))
1054
          PairedRegOp.setIsKill(false);
1055
    } else {
1056
      // Clear kill flags of the first stores register. Example:
1057
      //   STRWui %w1, ...
1058
      //   USE kill %w1   ; need to clear kill flag when moving STRWui downwards
1059
      //   STRW %w0
1060
      Register Reg = getLdStRegOp(*I).getReg();
1061
      for (MachineInstr &MI : make_range(std::next(I), Paired))
1062
        MI.clearRegisterKills(Reg, TRI);
1063
    }
1064
  }
1065

1066
  unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1067
  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));
1068

1069
  // Adds the pre-index operand for pre-indexed ld/st pairs.
1070
  if (AArch64InstrInfo::isPreLdSt(*RtMI))
1071
    MIB.addReg(BaseRegOp.getReg(), RegState::Define);
1072

1073
  MIB.add(RegOp0)
1074
      .add(RegOp1)
1075
      .add(BaseRegOp)
1076
      .addImm(OffsetImm)
1077
      .cloneMergedMemRefs({&*I, &*Paired})
1078
      .setMIFlags(I->mergeFlagsWith(*Paired));
1079

1080
  (void)MIB;
1081

1082
  LLVM_DEBUG(
1083
      dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
1084
  LLVM_DEBUG(I->print(dbgs()));
1085
  LLVM_DEBUG(dbgs() << "    ");
1086
  LLVM_DEBUG(Paired->print(dbgs()));
1087
  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
1088
  if (SExtIdx != -1) {
1089
    // Generate the sign extension for the proper result of the ldp.
1090
    // I.e., with X1, that would be:
1091
    // %w1 = KILL %w1, implicit-def %x1
1092
    // %x1 = SBFMXri killed %x1, 0, 31
1093
    MachineOperand &DstMO = MIB->getOperand(SExtIdx);
1094
    // Right now, DstMO has the extended register, since it comes from an
1095
    // extended opcode.
1096
    Register DstRegX = DstMO.getReg();
1097
    // Get the W variant of that register.
1098
    Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
1099
    // Update the result of LDP to use the W instead of the X variant.
1100
    DstMO.setReg(DstRegW);
1101
    LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1102
    LLVM_DEBUG(dbgs() << "\n");
1103
    // Make the machine verifier happy by providing a definition for
1104
    // the X register.
1105
    // Insert this definition right after the generated LDP, i.e., before
1106
    // InsertionPoint.
1107
    MachineInstrBuilder MIBKill =
1108
        BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
1109
            .addReg(DstRegW)
1110
            .addReg(DstRegX, RegState::Define);
1111
    MIBKill->getOperand(2).setImplicit();
1112
    // Create the sign extension.
1113
    MachineInstrBuilder MIBSXTW =
1114
        BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
1115
            .addReg(DstRegX)
1116
            .addImm(0)
1117
            .addImm(31);
1118
    (void)MIBSXTW;
1119
    LLVM_DEBUG(dbgs() << "  Extend operand:\n    ");
1120
    LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1121
  } else {
1122
    LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1123
  }
1124
  LLVM_DEBUG(dbgs() << "\n");
1125

1126
  if (MergeForward)
1127
    for (const MachineOperand &MOP : phys_regs_and_masks(*I))
1128
      if (MOP.isReg() && MOP.isKill())
1129
        DefinedInBB.addReg(MOP.getReg());
1130

1131
  // Erase the old instructions.
1132
  I->eraseFromParent();
1133
  Paired->eraseFromParent();
1134

1135
  return NextI;
1136
}
1137

1138
MachineBasicBlock::iterator
1139
AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1140
                                          MachineBasicBlock::iterator StoreI) {
1141
  MachineBasicBlock::iterator NextI =
1142
      next_nodbg(LoadI, LoadI->getParent()->end());
1143

1144
  int LoadSize = TII->getMemScale(*LoadI);
1145
  int StoreSize = TII->getMemScale(*StoreI);
1146
  Register LdRt = getLdStRegOp(*LoadI).getReg();
1147
  const MachineOperand &StMO = getLdStRegOp(*StoreI);
1148
  Register StRt = getLdStRegOp(*StoreI).getReg();
1149
  bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
1150

1151
  assert((IsStoreXReg ||
1152
          TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1153
         "Unexpected RegClass");
1154

1155
  MachineInstr *BitExtMI;
1156
  if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1157
    // Remove the load, if the destination register of the loads is the same
1158
    // register for stored value.
1159
    if (StRt == LdRt && LoadSize == 8) {
1160
      for (MachineInstr &MI : make_range(StoreI->getIterator(),
1161
                                         LoadI->getIterator())) {
1162
        if (MI.killsRegister(StRt, TRI)) {
1163
          MI.clearRegisterKills(StRt, TRI);
1164
          break;
1165
        }
1166
      }
1167
      LLVM_DEBUG(dbgs() << "Remove load instruction:\n    ");
1168
      LLVM_DEBUG(LoadI->print(dbgs()));
1169
      LLVM_DEBUG(dbgs() << "\n");
1170
      LoadI->eraseFromParent();
1171
      return NextI;
1172
    }
1173
    // Replace the load with a mov if the load and store are in the same size.
1174
    BitExtMI =
1175
        BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1176
                TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
1177
            .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1178
            .add(StMO)
1179
            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1180
            .setMIFlags(LoadI->getFlags());
1181
  } else {
1182
    // FIXME: Currently we disable this transformation in big-endian targets as
1183
    // performance and correctness are verified only in little-endian.
1184
    if (!Subtarget->isLittleEndian())
1185
      return NextI;
1186
    bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI);
1187
    assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1188
           "Unsupported ld/st match");
1189
    assert(LoadSize <= StoreSize && "Invalid load size");
1190
    int UnscaledLdOffset =
1191
        IsUnscaled
1192
            ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm()
1193
            : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize;
1194
    int UnscaledStOffset =
1195
        IsUnscaled
1196
            ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm()
1197
            : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize;
1198
    int Width = LoadSize * 8;
1199
    Register DestReg =
1200
        IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1201
                          LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
1202
                    : LdRt;
1203

1204
    assert((UnscaledLdOffset >= UnscaledStOffset &&
1205
            (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1206
           "Invalid offset");
1207

1208
    int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1209
    int Imms = Immr + Width - 1;
1210
    if (UnscaledLdOffset == UnscaledStOffset) {
1211
      uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1212
                                | ((Immr) << 6)               // immr
1213
                                | ((Imms) << 0)               // imms
1214
          ;
1215

1216
      BitExtMI =
1217
          BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1218
                  TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1219
                  DestReg)
1220
              .add(StMO)
1221
              .addImm(AndMaskEncoded)
1222
              .setMIFlags(LoadI->getFlags());
1223
    } else {
1224
      BitExtMI =
1225
          BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1226
                  TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1227
                  DestReg)
1228
              .add(StMO)
1229
              .addImm(Immr)
1230
              .addImm(Imms)
1231
              .setMIFlags(LoadI->getFlags());
1232
    }
1233
  }
1234

1235
  // Clear kill flags between store and load.
1236
  for (MachineInstr &MI : make_range(StoreI->getIterator(),
1237
                                     BitExtMI->getIterator()))
1238
    if (MI.killsRegister(StRt, TRI)) {
1239
      MI.clearRegisterKills(StRt, TRI);
1240
      break;
1241
    }
1242

1243
  LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n    ");
1244
  LLVM_DEBUG(StoreI->print(dbgs()));
1245
  LLVM_DEBUG(dbgs() << "    ");
1246
  LLVM_DEBUG(LoadI->print(dbgs()));
1247
  LLVM_DEBUG(dbgs() << "  with instructions:\n    ");
1248
  LLVM_DEBUG(StoreI->print(dbgs()));
1249
  LLVM_DEBUG(dbgs() << "    ");
1250
  LLVM_DEBUG((BitExtMI)->print(dbgs()));
1251
  LLVM_DEBUG(dbgs() << "\n");
1252

1253
  // Erase the old instructions.
1254
  LoadI->eraseFromParent();
1255
  return NextI;
1256
}
1257

1258
static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1259
  // Convert the byte-offset used by unscaled into an "element" offset used
1260
  // by the scaled pair load/store instructions.
1261
  if (IsUnscaled) {
1262
    // If the byte-offset isn't a multiple of the stride, there's no point
1263
    // trying to match it.
1264
    if (Offset % OffsetStride)
1265
      return false;
1266
    Offset /= OffsetStride;
1267
  }
1268
  return Offset <= 63 && Offset >= -64;
1269
}
1270

1271
// Do alignment, specialized to power of 2 and for signed ints,
1272
// avoiding having to do a C-style cast from uint_64t to int when
1273
// using alignTo from include/llvm/Support/MathExtras.h.
1274
// FIXME: Move this function to include/MathExtras.h?
1275
static int alignTo(int Num, int PowOf2) {
1276
  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1277
}
1278

1279
static bool mayAlias(MachineInstr &MIa,
1280
                     SmallVectorImpl<MachineInstr *> &MemInsns,
1281
                     AliasAnalysis *AA) {
1282
  for (MachineInstr *MIb : MemInsns) {
1283
    if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) {
1284
      LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
1285
      return true;
1286
    }
1287
  }
1288

1289
  LLVM_DEBUG(dbgs() << "No aliases found\n");
1290
  return false;
1291
}
1292

1293
bool AArch64LoadStoreOpt::findMatchingStore(
1294
    MachineBasicBlock::iterator I, unsigned Limit,
1295
    MachineBasicBlock::iterator &StoreI) {
1296
  MachineBasicBlock::iterator B = I->getParent()->begin();
1297
  MachineBasicBlock::iterator MBBI = I;
1298
  MachineInstr &LoadMI = *I;
1299
  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg();
1300

1301
  // If the load is the first instruction in the block, there's obviously
1302
  // not any matching store.
1303
  if (MBBI == B)
1304
    return false;
1305

1306
  // Track which register units have been modified and used between the first
1307
  // insn and the second insn.
1308
  ModifiedRegUnits.clear();
1309
  UsedRegUnits.clear();
1310

1311
  unsigned Count = 0;
1312
  do {
1313
    MBBI = prev_nodbg(MBBI, B);
1314
    MachineInstr &MI = *MBBI;
1315

1316
    // Don't count transient instructions towards the search limit since there
1317
    // may be different numbers of them if e.g. debug information is present.
1318
    if (!MI.isTransient())
1319
      ++Count;
1320

1321
    // If the load instruction reads directly from the address to which the
1322
    // store instruction writes and the stored value is not modified, we can
1323
    // promote the load. Since we do not handle stores with pre-/post-index,
1324
    // it's unnecessary to check if BaseReg is modified by the store itself.
1325
    // Also we can't handle stores without an immediate offset operand,
1326
    // while the operand might be the address for a global variable.
1327
    if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1328
        BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1329
        AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1330
        isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1331
        ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
1332
      StoreI = MBBI;
1333
      return true;
1334
    }
1335

1336
    if (MI.isCall())
1337
      return false;
1338

1339
    // Update modified / uses register units.
1340
    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1341

1342
    // Otherwise, if the base register is modified, we have no match, so
1343
    // return early.
1344
    if (!ModifiedRegUnits.available(BaseReg))
1345
      return false;
1346

1347
    // If we encounter a store aliased with the load, return early.
1348
    if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
1349
      return false;
1350
  } while (MBBI != B && Count < Limit);
1351
  return false;
1352
}
1353

1354
static bool needsWinCFI(const MachineFunction *MF) {
1355
  return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1356
         MF->getFunction().needsUnwindTableEntry();
1357
}
1358

1359
// Returns true if FirstMI and MI are candidates for merging or pairing.
1360
// Otherwise, returns false.
1361
static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1362
                                       LdStPairFlags &Flags,
1363
                                       const AArch64InstrInfo *TII) {
1364
  // If this is volatile or if pairing is suppressed, not a candidate.
1365
  if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1366
    return false;
1367

1368
  // We should have already checked FirstMI for pair suppression and volatility.
1369
  assert(!FirstMI.hasOrderedMemoryRef() &&
1370
         !TII->isLdStPairSuppressed(FirstMI) &&
1371
         "FirstMI shouldn't get here if either of these checks are true.");
1372

1373
  if (needsWinCFI(MI.getMF()) && (MI.getFlag(MachineInstr::FrameSetup) ||
1374
                                  MI.getFlag(MachineInstr::FrameDestroy)))
1375
    return false;
1376

1377
  unsigned OpcA = FirstMI.getOpcode();
1378
  unsigned OpcB = MI.getOpcode();
1379

1380
  // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1381
  if (OpcA == OpcB)
1382
    return !AArch64InstrInfo::isPreLdSt(FirstMI);
1383

1384
  // Two pre ld/st of different opcodes cannot be merged either
1385
  if (AArch64InstrInfo::isPreLdSt(FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
1386
    return false;
1387

1388
  // Try to match a sign-extended load/store with a zero-extended load/store.
1389
  bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1390
  unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1391
  assert(IsValidLdStrOpc &&
1392
         "Given Opc should be a Load or Store with an immediate");
1393
  // OpcA will be the first instruction in the pair.
1394
  if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1395
    Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1396
    return true;
1397
  }
1398

1399
  // If the second instruction isn't even a mergable/pairable load/store, bail
1400
  // out.
1401
  if (!PairIsValidLdStrOpc)
1402
    return false;
1403

1404
  // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1405
  // offsets.
1406
  if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
1407
    return false;
1408

1409
  // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1410
  // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
1411
  // are candidate pairs that can be merged.
1412
  if (isPreLdStPairCandidate(FirstMI, MI))
1413
    return true;
1414

1415
  // Try to match an unscaled load/store with a scaled load/store.
1416
  return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) &&
1417
         getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1418

1419
  // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1420
}
1421

1422
static bool canRenameMOP(const MachineOperand &MOP,
1423
                         const TargetRegisterInfo *TRI) {
1424
  if (MOP.isReg()) {
1425
    auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());
1426
    // Renaming registers with multiple disjunct sub-registers (e.g. the
1427
    // result of a LD3) means that all sub-registers are renamed, potentially
1428
    // impacting other instructions we did not check. Bail out.
1429
    // Note that this relies on the structure of the AArch64 register file. In
1430
    // particular, a subregister cannot be written without overwriting the
1431
    // whole register.
1432
    if (RegClass->HasDisjunctSubRegs) {
1433
      LLVM_DEBUG(
1434
          dbgs()
1435
          << "  Cannot rename operands with multiple disjunct subregisters ("
1436
          << MOP << ")\n");
1437
      return false;
1438
    }
1439

1440
    // We cannot rename arbitrary implicit-defs, the specific rule to rewrite
1441
    // them must be known. For example, in ORRWrs the implicit-def
1442
    // corresponds to the result register.
1443
    if (MOP.isImplicit() && MOP.isDef()) {
1444
      if (!isRewritableImplicitDef(MOP.getParent()->getOpcode()))
1445
        return false;
1446
      return TRI->isSuperOrSubRegisterEq(
1447
          MOP.getParent()->getOperand(0).getReg(), MOP.getReg());
1448
    }
1449
  }
1450
  return MOP.isImplicit() ||
1451
         (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1452
}
1453

1454
static bool
1455
canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1456
                 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1457
                 const TargetRegisterInfo *TRI) {
1458
  if (!FirstMI.mayStore())
1459
    return false;
1460

1461
  // Check if we can find an unused register which we can use to rename
1462
  // the register used by the first load/store.
1463

1464
  auto RegToRename = getLdStRegOp(FirstMI).getReg();
1465
  // For now, we only rename if the store operand gets killed at the store.
1466
  if (!getLdStRegOp(FirstMI).isKill() &&
1467
      !any_of(FirstMI.operands(),
1468
              [TRI, RegToRename](const MachineOperand &MOP) {
1469
                return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1470
                       MOP.isImplicit() && MOP.isKill() &&
1471
                       TRI->regsOverlap(RegToRename, MOP.getReg());
1472
              })) {
1473
    LLVM_DEBUG(dbgs() << "  Operand not killed at " << FirstMI);
1474
    return false;
1475
  }
1476

1477
  bool FoundDef = false;
1478

1479
  // For each instruction between FirstMI and the previous def for RegToRename,
1480
  // we
1481
  // * check if we can rename RegToRename in this instruction
1482
  // * collect the registers used and required register classes for RegToRename.
1483
  std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1484
                                                           bool IsDef) {
1485
    LLVM_DEBUG(dbgs() << "Checking " << MI);
1486
    // Currently we do not try to rename across frame-setup instructions.
1487
    if (MI.getFlag(MachineInstr::FrameSetup)) {
1488
      LLVM_DEBUG(dbgs() << "  Cannot rename framesetup instructions "
1489
                        << "currently\n");
1490
      return false;
1491
    }
1492

1493
    UsedInBetween.accumulate(MI);
1494

1495
    // For a definition, check that we can rename the definition and exit the
1496
    // loop.
1497
    FoundDef = IsDef;
1498

1499
    // For defs, check if we can rename the first def of RegToRename.
1500
    if (FoundDef) {
1501
      // For some pseudo instructions, we might not generate code in the end
1502
      // (e.g. KILL) and we would end up without a correct def for the rename
1503
      // register.
1504
      // TODO: This might be overly conservative and we could handle those cases
1505
      // in multiple ways:
1506
      //       1. Insert an extra copy, to materialize the def.
1507
      //       2. Skip pseudo-defs until we find an non-pseudo def.
1508
      if (MI.isPseudo()) {
1509
        LLVM_DEBUG(dbgs() << "  Cannot rename pseudo/bundle instruction\n");
1510
        return false;
1511
      }
1512

1513
      for (auto &MOP : MI.operands()) {
1514
        if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1515
            !TRI->regsOverlap(MOP.getReg(), RegToRename))
1516
          continue;
1517
        if (!canRenameMOP(MOP, TRI)) {
1518
          LLVM_DEBUG(dbgs() << "  Cannot rename " << MOP << " in " << MI);
1519
          return false;
1520
        }
1521
        RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1522
      }
1523
      return true;
1524
    } else {
1525
      for (auto &MOP : MI.operands()) {
1526
        if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1527
            !TRI->regsOverlap(MOP.getReg(), RegToRename))
1528
          continue;
1529

1530
        if (!canRenameMOP(MOP, TRI)) {
1531
          LLVM_DEBUG(dbgs() << "  Cannot rename " << MOP << " in " << MI);
1532
          return false;
1533
        }
1534
        RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1535
      }
1536
    }
1537
    return true;
1538
  };
1539

1540
  if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))
1541
    return false;
1542

1543
  if (!FoundDef) {
1544
    LLVM_DEBUG(dbgs() << "  Did not find definition for register in BB\n");
1545
    return false;
1546
  }
1547
  return true;
1548
}
1549

1550
// We want to merge the second load into the first by rewriting the usages of
1551
// the same reg between first (incl.) and second (excl.). We don't need to care
1552
// about any insns before FirstLoad or after SecondLoad.
1553
// 1. The second load writes new value into the same reg.
1554
//    - The renaming is impossible to impact later use of the reg.
1555
//    - The second load always trash the value written by the first load which
1556
//      means the reg must be killed before the second load.
1557
// 2. The first load must be a def for the same reg so we don't need to look
1558
//    into anything before it.
1559
static bool canRenameUntilSecondLoad(
1560
    MachineInstr &FirstLoad, MachineInstr &SecondLoad,
1561
    LiveRegUnits &UsedInBetween,
1562
    SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1563
    const TargetRegisterInfo *TRI) {
1564
  if (FirstLoad.isPseudo())
1565
    return false;
1566

1567
  UsedInBetween.accumulate(FirstLoad);
1568
  auto RegToRename = getLdStRegOp(FirstLoad).getReg();
1569
  bool Success = std::all_of(
1570
      FirstLoad.getIterator(), SecondLoad.getIterator(),
1571
      [&](MachineInstr &MI) {
1572
        LLVM_DEBUG(dbgs() << "Checking " << MI);
1573
        // Currently we do not try to rename across frame-setup instructions.
1574
        if (MI.getFlag(MachineInstr::FrameSetup)) {
1575
          LLVM_DEBUG(dbgs() << "  Cannot rename framesetup instructions "
1576
                            << "currently\n");
1577
          return false;
1578
        }
1579

1580
        for (auto &MOP : MI.operands()) {
1581
          if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1582
              !TRI->regsOverlap(MOP.getReg(), RegToRename))
1583
            continue;
1584
          if (!canRenameMOP(MOP, TRI)) {
1585
            LLVM_DEBUG(dbgs() << "  Cannot rename " << MOP << " in " << MI);
1586
            return false;
1587
          }
1588
          RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1589
        }
1590

1591
        return true;
1592
      });
1593
  return Success;
1594
}
1595

1596
// Check if we can find a physical register for renaming \p Reg. This register
1597
// must:
1598
// * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1599
//   defined registers up to the point where the renamed register will be used,
1600
// * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1601
//   registers in the range the rename register will be used,
1602
// * is available in all used register classes (checked using RequiredClasses).
1603
static std::optional<MCPhysReg> tryToFindRegisterToRename(
1604
    const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1605
    LiveRegUnits &UsedInBetween,
1606
    SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1607
    const TargetRegisterInfo *TRI) {
1608
  const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1609

1610
  // Checks if any sub- or super-register of PR is callee saved.
1611
  auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1612
    return any_of(TRI->sub_and_superregs_inclusive(PR),
1613
                  [&MF, TRI](MCPhysReg SubOrSuper) {
1614
                    return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
1615
                  });
1616
  };
1617

1618
  // Check if PR or one of its sub- or super-registers can be used for all
1619
  // required register classes.
1620
  auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1621
    return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
1622
      return any_of(
1623
          TRI->sub_and_superregs_inclusive(PR),
1624
          [C](MCPhysReg SubOrSuper) { return C->contains(SubOrSuper); });
1625
    });
1626
  };
1627

1628
  auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1629
  for (const MCPhysReg &PR : *RegClass) {
1630
    if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
1631
        !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1632
        CanBeUsedForAllClasses(PR)) {
1633
      DefinedInBB.addReg(PR);
1634
      LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1635
                        << "\n");
1636
      return {PR};
1637
    }
1638
  }
1639
  LLVM_DEBUG(dbgs() << "No rename register found from "
1640
                    << TRI->getRegClassName(RegClass) << "\n");
1641
  return std::nullopt;
1642
}
1643

1644
// For store pairs: returns a register from FirstMI to the beginning of the
1645
// block that can be renamed.
1646
// For load pairs: returns a register from FirstMI to MI that can be renamed.
1647
static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(
1648
    std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,
1649
    Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1650
    SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1651
    const TargetRegisterInfo *TRI) {
1652
  std::optional<MCPhysReg> RenameReg;
1653
  if (!DebugCounter::shouldExecute(RegRenamingCounter))
1654
    return RenameReg;
1655

1656
  auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
1657
  MachineFunction &MF = *FirstMI.getParent()->getParent();
1658
  if (!RegClass || !MF.getRegInfo().tracksLiveness())
1659
    return RenameReg;
1660

1661
  const bool IsLoad = FirstMI.mayLoad();
1662

1663
  if (!MaybeCanRename) {
1664
    if (IsLoad)
1665
      MaybeCanRename = {canRenameUntilSecondLoad(FirstMI, MI, UsedInBetween,
1666
                                                 RequiredClasses, TRI)};
1667
    else
1668
      MaybeCanRename = {
1669
          canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
1670
  }
1671

1672
  if (*MaybeCanRename) {
1673
    RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,
1674
                                          RequiredClasses, TRI);
1675
  }
1676
  return RenameReg;
1677
}
1678

1679
/// Scan the instructions looking for a load/store that can be combined with the
1680
/// current instruction into a wider equivalent or a load/store pair.
1681
MachineBasicBlock::iterator
1682
AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1683
                                      LdStPairFlags &Flags, unsigned Limit,
1684
                                      bool FindNarrowMerge) {
1685
  MachineBasicBlock::iterator E = I->getParent()->end();
1686
  MachineBasicBlock::iterator MBBI = I;
1687
  MachineBasicBlock::iterator MBBIWithRenameReg;
1688
  MachineInstr &FirstMI = *I;
1689
  MBBI = next_nodbg(MBBI, E);
1690

1691
  bool MayLoad = FirstMI.mayLoad();
1692
  bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
1693
  Register Reg = getLdStRegOp(FirstMI).getReg();
1694
  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg();
1695
  int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm();
1696
  int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
1697
  bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1698

1699
  std::optional<bool> MaybeCanRename;
1700
  if (!EnableRenaming)
1701
    MaybeCanRename = {false};
1702

1703
  SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
1704
  LiveRegUnits UsedInBetween;
1705
  UsedInBetween.init(*TRI);
1706

1707
  Flags.clearRenameReg();
1708

1709
  // Track which register units have been modified and used between the first
1710
  // insn (inclusive) and the second insn.
1711
  ModifiedRegUnits.clear();
1712
  UsedRegUnits.clear();
1713

1714
  // Remember any instructions that read/write memory between FirstMI and MI.
1715
  SmallVector<MachineInstr *, 4> MemInsns;
1716

1717
  LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
1718
  for (unsigned Count = 0; MBBI != E && Count < Limit;
1719
       MBBI = next_nodbg(MBBI, E)) {
1720
    MachineInstr &MI = *MBBI;
1721
    LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
1722

1723
    UsedInBetween.accumulate(MI);
1724

1725
    // Don't count transient instructions towards the search limit since there
1726
    // may be different numbers of them if e.g. debug information is present.
1727
    if (!MI.isTransient())
1728
      ++Count;
1729

1730
    Flags.setSExtIdx(-1);
1731
    if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1732
        AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
1733
      assert(MI.mayLoadOrStore() && "Expected memory operation.");
1734
      // If we've found another instruction with the same opcode, check to see
1735
      // if the base and offset are compatible with our starting instruction.
1736
      // These instructions all have scaled immediate operands, so we just
1737
      // check for +1/-1. Make sure to check the new instruction offset is
1738
      // actually an immediate and not a symbolic reference destined for
1739
      // a relocation.
1740
      Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
1741
      int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
1742
      bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
1743
      if (IsUnscaled != MIIsUnscaled) {
1744
        // We're trying to pair instructions that differ in how they are scaled.
1745
        // If FirstMI is scaled then scale the offset of MI accordingly.
1746
        // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1747
        int MemSize = TII->getMemScale(MI);
1748
        if (MIIsUnscaled) {
1749
          // If the unscaled offset isn't a multiple of the MemSize, we can't
1750
          // pair the operations together: bail and keep looking.
1751
          if (MIOffset % MemSize) {
1752
            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1753
                                              UsedRegUnits, TRI);
1754
            MemInsns.push_back(&MI);
1755
            continue;
1756
          }
1757
          MIOffset /= MemSize;
1758
        } else {
1759
          MIOffset *= MemSize;
1760
        }
1761
      }
1762

1763
      bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
1764

1765
      if (BaseReg == MIBaseReg) {
1766
        // If the offset of the second ld/st is not equal to the size of the
1767
        // destination register it can’t be paired with a pre-index ld/st
1768
        // pair. Additionally if the base reg is used or modified the operations
1769
        // can't be paired: bail and keep looking.
1770
        if (IsPreLdSt) {
1771
          bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
1772
          bool IsBaseRegUsed = !UsedRegUnits.available(
1773
              AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1774
          bool IsBaseRegModified = !ModifiedRegUnits.available(
1775
              AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1776
          // If the stored value and the address of the second instruction is
1777
          // the same, it needs to be using the updated register and therefore
1778
          // it must not be folded.
1779
          bool IsMIRegTheSame =
1780
              TRI->regsOverlap(getLdStRegOp(MI).getReg(),
1781
                               AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1782
          if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
1783
              IsMIRegTheSame) {
1784
            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1785
                                              UsedRegUnits, TRI);
1786
            MemInsns.push_back(&MI);
1787
            continue;
1788
          }
1789
        } else {
1790
          if ((Offset != MIOffset + OffsetStride) &&
1791
              (Offset + OffsetStride != MIOffset)) {
1792
            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1793
                                              UsedRegUnits, TRI);
1794
            MemInsns.push_back(&MI);
1795
            continue;
1796
          }
1797
        }
1798

1799
        int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1800
        if (FindNarrowMerge) {
1801
          // If the alignment requirements of the scaled wide load/store
1802
          // instruction can't express the offset of the scaled narrow input,
1803
          // bail and keep looking. For promotable zero stores, allow only when
1804
          // the stored value is the same (i.e., WZR).
1805
          if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
1806
              (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
1807
            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1808
                                              UsedRegUnits, TRI);
1809
            MemInsns.push_back(&MI);
1810
            continue;
1811
          }
1812
        } else {
1813
          // Pairwise instructions have a 7-bit signed offset field. Single
1814
          // insns have a 12-bit unsigned offset field.  If the resultant
1815
          // immediate offset of merging these instructions is out of range for
1816
          // a pairwise instruction, bail and keep looking.
1817
          if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1818
            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1819
                                              UsedRegUnits, TRI);
1820
            MemInsns.push_back(&MI);
1821
            LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
1822
                              << "keep looking.\n");
1823
            continue;
1824
          }
1825
          // If the alignment requirements of the paired (scaled) instruction
1826
          // can't express the offset of the unscaled input, bail and keep
1827
          // looking.
1828
          if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1829
            LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1830
                                              UsedRegUnits, TRI);
1831
            MemInsns.push_back(&MI);
1832
            LLVM_DEBUG(dbgs()
1833
                       << "Offset doesn't fit due to alignment requirements, "
1834
                       << "keep looking.\n");
1835
            continue;
1836
          }
1837
        }
1838

1839
        // If the BaseReg has been modified, then we cannot do the optimization.
1840
        // For example, in the following pattern
1841
        //   ldr x1 [x2]
1842
        //   ldr x2 [x3]
1843
        //   ldr x4 [x2, #8],
1844
        // the first and third ldr cannot be converted to ldp x1, x4, [x2]
1845
        if (!ModifiedRegUnits.available(BaseReg))
1846
          return E;
1847

1848
        const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
1849
                                                Reg, getLdStRegOp(MI).getReg());
1850

1851
        // If the Rt of the second instruction (destination register of the
1852
        // load) was not modified or used between the two instructions and none
1853
        // of the instructions between the second and first alias with the
1854
        // second, we can combine the second into the first.
1855
        bool RtNotModified =
1856
            ModifiedRegUnits.available(getLdStRegOp(MI).getReg());
1857
        bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
1858
                           !UsedRegUnits.available(getLdStRegOp(MI).getReg()));
1859

1860
        LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
1861
                          << "Reg '" << getLdStRegOp(MI) << "' not modified: "
1862
                          << (RtNotModified ? "true" : "false") << "\n"
1863
                          << "Reg '" << getLdStRegOp(MI) << "' not used: "
1864
                          << (RtNotUsed ? "true" : "false") << "\n");
1865

1866
        if (RtNotModified && RtNotUsed && !mayAlias(MI, MemInsns, AA)) {
1867
          // For pairs loading into the same reg, try to find a renaming
1868
          // opportunity to allow the renaming of Reg between FirstMI and MI
1869
          // and combine MI into FirstMI; otherwise bail and keep looking.
1870
          if (SameLoadReg) {
1871
            std::optional<MCPhysReg> RenameReg =
1872
                findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,
1873
                                                Reg, DefinedInBB, UsedInBetween,
1874
                                                RequiredClasses, TRI);
1875
            if (!RenameReg) {
1876
              LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1877
                                                UsedRegUnits, TRI);
1878
              MemInsns.push_back(&MI);
1879
              LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
1880
                                << "keep looking.\n");
1881
              continue;
1882
            }
1883
            Flags.setRenameReg(*RenameReg);
1884
          }
1885

1886
          Flags.setMergeForward(false);
1887
          if (!SameLoadReg)
1888
            Flags.clearRenameReg();
1889
          return MBBI;
1890
        }
1891

1892
        // Likewise, if the Rt of the first instruction is not modified or used
1893
        // between the two instructions and none of the instructions between the
1894
        // first and the second alias with the first, we can combine the first
1895
        // into the second.
1896
        RtNotModified = !(
1897
            MayLoad && !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg()));
1898

1899
        LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
1900
                          << "Reg '" << getLdStRegOp(FirstMI)
1901
                          << "' not modified: "
1902
                          << (RtNotModified ? "true" : "false") << "\n");
1903

1904
        if (RtNotModified && !mayAlias(FirstMI, MemInsns, AA)) {
1905
          if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
1906
            Flags.setMergeForward(true);
1907
            Flags.clearRenameReg();
1908
            return MBBI;
1909
          }
1910

1911
          std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(
1912
              MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,
1913
              RequiredClasses, TRI);
1914
          if (RenameReg) {
1915
            Flags.setMergeForward(true);
1916
            Flags.setRenameReg(*RenameReg);
1917
            MBBIWithRenameReg = MBBI;
1918
          }
1919
        }
1920
        LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
1921
                          << "interference in between, keep looking.\n");
1922
      }
1923
    }
1924

1925
    if (Flags.getRenameReg())
1926
      return MBBIWithRenameReg;
1927

1928
    // If the instruction wasn't a matching load or store.  Stop searching if we
1929
    // encounter a call instruction that might modify memory.
1930
    if (MI.isCall()) {
1931
      LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
1932
      return E;
1933
    }
1934

1935
    // Update modified / uses register units.
1936
    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1937

1938
    // Otherwise, if the base register is modified, we have no match, so
1939
    // return early.
1940
    if (!ModifiedRegUnits.available(BaseReg)) {
1941
      LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
1942
      return E;
1943
    }
1944

1945
    // Update list of instructions that read/write memory.
1946
    if (MI.mayLoadOrStore())
1947
      MemInsns.push_back(&MI);
1948
  }
1949
  return E;
1950
}
1951

1952
static MachineBasicBlock::iterator
1953
maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
1954
  auto End = MI.getParent()->end();
1955
  if (MaybeCFI == End ||
1956
      MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
1957
      !(MI.getFlag(MachineInstr::FrameSetup) ||
1958
        MI.getFlag(MachineInstr::FrameDestroy)) ||
1959
      AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP)
1960
    return End;
1961

1962
  const MachineFunction &MF = *MI.getParent()->getParent();
1963
  unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();
1964
  const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
1965
  switch (CFI.getOperation()) {
1966
  case MCCFIInstruction::OpDefCfa:
1967
  case MCCFIInstruction::OpDefCfaOffset:
1968
    return MaybeCFI;
1969
  default:
1970
    return End;
1971
  }
1972
}
1973

1974
MachineBasicBlock::iterator
1975
AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1976
                                     MachineBasicBlock::iterator Update,
1977
                                     bool IsPreIdx) {
1978
  assert((Update->getOpcode() == AArch64::ADDXri ||
1979
          Update->getOpcode() == AArch64::SUBXri) &&
1980
         "Unexpected base register update instruction to merge!");
1981
  MachineBasicBlock::iterator E = I->getParent()->end();
1982
  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
1983

1984
  // If updating the SP and the following instruction is CFA offset related CFI
1985
  // instruction move it after the merged instruction.
1986
  MachineBasicBlock::iterator CFI =
1987
      IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;
1988

1989
  // Return the instruction following the merged instruction, which is
1990
  // the instruction following our unmerged load. Unless that's the add/sub
1991
  // instruction we're merging, in which case it's the one after that.
1992
  if (NextI == Update)
1993
    NextI = next_nodbg(NextI, E);
1994

1995
  int Value = Update->getOperand(2).getImm();
1996
  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1997
         "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1998
  if (Update->getOpcode() == AArch64::SUBXri)
1999
    Value = -Value;
2000

2001
  unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
2002
                             : getPostIndexedOpcode(I->getOpcode());
2003
  MachineInstrBuilder MIB;
2004
  int Scale, MinOffset, MaxOffset;
2005
  getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
2006
  if (!AArch64InstrInfo::isPairedLdSt(*I)) {
2007
    // Non-paired instruction.
2008
    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
2009
              .add(getLdStRegOp(*Update))
2010
              .add(getLdStRegOp(*I))
2011
              .add(AArch64InstrInfo::getLdStBaseOp(*I))
2012
              .addImm(Value / Scale)
2013
              .setMemRefs(I->memoperands())
2014
              .setMIFlags(I->mergeFlagsWith(*Update));
2015
  } else {
2016
    // Paired instruction.
2017
    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
2018
              .add(getLdStRegOp(*Update))
2019
              .add(getLdStRegOp(*I, 0))
2020
              .add(getLdStRegOp(*I, 1))
2021
              .add(AArch64InstrInfo::getLdStBaseOp(*I))
2022
              .addImm(Value / Scale)
2023
              .setMemRefs(I->memoperands())
2024
              .setMIFlags(I->mergeFlagsWith(*Update));
2025
  }
2026
  if (CFI != E) {
2027
    MachineBasicBlock *MBB = I->getParent();
2028
    MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);
2029
  }
2030

2031
  if (IsPreIdx) {
2032
    ++NumPreFolded;
2033
    LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
2034
  } else {
2035
    ++NumPostFolded;
2036
    LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
2037
  }
2038
  LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
2039
  LLVM_DEBUG(I->print(dbgs()));
2040
  LLVM_DEBUG(dbgs() << "    ");
2041
  LLVM_DEBUG(Update->print(dbgs()));
2042
  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
2043
  LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
2044
  LLVM_DEBUG(dbgs() << "\n");
2045

2046
  // Erase the old instructions for the block.
2047
  I->eraseFromParent();
2048
  Update->eraseFromParent();
2049

2050
  return NextI;
2051
}
2052

2053
bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2054
                                               MachineInstr &MI,
2055
                                               unsigned BaseReg, int Offset) {
2056
  switch (MI.getOpcode()) {
2057
  default:
2058
    break;
2059
  case AArch64::SUBXri:
2060
  case AArch64::ADDXri:
2061
    // Make sure it's a vanilla immediate operand, not a relocation or
2062
    // anything else we can't handle.
2063
    if (!MI.getOperand(2).isImm())
2064
      break;
2065
    // Watch out for 1 << 12 shifted value.
2066
    if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
2067
      break;
2068

2069
    // The update instruction source and destination register must be the
2070
    // same as the load/store base register.
2071
    if (MI.getOperand(0).getReg() != BaseReg ||
2072
        MI.getOperand(1).getReg() != BaseReg)
2073
      break;
2074

2075
    int UpdateOffset = MI.getOperand(2).getImm();
2076
    if (MI.getOpcode() == AArch64::SUBXri)
2077
      UpdateOffset = -UpdateOffset;
2078

2079
    // The immediate must be a multiple of the scaling factor of the pre/post
2080
    // indexed instruction.
2081
    int Scale, MinOffset, MaxOffset;
2082
    getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
2083
    if (UpdateOffset % Scale != 0)
2084
      break;
2085

2086
    // Scaled offset must fit in the instruction immediate.
2087
    int ScaledOffset = UpdateOffset / Scale;
2088
    if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
2089
      break;
2090

2091
    // If we have a non-zero Offset, we check that it matches the amount
2092
    // we're adding to the register.
2093
    if (!Offset || Offset == UpdateOffset)
2094
      return true;
2095
    break;
2096
  }
2097
  return false;
2098
}
2099

2100
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
2101
    MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2102
  MachineBasicBlock::iterator E = I->getParent()->end();
2103
  MachineInstr &MemMI = *I;
2104
  MachineBasicBlock::iterator MBBI = I;
2105

2106
  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2107
  int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() *
2108
                         TII->getMemScale(MemMI);
2109

2110
  // Scan forward looking for post-index opportunities.  Updating instructions
2111
  // can't be formed if the memory instruction doesn't have the offset we're
2112
  // looking for.
2113
  if (MIUnscaledOffset != UnscaledOffset)
2114
    return E;
2115

2116
  // If the base register overlaps a source/destination register, we can't
2117
  // merge the update. This does not apply to tag store instructions which
2118
  // ignore the address part of the source register.
2119
  // This does not apply to STGPi as well, which does not have unpredictable
2120
  // behavior in this case unlike normal stores, and always performs writeback
2121
  // after reading the source register value.
2122
  if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
2123
    bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
2124
    for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2125
      Register DestReg = getLdStRegOp(MemMI, i).getReg();
2126
      if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
2127
        return E;
2128
    }
2129
  }
2130

2131
  // Track which register units have been modified and used between the first
2132
  // insn (inclusive) and the second insn.
2133
  ModifiedRegUnits.clear();
2134
  UsedRegUnits.clear();
2135
  MBBI = next_nodbg(MBBI, E);
2136

2137
  // We can't post-increment the stack pointer if any instruction between
2138
  // the memory access (I) and the increment (MBBI) can access the memory
2139
  // region defined by [SP, MBBI].
2140
  const bool BaseRegSP = BaseReg == AArch64::SP;
2141
  if (BaseRegSP && needsWinCFI(I->getMF())) {
2142
    // FIXME: For now, we always block the optimization over SP in windows
2143
    // targets as it requires to adjust the unwind/debug info, messing up
2144
    // the unwind info can actually cause a miscompile.
2145
    return E;
2146
  }
2147

2148
  for (unsigned Count = 0; MBBI != E && Count < Limit;
2149
       MBBI = next_nodbg(MBBI, E)) {
2150
    MachineInstr &MI = *MBBI;
2151

2152
    // Don't count transient instructions towards the search limit since there
2153
    // may be different numbers of them if e.g. debug information is present.
2154
    if (!MI.isTransient())
2155
      ++Count;
2156

2157
    // If we found a match, return it.
2158
    if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
2159
      return MBBI;
2160

2161
    // Update the status of what the instruction clobbered and used.
2162
    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2163

2164
    // Otherwise, if the base register is used or modified, we have no match, so
2165
    // return early.
2166
    // If we are optimizing SP, do not allow instructions that may load or store
2167
    // in between the load and the optimized value update.
2168
    if (!ModifiedRegUnits.available(BaseReg) ||
2169
        !UsedRegUnits.available(BaseReg) ||
2170
        (BaseRegSP && MBBI->mayLoadOrStore()))
2171
      return E;
2172
  }
2173
  return E;
2174
}
2175

2176
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2177
    MachineBasicBlock::iterator I, unsigned Limit) {
2178
  MachineBasicBlock::iterator B = I->getParent()->begin();
2179
  MachineBasicBlock::iterator E = I->getParent()->end();
2180
  MachineInstr &MemMI = *I;
2181
  MachineBasicBlock::iterator MBBI = I;
2182
  MachineFunction &MF = *MemMI.getMF();
2183

2184
  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2185
  int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();
2186

2187
  // If the load/store is the first instruction in the block, there's obviously
2188
  // not any matching update. Ditto if the memory offset isn't zero.
2189
  if (MBBI == B || Offset != 0)
2190
    return E;
2191
  // If the base register overlaps a destination register, we can't
2192
  // merge the update.
2193
  if (!isTagStore(MemMI)) {
2194
    bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
2195
    for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2196
      Register DestReg = getLdStRegOp(MemMI, i).getReg();
2197
      if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
2198
        return E;
2199
    }
2200
  }
2201

2202
  const bool BaseRegSP = BaseReg == AArch64::SP;
2203
  if (BaseRegSP && needsWinCFI(I->getMF())) {
2204
    // FIXME: For now, we always block the optimization over SP in windows
2205
    // targets as it requires to adjust the unwind/debug info, messing up
2206
    // the unwind info can actually cause a miscompile.
2207
    return E;
2208
  }
2209

2210
  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2211
  unsigned RedZoneSize =
2212
      Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
2213

2214
  // Track which register units have been modified and used between the first
2215
  // insn (inclusive) and the second insn.
2216
  ModifiedRegUnits.clear();
2217
  UsedRegUnits.clear();
2218
  unsigned Count = 0;
2219
  bool MemAcessBeforeSPPreInc = false;
2220
  do {
2221
    MBBI = prev_nodbg(MBBI, B);
2222
    MachineInstr &MI = *MBBI;
2223

2224
    // Don't count transient instructions towards the search limit since there
2225
    // may be different numbers of them if e.g. debug information is present.
2226
    if (!MI.isTransient())
2227
      ++Count;
2228

2229
    // If we found a match, return it.
2230
    if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {
2231
      // Check that the update value is within our red zone limit (which may be
2232
      // zero).
2233
      if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
2234
        return E;
2235
      return MBBI;
2236
    }
2237

2238
    // Update the status of what the instruction clobbered and used.
2239
    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2240

2241
    // Otherwise, if the base register is used or modified, we have no match, so
2242
    // return early.
2243
    if (!ModifiedRegUnits.available(BaseReg) ||
2244
        !UsedRegUnits.available(BaseReg))
2245
      return E;
2246
    // Keep track if we have a memory access before an SP pre-increment, in this
2247
    // case we need to validate later that the update amount respects the red
2248
    // zone.
2249
    if (BaseRegSP && MBBI->mayLoadOrStore())
2250
      MemAcessBeforeSPPreInc = true;
2251
  } while (MBBI != B && Count < Limit);
2252
  return E;
2253
}
2254

2255
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2256
    MachineBasicBlock::iterator &MBBI) {
2257
  MachineInstr &MI = *MBBI;
2258
  // If this is a volatile load, don't mess with it.
2259
  if (MI.hasOrderedMemoryRef())
2260
    return false;
2261

2262
  if (needsWinCFI(MI.getMF()) && MI.getFlag(MachineInstr::FrameDestroy))
2263
    return false;
2264

2265
  // Make sure this is a reg+imm.
2266
  // FIXME: It is possible to extend it to handle reg+reg cases.
2267
  if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2268
    return false;
2269

2270
  // Look backward up to LdStLimit instructions.
2271
  MachineBasicBlock::iterator StoreI;
2272
  if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
2273
    ++NumLoadsFromStoresPromoted;
2274
    // Promote the load. Keeping the iterator straight is a
2275
    // pain, so we let the merge routine tell us what the next instruction
2276
    // is after it's done mucking about.
2277
    MBBI = promoteLoadFromStore(MBBI, StoreI);
2278
    return true;
2279
  }
2280
  return false;
2281
}
2282

2283
// Merge adjacent zero stores into a wider store.
2284
bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2285
    MachineBasicBlock::iterator &MBBI) {
2286
  assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2287
  MachineInstr &MI = *MBBI;
2288
  MachineBasicBlock::iterator E = MI.getParent()->end();
2289

2290
  if (!TII->isCandidateToMergeOrPair(MI))
2291
    return false;
2292

2293
  // Look ahead up to LdStLimit instructions for a mergable instruction.
2294
  LdStPairFlags Flags;
2295
  MachineBasicBlock::iterator MergeMI =
2296
      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
2297
  if (MergeMI != E) {
2298
    ++NumZeroStoresPromoted;
2299

2300
    // Keeping the iterator straight is a pain, so we let the merge routine tell
2301
    // us what the next instruction is after it's done mucking about.
2302
    MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
2303
    return true;
2304
  }
2305
  return false;
2306
}
2307

2308
// Find loads and stores that can be merged into a single load or store pair
2309
// instruction.
2310
bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2311
  MachineInstr &MI = *MBBI;
2312
  MachineBasicBlock::iterator E = MI.getParent()->end();
2313

2314
  if (!TII->isCandidateToMergeOrPair(MI))
2315
    return false;
2316

2317
  // If disable-ldp feature is opted, do not emit ldp.
2318
  if (MI.mayLoad() && Subtarget->hasDisableLdp())
2319
    return false;
2320

2321
  // If disable-stp feature is opted, do not emit stp.
2322
  if (MI.mayStore() && Subtarget->hasDisableStp())
2323
    return false;
2324

2325
  // Early exit if the offset is not possible to match. (6 bits of positive
2326
  // range, plus allow an extra one in case we find a later insn that matches
2327
  // with Offset-1)
2328
  bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2329
  int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2330
  int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2331
  // Allow one more for offset.
2332
  if (Offset > 0)
2333
    Offset -= OffsetStride;
2334
  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2335
    return false;
2336

2337
  // Look ahead up to LdStLimit instructions for a pairable instruction.
2338
  LdStPairFlags Flags;
2339
  MachineBasicBlock::iterator Paired =
2340
      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
2341
  if (Paired != E) {
2342
    // Keeping the iterator straight is a pain, so we let the merge routine tell
2343
    // us what the next instruction is after it's done mucking about.
2344
    auto Prev = std::prev(MBBI);
2345

2346
    // Fetch the memoperand of the load/store that is a candidate for
2347
    // combination.
2348
    MachineMemOperand *MemOp =
2349
        MI.memoperands_empty() ? nullptr : MI.memoperands().front();
2350

2351
    // If a load/store arrives and ldp/stp-aligned-only feature is opted, check
2352
    // that the alignment of the source pointer is at least double the alignment
2353
    // of the type.
2354
    if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) ||
2355
        (MI.mayStore() && Subtarget->hasStpAlignedOnly())) {
2356
      // If there is no size/align information, cancel the transformation.
2357
      if (!MemOp || !MemOp->getMemoryType().isValid()) {
2358
        NumFailedAlignmentCheck++;
2359
        return false;
2360
      }
2361

2362
      // Get the needed alignments to check them if
2363
      // ldp-aligned-only/stp-aligned-only features are opted.
2364
      uint64_t MemAlignment = MemOp->getAlign().value();
2365
      uint64_t TypeAlignment = Align(MemOp->getSize().getValue()).value();
2366

2367
      if (MemAlignment < 2 * TypeAlignment) {
2368
        NumFailedAlignmentCheck++;
2369
        return false;
2370
      }
2371
    }
2372

2373
    ++NumPairCreated;
2374
    if (TII->hasUnscaledLdStOffset(MI))
2375
      ++NumUnscaledPairCreated;
2376

2377
    MBBI = mergePairedInsns(MBBI, Paired, Flags);
2378
    // Collect liveness info for instructions between Prev and the new position
2379
    // MBBI.
2380
    for (auto I = std::next(Prev); I != MBBI; I++)
2381
      updateDefinedRegisters(*I, DefinedInBB, TRI);
2382

2383
    return true;
2384
  }
2385
  return false;
2386
}
2387

2388
bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2389
    (MachineBasicBlock::iterator &MBBI) {
2390
  MachineInstr &MI = *MBBI;
2391
  MachineBasicBlock::iterator E = MI.getParent()->end();
2392
  MachineBasicBlock::iterator Update;
2393

2394
  // Look forward to try to form a post-index instruction. For example,
2395
  // ldr x0, [x20]
2396
  // add x20, x20, #32
2397
  //   merged into:
2398
  // ldr x0, [x20], #32
2399
  Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
2400
  if (Update != E) {
2401
    // Merge the update into the ld/st.
2402
    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
2403
    return true;
2404
  }
2405

2406
  // Don't know how to handle unscaled pre/post-index versions below, so bail.
2407
  if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
2408
    return false;
2409

2410
  // Look back to try to find a pre-index instruction. For example,
2411
  // add x0, x0, #8
2412
  // ldr x1, [x0]
2413
  //   merged into:
2414
  // ldr x1, [x0, #8]!
2415
  Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
2416
  if (Update != E) {
2417
    // Merge the update into the ld/st.
2418
    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2419
    return true;
2420
  }
2421

2422
  // The immediate in the load/store is scaled by the size of the memory
2423
  // operation. The immediate in the add we're looking for,
2424
  // however, is not, so adjust here.
2425
  int UnscaledOffset =
2426
      AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2427

2428
  // Look forward to try to find a pre-index instruction. For example,
2429
  // ldr x1, [x0, #64]
2430
  // add x0, x0, #64
2431
  //   merged into:
2432
  // ldr x1, [x0, #64]!
2433
  Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
2434
  if (Update != E) {
2435
    // Merge the update into the ld/st.
2436
    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2437
    return true;
2438
  }
2439

2440
  return false;
2441
}
2442

2443
bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2444
                                        bool EnableNarrowZeroStOpt) {
2445

2446
  bool Modified = false;
2447
  // Four tranformations to do here:
2448
  // 1) Find loads that directly read from stores and promote them by
2449
  //    replacing with mov instructions. If the store is wider than the load,
2450
  //    the load will be replaced with a bitfield extract.
2451
  //      e.g.,
2452
  //        str w1, [x0, #4]
2453
  //        ldrh w2, [x0, #6]
2454
  //        ; becomes
2455
  //        str w1, [x0, #4]
2456
  //        lsr w2, w1, #16
2457
  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2458
       MBBI != E;) {
2459
    if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
2460
      Modified = true;
2461
    else
2462
      ++MBBI;
2463
  }
2464
  // 2) Merge adjacent zero stores into a wider store.
2465
  //      e.g.,
2466
  //        strh wzr, [x0]
2467
  //        strh wzr, [x0, #2]
2468
  //        ; becomes
2469
  //        str wzr, [x0]
2470
  //      e.g.,
2471
  //        str wzr, [x0]
2472
  //        str wzr, [x0, #4]
2473
  //        ; becomes
2474
  //        str xzr, [x0]
2475
  if (EnableNarrowZeroStOpt)
2476
    for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2477
         MBBI != E;) {
2478
      if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
2479
        Modified = true;
2480
      else
2481
        ++MBBI;
2482
    }
2483
  // 3) Find loads and stores that can be merged into a single load or store
2484
  //    pair instruction.
2485
  //      e.g.,
2486
  //        ldr x0, [x2]
2487
  //        ldr x1, [x2, #8]
2488
  //        ; becomes
2489
  //        ldp x0, x1, [x2]
2490

2491
  if (MBB.getParent()->getRegInfo().tracksLiveness()) {
2492
    DefinedInBB.clear();
2493
    DefinedInBB.addLiveIns(MBB);
2494
  }
2495

2496
  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2497
       MBBI != E;) {
2498
    // Track currently live registers up to this point, to help with
2499
    // searching for a rename register on demand.
2500
    updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
2501
    if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
2502
      Modified = true;
2503
    else
2504
      ++MBBI;
2505
  }
2506
  // 4) Find base register updates that can be merged into the load or store
2507
  //    as a base-reg writeback.
2508
  //      e.g.,
2509
  //        ldr x0, [x2]
2510
  //        add x2, x2, #4
2511
  //        ; becomes
2512
  //        ldr x0, [x2], #4
2513
  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2514
       MBBI != E;) {
2515
    if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
2516
      Modified = true;
2517
    else
2518
      ++MBBI;
2519
  }
2520

2521
  return Modified;
2522
}
2523

2524
bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
2525
  if (skipFunction(Fn.getFunction()))
2526
    return false;
2527

2528
  Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
2529
  TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
2530
  TRI = Subtarget->getRegisterInfo();
2531
  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2532

2533
  // Resize the modified and used register unit trackers.  We do this once
2534
  // per function and then clear the register units each time we optimize a load
2535
  // or store.
2536
  ModifiedRegUnits.init(*TRI);
2537
  UsedRegUnits.init(*TRI);
2538
  DefinedInBB.init(*TRI);
2539

2540
  bool Modified = false;
2541
  bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
2542
  for (auto &MBB : Fn) {
2543
    auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);
2544
    Modified |= M;
2545
  }
2546

2547
  return Modified;
2548
}
2549

2550
// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
2551
// stores near one another?  Note: The pre-RA instruction scheduler already has
2552
// hooks to try and schedule pairable loads/stores together to improve pairing
2553
// opportunities.  Thus, pre-RA pairing pass may not be worth the effort.
2554

2555
// FIXME: When pairing store instructions it's very possible for this pass to
2556
// hoist a store with a KILL marker above another use (without a KILL marker).
2557
// The resulting IR is invalid, but nothing uses the KILL markers after this
2558
// pass, so it's never caused a problem in practice.
2559

2560
/// createAArch64LoadStoreOptimizationPass - returns an instance of the
2561
/// load / store optimization pass.
2562
FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
2563
  return new AArch64LoadStoreOpt();
2564
}
2565

2566
Product

Resources

Company