Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
35294 views
1
//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file contains a pass that performs load / store related peephole
10
// optimizations. This pass should be run after register allocation.
11
//
12
// The pass runs after the PrologEpilogInserter where we emit the CFI
13
// instructions. In order to preserve the correctness of the unwind informaiton,
14
// the pass should not change the order of any two instructions, one of which
15
// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16
// to unwind information.
17
//
18
//===----------------------------------------------------------------------===//
19
20
#include "AArch64InstrInfo.h"
21
#include "AArch64MachineFunctionInfo.h"
22
#include "AArch64Subtarget.h"
23
#include "MCTargetDesc/AArch64AddressingModes.h"
24
#include "llvm/ADT/SmallVector.h"
25
#include "llvm/ADT/Statistic.h"
26
#include "llvm/ADT/StringRef.h"
27
#include "llvm/ADT/iterator_range.h"
28
#include "llvm/Analysis/AliasAnalysis.h"
29
#include "llvm/CodeGen/MachineBasicBlock.h"
30
#include "llvm/CodeGen/MachineFunction.h"
31
#include "llvm/CodeGen/MachineFunctionPass.h"
32
#include "llvm/CodeGen/MachineInstr.h"
33
#include "llvm/CodeGen/MachineInstrBuilder.h"
34
#include "llvm/CodeGen/MachineOperand.h"
35
#include "llvm/CodeGen/MachineRegisterInfo.h"
36
#include "llvm/CodeGen/TargetRegisterInfo.h"
37
#include "llvm/IR/DebugLoc.h"
38
#include "llvm/MC/MCAsmInfo.h"
39
#include "llvm/MC/MCDwarf.h"
40
#include "llvm/MC/MCRegisterInfo.h"
41
#include "llvm/Pass.h"
42
#include "llvm/Support/CommandLine.h"
43
#include "llvm/Support/Debug.h"
44
#include "llvm/Support/DebugCounter.h"
45
#include "llvm/Support/ErrorHandling.h"
46
#include "llvm/Support/raw_ostream.h"
47
#include <cassert>
48
#include <cstdint>
49
#include <functional>
50
#include <iterator>
51
#include <limits>
52
#include <optional>
53
54
using namespace llvm;
55
56
#define DEBUG_TYPE "aarch64-ldst-opt"
57
58
STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
59
STATISTIC(NumPostFolded, "Number of post-index updates folded");
60
STATISTIC(NumPreFolded, "Number of pre-index updates folded");
61
STATISTIC(NumUnscaledPairCreated,
62
"Number of load/store from unscaled generated");
63
STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
64
STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
65
STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "
66
"not passed the alignment check");
67
68
DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
69
"Controls which pairs are considered for renaming");
70
71
// The LdStLimit limits how far we search for load/store pairs.
72
static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
73
cl::init(20), cl::Hidden);
74
75
// The UpdateLimit limits how far we search for update instructions when we form
76
// pre-/post-index instructions.
77
static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
78
cl::Hidden);
79
80
// Enable register renaming to find additional store pairing opportunities.
81
static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
82
cl::init(true), cl::Hidden);
83
84
#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
85
86
namespace {
87
88
using LdStPairFlags = struct LdStPairFlags {
89
// If a matching instruction is found, MergeForward is set to true if the
90
// merge is to remove the first instruction and replace the second with
91
// a pair-wise insn, and false if the reverse is true.
92
bool MergeForward = false;
93
94
// SExtIdx gives the index of the result of the load pair that must be
95
// extended. The value of SExtIdx assumes that the paired load produces the
96
// value in this order: (I, returned iterator), i.e., -1 means no value has
97
// to be extended, 0 means I, and 1 means the returned iterator.
98
int SExtIdx = -1;
99
100
// If not none, RenameReg can be used to rename the result register of the
101
// first store in a pair. Currently this only works when merging stores
102
// forward.
103
std::optional<MCPhysReg> RenameReg;
104
105
LdStPairFlags() = default;
106
107
void setMergeForward(bool V = true) { MergeForward = V; }
108
bool getMergeForward() const { return MergeForward; }
109
110
void setSExtIdx(int V) { SExtIdx = V; }
111
int getSExtIdx() const { return SExtIdx; }
112
113
void setRenameReg(MCPhysReg R) { RenameReg = R; }
114
void clearRenameReg() { RenameReg = std::nullopt; }
115
std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
116
};
117
118
struct AArch64LoadStoreOpt : public MachineFunctionPass {
119
static char ID;
120
121
AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
122
initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
123
}
124
125
AliasAnalysis *AA;
126
const AArch64InstrInfo *TII;
127
const TargetRegisterInfo *TRI;
128
const AArch64Subtarget *Subtarget;
129
130
// Track which register units have been modified and used.
131
LiveRegUnits ModifiedRegUnits, UsedRegUnits;
132
LiveRegUnits DefinedInBB;
133
134
void getAnalysisUsage(AnalysisUsage &AU) const override {
135
AU.addRequired<AAResultsWrapperPass>();
136
MachineFunctionPass::getAnalysisUsage(AU);
137
}
138
139
// Scan the instructions looking for a load/store that can be combined
140
// with the current instruction into a load/store pair.
141
// Return the matching instruction if one is found, else MBB->end().
142
MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
143
LdStPairFlags &Flags,
144
unsigned Limit,
145
bool FindNarrowMerge);
146
147
// Scan the instructions looking for a store that writes to the address from
148
// which the current load instruction reads. Return true if one is found.
149
bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
150
MachineBasicBlock::iterator &StoreI);
151
152
// Merge the two instructions indicated into a wider narrow store instruction.
153
MachineBasicBlock::iterator
154
mergeNarrowZeroStores(MachineBasicBlock::iterator I,
155
MachineBasicBlock::iterator MergeMI,
156
const LdStPairFlags &Flags);
157
158
// Merge the two instructions indicated into a single pair-wise instruction.
159
MachineBasicBlock::iterator
160
mergePairedInsns(MachineBasicBlock::iterator I,
161
MachineBasicBlock::iterator Paired,
162
const LdStPairFlags &Flags);
163
164
// Promote the load that reads directly from the address stored to.
165
MachineBasicBlock::iterator
166
promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
167
MachineBasicBlock::iterator StoreI);
168
169
// Scan the instruction list to find a base register update that can
170
// be combined with the current instruction (a load or store) using
171
// pre or post indexed addressing with writeback. Scan forwards.
172
MachineBasicBlock::iterator
173
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
174
int UnscaledOffset, unsigned Limit);
175
176
// Scan the instruction list to find a base register update that can
177
// be combined with the current instruction (a load or store) using
178
// pre or post indexed addressing with writeback. Scan backwards.
179
MachineBasicBlock::iterator
180
findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
181
182
// Find an instruction that updates the base register of the ld/st
183
// instruction.
184
bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
185
unsigned BaseReg, int Offset);
186
187
// Merge a pre- or post-index base register update into a ld/st instruction.
188
MachineBasicBlock::iterator
189
mergeUpdateInsn(MachineBasicBlock::iterator I,
190
MachineBasicBlock::iterator Update, bool IsPreIdx);
191
192
// Find and merge zero store instructions.
193
bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
194
195
// Find and pair ldr/str instructions.
196
bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
197
198
// Find and promote load instructions which read directly from store.
199
bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
200
201
// Find and merge a base register updates before or after a ld/st instruction.
202
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
203
204
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205
206
bool runOnMachineFunction(MachineFunction &Fn) override;
207
208
MachineFunctionProperties getRequiredProperties() const override {
209
return MachineFunctionProperties().set(
210
MachineFunctionProperties::Property::NoVRegs);
211
}
212
213
StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
214
};
215
216
char AArch64LoadStoreOpt::ID = 0;
217
218
} // end anonymous namespace
219
220
INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
221
AARCH64_LOAD_STORE_OPT_NAME, false, false)
222
223
static bool isNarrowStore(unsigned Opc) {
224
switch (Opc) {
225
default:
226
return false;
227
case AArch64::STRBBui:
228
case AArch64::STURBBi:
229
case AArch64::STRHHui:
230
case AArch64::STURHHi:
231
return true;
232
}
233
}
234
235
// These instruction set memory tag and either keep memory contents unchanged or
236
// set it to zero, ignoring the address part of the source register.
237
static bool isTagStore(const MachineInstr &MI) {
238
switch (MI.getOpcode()) {
239
default:
240
return false;
241
case AArch64::STGi:
242
case AArch64::STZGi:
243
case AArch64::ST2Gi:
244
case AArch64::STZ2Gi:
245
return true;
246
}
247
}
248
249
static unsigned getMatchingNonSExtOpcode(unsigned Opc,
250
bool *IsValidLdStrOpc = nullptr) {
251
if (IsValidLdStrOpc)
252
*IsValidLdStrOpc = true;
253
switch (Opc) {
254
default:
255
if (IsValidLdStrOpc)
256
*IsValidLdStrOpc = false;
257
return std::numeric_limits<unsigned>::max();
258
case AArch64::STRDui:
259
case AArch64::STURDi:
260
case AArch64::STRDpre:
261
case AArch64::STRQui:
262
case AArch64::STURQi:
263
case AArch64::STRQpre:
264
case AArch64::STRBBui:
265
case AArch64::STURBBi:
266
case AArch64::STRHHui:
267
case AArch64::STURHHi:
268
case AArch64::STRWui:
269
case AArch64::STRWpre:
270
case AArch64::STURWi:
271
case AArch64::STRXui:
272
case AArch64::STRXpre:
273
case AArch64::STURXi:
274
case AArch64::LDRDui:
275
case AArch64::LDURDi:
276
case AArch64::LDRDpre:
277
case AArch64::LDRQui:
278
case AArch64::LDURQi:
279
case AArch64::LDRQpre:
280
case AArch64::LDRWui:
281
case AArch64::LDURWi:
282
case AArch64::LDRWpre:
283
case AArch64::LDRXui:
284
case AArch64::LDURXi:
285
case AArch64::LDRXpre:
286
case AArch64::STRSui:
287
case AArch64::STURSi:
288
case AArch64::STRSpre:
289
case AArch64::LDRSui:
290
case AArch64::LDURSi:
291
case AArch64::LDRSpre:
292
return Opc;
293
case AArch64::LDRSWui:
294
return AArch64::LDRWui;
295
case AArch64::LDURSWi:
296
return AArch64::LDURWi;
297
case AArch64::LDRSWpre:
298
return AArch64::LDRWpre;
299
}
300
}
301
302
static unsigned getMatchingWideOpcode(unsigned Opc) {
303
switch (Opc) {
304
default:
305
llvm_unreachable("Opcode has no wide equivalent!");
306
case AArch64::STRBBui:
307
return AArch64::STRHHui;
308
case AArch64::STRHHui:
309
return AArch64::STRWui;
310
case AArch64::STURBBi:
311
return AArch64::STURHHi;
312
case AArch64::STURHHi:
313
return AArch64::STURWi;
314
case AArch64::STURWi:
315
return AArch64::STURXi;
316
case AArch64::STRWui:
317
return AArch64::STRXui;
318
}
319
}
320
321
static unsigned getMatchingPairOpcode(unsigned Opc) {
322
switch (Opc) {
323
default:
324
llvm_unreachable("Opcode has no pairwise equivalent!");
325
case AArch64::STRSui:
326
case AArch64::STURSi:
327
return AArch64::STPSi;
328
case AArch64::STRSpre:
329
return AArch64::STPSpre;
330
case AArch64::STRDui:
331
case AArch64::STURDi:
332
return AArch64::STPDi;
333
case AArch64::STRDpre:
334
return AArch64::STPDpre;
335
case AArch64::STRQui:
336
case AArch64::STURQi:
337
return AArch64::STPQi;
338
case AArch64::STRQpre:
339
return AArch64::STPQpre;
340
case AArch64::STRWui:
341
case AArch64::STURWi:
342
return AArch64::STPWi;
343
case AArch64::STRWpre:
344
return AArch64::STPWpre;
345
case AArch64::STRXui:
346
case AArch64::STURXi:
347
return AArch64::STPXi;
348
case AArch64::STRXpre:
349
return AArch64::STPXpre;
350
case AArch64::LDRSui:
351
case AArch64::LDURSi:
352
return AArch64::LDPSi;
353
case AArch64::LDRSpre:
354
return AArch64::LDPSpre;
355
case AArch64::LDRDui:
356
case AArch64::LDURDi:
357
return AArch64::LDPDi;
358
case AArch64::LDRDpre:
359
return AArch64::LDPDpre;
360
case AArch64::LDRQui:
361
case AArch64::LDURQi:
362
return AArch64::LDPQi;
363
case AArch64::LDRQpre:
364
return AArch64::LDPQpre;
365
case AArch64::LDRWui:
366
case AArch64::LDURWi:
367
return AArch64::LDPWi;
368
case AArch64::LDRWpre:
369
return AArch64::LDPWpre;
370
case AArch64::LDRXui:
371
case AArch64::LDURXi:
372
return AArch64::LDPXi;
373
case AArch64::LDRXpre:
374
return AArch64::LDPXpre;
375
case AArch64::LDRSWui:
376
case AArch64::LDURSWi:
377
return AArch64::LDPSWi;
378
case AArch64::LDRSWpre:
379
return AArch64::LDPSWpre;
380
}
381
}
382
383
static unsigned isMatchingStore(MachineInstr &LoadInst,
384
MachineInstr &StoreInst) {
385
unsigned LdOpc = LoadInst.getOpcode();
386
unsigned StOpc = StoreInst.getOpcode();
387
switch (LdOpc) {
388
default:
389
llvm_unreachable("Unsupported load instruction!");
390
case AArch64::LDRBBui:
391
return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
392
StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
393
case AArch64::LDURBBi:
394
return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
395
StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
396
case AArch64::LDRHHui:
397
return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
398
StOpc == AArch64::STRXui;
399
case AArch64::LDURHHi:
400
return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
401
StOpc == AArch64::STURXi;
402
case AArch64::LDRWui:
403
return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
404
case AArch64::LDURWi:
405
return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
406
case AArch64::LDRXui:
407
return StOpc == AArch64::STRXui;
408
case AArch64::LDURXi:
409
return StOpc == AArch64::STURXi;
410
}
411
}
412
413
static unsigned getPreIndexedOpcode(unsigned Opc) {
414
// FIXME: We don't currently support creating pre-indexed loads/stores when
415
// the load or store is the unscaled version. If we decide to perform such an
416
// optimization in the future the cases for the unscaled loads/stores will
417
// need to be added here.
418
switch (Opc) {
419
default:
420
llvm_unreachable("Opcode has no pre-indexed equivalent!");
421
case AArch64::STRSui:
422
return AArch64::STRSpre;
423
case AArch64::STRDui:
424
return AArch64::STRDpre;
425
case AArch64::STRQui:
426
return AArch64::STRQpre;
427
case AArch64::STRBBui:
428
return AArch64::STRBBpre;
429
case AArch64::STRHHui:
430
return AArch64::STRHHpre;
431
case AArch64::STRWui:
432
return AArch64::STRWpre;
433
case AArch64::STRXui:
434
return AArch64::STRXpre;
435
case AArch64::LDRSui:
436
return AArch64::LDRSpre;
437
case AArch64::LDRDui:
438
return AArch64::LDRDpre;
439
case AArch64::LDRQui:
440
return AArch64::LDRQpre;
441
case AArch64::LDRBBui:
442
return AArch64::LDRBBpre;
443
case AArch64::LDRHHui:
444
return AArch64::LDRHHpre;
445
case AArch64::LDRWui:
446
return AArch64::LDRWpre;
447
case AArch64::LDRXui:
448
return AArch64::LDRXpre;
449
case AArch64::LDRSWui:
450
return AArch64::LDRSWpre;
451
case AArch64::LDPSi:
452
return AArch64::LDPSpre;
453
case AArch64::LDPSWi:
454
return AArch64::LDPSWpre;
455
case AArch64::LDPDi:
456
return AArch64::LDPDpre;
457
case AArch64::LDPQi:
458
return AArch64::LDPQpre;
459
case AArch64::LDPWi:
460
return AArch64::LDPWpre;
461
case AArch64::LDPXi:
462
return AArch64::LDPXpre;
463
case AArch64::STPSi:
464
return AArch64::STPSpre;
465
case AArch64::STPDi:
466
return AArch64::STPDpre;
467
case AArch64::STPQi:
468
return AArch64::STPQpre;
469
case AArch64::STPWi:
470
return AArch64::STPWpre;
471
case AArch64::STPXi:
472
return AArch64::STPXpre;
473
case AArch64::STGi:
474
return AArch64::STGPreIndex;
475
case AArch64::STZGi:
476
return AArch64::STZGPreIndex;
477
case AArch64::ST2Gi:
478
return AArch64::ST2GPreIndex;
479
case AArch64::STZ2Gi:
480
return AArch64::STZ2GPreIndex;
481
case AArch64::STGPi:
482
return AArch64::STGPpre;
483
}
484
}
485
486
static unsigned getPostIndexedOpcode(unsigned Opc) {
487
switch (Opc) {
488
default:
489
llvm_unreachable("Opcode has no post-indexed wise equivalent!");
490
case AArch64::STRSui:
491
case AArch64::STURSi:
492
return AArch64::STRSpost;
493
case AArch64::STRDui:
494
case AArch64::STURDi:
495
return AArch64::STRDpost;
496
case AArch64::STRQui:
497
case AArch64::STURQi:
498
return AArch64::STRQpost;
499
case AArch64::STRBBui:
500
return AArch64::STRBBpost;
501
case AArch64::STRHHui:
502
return AArch64::STRHHpost;
503
case AArch64::STRWui:
504
case AArch64::STURWi:
505
return AArch64::STRWpost;
506
case AArch64::STRXui:
507
case AArch64::STURXi:
508
return AArch64::STRXpost;
509
case AArch64::LDRSui:
510
case AArch64::LDURSi:
511
return AArch64::LDRSpost;
512
case AArch64::LDRDui:
513
case AArch64::LDURDi:
514
return AArch64::LDRDpost;
515
case AArch64::LDRQui:
516
case AArch64::LDURQi:
517
return AArch64::LDRQpost;
518
case AArch64::LDRBBui:
519
return AArch64::LDRBBpost;
520
case AArch64::LDRHHui:
521
return AArch64::LDRHHpost;
522
case AArch64::LDRWui:
523
case AArch64::LDURWi:
524
return AArch64::LDRWpost;
525
case AArch64::LDRXui:
526
case AArch64::LDURXi:
527
return AArch64::LDRXpost;
528
case AArch64::LDRSWui:
529
return AArch64::LDRSWpost;
530
case AArch64::LDPSi:
531
return AArch64::LDPSpost;
532
case AArch64::LDPSWi:
533
return AArch64::LDPSWpost;
534
case AArch64::LDPDi:
535
return AArch64::LDPDpost;
536
case AArch64::LDPQi:
537
return AArch64::LDPQpost;
538
case AArch64::LDPWi:
539
return AArch64::LDPWpost;
540
case AArch64::LDPXi:
541
return AArch64::LDPXpost;
542
case AArch64::STPSi:
543
return AArch64::STPSpost;
544
case AArch64::STPDi:
545
return AArch64::STPDpost;
546
case AArch64::STPQi:
547
return AArch64::STPQpost;
548
case AArch64::STPWi:
549
return AArch64::STPWpost;
550
case AArch64::STPXi:
551
return AArch64::STPXpost;
552
case AArch64::STGi:
553
return AArch64::STGPostIndex;
554
case AArch64::STZGi:
555
return AArch64::STZGPostIndex;
556
case AArch64::ST2Gi:
557
return AArch64::ST2GPostIndex;
558
case AArch64::STZ2Gi:
559
return AArch64::STZ2GPostIndex;
560
case AArch64::STGPi:
561
return AArch64::STGPpost;
562
}
563
}
564
565
static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
566
567
unsigned OpcA = FirstMI.getOpcode();
568
unsigned OpcB = MI.getOpcode();
569
570
switch (OpcA) {
571
default:
572
return false;
573
case AArch64::STRSpre:
574
return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
575
case AArch64::STRDpre:
576
return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
577
case AArch64::STRQpre:
578
return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
579
case AArch64::STRWpre:
580
return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
581
case AArch64::STRXpre:
582
return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
583
case AArch64::LDRSpre:
584
return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
585
case AArch64::LDRDpre:
586
return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
587
case AArch64::LDRQpre:
588
return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
589
case AArch64::LDRWpre:
590
return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
591
case AArch64::LDRXpre:
592
return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
593
case AArch64::LDRSWpre:
594
return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi);
595
}
596
}
597
598
// Returns the scale and offset range of pre/post indexed variants of MI.
599
static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
600
int &MinOffset, int &MaxOffset) {
601
bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
602
bool IsTagStore = isTagStore(MI);
603
// ST*G and all paired ldst have the same scale in pre/post-indexed variants
604
// as in the "unsigned offset" variant.
605
// All other pre/post indexed ldst instructions are unscaled.
606
Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
607
608
if (IsPaired) {
609
MinOffset = -64;
610
MaxOffset = 63;
611
} else {
612
MinOffset = -256;
613
MaxOffset = 255;
614
}
615
}
616
617
static MachineOperand &getLdStRegOp(MachineInstr &MI,
618
unsigned PairedRegOp = 0) {
619
assert(PairedRegOp < 2 && "Unexpected register operand idx.");
620
bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
621
if (IsPreLdSt)
622
PairedRegOp += 1;
623
unsigned Idx =
624
AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
625
return MI.getOperand(Idx);
626
}
627
628
static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
629
MachineInstr &StoreInst,
630
const AArch64InstrInfo *TII) {
631
assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
632
int LoadSize = TII->getMemScale(LoadInst);
633
int StoreSize = TII->getMemScale(StoreInst);
634
int UnscaledStOffset =
635
TII->hasUnscaledLdStOffset(StoreInst)
636
? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm()
637
: AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize;
638
int UnscaledLdOffset =
639
TII->hasUnscaledLdStOffset(LoadInst)
640
? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm()
641
: AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize;
642
return (UnscaledStOffset <= UnscaledLdOffset) &&
643
(UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
644
}
645
646
static bool isPromotableZeroStoreInst(MachineInstr &MI) {
647
unsigned Opc = MI.getOpcode();
648
return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
649
isNarrowStore(Opc)) &&
650
getLdStRegOp(MI).getReg() == AArch64::WZR;
651
}
652
653
static bool isPromotableLoadFromStore(MachineInstr &MI) {
654
switch (MI.getOpcode()) {
655
default:
656
return false;
657
// Scaled instructions.
658
case AArch64::LDRBBui:
659
case AArch64::LDRHHui:
660
case AArch64::LDRWui:
661
case AArch64::LDRXui:
662
// Unscaled instructions.
663
case AArch64::LDURBBi:
664
case AArch64::LDURHHi:
665
case AArch64::LDURWi:
666
case AArch64::LDURXi:
667
return true;
668
}
669
}
670
671
static bool isMergeableLdStUpdate(MachineInstr &MI) {
672
unsigned Opc = MI.getOpcode();
673
switch (Opc) {
674
default:
675
return false;
676
// Scaled instructions.
677
case AArch64::STRSui:
678
case AArch64::STRDui:
679
case AArch64::STRQui:
680
case AArch64::STRXui:
681
case AArch64::STRWui:
682
case AArch64::STRHHui:
683
case AArch64::STRBBui:
684
case AArch64::LDRSui:
685
case AArch64::LDRDui:
686
case AArch64::LDRQui:
687
case AArch64::LDRXui:
688
case AArch64::LDRWui:
689
case AArch64::LDRHHui:
690
case AArch64::LDRBBui:
691
case AArch64::STGi:
692
case AArch64::STZGi:
693
case AArch64::ST2Gi:
694
case AArch64::STZ2Gi:
695
case AArch64::STGPi:
696
// Unscaled instructions.
697
case AArch64::STURSi:
698
case AArch64::STURDi:
699
case AArch64::STURQi:
700
case AArch64::STURWi:
701
case AArch64::STURXi:
702
case AArch64::LDURSi:
703
case AArch64::LDURDi:
704
case AArch64::LDURQi:
705
case AArch64::LDURWi:
706
case AArch64::LDURXi:
707
// Paired instructions.
708
case AArch64::LDPSi:
709
case AArch64::LDPSWi:
710
case AArch64::LDPDi:
711
case AArch64::LDPQi:
712
case AArch64::LDPWi:
713
case AArch64::LDPXi:
714
case AArch64::STPSi:
715
case AArch64::STPDi:
716
case AArch64::STPQi:
717
case AArch64::STPWi:
718
case AArch64::STPXi:
719
// Make sure this is a reg+imm (as opposed to an address reloc).
720
if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
721
return false;
722
723
return true;
724
}
725
}
726
727
static bool isRewritableImplicitDef(unsigned Opc) {
728
switch (Opc) {
729
default:
730
return false;
731
case AArch64::ORRWrs:
732
case AArch64::ADDWri:
733
return true;
734
}
735
}
736
737
MachineBasicBlock::iterator
738
AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
739
MachineBasicBlock::iterator MergeMI,
740
const LdStPairFlags &Flags) {
741
assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
742
"Expected promotable zero stores.");
743
744
MachineBasicBlock::iterator E = I->getParent()->end();
745
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
746
// If NextI is the second of the two instructions to be merged, we need
747
// to skip one further. Either way we merge will invalidate the iterator,
748
// and we don't need to scan the new instruction, as it's a pairwise
749
// instruction, which we're not considering for further action anyway.
750
if (NextI == MergeMI)
751
NextI = next_nodbg(NextI, E);
752
753
unsigned Opc = I->getOpcode();
754
unsigned MergeMIOpc = MergeMI->getOpcode();
755
bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
756
bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(MergeMIOpc);
757
int OffsetStride = IsScaled ? TII->getMemScale(*I) : 1;
758
int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(*MergeMI) : 1;
759
760
bool MergeForward = Flags.getMergeForward();
761
// Insert our new paired instruction after whichever of the paired
762
// instructions MergeForward indicates.
763
MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
764
// Also based on MergeForward is from where we copy the base register operand
765
// so we get the flags compatible with the input code.
766
const MachineOperand &BaseRegOp =
767
MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI)
768
: AArch64InstrInfo::getLdStBaseOp(*I);
769
770
// Which register is Rt and which is Rt2 depends on the offset order.
771
int64_t IOffsetInBytes =
772
AArch64InstrInfo::getLdStOffsetOp(*I).getImm() * OffsetStride;
773
int64_t MIOffsetInBytes =
774
AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() *
775
MergeMIOffsetStride;
776
// Select final offset based on the offset order.
777
int64_t OffsetImm;
778
if (IOffsetInBytes > MIOffsetInBytes)
779
OffsetImm = MIOffsetInBytes;
780
else
781
OffsetImm = IOffsetInBytes;
782
783
int NewOpcode = getMatchingWideOpcode(Opc);
784
bool FinalIsScaled = !TII->hasUnscaledLdStOffset(NewOpcode);
785
786
// Adjust final offset if the result opcode is a scaled store.
787
if (FinalIsScaled) {
788
int NewOffsetStride = FinalIsScaled ? TII->getMemScale(NewOpcode) : 1;
789
assert(((OffsetImm % NewOffsetStride) == 0) &&
790
"Offset should be a multiple of the store memory scale");
791
OffsetImm = OffsetImm / NewOffsetStride;
792
}
793
794
// Construct the new instruction.
795
DebugLoc DL = I->getDebugLoc();
796
MachineBasicBlock *MBB = I->getParent();
797
MachineInstrBuilder MIB;
798
MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
799
.addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
800
.add(BaseRegOp)
801
.addImm(OffsetImm)
802
.cloneMergedMemRefs({&*I, &*MergeMI})
803
.setMIFlags(I->mergeFlagsWith(*MergeMI));
804
(void)MIB;
805
806
LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
807
LLVM_DEBUG(I->print(dbgs()));
808
LLVM_DEBUG(dbgs() << " ");
809
LLVM_DEBUG(MergeMI->print(dbgs()));
810
LLVM_DEBUG(dbgs() << " with instruction:\n ");
811
LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
812
LLVM_DEBUG(dbgs() << "\n");
813
814
// Erase the old instructions.
815
I->eraseFromParent();
816
MergeMI->eraseFromParent();
817
return NextI;
818
}
819
820
// Apply Fn to all instructions between MI and the beginning of the block, until
821
// a def for DefReg is reached. Returns true, iff Fn returns true for all
822
// visited instructions. Stop after visiting Limit iterations.
823
static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
824
const TargetRegisterInfo *TRI, unsigned Limit,
825
std::function<bool(MachineInstr &, bool)> &Fn) {
826
auto MBB = MI.getParent();
827
for (MachineInstr &I :
828
instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {
829
if (!Limit)
830
return false;
831
--Limit;
832
833
bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {
834
return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
835
TRI->regsOverlap(MOP.getReg(), DefReg);
836
});
837
if (!Fn(I, isDef))
838
return false;
839
if (isDef)
840
break;
841
}
842
return true;
843
}
844
845
static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
846
const TargetRegisterInfo *TRI) {
847
848
for (const MachineOperand &MOP : phys_regs_and_masks(MI))
849
if (MOP.isReg() && MOP.isKill())
850
Units.removeReg(MOP.getReg());
851
852
for (const MachineOperand &MOP : phys_regs_and_masks(MI))
853
if (MOP.isReg() && !MOP.isKill())
854
Units.addReg(MOP.getReg());
855
}
856
857
MachineBasicBlock::iterator
858
AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
859
MachineBasicBlock::iterator Paired,
860
const LdStPairFlags &Flags) {
861
MachineBasicBlock::iterator E = I->getParent()->end();
862
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
863
// If NextI is the second of the two instructions to be merged, we need
864
// to skip one further. Either way we merge will invalidate the iterator,
865
// and we don't need to scan the new instruction, as it's a pairwise
866
// instruction, which we're not considering for further action anyway.
867
if (NextI == Paired)
868
NextI = next_nodbg(NextI, E);
869
870
int SExtIdx = Flags.getSExtIdx();
871
unsigned Opc =
872
SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
873
bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
874
int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
875
876
bool MergeForward = Flags.getMergeForward();
877
878
std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
879
if (RenameReg) {
880
MCRegister RegToRename = getLdStRegOp(*I).getReg();
881
DefinedInBB.addReg(*RenameReg);
882
883
// Return the sub/super register for RenameReg, matching the size of
884
// OriginalReg.
885
auto GetMatchingSubReg =
886
[this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
887
for (MCPhysReg SubOrSuper :
888
TRI->sub_and_superregs_inclusive(*RenameReg)) {
889
if (C->contains(SubOrSuper))
890
return SubOrSuper;
891
}
892
llvm_unreachable("Should have found matching sub or super register!");
893
};
894
895
std::function<bool(MachineInstr &, bool)> UpdateMIs =
896
[this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,
897
bool IsDef) {
898
if (IsDef) {
899
bool SeenDef = false;
900
for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
901
MachineOperand &MOP = MI.getOperand(OpIdx);
902
// Rename the first explicit definition and all implicit
903
// definitions matching RegToRename.
904
if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
905
(!MergeForward || !SeenDef ||
906
(MOP.isDef() && MOP.isImplicit())) &&
907
TRI->regsOverlap(MOP.getReg(), RegToRename)) {
908
assert((MOP.isImplicit() ||
909
(MOP.isRenamable() && !MOP.isEarlyClobber())) &&
910
"Need renamable operands");
911
Register MatchingReg;
912
if (const TargetRegisterClass *RC =
913
MI.getRegClassConstraint(OpIdx, TII, TRI))
914
MatchingReg = GetMatchingSubReg(RC);
915
else {
916
if (!isRewritableImplicitDef(MI.getOpcode()))
917
continue;
918
MatchingReg = GetMatchingSubReg(
919
TRI->getMinimalPhysRegClass(MOP.getReg()));
920
}
921
MOP.setReg(MatchingReg);
922
SeenDef = true;
923
}
924
}
925
} else {
926
for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
927
MachineOperand &MOP = MI.getOperand(OpIdx);
928
if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
929
TRI->regsOverlap(MOP.getReg(), RegToRename)) {
930
assert((MOP.isImplicit() ||
931
(MOP.isRenamable() && !MOP.isEarlyClobber())) &&
932
"Need renamable operands");
933
Register MatchingReg;
934
if (const TargetRegisterClass *RC =
935
MI.getRegClassConstraint(OpIdx, TII, TRI))
936
MatchingReg = GetMatchingSubReg(RC);
937
else
938
MatchingReg = GetMatchingSubReg(
939
TRI->getMinimalPhysRegClass(MOP.getReg()));
940
assert(MatchingReg != AArch64::NoRegister &&
941
"Cannot find matching regs for renaming");
942
MOP.setReg(MatchingReg);
943
}
944
}
945
}
946
LLVM_DEBUG(dbgs() << "Renamed " << MI);
947
return true;
948
};
949
forAllMIsUntilDef(MergeForward ? *I : *std::prev(Paired), RegToRename, TRI,
950
UINT32_MAX, UpdateMIs);
951
952
#if !defined(NDEBUG)
953
// For forward merging store:
954
// Make sure the register used for renaming is not used between the
955
// paired instructions. That would trash the content before the new
956
// paired instruction.
957
MCPhysReg RegToCheck = *RenameReg;
958
// For backward merging load:
959
// Make sure the register being renamed is not used between the
960
// paired instructions. That would trash the content after the new
961
// paired instruction.
962
if (!MergeForward)
963
RegToCheck = RegToRename;
964
for (auto &MI :
965
iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
966
MergeForward ? std::next(I) : I,
967
MergeForward ? std::next(Paired) : Paired))
968
assert(all_of(MI.operands(),
969
[this, RegToCheck](const MachineOperand &MOP) {
970
return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
971
MOP.isUndef() ||
972
!TRI->regsOverlap(MOP.getReg(), RegToCheck);
973
}) &&
974
"Rename register used between paired instruction, trashing the "
975
"content");
976
#endif
977
}
978
979
// Insert our new paired instruction after whichever of the paired
980
// instructions MergeForward indicates.
981
MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
982
// Also based on MergeForward is from where we copy the base register operand
983
// so we get the flags compatible with the input code.
984
const MachineOperand &BaseRegOp =
985
MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired)
986
: AArch64InstrInfo::getLdStBaseOp(*I);
987
988
int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm();
989
int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm();
990
bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
991
if (IsUnscaled != PairedIsUnscaled) {
992
// We're trying to pair instructions that differ in how they are scaled. If
993
// I is scaled then scale the offset of Paired accordingly. Otherwise, do
994
// the opposite (i.e., make Paired's offset unscaled).
995
int MemSize = TII->getMemScale(*Paired);
996
if (PairedIsUnscaled) {
997
// If the unscaled offset isn't a multiple of the MemSize, we can't
998
// pair the operations together.
999
assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
1000
"Offset should be a multiple of the stride!");
1001
PairedOffset /= MemSize;
1002
} else {
1003
PairedOffset *= MemSize;
1004
}
1005
}
1006
1007
// Which register is Rt and which is Rt2 depends on the offset order.
1008
// However, for pre load/stores the Rt should be the one of the pre
1009
// load/store.
1010
MachineInstr *RtMI, *Rt2MI;
1011
if (Offset == PairedOffset + OffsetStride &&
1012
!AArch64InstrInfo::isPreLdSt(*I)) {
1013
RtMI = &*Paired;
1014
Rt2MI = &*I;
1015
// Here we swapped the assumption made for SExtIdx.
1016
// I.e., we turn ldp I, Paired into ldp Paired, I.
1017
// Update the index accordingly.
1018
if (SExtIdx != -1)
1019
SExtIdx = (SExtIdx + 1) % 2;
1020
} else {
1021
RtMI = &*I;
1022
Rt2MI = &*Paired;
1023
}
1024
int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
1025
// Scale the immediate offset, if necessary.
1026
if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
1027
assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
1028
"Unscaled offset cannot be scaled.");
1029
OffsetImm /= TII->getMemScale(*RtMI);
1030
}
1031
1032
// Construct the new instruction.
1033
MachineInstrBuilder MIB;
1034
DebugLoc DL = I->getDebugLoc();
1035
MachineBasicBlock *MBB = I->getParent();
1036
MachineOperand RegOp0 = getLdStRegOp(*RtMI);
1037
MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
1038
MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;
1039
// Kill flags may become invalid when moving stores for pairing.
1040
if (RegOp0.isUse()) {
1041
if (!MergeForward) {
1042
// Clear kill flags on store if moving upwards. Example:
1043
// STRWui kill %w0, ...
1044
// USE %w1
1045
// STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
1046
// We are about to move the store of w1, so its kill flag may become
1047
// invalid; not the case for w0.
1048
// Since w1 is used between the stores, the kill flag on w1 is cleared
1049
// after merging.
1050
// STPWi kill %w0, %w1, ...
1051
// USE %w1
1052
for (auto It = std::next(I); It != Paired && PairedRegOp.isKill(); ++It)
1053
if (It->readsRegister(PairedRegOp.getReg(), TRI))
1054
PairedRegOp.setIsKill(false);
1055
} else {
1056
// Clear kill flags of the first stores register. Example:
1057
// STRWui %w1, ...
1058
// USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1059
// STRW %w0
1060
Register Reg = getLdStRegOp(*I).getReg();
1061
for (MachineInstr &MI : make_range(std::next(I), Paired))
1062
MI.clearRegisterKills(Reg, TRI);
1063
}
1064
}
1065
1066
unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1067
MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));
1068
1069
// Adds the pre-index operand for pre-indexed ld/st pairs.
1070
if (AArch64InstrInfo::isPreLdSt(*RtMI))
1071
MIB.addReg(BaseRegOp.getReg(), RegState::Define);
1072
1073
MIB.add(RegOp0)
1074
.add(RegOp1)
1075
.add(BaseRegOp)
1076
.addImm(OffsetImm)
1077
.cloneMergedMemRefs({&*I, &*Paired})
1078
.setMIFlags(I->mergeFlagsWith(*Paired));
1079
1080
(void)MIB;
1081
1082
LLVM_DEBUG(
1083
dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1084
LLVM_DEBUG(I->print(dbgs()));
1085
LLVM_DEBUG(dbgs() << " ");
1086
LLVM_DEBUG(Paired->print(dbgs()));
1087
LLVM_DEBUG(dbgs() << " with instruction:\n ");
1088
if (SExtIdx != -1) {
1089
// Generate the sign extension for the proper result of the ldp.
1090
// I.e., with X1, that would be:
1091
// %w1 = KILL %w1, implicit-def %x1
1092
// %x1 = SBFMXri killed %x1, 0, 31
1093
MachineOperand &DstMO = MIB->getOperand(SExtIdx);
1094
// Right now, DstMO has the extended register, since it comes from an
1095
// extended opcode.
1096
Register DstRegX = DstMO.getReg();
1097
// Get the W variant of that register.
1098
Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
1099
// Update the result of LDP to use the W instead of the X variant.
1100
DstMO.setReg(DstRegW);
1101
LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1102
LLVM_DEBUG(dbgs() << "\n");
1103
// Make the machine verifier happy by providing a definition for
1104
// the X register.
1105
// Insert this definition right after the generated LDP, i.e., before
1106
// InsertionPoint.
1107
MachineInstrBuilder MIBKill =
1108
BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
1109
.addReg(DstRegW)
1110
.addReg(DstRegX, RegState::Define);
1111
MIBKill->getOperand(2).setImplicit();
1112
// Create the sign extension.
1113
MachineInstrBuilder MIBSXTW =
1114
BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
1115
.addReg(DstRegX)
1116
.addImm(0)
1117
.addImm(31);
1118
(void)MIBSXTW;
1119
LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1120
LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1121
} else {
1122
LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1123
}
1124
LLVM_DEBUG(dbgs() << "\n");
1125
1126
if (MergeForward)
1127
for (const MachineOperand &MOP : phys_regs_and_masks(*I))
1128
if (MOP.isReg() && MOP.isKill())
1129
DefinedInBB.addReg(MOP.getReg());
1130
1131
// Erase the old instructions.
1132
I->eraseFromParent();
1133
Paired->eraseFromParent();
1134
1135
return NextI;
1136
}
1137
1138
MachineBasicBlock::iterator
1139
AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1140
MachineBasicBlock::iterator StoreI) {
1141
MachineBasicBlock::iterator NextI =
1142
next_nodbg(LoadI, LoadI->getParent()->end());
1143
1144
int LoadSize = TII->getMemScale(*LoadI);
1145
int StoreSize = TII->getMemScale(*StoreI);
1146
Register LdRt = getLdStRegOp(*LoadI).getReg();
1147
const MachineOperand &StMO = getLdStRegOp(*StoreI);
1148
Register StRt = getLdStRegOp(*StoreI).getReg();
1149
bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
1150
1151
assert((IsStoreXReg ||
1152
TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1153
"Unexpected RegClass");
1154
1155
MachineInstr *BitExtMI;
1156
if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1157
// Remove the load, if the destination register of the loads is the same
1158
// register for stored value.
1159
if (StRt == LdRt && LoadSize == 8) {
1160
for (MachineInstr &MI : make_range(StoreI->getIterator(),
1161
LoadI->getIterator())) {
1162
if (MI.killsRegister(StRt, TRI)) {
1163
MI.clearRegisterKills(StRt, TRI);
1164
break;
1165
}
1166
}
1167
LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1168
LLVM_DEBUG(LoadI->print(dbgs()));
1169
LLVM_DEBUG(dbgs() << "\n");
1170
LoadI->eraseFromParent();
1171
return NextI;
1172
}
1173
// Replace the load with a mov if the load and store are in the same size.
1174
BitExtMI =
1175
BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1176
TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
1177
.addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1178
.add(StMO)
1179
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1180
.setMIFlags(LoadI->getFlags());
1181
} else {
1182
// FIXME: Currently we disable this transformation in big-endian targets as
1183
// performance and correctness are verified only in little-endian.
1184
if (!Subtarget->isLittleEndian())
1185
return NextI;
1186
bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI);
1187
assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1188
"Unsupported ld/st match");
1189
assert(LoadSize <= StoreSize && "Invalid load size");
1190
int UnscaledLdOffset =
1191
IsUnscaled
1192
? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm()
1193
: AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize;
1194
int UnscaledStOffset =
1195
IsUnscaled
1196
? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm()
1197
: AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize;
1198
int Width = LoadSize * 8;
1199
Register DestReg =
1200
IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1201
LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
1202
: LdRt;
1203
1204
assert((UnscaledLdOffset >= UnscaledStOffset &&
1205
(UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1206
"Invalid offset");
1207
1208
int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1209
int Imms = Immr + Width - 1;
1210
if (UnscaledLdOffset == UnscaledStOffset) {
1211
uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1212
| ((Immr) << 6) // immr
1213
| ((Imms) << 0) // imms
1214
;
1215
1216
BitExtMI =
1217
BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1218
TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1219
DestReg)
1220
.add(StMO)
1221
.addImm(AndMaskEncoded)
1222
.setMIFlags(LoadI->getFlags());
1223
} else {
1224
BitExtMI =
1225
BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1226
TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1227
DestReg)
1228
.add(StMO)
1229
.addImm(Immr)
1230
.addImm(Imms)
1231
.setMIFlags(LoadI->getFlags());
1232
}
1233
}
1234
1235
// Clear kill flags between store and load.
1236
for (MachineInstr &MI : make_range(StoreI->getIterator(),
1237
BitExtMI->getIterator()))
1238
if (MI.killsRegister(StRt, TRI)) {
1239
MI.clearRegisterKills(StRt, TRI);
1240
break;
1241
}
1242
1243
LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1244
LLVM_DEBUG(StoreI->print(dbgs()));
1245
LLVM_DEBUG(dbgs() << " ");
1246
LLVM_DEBUG(LoadI->print(dbgs()));
1247
LLVM_DEBUG(dbgs() << " with instructions:\n ");
1248
LLVM_DEBUG(StoreI->print(dbgs()));
1249
LLVM_DEBUG(dbgs() << " ");
1250
LLVM_DEBUG((BitExtMI)->print(dbgs()));
1251
LLVM_DEBUG(dbgs() << "\n");
1252
1253
// Erase the old instructions.
1254
LoadI->eraseFromParent();
1255
return NextI;
1256
}
1257
1258
static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1259
// Convert the byte-offset used by unscaled into an "element" offset used
1260
// by the scaled pair load/store instructions.
1261
if (IsUnscaled) {
1262
// If the byte-offset isn't a multiple of the stride, there's no point
1263
// trying to match it.
1264
if (Offset % OffsetStride)
1265
return false;
1266
Offset /= OffsetStride;
1267
}
1268
return Offset <= 63 && Offset >= -64;
1269
}
1270
1271
// Do alignment, specialized to power of 2 and for signed ints,
1272
// avoiding having to do a C-style cast from uint_64t to int when
1273
// using alignTo from include/llvm/Support/MathExtras.h.
1274
// FIXME: Move this function to include/MathExtras.h?
1275
static int alignTo(int Num, int PowOf2) {
1276
return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1277
}
1278
1279
static bool mayAlias(MachineInstr &MIa,
1280
SmallVectorImpl<MachineInstr *> &MemInsns,
1281
AliasAnalysis *AA) {
1282
for (MachineInstr *MIb : MemInsns) {
1283
if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) {
1284
LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
1285
return true;
1286
}
1287
}
1288
1289
LLVM_DEBUG(dbgs() << "No aliases found\n");
1290
return false;
1291
}
1292
1293
bool AArch64LoadStoreOpt::findMatchingStore(
1294
MachineBasicBlock::iterator I, unsigned Limit,
1295
MachineBasicBlock::iterator &StoreI) {
1296
MachineBasicBlock::iterator B = I->getParent()->begin();
1297
MachineBasicBlock::iterator MBBI = I;
1298
MachineInstr &LoadMI = *I;
1299
Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg();
1300
1301
// If the load is the first instruction in the block, there's obviously
1302
// not any matching store.
1303
if (MBBI == B)
1304
return false;
1305
1306
// Track which register units have been modified and used between the first
1307
// insn and the second insn.
1308
ModifiedRegUnits.clear();
1309
UsedRegUnits.clear();
1310
1311
unsigned Count = 0;
1312
do {
1313
MBBI = prev_nodbg(MBBI, B);
1314
MachineInstr &MI = *MBBI;
1315
1316
// Don't count transient instructions towards the search limit since there
1317
// may be different numbers of them if e.g. debug information is present.
1318
if (!MI.isTransient())
1319
++Count;
1320
1321
// If the load instruction reads directly from the address to which the
1322
// store instruction writes and the stored value is not modified, we can
1323
// promote the load. Since we do not handle stores with pre-/post-index,
1324
// it's unnecessary to check if BaseReg is modified by the store itself.
1325
// Also we can't handle stores without an immediate offset operand,
1326
// while the operand might be the address for a global variable.
1327
if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1328
BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1329
AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1330
isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1331
ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
1332
StoreI = MBBI;
1333
return true;
1334
}
1335
1336
if (MI.isCall())
1337
return false;
1338
1339
// Update modified / uses register units.
1340
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1341
1342
// Otherwise, if the base register is modified, we have no match, so
1343
// return early.
1344
if (!ModifiedRegUnits.available(BaseReg))
1345
return false;
1346
1347
// If we encounter a store aliased with the load, return early.
1348
if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
1349
return false;
1350
} while (MBBI != B && Count < Limit);
1351
return false;
1352
}
1353
1354
static bool needsWinCFI(const MachineFunction *MF) {
1355
return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1356
MF->getFunction().needsUnwindTableEntry();
1357
}
1358
1359
// Returns true if FirstMI and MI are candidates for merging or pairing.
1360
// Otherwise, returns false.
1361
static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1362
LdStPairFlags &Flags,
1363
const AArch64InstrInfo *TII) {
1364
// If this is volatile or if pairing is suppressed, not a candidate.
1365
if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1366
return false;
1367
1368
// We should have already checked FirstMI for pair suppression and volatility.
1369
assert(!FirstMI.hasOrderedMemoryRef() &&
1370
!TII->isLdStPairSuppressed(FirstMI) &&
1371
"FirstMI shouldn't get here if either of these checks are true.");
1372
1373
if (needsWinCFI(MI.getMF()) && (MI.getFlag(MachineInstr::FrameSetup) ||
1374
MI.getFlag(MachineInstr::FrameDestroy)))
1375
return false;
1376
1377
unsigned OpcA = FirstMI.getOpcode();
1378
unsigned OpcB = MI.getOpcode();
1379
1380
// Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1381
if (OpcA == OpcB)
1382
return !AArch64InstrInfo::isPreLdSt(FirstMI);
1383
1384
// Two pre ld/st of different opcodes cannot be merged either
1385
if (AArch64InstrInfo::isPreLdSt(FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
1386
return false;
1387
1388
// Try to match a sign-extended load/store with a zero-extended load/store.
1389
bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1390
unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1391
assert(IsValidLdStrOpc &&
1392
"Given Opc should be a Load or Store with an immediate");
1393
// OpcA will be the first instruction in the pair.
1394
if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1395
Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1396
return true;
1397
}
1398
1399
// If the second instruction isn't even a mergable/pairable load/store, bail
1400
// out.
1401
if (!PairIsValidLdStrOpc)
1402
return false;
1403
1404
// FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1405
// offsets.
1406
if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
1407
return false;
1408
1409
// The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1410
// LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
1411
// are candidate pairs that can be merged.
1412
if (isPreLdStPairCandidate(FirstMI, MI))
1413
return true;
1414
1415
// Try to match an unscaled load/store with a scaled load/store.
1416
return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) &&
1417
getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1418
1419
// FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1420
}
1421
1422
static bool canRenameMOP(const MachineOperand &MOP,
1423
const TargetRegisterInfo *TRI) {
1424
if (MOP.isReg()) {
1425
auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());
1426
// Renaming registers with multiple disjunct sub-registers (e.g. the
1427
// result of a LD3) means that all sub-registers are renamed, potentially
1428
// impacting other instructions we did not check. Bail out.
1429
// Note that this relies on the structure of the AArch64 register file. In
1430
// particular, a subregister cannot be written without overwriting the
1431
// whole register.
1432
if (RegClass->HasDisjunctSubRegs) {
1433
LLVM_DEBUG(
1434
dbgs()
1435
<< " Cannot rename operands with multiple disjunct subregisters ("
1436
<< MOP << ")\n");
1437
return false;
1438
}
1439
1440
// We cannot rename arbitrary implicit-defs, the specific rule to rewrite
1441
// them must be known. For example, in ORRWrs the implicit-def
1442
// corresponds to the result register.
1443
if (MOP.isImplicit() && MOP.isDef()) {
1444
if (!isRewritableImplicitDef(MOP.getParent()->getOpcode()))
1445
return false;
1446
return TRI->isSuperOrSubRegisterEq(
1447
MOP.getParent()->getOperand(0).getReg(), MOP.getReg());
1448
}
1449
}
1450
return MOP.isImplicit() ||
1451
(MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1452
}
1453
1454
static bool
1455
canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1456
SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1457
const TargetRegisterInfo *TRI) {
1458
if (!FirstMI.mayStore())
1459
return false;
1460
1461
// Check if we can find an unused register which we can use to rename
1462
// the register used by the first load/store.
1463
1464
auto RegToRename = getLdStRegOp(FirstMI).getReg();
1465
// For now, we only rename if the store operand gets killed at the store.
1466
if (!getLdStRegOp(FirstMI).isKill() &&
1467
!any_of(FirstMI.operands(),
1468
[TRI, RegToRename](const MachineOperand &MOP) {
1469
return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1470
MOP.isImplicit() && MOP.isKill() &&
1471
TRI->regsOverlap(RegToRename, MOP.getReg());
1472
})) {
1473
LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI);
1474
return false;
1475
}
1476
1477
bool FoundDef = false;
1478
1479
// For each instruction between FirstMI and the previous def for RegToRename,
1480
// we
1481
// * check if we can rename RegToRename in this instruction
1482
// * collect the registers used and required register classes for RegToRename.
1483
std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1484
bool IsDef) {
1485
LLVM_DEBUG(dbgs() << "Checking " << MI);
1486
// Currently we do not try to rename across frame-setup instructions.
1487
if (MI.getFlag(MachineInstr::FrameSetup)) {
1488
LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1489
<< "currently\n");
1490
return false;
1491
}
1492
1493
UsedInBetween.accumulate(MI);
1494
1495
// For a definition, check that we can rename the definition and exit the
1496
// loop.
1497
FoundDef = IsDef;
1498
1499
// For defs, check if we can rename the first def of RegToRename.
1500
if (FoundDef) {
1501
// For some pseudo instructions, we might not generate code in the end
1502
// (e.g. KILL) and we would end up without a correct def for the rename
1503
// register.
1504
// TODO: This might be overly conservative and we could handle those cases
1505
// in multiple ways:
1506
// 1. Insert an extra copy, to materialize the def.
1507
// 2. Skip pseudo-defs until we find an non-pseudo def.
1508
if (MI.isPseudo()) {
1509
LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n");
1510
return false;
1511
}
1512
1513
for (auto &MOP : MI.operands()) {
1514
if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1515
!TRI->regsOverlap(MOP.getReg(), RegToRename))
1516
continue;
1517
if (!canRenameMOP(MOP, TRI)) {
1518
LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1519
return false;
1520
}
1521
RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1522
}
1523
return true;
1524
} else {
1525
for (auto &MOP : MI.operands()) {
1526
if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1527
!TRI->regsOverlap(MOP.getReg(), RegToRename))
1528
continue;
1529
1530
if (!canRenameMOP(MOP, TRI)) {
1531
LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1532
return false;
1533
}
1534
RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1535
}
1536
}
1537
return true;
1538
};
1539
1540
if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))
1541
return false;
1542
1543
if (!FoundDef) {
1544
LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1545
return false;
1546
}
1547
return true;
1548
}
1549
1550
// We want to merge the second load into the first by rewriting the usages of
1551
// the same reg between first (incl.) and second (excl.). We don't need to care
1552
// about any insns before FirstLoad or after SecondLoad.
1553
// 1. The second load writes new value into the same reg.
1554
// - The renaming is impossible to impact later use of the reg.
1555
// - The second load always trash the value written by the first load which
1556
// means the reg must be killed before the second load.
1557
// 2. The first load must be a def for the same reg so we don't need to look
1558
// into anything before it.
1559
static bool canRenameUntilSecondLoad(
1560
MachineInstr &FirstLoad, MachineInstr &SecondLoad,
1561
LiveRegUnits &UsedInBetween,
1562
SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1563
const TargetRegisterInfo *TRI) {
1564
if (FirstLoad.isPseudo())
1565
return false;
1566
1567
UsedInBetween.accumulate(FirstLoad);
1568
auto RegToRename = getLdStRegOp(FirstLoad).getReg();
1569
bool Success = std::all_of(
1570
FirstLoad.getIterator(), SecondLoad.getIterator(),
1571
[&](MachineInstr &MI) {
1572
LLVM_DEBUG(dbgs() << "Checking " << MI);
1573
// Currently we do not try to rename across frame-setup instructions.
1574
if (MI.getFlag(MachineInstr::FrameSetup)) {
1575
LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1576
<< "currently\n");
1577
return false;
1578
}
1579
1580
for (auto &MOP : MI.operands()) {
1581
if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1582
!TRI->regsOverlap(MOP.getReg(), RegToRename))
1583
continue;
1584
if (!canRenameMOP(MOP, TRI)) {
1585
LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1586
return false;
1587
}
1588
RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1589
}
1590
1591
return true;
1592
});
1593
return Success;
1594
}
1595
1596
// Check if we can find a physical register for renaming \p Reg. This register
1597
// must:
1598
// * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1599
// defined registers up to the point where the renamed register will be used,
1600
// * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1601
// registers in the range the rename register will be used,
1602
// * is available in all used register classes (checked using RequiredClasses).
1603
static std::optional<MCPhysReg> tryToFindRegisterToRename(
1604
const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1605
LiveRegUnits &UsedInBetween,
1606
SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1607
const TargetRegisterInfo *TRI) {
1608
const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1609
1610
// Checks if any sub- or super-register of PR is callee saved.
1611
auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1612
return any_of(TRI->sub_and_superregs_inclusive(PR),
1613
[&MF, TRI](MCPhysReg SubOrSuper) {
1614
return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
1615
});
1616
};
1617
1618
// Check if PR or one of its sub- or super-registers can be used for all
1619
// required register classes.
1620
auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1621
return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
1622
return any_of(
1623
TRI->sub_and_superregs_inclusive(PR),
1624
[C](MCPhysReg SubOrSuper) { return C->contains(SubOrSuper); });
1625
});
1626
};
1627
1628
auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1629
for (const MCPhysReg &PR : *RegClass) {
1630
if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
1631
!RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1632
CanBeUsedForAllClasses(PR)) {
1633
DefinedInBB.addReg(PR);
1634
LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1635
<< "\n");
1636
return {PR};
1637
}
1638
}
1639
LLVM_DEBUG(dbgs() << "No rename register found from "
1640
<< TRI->getRegClassName(RegClass) << "\n");
1641
return std::nullopt;
1642
}
1643
1644
// For store pairs: returns a register from FirstMI to the beginning of the
1645
// block that can be renamed.
1646
// For load pairs: returns a register from FirstMI to MI that can be renamed.
1647
static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(
1648
std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,
1649
Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1650
SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1651
const TargetRegisterInfo *TRI) {
1652
std::optional<MCPhysReg> RenameReg;
1653
if (!DebugCounter::shouldExecute(RegRenamingCounter))
1654
return RenameReg;
1655
1656
auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
1657
MachineFunction &MF = *FirstMI.getParent()->getParent();
1658
if (!RegClass || !MF.getRegInfo().tracksLiveness())
1659
return RenameReg;
1660
1661
const bool IsLoad = FirstMI.mayLoad();
1662
1663
if (!MaybeCanRename) {
1664
if (IsLoad)
1665
MaybeCanRename = {canRenameUntilSecondLoad(FirstMI, MI, UsedInBetween,
1666
RequiredClasses, TRI)};
1667
else
1668
MaybeCanRename = {
1669
canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
1670
}
1671
1672
if (*MaybeCanRename) {
1673
RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,
1674
RequiredClasses, TRI);
1675
}
1676
return RenameReg;
1677
}
1678
1679
/// Scan the instructions looking for a load/store that can be combined with the
1680
/// current instruction into a wider equivalent or a load/store pair.
1681
MachineBasicBlock::iterator
1682
AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1683
LdStPairFlags &Flags, unsigned Limit,
1684
bool FindNarrowMerge) {
1685
MachineBasicBlock::iterator E = I->getParent()->end();
1686
MachineBasicBlock::iterator MBBI = I;
1687
MachineBasicBlock::iterator MBBIWithRenameReg;
1688
MachineInstr &FirstMI = *I;
1689
MBBI = next_nodbg(MBBI, E);
1690
1691
bool MayLoad = FirstMI.mayLoad();
1692
bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
1693
Register Reg = getLdStRegOp(FirstMI).getReg();
1694
Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg();
1695
int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm();
1696
int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
1697
bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1698
1699
std::optional<bool> MaybeCanRename;
1700
if (!EnableRenaming)
1701
MaybeCanRename = {false};
1702
1703
SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
1704
LiveRegUnits UsedInBetween;
1705
UsedInBetween.init(*TRI);
1706
1707
Flags.clearRenameReg();
1708
1709
// Track which register units have been modified and used between the first
1710
// insn (inclusive) and the second insn.
1711
ModifiedRegUnits.clear();
1712
UsedRegUnits.clear();
1713
1714
// Remember any instructions that read/write memory between FirstMI and MI.
1715
SmallVector<MachineInstr *, 4> MemInsns;
1716
1717
LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
1718
for (unsigned Count = 0; MBBI != E && Count < Limit;
1719
MBBI = next_nodbg(MBBI, E)) {
1720
MachineInstr &MI = *MBBI;
1721
LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
1722
1723
UsedInBetween.accumulate(MI);
1724
1725
// Don't count transient instructions towards the search limit since there
1726
// may be different numbers of them if e.g. debug information is present.
1727
if (!MI.isTransient())
1728
++Count;
1729
1730
Flags.setSExtIdx(-1);
1731
if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1732
AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
1733
assert(MI.mayLoadOrStore() && "Expected memory operation.");
1734
// If we've found another instruction with the same opcode, check to see
1735
// if the base and offset are compatible with our starting instruction.
1736
// These instructions all have scaled immediate operands, so we just
1737
// check for +1/-1. Make sure to check the new instruction offset is
1738
// actually an immediate and not a symbolic reference destined for
1739
// a relocation.
1740
Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
1741
int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
1742
bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
1743
if (IsUnscaled != MIIsUnscaled) {
1744
// We're trying to pair instructions that differ in how they are scaled.
1745
// If FirstMI is scaled then scale the offset of MI accordingly.
1746
// Otherwise, do the opposite (i.e., make MI's offset unscaled).
1747
int MemSize = TII->getMemScale(MI);
1748
if (MIIsUnscaled) {
1749
// If the unscaled offset isn't a multiple of the MemSize, we can't
1750
// pair the operations together: bail and keep looking.
1751
if (MIOffset % MemSize) {
1752
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1753
UsedRegUnits, TRI);
1754
MemInsns.push_back(&MI);
1755
continue;
1756
}
1757
MIOffset /= MemSize;
1758
} else {
1759
MIOffset *= MemSize;
1760
}
1761
}
1762
1763
bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
1764
1765
if (BaseReg == MIBaseReg) {
1766
// If the offset of the second ld/st is not equal to the size of the
1767
// destination register it can’t be paired with a pre-index ld/st
1768
// pair. Additionally if the base reg is used or modified the operations
1769
// can't be paired: bail and keep looking.
1770
if (IsPreLdSt) {
1771
bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
1772
bool IsBaseRegUsed = !UsedRegUnits.available(
1773
AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1774
bool IsBaseRegModified = !ModifiedRegUnits.available(
1775
AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1776
// If the stored value and the address of the second instruction is
1777
// the same, it needs to be using the updated register and therefore
1778
// it must not be folded.
1779
bool IsMIRegTheSame =
1780
TRI->regsOverlap(getLdStRegOp(MI).getReg(),
1781
AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1782
if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
1783
IsMIRegTheSame) {
1784
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1785
UsedRegUnits, TRI);
1786
MemInsns.push_back(&MI);
1787
continue;
1788
}
1789
} else {
1790
if ((Offset != MIOffset + OffsetStride) &&
1791
(Offset + OffsetStride != MIOffset)) {
1792
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1793
UsedRegUnits, TRI);
1794
MemInsns.push_back(&MI);
1795
continue;
1796
}
1797
}
1798
1799
int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1800
if (FindNarrowMerge) {
1801
// If the alignment requirements of the scaled wide load/store
1802
// instruction can't express the offset of the scaled narrow input,
1803
// bail and keep looking. For promotable zero stores, allow only when
1804
// the stored value is the same (i.e., WZR).
1805
if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
1806
(IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
1807
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1808
UsedRegUnits, TRI);
1809
MemInsns.push_back(&MI);
1810
continue;
1811
}
1812
} else {
1813
// Pairwise instructions have a 7-bit signed offset field. Single
1814
// insns have a 12-bit unsigned offset field. If the resultant
1815
// immediate offset of merging these instructions is out of range for
1816
// a pairwise instruction, bail and keep looking.
1817
if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1818
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1819
UsedRegUnits, TRI);
1820
MemInsns.push_back(&MI);
1821
LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
1822
<< "keep looking.\n");
1823
continue;
1824
}
1825
// If the alignment requirements of the paired (scaled) instruction
1826
// can't express the offset of the unscaled input, bail and keep
1827
// looking.
1828
if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1829
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1830
UsedRegUnits, TRI);
1831
MemInsns.push_back(&MI);
1832
LLVM_DEBUG(dbgs()
1833
<< "Offset doesn't fit due to alignment requirements, "
1834
<< "keep looking.\n");
1835
continue;
1836
}
1837
}
1838
1839
// If the BaseReg has been modified, then we cannot do the optimization.
1840
// For example, in the following pattern
1841
// ldr x1 [x2]
1842
// ldr x2 [x3]
1843
// ldr x4 [x2, #8],
1844
// the first and third ldr cannot be converted to ldp x1, x4, [x2]
1845
if (!ModifiedRegUnits.available(BaseReg))
1846
return E;
1847
1848
const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
1849
Reg, getLdStRegOp(MI).getReg());
1850
1851
// If the Rt of the second instruction (destination register of the
1852
// load) was not modified or used between the two instructions and none
1853
// of the instructions between the second and first alias with the
1854
// second, we can combine the second into the first.
1855
bool RtNotModified =
1856
ModifiedRegUnits.available(getLdStRegOp(MI).getReg());
1857
bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
1858
!UsedRegUnits.available(getLdStRegOp(MI).getReg()));
1859
1860
LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
1861
<< "Reg '" << getLdStRegOp(MI) << "' not modified: "
1862
<< (RtNotModified ? "true" : "false") << "\n"
1863
<< "Reg '" << getLdStRegOp(MI) << "' not used: "
1864
<< (RtNotUsed ? "true" : "false") << "\n");
1865
1866
if (RtNotModified && RtNotUsed && !mayAlias(MI, MemInsns, AA)) {
1867
// For pairs loading into the same reg, try to find a renaming
1868
// opportunity to allow the renaming of Reg between FirstMI and MI
1869
// and combine MI into FirstMI; otherwise bail and keep looking.
1870
if (SameLoadReg) {
1871
std::optional<MCPhysReg> RenameReg =
1872
findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,
1873
Reg, DefinedInBB, UsedInBetween,
1874
RequiredClasses, TRI);
1875
if (!RenameReg) {
1876
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1877
UsedRegUnits, TRI);
1878
MemInsns.push_back(&MI);
1879
LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
1880
<< "keep looking.\n");
1881
continue;
1882
}
1883
Flags.setRenameReg(*RenameReg);
1884
}
1885
1886
Flags.setMergeForward(false);
1887
if (!SameLoadReg)
1888
Flags.clearRenameReg();
1889
return MBBI;
1890
}
1891
1892
// Likewise, if the Rt of the first instruction is not modified or used
1893
// between the two instructions and none of the instructions between the
1894
// first and the second alias with the first, we can combine the first
1895
// into the second.
1896
RtNotModified = !(
1897
MayLoad && !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg()));
1898
1899
LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
1900
<< "Reg '" << getLdStRegOp(FirstMI)
1901
<< "' not modified: "
1902
<< (RtNotModified ? "true" : "false") << "\n");
1903
1904
if (RtNotModified && !mayAlias(FirstMI, MemInsns, AA)) {
1905
if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
1906
Flags.setMergeForward(true);
1907
Flags.clearRenameReg();
1908
return MBBI;
1909
}
1910
1911
std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(
1912
MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,
1913
RequiredClasses, TRI);
1914
if (RenameReg) {
1915
Flags.setMergeForward(true);
1916
Flags.setRenameReg(*RenameReg);
1917
MBBIWithRenameReg = MBBI;
1918
}
1919
}
1920
LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
1921
<< "interference in between, keep looking.\n");
1922
}
1923
}
1924
1925
if (Flags.getRenameReg())
1926
return MBBIWithRenameReg;
1927
1928
// If the instruction wasn't a matching load or store. Stop searching if we
1929
// encounter a call instruction that might modify memory.
1930
if (MI.isCall()) {
1931
LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
1932
return E;
1933
}
1934
1935
// Update modified / uses register units.
1936
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1937
1938
// Otherwise, if the base register is modified, we have no match, so
1939
// return early.
1940
if (!ModifiedRegUnits.available(BaseReg)) {
1941
LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
1942
return E;
1943
}
1944
1945
// Update list of instructions that read/write memory.
1946
if (MI.mayLoadOrStore())
1947
MemInsns.push_back(&MI);
1948
}
1949
return E;
1950
}
1951
1952
static MachineBasicBlock::iterator
1953
maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
1954
auto End = MI.getParent()->end();
1955
if (MaybeCFI == End ||
1956
MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
1957
!(MI.getFlag(MachineInstr::FrameSetup) ||
1958
MI.getFlag(MachineInstr::FrameDestroy)) ||
1959
AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP)
1960
return End;
1961
1962
const MachineFunction &MF = *MI.getParent()->getParent();
1963
unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();
1964
const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
1965
switch (CFI.getOperation()) {
1966
case MCCFIInstruction::OpDefCfa:
1967
case MCCFIInstruction::OpDefCfaOffset:
1968
return MaybeCFI;
1969
default:
1970
return End;
1971
}
1972
}
1973
1974
MachineBasicBlock::iterator
1975
AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1976
MachineBasicBlock::iterator Update,
1977
bool IsPreIdx) {
1978
assert((Update->getOpcode() == AArch64::ADDXri ||
1979
Update->getOpcode() == AArch64::SUBXri) &&
1980
"Unexpected base register update instruction to merge!");
1981
MachineBasicBlock::iterator E = I->getParent()->end();
1982
MachineBasicBlock::iterator NextI = next_nodbg(I, E);
1983
1984
// If updating the SP and the following instruction is CFA offset related CFI
1985
// instruction move it after the merged instruction.
1986
MachineBasicBlock::iterator CFI =
1987
IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;
1988
1989
// Return the instruction following the merged instruction, which is
1990
// the instruction following our unmerged load. Unless that's the add/sub
1991
// instruction we're merging, in which case it's the one after that.
1992
if (NextI == Update)
1993
NextI = next_nodbg(NextI, E);
1994
1995
int Value = Update->getOperand(2).getImm();
1996
assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1997
"Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1998
if (Update->getOpcode() == AArch64::SUBXri)
1999
Value = -Value;
2000
2001
unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
2002
: getPostIndexedOpcode(I->getOpcode());
2003
MachineInstrBuilder MIB;
2004
int Scale, MinOffset, MaxOffset;
2005
getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
2006
if (!AArch64InstrInfo::isPairedLdSt(*I)) {
2007
// Non-paired instruction.
2008
MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
2009
.add(getLdStRegOp(*Update))
2010
.add(getLdStRegOp(*I))
2011
.add(AArch64InstrInfo::getLdStBaseOp(*I))
2012
.addImm(Value / Scale)
2013
.setMemRefs(I->memoperands())
2014
.setMIFlags(I->mergeFlagsWith(*Update));
2015
} else {
2016
// Paired instruction.
2017
MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
2018
.add(getLdStRegOp(*Update))
2019
.add(getLdStRegOp(*I, 0))
2020
.add(getLdStRegOp(*I, 1))
2021
.add(AArch64InstrInfo::getLdStBaseOp(*I))
2022
.addImm(Value / Scale)
2023
.setMemRefs(I->memoperands())
2024
.setMIFlags(I->mergeFlagsWith(*Update));
2025
}
2026
if (CFI != E) {
2027
MachineBasicBlock *MBB = I->getParent();
2028
MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);
2029
}
2030
2031
if (IsPreIdx) {
2032
++NumPreFolded;
2033
LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
2034
} else {
2035
++NumPostFolded;
2036
LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
2037
}
2038
LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2039
LLVM_DEBUG(I->print(dbgs()));
2040
LLVM_DEBUG(dbgs() << " ");
2041
LLVM_DEBUG(Update->print(dbgs()));
2042
LLVM_DEBUG(dbgs() << " with instruction:\n ");
2043
LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
2044
LLVM_DEBUG(dbgs() << "\n");
2045
2046
// Erase the old instructions for the block.
2047
I->eraseFromParent();
2048
Update->eraseFromParent();
2049
2050
return NextI;
2051
}
2052
2053
bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2054
MachineInstr &MI,
2055
unsigned BaseReg, int Offset) {
2056
switch (MI.getOpcode()) {
2057
default:
2058
break;
2059
case AArch64::SUBXri:
2060
case AArch64::ADDXri:
2061
// Make sure it's a vanilla immediate operand, not a relocation or
2062
// anything else we can't handle.
2063
if (!MI.getOperand(2).isImm())
2064
break;
2065
// Watch out for 1 << 12 shifted value.
2066
if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
2067
break;
2068
2069
// The update instruction source and destination register must be the
2070
// same as the load/store base register.
2071
if (MI.getOperand(0).getReg() != BaseReg ||
2072
MI.getOperand(1).getReg() != BaseReg)
2073
break;
2074
2075
int UpdateOffset = MI.getOperand(2).getImm();
2076
if (MI.getOpcode() == AArch64::SUBXri)
2077
UpdateOffset = -UpdateOffset;
2078
2079
// The immediate must be a multiple of the scaling factor of the pre/post
2080
// indexed instruction.
2081
int Scale, MinOffset, MaxOffset;
2082
getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
2083
if (UpdateOffset % Scale != 0)
2084
break;
2085
2086
// Scaled offset must fit in the instruction immediate.
2087
int ScaledOffset = UpdateOffset / Scale;
2088
if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
2089
break;
2090
2091
// If we have a non-zero Offset, we check that it matches the amount
2092
// we're adding to the register.
2093
if (!Offset || Offset == UpdateOffset)
2094
return true;
2095
break;
2096
}
2097
return false;
2098
}
2099
2100
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
2101
MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2102
MachineBasicBlock::iterator E = I->getParent()->end();
2103
MachineInstr &MemMI = *I;
2104
MachineBasicBlock::iterator MBBI = I;
2105
2106
Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2107
int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() *
2108
TII->getMemScale(MemMI);
2109
2110
// Scan forward looking for post-index opportunities. Updating instructions
2111
// can't be formed if the memory instruction doesn't have the offset we're
2112
// looking for.
2113
if (MIUnscaledOffset != UnscaledOffset)
2114
return E;
2115
2116
// If the base register overlaps a source/destination register, we can't
2117
// merge the update. This does not apply to tag store instructions which
2118
// ignore the address part of the source register.
2119
// This does not apply to STGPi as well, which does not have unpredictable
2120
// behavior in this case unlike normal stores, and always performs writeback
2121
// after reading the source register value.
2122
if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
2123
bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
2124
for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2125
Register DestReg = getLdStRegOp(MemMI, i).getReg();
2126
if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
2127
return E;
2128
}
2129
}
2130
2131
// Track which register units have been modified and used between the first
2132
// insn (inclusive) and the second insn.
2133
ModifiedRegUnits.clear();
2134
UsedRegUnits.clear();
2135
MBBI = next_nodbg(MBBI, E);
2136
2137
// We can't post-increment the stack pointer if any instruction between
2138
// the memory access (I) and the increment (MBBI) can access the memory
2139
// region defined by [SP, MBBI].
2140
const bool BaseRegSP = BaseReg == AArch64::SP;
2141
if (BaseRegSP && needsWinCFI(I->getMF())) {
2142
// FIXME: For now, we always block the optimization over SP in windows
2143
// targets as it requires to adjust the unwind/debug info, messing up
2144
// the unwind info can actually cause a miscompile.
2145
return E;
2146
}
2147
2148
for (unsigned Count = 0; MBBI != E && Count < Limit;
2149
MBBI = next_nodbg(MBBI, E)) {
2150
MachineInstr &MI = *MBBI;
2151
2152
// Don't count transient instructions towards the search limit since there
2153
// may be different numbers of them if e.g. debug information is present.
2154
if (!MI.isTransient())
2155
++Count;
2156
2157
// If we found a match, return it.
2158
if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
2159
return MBBI;
2160
2161
// Update the status of what the instruction clobbered and used.
2162
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2163
2164
// Otherwise, if the base register is used or modified, we have no match, so
2165
// return early.
2166
// If we are optimizing SP, do not allow instructions that may load or store
2167
// in between the load and the optimized value update.
2168
if (!ModifiedRegUnits.available(BaseReg) ||
2169
!UsedRegUnits.available(BaseReg) ||
2170
(BaseRegSP && MBBI->mayLoadOrStore()))
2171
return E;
2172
}
2173
return E;
2174
}
2175
2176
MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2177
MachineBasicBlock::iterator I, unsigned Limit) {
2178
MachineBasicBlock::iterator B = I->getParent()->begin();
2179
MachineBasicBlock::iterator E = I->getParent()->end();
2180
MachineInstr &MemMI = *I;
2181
MachineBasicBlock::iterator MBBI = I;
2182
MachineFunction &MF = *MemMI.getMF();
2183
2184
Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2185
int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();
2186
2187
// If the load/store is the first instruction in the block, there's obviously
2188
// not any matching update. Ditto if the memory offset isn't zero.
2189
if (MBBI == B || Offset != 0)
2190
return E;
2191
// If the base register overlaps a destination register, we can't
2192
// merge the update.
2193
if (!isTagStore(MemMI)) {
2194
bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
2195
for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2196
Register DestReg = getLdStRegOp(MemMI, i).getReg();
2197
if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
2198
return E;
2199
}
2200
}
2201
2202
const bool BaseRegSP = BaseReg == AArch64::SP;
2203
if (BaseRegSP && needsWinCFI(I->getMF())) {
2204
// FIXME: For now, we always block the optimization over SP in windows
2205
// targets as it requires to adjust the unwind/debug info, messing up
2206
// the unwind info can actually cause a miscompile.
2207
return E;
2208
}
2209
2210
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2211
unsigned RedZoneSize =
2212
Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
2213
2214
// Track which register units have been modified and used between the first
2215
// insn (inclusive) and the second insn.
2216
ModifiedRegUnits.clear();
2217
UsedRegUnits.clear();
2218
unsigned Count = 0;
2219
bool MemAcessBeforeSPPreInc = false;
2220
do {
2221
MBBI = prev_nodbg(MBBI, B);
2222
MachineInstr &MI = *MBBI;
2223
2224
// Don't count transient instructions towards the search limit since there
2225
// may be different numbers of them if e.g. debug information is present.
2226
if (!MI.isTransient())
2227
++Count;
2228
2229
// If we found a match, return it.
2230
if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {
2231
// Check that the update value is within our red zone limit (which may be
2232
// zero).
2233
if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
2234
return E;
2235
return MBBI;
2236
}
2237
2238
// Update the status of what the instruction clobbered and used.
2239
LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2240
2241
// Otherwise, if the base register is used or modified, we have no match, so
2242
// return early.
2243
if (!ModifiedRegUnits.available(BaseReg) ||
2244
!UsedRegUnits.available(BaseReg))
2245
return E;
2246
// Keep track if we have a memory access before an SP pre-increment, in this
2247
// case we need to validate later that the update amount respects the red
2248
// zone.
2249
if (BaseRegSP && MBBI->mayLoadOrStore())
2250
MemAcessBeforeSPPreInc = true;
2251
} while (MBBI != B && Count < Limit);
2252
return E;
2253
}
2254
2255
bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2256
MachineBasicBlock::iterator &MBBI) {
2257
MachineInstr &MI = *MBBI;
2258
// If this is a volatile load, don't mess with it.
2259
if (MI.hasOrderedMemoryRef())
2260
return false;
2261
2262
if (needsWinCFI(MI.getMF()) && MI.getFlag(MachineInstr::FrameDestroy))
2263
return false;
2264
2265
// Make sure this is a reg+imm.
2266
// FIXME: It is possible to extend it to handle reg+reg cases.
2267
if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2268
return false;
2269
2270
// Look backward up to LdStLimit instructions.
2271
MachineBasicBlock::iterator StoreI;
2272
if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
2273
++NumLoadsFromStoresPromoted;
2274
// Promote the load. Keeping the iterator straight is a
2275
// pain, so we let the merge routine tell us what the next instruction
2276
// is after it's done mucking about.
2277
MBBI = promoteLoadFromStore(MBBI, StoreI);
2278
return true;
2279
}
2280
return false;
2281
}
2282
2283
// Merge adjacent zero stores into a wider store.
2284
bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2285
MachineBasicBlock::iterator &MBBI) {
2286
assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2287
MachineInstr &MI = *MBBI;
2288
MachineBasicBlock::iterator E = MI.getParent()->end();
2289
2290
if (!TII->isCandidateToMergeOrPair(MI))
2291
return false;
2292
2293
// Look ahead up to LdStLimit instructions for a mergable instruction.
2294
LdStPairFlags Flags;
2295
MachineBasicBlock::iterator MergeMI =
2296
findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
2297
if (MergeMI != E) {
2298
++NumZeroStoresPromoted;
2299
2300
// Keeping the iterator straight is a pain, so we let the merge routine tell
2301
// us what the next instruction is after it's done mucking about.
2302
MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
2303
return true;
2304
}
2305
return false;
2306
}
2307
2308
// Find loads and stores that can be merged into a single load or store pair
2309
// instruction.
2310
bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2311
MachineInstr &MI = *MBBI;
2312
MachineBasicBlock::iterator E = MI.getParent()->end();
2313
2314
if (!TII->isCandidateToMergeOrPair(MI))
2315
return false;
2316
2317
// If disable-ldp feature is opted, do not emit ldp.
2318
if (MI.mayLoad() && Subtarget->hasDisableLdp())
2319
return false;
2320
2321
// If disable-stp feature is opted, do not emit stp.
2322
if (MI.mayStore() && Subtarget->hasDisableStp())
2323
return false;
2324
2325
// Early exit if the offset is not possible to match. (6 bits of positive
2326
// range, plus allow an extra one in case we find a later insn that matches
2327
// with Offset-1)
2328
bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2329
int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2330
int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2331
// Allow one more for offset.
2332
if (Offset > 0)
2333
Offset -= OffsetStride;
2334
if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2335
return false;
2336
2337
// Look ahead up to LdStLimit instructions for a pairable instruction.
2338
LdStPairFlags Flags;
2339
MachineBasicBlock::iterator Paired =
2340
findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
2341
if (Paired != E) {
2342
// Keeping the iterator straight is a pain, so we let the merge routine tell
2343
// us what the next instruction is after it's done mucking about.
2344
auto Prev = std::prev(MBBI);
2345
2346
// Fetch the memoperand of the load/store that is a candidate for
2347
// combination.
2348
MachineMemOperand *MemOp =
2349
MI.memoperands_empty() ? nullptr : MI.memoperands().front();
2350
2351
// If a load/store arrives and ldp/stp-aligned-only feature is opted, check
2352
// that the alignment of the source pointer is at least double the alignment
2353
// of the type.
2354
if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) ||
2355
(MI.mayStore() && Subtarget->hasStpAlignedOnly())) {
2356
// If there is no size/align information, cancel the transformation.
2357
if (!MemOp || !MemOp->getMemoryType().isValid()) {
2358
NumFailedAlignmentCheck++;
2359
return false;
2360
}
2361
2362
// Get the needed alignments to check them if
2363
// ldp-aligned-only/stp-aligned-only features are opted.
2364
uint64_t MemAlignment = MemOp->getAlign().value();
2365
uint64_t TypeAlignment = Align(MemOp->getSize().getValue()).value();
2366
2367
if (MemAlignment < 2 * TypeAlignment) {
2368
NumFailedAlignmentCheck++;
2369
return false;
2370
}
2371
}
2372
2373
++NumPairCreated;
2374
if (TII->hasUnscaledLdStOffset(MI))
2375
++NumUnscaledPairCreated;
2376
2377
MBBI = mergePairedInsns(MBBI, Paired, Flags);
2378
// Collect liveness info for instructions between Prev and the new position
2379
// MBBI.
2380
for (auto I = std::next(Prev); I != MBBI; I++)
2381
updateDefinedRegisters(*I, DefinedInBB, TRI);
2382
2383
return true;
2384
}
2385
return false;
2386
}
2387
2388
bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2389
(MachineBasicBlock::iterator &MBBI) {
2390
MachineInstr &MI = *MBBI;
2391
MachineBasicBlock::iterator E = MI.getParent()->end();
2392
MachineBasicBlock::iterator Update;
2393
2394
// Look forward to try to form a post-index instruction. For example,
2395
// ldr x0, [x20]
2396
// add x20, x20, #32
2397
// merged into:
2398
// ldr x0, [x20], #32
2399
Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
2400
if (Update != E) {
2401
// Merge the update into the ld/st.
2402
MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
2403
return true;
2404
}
2405
2406
// Don't know how to handle unscaled pre/post-index versions below, so bail.
2407
if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
2408
return false;
2409
2410
// Look back to try to find a pre-index instruction. For example,
2411
// add x0, x0, #8
2412
// ldr x1, [x0]
2413
// merged into:
2414
// ldr x1, [x0, #8]!
2415
Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
2416
if (Update != E) {
2417
// Merge the update into the ld/st.
2418
MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2419
return true;
2420
}
2421
2422
// The immediate in the load/store is scaled by the size of the memory
2423
// operation. The immediate in the add we're looking for,
2424
// however, is not, so adjust here.
2425
int UnscaledOffset =
2426
AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2427
2428
// Look forward to try to find a pre-index instruction. For example,
2429
// ldr x1, [x0, #64]
2430
// add x0, x0, #64
2431
// merged into:
2432
// ldr x1, [x0, #64]!
2433
Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
2434
if (Update != E) {
2435
// Merge the update into the ld/st.
2436
MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2437
return true;
2438
}
2439
2440
return false;
2441
}
2442
2443
bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2444
bool EnableNarrowZeroStOpt) {
2445
2446
bool Modified = false;
2447
// Four tranformations to do here:
2448
// 1) Find loads that directly read from stores and promote them by
2449
// replacing with mov instructions. If the store is wider than the load,
2450
// the load will be replaced with a bitfield extract.
2451
// e.g.,
2452
// str w1, [x0, #4]
2453
// ldrh w2, [x0, #6]
2454
// ; becomes
2455
// str w1, [x0, #4]
2456
// lsr w2, w1, #16
2457
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2458
MBBI != E;) {
2459
if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
2460
Modified = true;
2461
else
2462
++MBBI;
2463
}
2464
// 2) Merge adjacent zero stores into a wider store.
2465
// e.g.,
2466
// strh wzr, [x0]
2467
// strh wzr, [x0, #2]
2468
// ; becomes
2469
// str wzr, [x0]
2470
// e.g.,
2471
// str wzr, [x0]
2472
// str wzr, [x0, #4]
2473
// ; becomes
2474
// str xzr, [x0]
2475
if (EnableNarrowZeroStOpt)
2476
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2477
MBBI != E;) {
2478
if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
2479
Modified = true;
2480
else
2481
++MBBI;
2482
}
2483
// 3) Find loads and stores that can be merged into a single load or store
2484
// pair instruction.
2485
// e.g.,
2486
// ldr x0, [x2]
2487
// ldr x1, [x2, #8]
2488
// ; becomes
2489
// ldp x0, x1, [x2]
2490
2491
if (MBB.getParent()->getRegInfo().tracksLiveness()) {
2492
DefinedInBB.clear();
2493
DefinedInBB.addLiveIns(MBB);
2494
}
2495
2496
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2497
MBBI != E;) {
2498
// Track currently live registers up to this point, to help with
2499
// searching for a rename register on demand.
2500
updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
2501
if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
2502
Modified = true;
2503
else
2504
++MBBI;
2505
}
2506
// 4) Find base register updates that can be merged into the load or store
2507
// as a base-reg writeback.
2508
// e.g.,
2509
// ldr x0, [x2]
2510
// add x2, x2, #4
2511
// ; becomes
2512
// ldr x0, [x2], #4
2513
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2514
MBBI != E;) {
2515
if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
2516
Modified = true;
2517
else
2518
++MBBI;
2519
}
2520
2521
return Modified;
2522
}
2523
2524
bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
2525
if (skipFunction(Fn.getFunction()))
2526
return false;
2527
2528
Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
2529
TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
2530
TRI = Subtarget->getRegisterInfo();
2531
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2532
2533
// Resize the modified and used register unit trackers. We do this once
2534
// per function and then clear the register units each time we optimize a load
2535
// or store.
2536
ModifiedRegUnits.init(*TRI);
2537
UsedRegUnits.init(*TRI);
2538
DefinedInBB.init(*TRI);
2539
2540
bool Modified = false;
2541
bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
2542
for (auto &MBB : Fn) {
2543
auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);
2544
Modified |= M;
2545
}
2546
2547
return Modified;
2548
}
2549
2550
// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
2551
// stores near one another? Note: The pre-RA instruction scheduler already has
2552
// hooks to try and schedule pairable loads/stores together to improve pairing
2553
// opportunities. Thus, pre-RA pairing pass may not be worth the effort.
2554
2555
// FIXME: When pairing store instructions it's very possible for this pass to
2556
// hoist a store with a KILL marker above another use (without a KILL marker).
2557
// The resulting IR is invalid, but nothing uses the KILL markers after this
2558
// pass, so it's never caused a problem in practice.
2559
2560
/// createAArch64LoadStoreOptimizationPass - returns an instance of the
2561
/// load / store optimization pass.
2562
FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
2563
return new AArch64LoadStoreOpt();
2564
}
2565
2566