Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/ARM/ARMLatencyMutations.cpp
213799 views
1
//===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file This file contains the ARM definition DAG scheduling mutations which
10
/// change inter-instruction latencies
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "ARMLatencyMutations.h"
15
#include "ARMSubtarget.h"
16
#include "Thumb2InstrInfo.h"
17
#include "llvm/Analysis/AliasAnalysis.h"
18
#include "llvm/CodeGen/ScheduleDAG.h"
19
#include "llvm/CodeGen/ScheduleDAGMutation.h"
20
#include "llvm/CodeGen/TargetInstrInfo.h"
21
#include <algorithm>
22
#include <array>
23
#include <initializer_list>
24
#include <memory>
25
26
namespace llvm {
27
28
namespace {
29
30
// Precompute information about opcodes to speed up pass
31
32
class InstructionInformation {
33
protected:
34
struct IInfo {
35
bool HasBRegAddr : 1; // B-side of addr gen is a register
36
bool HasBRegAddrShift : 1; // B-side of addr gen has a shift
37
bool IsDivide : 1; // Some form of integer divide
38
bool IsInlineShiftALU : 1; // Inline shift+ALU
39
bool IsMultiply : 1; // Some form of integer multiply
40
bool IsMVEIntMAC : 1; // MVE 8/16/32-bit integer MAC operation
41
bool IsNonSubwordLoad : 1; // Load which is a word or larger
42
bool IsShift : 1; // Shift operation
43
bool IsRev : 1; // REV operation
44
bool ProducesQP : 1; // Produces a vector register result
45
bool ProducesDP : 1; // Produces a double-precision register result
46
bool ProducesSP : 1; // Produces a single-precision register result
47
bool ConsumesQP : 1; // Consumes a vector register result
48
bool ConsumesDP : 1; // Consumes a double-precision register result
49
bool ConsumesSP : 1; // Consumes a single-precision register result
50
unsigned MVEIntMACMatched; // Matched operand type (for MVE)
51
unsigned AddressOpMask; // Mask indicating which operands go into AGU
52
IInfo()
53
: HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false),
54
IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false),
55
IsNonSubwordLoad(false), IsShift(false), IsRev(false),
56
ProducesQP(false), ProducesDP(false), ProducesSP(false),
57
ConsumesQP(false), ConsumesDP(false), ConsumesSP(false),
58
MVEIntMACMatched(0), AddressOpMask(0) {}
59
};
60
typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray;
61
IInfoArray Info;
62
63
public:
64
// Always available information
65
unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; }
66
bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; }
67
bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; }
68
bool isDivide(unsigned Op) { return Info[Op].IsDivide; }
69
bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; }
70
bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; }
71
bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; }
72
bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; }
73
bool isRev(unsigned Op) { return Info[Op].IsRev; }
74
bool isShift(unsigned Op) { return Info[Op].IsShift; }
75
76
// information available if markDPConsumers is called.
77
bool producesQP(unsigned Op) { return Info[Op].ProducesQP; }
78
bool producesDP(unsigned Op) { return Info[Op].ProducesDP; }
79
bool producesSP(unsigned Op) { return Info[Op].ProducesSP; }
80
bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; }
81
bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; }
82
bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; }
83
84
bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) {
85
return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp;
86
}
87
88
InstructionInformation(const ARMBaseInstrInfo *TII);
89
90
protected:
91
void markDPProducersConsumers(const ARMBaseInstrInfo *TII);
92
};
93
94
InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) {
95
using namespace ARM;
96
97
std::initializer_list<unsigned> hasBRegAddrList = {
98
t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
99
tLDRr, tLDRBr, tLDRHr, tSTRr, tSTRBr, tSTRHr,
100
};
101
for (auto op : hasBRegAddrList) {
102
Info[op].HasBRegAddr = true;
103
}
104
105
std::initializer_list<unsigned> hasBRegAddrShiftList = {
106
t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
107
};
108
for (auto op : hasBRegAddrShiftList) {
109
Info[op].HasBRegAddrShift = true;
110
}
111
112
Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
113
114
std::initializer_list<unsigned> isInlineShiftALUList = {
115
t2ADCrs, t2ADDSrs, t2ADDrs, t2BICrs, t2EORrs,
116
t2ORNrs, t2RSBSrs, t2RSBrs, t2SBCrs, t2SUBrs,
117
t2SUBSrs, t2CMPrs, t2CMNzrs, t2TEQrs, t2TSTrs,
118
};
119
for (auto op : isInlineShiftALUList) {
120
Info[op].IsInlineShiftALU = true;
121
}
122
123
Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
124
125
std::initializer_list<unsigned> isMultiplyList = {
126
t2MUL, t2MLA, t2MLS, t2SMLABB, t2SMLABT, t2SMLAD, t2SMLADX,
127
t2SMLAL, t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT,
128
t2SMLATB, t2SMLATT, t2SMLAWT, t2SMLSD, t2SMLSDX, t2SMLSLD, t2SMLSLDX,
129
t2SMMLA, t2SMMLAR, t2SMMLS, t2SMMLSR, t2SMMUL, t2SMMULR, t2SMUAD,
130
t2SMUADX, t2SMULBB, t2SMULBT, t2SMULL, t2SMULTB, t2SMULTT, t2SMULWT,
131
t2SMUSD, t2SMUSDX, t2UMAAL, t2UMLAL, t2UMULL, tMUL,
132
};
133
for (auto op : isMultiplyList) {
134
Info[op].IsMultiply = true;
135
}
136
137
std::initializer_list<unsigned> isMVEIntMACList = {
138
MVE_VMLAS_qr_i16, MVE_VMLAS_qr_i32, MVE_VMLAS_qr_i8,
139
MVE_VMLA_qr_i16, MVE_VMLA_qr_i32, MVE_VMLA_qr_i8,
140
MVE_VQDMLAH_qrs16, MVE_VQDMLAH_qrs32, MVE_VQDMLAH_qrs8,
141
MVE_VQDMLASH_qrs16, MVE_VQDMLASH_qrs32, MVE_VQDMLASH_qrs8,
142
MVE_VQRDMLAH_qrs16, MVE_VQRDMLAH_qrs32, MVE_VQRDMLAH_qrs8,
143
MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8,
144
MVE_VQDMLADHXs16, MVE_VQDMLADHXs32, MVE_VQDMLADHXs8,
145
MVE_VQDMLADHs16, MVE_VQDMLADHs32, MVE_VQDMLADHs8,
146
MVE_VQDMLSDHXs16, MVE_VQDMLSDHXs32, MVE_VQDMLSDHXs8,
147
MVE_VQDMLSDHs16, MVE_VQDMLSDHs32, MVE_VQDMLSDHs8,
148
MVE_VQRDMLADHXs16, MVE_VQRDMLADHXs32, MVE_VQRDMLADHXs8,
149
MVE_VQRDMLADHs16, MVE_VQRDMLADHs32, MVE_VQRDMLADHs8,
150
MVE_VQRDMLSDHXs16, MVE_VQRDMLSDHXs32, MVE_VQRDMLSDHXs8,
151
MVE_VQRDMLSDHs16, MVE_VQRDMLSDHs32, MVE_VQRDMLSDHs8,
152
};
153
for (auto op : isMVEIntMACList) {
154
Info[op].IsMVEIntMAC = true;
155
}
156
157
std::initializer_list<unsigned> isNonSubwordLoadList = {
158
t2LDRi12, t2LDRi8, t2LDR_POST, t2LDR_PRE, t2LDRpci,
159
t2LDRs, t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi,
160
tLDRpci, tLDRr, tLDRspi,
161
};
162
for (auto op : isNonSubwordLoadList) {
163
Info[op].IsNonSubwordLoad = true;
164
}
165
166
std::initializer_list<unsigned> isRevList = {
167
t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH,
168
};
169
for (auto op : isRevList) {
170
Info[op].IsRev = true;
171
}
172
173
std::initializer_list<unsigned> isShiftList = {
174
t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr,
175
tASRri, tASRrr, tLSLSri, tLSLri, tLSLrr, tLSRri, tLSRrr, tROR,
176
};
177
for (auto op : isShiftList) {
178
Info[op].IsShift = true;
179
}
180
181
std::initializer_list<unsigned> Address1List = {
182
t2LDRBi12,
183
t2LDRBi8,
184
t2LDRBpci,
185
t2LDRBs,
186
t2LDRHi12,
187
t2LDRHi8,
188
t2LDRHpci,
189
t2LDRHs,
190
t2LDRSBi12,
191
t2LDRSBi8,
192
t2LDRSBpci,
193
t2LDRSBs,
194
t2LDRSHi12,
195
t2LDRSHi8,
196
t2LDRSHpci,
197
t2LDRSHs,
198
t2LDRi12,
199
t2LDRi8,
200
t2LDRpci,
201
t2LDRs,
202
tLDRBi,
203
tLDRBr,
204
tLDRHi,
205
tLDRHr,
206
tLDRSB,
207
tLDRSH,
208
tLDRi,
209
tLDRpci,
210
tLDRr,
211
tLDRspi,
212
t2STRBi12,
213
t2STRBi8,
214
t2STRBs,
215
t2STRHi12,
216
t2STRHi8,
217
t2STRHs,
218
t2STRi12,
219
t2STRi8,
220
t2STRs,
221
tSTRBi,
222
tSTRBr,
223
tSTRHi,
224
tSTRHr,
225
tSTRi,
226
tSTRr,
227
tSTRspi,
228
VLDRD,
229
VLDRH,
230
VLDRS,
231
VSTRD,
232
VSTRH,
233
VSTRS,
234
MVE_VLD20_16,
235
MVE_VLD20_32,
236
MVE_VLD20_8,
237
MVE_VLD21_16,
238
MVE_VLD21_32,
239
MVE_VLD21_8,
240
MVE_VLD40_16,
241
MVE_VLD40_32,
242
MVE_VLD40_8,
243
MVE_VLD41_16,
244
MVE_VLD41_32,
245
MVE_VLD41_8,
246
MVE_VLD42_16,
247
MVE_VLD42_32,
248
MVE_VLD42_8,
249
MVE_VLD43_16,
250
MVE_VLD43_32,
251
MVE_VLD43_8,
252
MVE_VLDRBS16,
253
MVE_VLDRBS16_rq,
254
MVE_VLDRBS32,
255
MVE_VLDRBS32_rq,
256
MVE_VLDRBU16,
257
MVE_VLDRBU16_rq,
258
MVE_VLDRBU32,
259
MVE_VLDRBU32_rq,
260
MVE_VLDRBU8,
261
MVE_VLDRBU8_rq,
262
MVE_VLDRDU64_qi,
263
MVE_VLDRDU64_rq,
264
MVE_VLDRDU64_rq_u,
265
MVE_VLDRHS32,
266
MVE_VLDRHS32_rq,
267
MVE_VLDRHS32_rq_u,
268
MVE_VLDRHU16,
269
MVE_VLDRHU16_rq,
270
MVE_VLDRHU16_rq_u,
271
MVE_VLDRHU32,
272
MVE_VLDRHU32_rq,
273
MVE_VLDRHU32_rq_u,
274
MVE_VLDRWU32,
275
MVE_VLDRWU32_qi,
276
MVE_VLDRWU32_rq,
277
MVE_VLDRWU32_rq_u,
278
MVE_VST20_16,
279
MVE_VST20_32,
280
MVE_VST20_8,
281
MVE_VST21_16,
282
MVE_VST21_32,
283
MVE_VST21_8,
284
MVE_VST40_16,
285
MVE_VST40_32,
286
MVE_VST40_8,
287
MVE_VST41_16,
288
MVE_VST41_32,
289
MVE_VST41_8,
290
MVE_VST42_16,
291
MVE_VST42_32,
292
MVE_VST42_8,
293
MVE_VST43_16,
294
MVE_VST43_32,
295
MVE_VST43_8,
296
MVE_VSTRB16,
297
MVE_VSTRB16_rq,
298
MVE_VSTRB32,
299
MVE_VSTRB32_rq,
300
MVE_VSTRBU8,
301
MVE_VSTRB8_rq,
302
MVE_VSTRD64_qi,
303
MVE_VSTRD64_rq,
304
MVE_VSTRD64_rq_u,
305
MVE_VSTRH32,
306
MVE_VSTRH32_rq,
307
MVE_VSTRH32_rq_u,
308
MVE_VSTRHU16,
309
MVE_VSTRH16_rq,
310
MVE_VSTRH16_rq_u,
311
MVE_VSTRWU32,
312
MVE_VSTRW32_qi,
313
MVE_VSTRW32_rq,
314
MVE_VSTRW32_rq_u,
315
};
316
std::initializer_list<unsigned> Address2List = {
317
t2LDRB_POST,
318
t2LDRB_PRE,
319
t2LDRDi8,
320
t2LDRH_POST,
321
t2LDRH_PRE,
322
t2LDRSB_POST,
323
t2LDRSB_PRE,
324
t2LDRSH_POST,
325
t2LDRSH_PRE,
326
t2LDR_POST,
327
t2LDR_PRE,
328
t2STRB_POST,
329
t2STRB_PRE,
330
t2STRDi8,
331
t2STRH_POST,
332
t2STRH_PRE,
333
t2STR_POST,
334
t2STR_PRE,
335
MVE_VLD20_16_wb,
336
MVE_VLD20_32_wb,
337
MVE_VLD20_8_wb,
338
MVE_VLD21_16_wb,
339
MVE_VLD21_32_wb,
340
MVE_VLD21_8_wb,
341
MVE_VLD40_16_wb,
342
MVE_VLD40_32_wb,
343
MVE_VLD40_8_wb,
344
MVE_VLD41_16_wb,
345
MVE_VLD41_32_wb,
346
MVE_VLD41_8_wb,
347
MVE_VLD42_16_wb,
348
MVE_VLD42_32_wb,
349
MVE_VLD42_8_wb,
350
MVE_VLD43_16_wb,
351
MVE_VLD43_32_wb,
352
MVE_VLD43_8_wb,
353
MVE_VLDRBS16_post,
354
MVE_VLDRBS16_pre,
355
MVE_VLDRBS32_post,
356
MVE_VLDRBS32_pre,
357
MVE_VLDRBU16_post,
358
MVE_VLDRBU16_pre,
359
MVE_VLDRBU32_post,
360
MVE_VLDRBU32_pre,
361
MVE_VLDRBU8_post,
362
MVE_VLDRBU8_pre,
363
MVE_VLDRDU64_qi_pre,
364
MVE_VLDRHS32_post,
365
MVE_VLDRHS32_pre,
366
MVE_VLDRHU16_post,
367
MVE_VLDRHU16_pre,
368
MVE_VLDRHU32_post,
369
MVE_VLDRHU32_pre,
370
MVE_VLDRWU32_post,
371
MVE_VLDRWU32_pre,
372
MVE_VLDRWU32_qi_pre,
373
MVE_VST20_16_wb,
374
MVE_VST20_32_wb,
375
MVE_VST20_8_wb,
376
MVE_VST21_16_wb,
377
MVE_VST21_32_wb,
378
MVE_VST21_8_wb,
379
MVE_VST40_16_wb,
380
MVE_VST40_32_wb,
381
MVE_VST40_8_wb,
382
MVE_VST41_16_wb,
383
MVE_VST41_32_wb,
384
MVE_VST41_8_wb,
385
MVE_VST42_16_wb,
386
MVE_VST42_32_wb,
387
MVE_VST42_8_wb,
388
MVE_VST43_16_wb,
389
MVE_VST43_32_wb,
390
MVE_VST43_8_wb,
391
MVE_VSTRB16_post,
392
MVE_VSTRB16_pre,
393
MVE_VSTRB32_post,
394
MVE_VSTRB32_pre,
395
MVE_VSTRBU8_post,
396
MVE_VSTRBU8_pre,
397
MVE_VSTRD64_qi_pre,
398
MVE_VSTRH32_post,
399
MVE_VSTRH32_pre,
400
MVE_VSTRHU16_post,
401
MVE_VSTRHU16_pre,
402
MVE_VSTRWU32_post,
403
MVE_VSTRWU32_pre,
404
MVE_VSTRW32_qi_pre,
405
};
406
std::initializer_list<unsigned> Address3List = {
407
t2LDRD_POST,
408
t2LDRD_PRE,
409
t2STRD_POST,
410
t2STRD_PRE,
411
};
412
// Compute a mask of which operands are involved in address computation
413
for (auto &op : Address1List) {
414
Info[op].AddressOpMask = 0x6;
415
}
416
for (auto &op : Address2List) {
417
Info[op].AddressOpMask = 0xc;
418
}
419
for (auto &op : Address3List) {
420
Info[op].AddressOpMask = 0x18;
421
}
422
for (auto &op : hasBRegAddrShiftList) {
423
Info[op].AddressOpMask |= 0x8;
424
}
425
}
426
427
void InstructionInformation::markDPProducersConsumers(
428
const ARMBaseInstrInfo *TII) {
429
// Learn about all instructions which have FP source/dest registers
430
for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) {
431
const MCInstrDesc &MID = TII->get(MI);
432
auto Operands = MID.operands();
433
for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) {
434
bool MarkQP = false, MarkDP = false, MarkSP = false;
435
switch (Operands[OI].RegClass) {
436
case ARM::MQPRRegClassID:
437
case ARM::DPRRegClassID:
438
case ARM::DPR_8RegClassID:
439
case ARM::DPR_VFP2RegClassID:
440
case ARM::DPairRegClassID:
441
case ARM::DPairSpcRegClassID:
442
case ARM::DQuadRegClassID:
443
case ARM::DQuadSpcRegClassID:
444
case ARM::DTripleRegClassID:
445
case ARM::DTripleSpcRegClassID:
446
MarkDP = true;
447
break;
448
case ARM::QPRRegClassID:
449
case ARM::QPR_8RegClassID:
450
case ARM::QPR_VFP2RegClassID:
451
case ARM::QQPRRegClassID:
452
case ARM::QQQQPRRegClassID:
453
MarkQP = true;
454
break;
455
case ARM::SPRRegClassID:
456
case ARM::SPR_8RegClassID:
457
case ARM::FPWithVPRRegClassID:
458
MarkSP = true;
459
break;
460
default:
461
break;
462
}
463
if (MarkQP) {
464
if (OI < MID.getNumDefs())
465
Info[MI].ProducesQP = true;
466
else
467
Info[MI].ConsumesQP = true;
468
}
469
if (MarkDP) {
470
if (OI < MID.getNumDefs())
471
Info[MI].ProducesDP = true;
472
else
473
Info[MI].ConsumesDP = true;
474
}
475
if (MarkSP) {
476
if (OI < MID.getNumDefs())
477
Info[MI].ProducesSP = true;
478
else
479
Info[MI].ConsumesSP = true;
480
}
481
}
482
}
483
}
484
485
} // anonymous namespace
486
487
static bool hasImplicitCPSRUse(const MachineInstr *MI) {
488
return MI->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR);
489
}
490
491
void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep,
492
unsigned latency) {
493
SDep Reverse = SrcDep;
494
Reverse.setSUnit(&SrcSU);
495
for (SDep &PDep : SrcDep.getSUnit()->Preds) {
496
if (PDep == Reverse) {
497
PDep.setLatency(latency);
498
SrcDep.getSUnit()->setDepthDirty();
499
break;
500
}
501
}
502
SrcDep.setLatency(latency);
503
SrcSU.setHeightDirty();
504
}
505
506
static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) {
507
return (a & 0xe) != (b & 0xe);
508
}
509
510
// Set output dependences to zero latency for processors which can
511
// simultaneously issue to the same register. Returns true if a change
512
// was made.
513
bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) {
514
if (Dep.getKind() == SDep::Output) {
515
setBidirLatencies(ISU, Dep, 0);
516
return true;
517
}
518
return false;
519
}
520
521
// The graph doesn't look inside of bundles to determine their
522
// scheduling boundaries and reports zero latency into and out of them
523
// (except for CPSR into the bundle, which has latency 1).
524
// Make some better scheduling assumptions:
525
// 1) CPSR uses have zero latency; other uses have incoming latency 1
526
// 2) CPSR defs retain a latency of zero; others have a latency of 1.
527
//
528
// Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise
529
unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) {
530
531
SUnit &DepSU = *Dep.getSUnit();
532
const MachineInstr *SrcMI = ISU.getInstr();
533
unsigned SrcOpcode = SrcMI->getOpcode();
534
const MachineInstr *DstMI = DepSU.getInstr();
535
unsigned DstOpcode = DstMI->getOpcode();
536
537
if (DstOpcode == ARM::BUNDLE && TII->isPredicated(*DstMI)) {
538
setBidirLatencies(
539
ISU, Dep,
540
(Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1);
541
return 1;
542
}
543
if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(*SrcMI) &&
544
Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) {
545
setBidirLatencies(ISU, Dep, 1);
546
return 2;
547
}
548
return 0;
549
}
550
551
// Determine whether there is a memory RAW hazard here and set up latency
552
// accordingly
553
bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep,
554
unsigned latency) {
555
if (!Dep.isNormalMemory())
556
return false;
557
auto &SrcInst = *ISU.getInstr();
558
auto &DstInst = *Dep.getSUnit()->getInstr();
559
if (!SrcInst.mayStore() || !DstInst.mayLoad())
560
return false;
561
562
auto SrcMO = *SrcInst.memoperands().begin();
563
auto DstMO = *DstInst.memoperands().begin();
564
auto SrcVal = SrcMO->getValue();
565
auto DstVal = DstMO->getValue();
566
auto SrcPseudoVal = SrcMO->getPseudoValue();
567
auto DstPseudoVal = DstMO->getPseudoValue();
568
if (SrcVal && DstVal && AA->alias(SrcVal, DstVal) == AliasResult::MustAlias &&
569
SrcMO->getOffset() == DstMO->getOffset()) {
570
setBidirLatencies(ISU, Dep, latency);
571
return true;
572
} else if (SrcPseudoVal && DstPseudoVal &&
573
SrcPseudoVal->kind() == DstPseudoVal->kind() &&
574
SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) {
575
// Spills/fills
576
auto FS0 = cast<FixedStackPseudoSourceValue>(SrcPseudoVal);
577
auto FS1 = cast<FixedStackPseudoSourceValue>(DstPseudoVal);
578
if (FS0 == FS1) {
579
setBidirLatencies(ISU, Dep, latency);
580
return true;
581
}
582
}
583
return false;
584
}
585
586
namespace {
587
588
std::unique_ptr<InstructionInformation> II;
589
590
class CortexM7InstructionInformation : public InstructionInformation {
591
public:
592
CortexM7InstructionInformation(const ARMBaseInstrInfo *TII)
593
: InstructionInformation(TII) {}
594
};
595
596
class CortexM7Overrides : public ARMOverrideBypasses {
597
public:
598
CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
599
: ARMOverrideBypasses(TII, AA) {
600
if (!II)
601
II.reset(new CortexM7InstructionInformation(TII));
602
}
603
604
void modifyBypasses(SUnit &) override;
605
};
606
607
void CortexM7Overrides::modifyBypasses(SUnit &ISU) {
608
const MachineInstr *SrcMI = ISU.getInstr();
609
unsigned SrcOpcode = SrcMI->getOpcode();
610
bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
611
612
// Walk the successors looking for latency overrides that are needed
613
for (SDep &Dep : ISU.Succs) {
614
615
// Output dependences should have 0 latency, as M7 is able to
616
// schedule writers to the same register for simultaneous issue.
617
if (zeroOutputDependences(ISU, Dep))
618
continue;
619
620
if (memoryRAWHazard(ISU, Dep, 4))
621
continue;
622
623
// Ignore dependencies other than data
624
if (Dep.getKind() != SDep::Data)
625
continue;
626
627
SUnit &DepSU = *Dep.getSUnit();
628
if (DepSU.isBoundaryNode())
629
continue;
630
631
if (makeBundleAssumptions(ISU, Dep) == 1)
632
continue;
633
634
const MachineInstr *DstMI = DepSU.getInstr();
635
unsigned DstOpcode = DstMI->getOpcode();
636
637
// Word loads into any multiply or divide instruction are considered
638
// cannot bypass their scheduling stage. Didn't do this in the .td file
639
// because we cannot easily create a read advance that is 0 from certain
640
// writer classes and 1 from all the rest.
641
// (The other way around would have been easy.)
642
if (isNSWload && (II->isMultiply(DstOpcode) || II->isDivide(DstOpcode)))
643
setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
644
645
// Word loads into B operand of a load/store are considered cannot bypass
646
// their scheduling stage. Cannot do in the .td file because
647
// need to decide between -1 and -2 for ReadAdvance
648
if (isNSWload && II->hasBRegAddr(DstOpcode) &&
649
DstMI->getOperand(2).getReg() == Dep.getReg())
650
setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
651
652
// Multiplies into any address generation cannot bypass from EX3. Cannot do
653
// in the .td file because need to decide between -1 and -2 for ReadAdvance
654
if (II->isMultiply(SrcOpcode)) {
655
unsigned OpMask = II->getAddressOpMask(DstOpcode) >> 1;
656
for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) {
657
if ((OpMask & 1) && DstMI->getOperand(i).isReg() &&
658
DstMI->getOperand(i).getReg() == Dep.getReg()) {
659
setBidirLatencies(ISU, Dep, 4); // first legal bypass is EX4->EX1
660
break;
661
}
662
}
663
}
664
665
// Mismatched conditional producers take longer on M7; they end up looking
666
// like they were produced at EX3 and read at IS.
667
if (TII->isPredicated(*SrcMI) && Dep.isAssignedRegDep() &&
668
(SrcOpcode == ARM::BUNDLE ||
669
mismatchedPred(TII->getPredicate(*SrcMI),
670
TII->getPredicate(*DstMI)))) {
671
unsigned Lat = 1;
672
// Operand A of shift+ALU is treated as an EX1 read instead of EX2.
673
if (II->isInlineShiftALU(DstOpcode) && DstMI->getOperand(3).getImm() &&
674
DstMI->getOperand(1).getReg() == Dep.getReg())
675
Lat = 2;
676
Lat = std::min(3u, Dep.getLatency() + Lat);
677
setBidirLatencies(ISU, Dep, std::max(Dep.getLatency(), Lat));
678
}
679
680
// CC setter into conditional producer shouldn't have a latency of more
681
// than 1 unless it's due to an implicit read. (All the "true" readers
682
// of the condition code use an implicit read, and predicates use an
683
// explicit.)
684
if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
685
TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
686
setBidirLatencies(ISU, Dep, 1);
687
688
// REV instructions cannot bypass directly into the EX1 shifter. The
689
// code is slightly inexact as it doesn't attempt to ensure that the bypass
690
// is to the shifter operands.
691
if (II->isRev(SrcOpcode)) {
692
if (II->isInlineShiftALU(DstOpcode))
693
setBidirLatencies(ISU, Dep, 2);
694
else if (II->isShift(DstOpcode))
695
setBidirLatencies(ISU, Dep, 1);
696
}
697
}
698
}
699
700
class M85InstructionInformation : public InstructionInformation {
701
public:
702
M85InstructionInformation(const ARMBaseInstrInfo *t)
703
: InstructionInformation(t) {
704
markDPProducersConsumers(t);
705
}
706
};
707
708
class M85Overrides : public ARMOverrideBypasses {
709
public:
710
M85Overrides(const ARMBaseInstrInfo *t, AAResults *a)
711
: ARMOverrideBypasses(t, a) {
712
if (!II)
713
II.reset(new M85InstructionInformation(t));
714
}
715
716
void modifyBypasses(SUnit &) override;
717
718
private:
719
unsigned computeBypassStage(const MCSchedClassDesc *SCD);
720
signed modifyMixedWidthFP(const MachineInstr *SrcMI,
721
const MachineInstr *DstMI, unsigned RegID,
722
const MCSchedClassDesc *SCD);
723
};
724
725
unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) {
726
auto SM = DAG->getSchedModel();
727
unsigned DefIdx = 0; // just look for the first output's timing
728
if (DefIdx < SCDesc->NumWriteLatencyEntries) {
729
// Lookup the definition's write latency in SubtargetInfo.
730
const MCWriteLatencyEntry *WLEntry =
731
SM->getSubtargetInfo()->getWriteLatencyEntry(SCDesc, DefIdx);
732
unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000;
733
if (Latency == 4)
734
return 2;
735
else if (Latency == 5)
736
return 3;
737
else if (Latency > 3)
738
return 3;
739
else
740
return Latency;
741
}
742
return 2;
743
}
744
745
// Latency changes for bypassing between FP registers of different sizes:
746
//
747
// Note that mixed DP/SP are unlikely because of the semantics
748
// of C. Mixed MVE/SP are quite common when MVE intrinsics are used.
749
signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI,
750
const MachineInstr *DstMI,
751
unsigned RegID,
752
const MCSchedClassDesc *SCD) {
753
754
if (!II->producesSP(SrcMI->getOpcode()) &&
755
!II->producesDP(SrcMI->getOpcode()) &&
756
!II->producesQP(SrcMI->getOpcode()))
757
return 0;
758
759
if (Register::isVirtualRegister(RegID)) {
760
if (II->producesSP(SrcMI->getOpcode()) &&
761
II->consumesDP(DstMI->getOpcode())) {
762
for (auto &OP : SrcMI->operands())
763
if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
764
OP.getSubReg() == ARM::ssub_1)
765
return 5 - computeBypassStage(SCD);
766
} else if (II->producesSP(SrcMI->getOpcode()) &&
767
II->consumesQP(DstMI->getOpcode())) {
768
for (auto &OP : SrcMI->operands())
769
if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
770
(OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
771
return 5 - computeBypassStage(SCD) -
772
((OP.getSubReg() == ARM::ssub_2 ||
773
OP.getSubReg() == ARM::ssub_3)
774
? 1
775
: 0);
776
} else if (II->producesDP(SrcMI->getOpcode()) &&
777
II->consumesQP(DstMI->getOpcode())) {
778
for (auto &OP : SrcMI->operands())
779
if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
780
OP.getSubReg() == ARM::ssub_1)
781
return -1;
782
} else if (II->producesDP(SrcMI->getOpcode()) &&
783
II->consumesSP(DstMI->getOpcode())) {
784
for (auto &OP : DstMI->operands())
785
if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
786
OP.getSubReg() == ARM::ssub_1)
787
return 5 - computeBypassStage(SCD);
788
} else if (II->producesQP(SrcMI->getOpcode()) &&
789
II->consumesSP(DstMI->getOpcode())) {
790
for (auto &OP : DstMI->operands())
791
if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
792
(OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
793
return 5 - computeBypassStage(SCD) +
794
((OP.getSubReg() == ARM::ssub_2 ||
795
OP.getSubReg() == ARM::ssub_3)
796
? 1
797
: 0);
798
} else if (II->producesQP(SrcMI->getOpcode()) &&
799
II->consumesDP(DstMI->getOpcode())) {
800
for (auto &OP : DstMI->operands())
801
if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
802
OP.getSubReg() == ARM::ssub_1)
803
return 1;
804
}
805
} else if (Register::isPhysicalRegister(RegID)) {
806
// Note that when the producer is narrower, not all of the producers
807
// may be present in the scheduling graph; somewhere earlier in the
808
// compiler, an implicit def/use of the aliased full register gets
809
// added to the producer, and so only that producer is seen as *the*
810
// single producer. This behavior also has the unfortunate effect of
811
// serializing the producers in the compiler's view of things.
812
if (II->producesSP(SrcMI->getOpcode()) &&
813
II->consumesDP(DstMI->getOpcode())) {
814
for (auto &OP : SrcMI->operands())
815
if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
816
OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
817
(OP.getReg() == RegID ||
818
(OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
819
(OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
820
return 5 - computeBypassStage(SCD);
821
} else if (II->producesSP(SrcMI->getOpcode()) &&
822
II->consumesQP(DstMI->getOpcode())) {
823
for (auto &OP : SrcMI->operands())
824
if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
825
OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
826
(OP.getReg() == RegID ||
827
(OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
828
(OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
829
return 5 - computeBypassStage(SCD) -
830
(((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0);
831
} else if (II->producesDP(SrcMI->getOpcode()) &&
832
II->consumesQP(DstMI->getOpcode())) {
833
for (auto &OP : SrcMI->operands())
834
if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 &&
835
OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 &&
836
(OP.getReg() == RegID ||
837
(OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID))
838
return -1;
839
} else if (II->producesDP(SrcMI->getOpcode()) &&
840
II->consumesSP(DstMI->getOpcode())) {
841
if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
842
return 5 - computeBypassStage(SCD);
843
} else if (II->producesQP(SrcMI->getOpcode()) &&
844
II->consumesSP(DstMI->getOpcode())) {
845
if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
846
return 5 - computeBypassStage(SCD) +
847
(((RegID - ARM::S0) / 2) % 2 ? 1 : 0);
848
} else if (II->producesQP(SrcMI->getOpcode()) &&
849
II->consumesDP(DstMI->getOpcode())) {
850
if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2)
851
return 1;
852
}
853
}
854
return 0;
855
}
856
857
void M85Overrides::modifyBypasses(SUnit &ISU) {
858
const MachineInstr *SrcMI = ISU.getInstr();
859
unsigned SrcOpcode = SrcMI->getOpcode();
860
bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
861
862
// Walk the successors looking for latency overrides that are needed
863
for (SDep &Dep : ISU.Succs) {
864
865
// Output dependences should have 0 latency, as CortexM85 is able to
866
// schedule writers to the same register for simultaneous issue.
867
if (zeroOutputDependences(ISU, Dep))
868
continue;
869
870
if (memoryRAWHazard(ISU, Dep, 3))
871
continue;
872
873
// Ignore dependencies other than data or strong ordering.
874
if (Dep.getKind() != SDep::Data)
875
continue;
876
877
SUnit &DepSU = *Dep.getSUnit();
878
if (DepSU.isBoundaryNode())
879
continue;
880
881
if (makeBundleAssumptions(ISU, Dep) == 1)
882
continue;
883
884
const MachineInstr *DstMI = DepSU.getInstr();
885
unsigned DstOpcode = DstMI->getOpcode();
886
887
// Word loads into B operand of a load/store with cannot bypass their
888
// scheduling stage. Cannot do in the .td file because need to decide
889
// between -1 and -2 for ReadAdvance
890
891
if (isNSWload && II->hasBRegAddrShift(DstOpcode) &&
892
DstMI->getOperand(3).getImm() != 0 && // shift operand
893
DstMI->getOperand(2).getReg() == Dep.getReg())
894
setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
895
896
if (isNSWload && isMVEVectorInstruction(DstMI)) {
897
setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
898
}
899
900
if (II->isMVEIntMAC(DstOpcode) &&
901
II->isMVEIntMACMatched(SrcOpcode, DstOpcode) &&
902
DstMI->getOperand(0).isReg() &&
903
DstMI->getOperand(0).getReg() == Dep.getReg())
904
setBidirLatencies(ISU, Dep, Dep.getLatency() - 1);
905
906
// CC setter into conditional producer shouldn't have a latency of more
907
// than 0 unless it's due to an implicit read.
908
if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
909
TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
910
setBidirLatencies(ISU, Dep, 0);
911
912
if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, Dep.getReg(),
913
DAG->getSchedClass(&ISU)))
914
setBidirLatencies(ISU, Dep, std::max(0, signed(Dep.getLatency()) + ALat));
915
916
if (II->isRev(SrcOpcode)) {
917
if (II->isInlineShiftALU(DstOpcode))
918
setBidirLatencies(ISU, Dep, 1);
919
else if (II->isShift(DstOpcode))
920
setBidirLatencies(ISU, Dep, 1);
921
}
922
}
923
}
924
925
// Add M55 specific overrides for latencies between instructions. Currently it:
926
// - Adds an extra cycle latency between MVE VMLAV and scalar instructions.
927
class CortexM55Overrides : public ARMOverrideBypasses {
928
public:
929
CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
930
: ARMOverrideBypasses(TII, AA) {}
931
932
void modifyBypasses(SUnit &SU) override {
933
MachineInstr *SrcMI = SU.getInstr();
934
if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction))
935
return;
936
937
for (SDep &Dep : SU.Succs) {
938
if (Dep.getKind() != SDep::Data)
939
continue;
940
SUnit &DepSU = *Dep.getSUnit();
941
if (DepSU.isBoundaryNode())
942
continue;
943
MachineInstr *DstMI = DepSU.getInstr();
944
945
if (!isMVEVectorInstruction(DstMI) && !DstMI->mayStore())
946
setBidirLatencies(SU, Dep, 3);
947
}
948
}
949
};
950
951
} // end anonymous namespace
952
953
void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) {
954
DAG = DAGInstrs;
955
for (SUnit &ISU : DAGInstrs->SUnits) {
956
if (ISU.isBoundaryNode())
957
continue;
958
modifyBypasses(ISU);
959
}
960
if (DAGInstrs->ExitSU.getInstr())
961
modifyBypasses(DAGInstrs->ExitSU);
962
}
963
964
std::unique_ptr<ScheduleDAGMutation>
965
createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) {
966
if (ST.isCortexM85())
967
return std::make_unique<M85Overrides>(ST.getInstrInfo(), AA);
968
else if (ST.isCortexM7())
969
return std::make_unique<CortexM7Overrides>(ST.getInstrInfo(), AA);
970
else if (ST.isCortexM55())
971
return std::make_unique<CortexM55Overrides>(ST.getInstrInfo(), AA);
972
973
return nullptr;
974
}
975
976
} // end namespace llvm
977
978