CoCalc -- ARMLatencyMutations.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/ARM/ARMLatencyMutations.cpp
²¹³⁷⁹⁹ views
1
//===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file This file contains the ARM definition DAG scheduling mutations which
10
/// change inter-instruction latencies
11
//
12
//===----------------------------------------------------------------------===//
13

14
#include "ARMLatencyMutations.h"
15
#include "ARMSubtarget.h"
16
#include "Thumb2InstrInfo.h"
17
#include "llvm/Analysis/AliasAnalysis.h"
18
#include "llvm/CodeGen/ScheduleDAG.h"
19
#include "llvm/CodeGen/ScheduleDAGMutation.h"
20
#include "llvm/CodeGen/TargetInstrInfo.h"
21
#include <algorithm>
22
#include <array>
23
#include <initializer_list>
24
#include <memory>
25

26
namespace llvm {
27

28
namespace {
29

30
// Precompute information about opcodes to speed up pass
31

32
class InstructionInformation {
33
protected:
34
  struct IInfo {
35
    bool HasBRegAddr : 1;      // B-side of addr gen is a register
36
    bool HasBRegAddrShift : 1; // B-side of addr gen has a shift
37
    bool IsDivide : 1;         // Some form of integer divide
38
    bool IsInlineShiftALU : 1; // Inline shift+ALU
39
    bool IsMultiply : 1;       // Some form of integer multiply
40
    bool IsMVEIntMAC : 1;      // MVE 8/16/32-bit integer MAC operation
41
    bool IsNonSubwordLoad : 1; // Load which is a word or larger
42
    bool IsShift : 1;          // Shift operation
43
    bool IsRev : 1;            // REV operation
44
    bool ProducesQP : 1;       // Produces a vector register result
45
    bool ProducesDP : 1;       // Produces a double-precision register result
46
    bool ProducesSP : 1;       // Produces a single-precision register result
47
    bool ConsumesQP : 1;       // Consumes a vector register result
48
    bool ConsumesDP : 1;       // Consumes a double-precision register result
49
    bool ConsumesSP : 1;       // Consumes a single-precision register result
50
    unsigned MVEIntMACMatched; // Matched operand type (for MVE)
51
    unsigned AddressOpMask;    // Mask indicating which operands go into AGU
52
    IInfo()
53
        : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false),
54
          IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false),
55
          IsNonSubwordLoad(false), IsShift(false), IsRev(false),
56
          ProducesQP(false), ProducesDP(false), ProducesSP(false),
57
          ConsumesQP(false), ConsumesDP(false), ConsumesSP(false),
58
          MVEIntMACMatched(0), AddressOpMask(0) {}
59
  };
60
  typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray;
61
  IInfoArray Info;
62

63
public:
64
  // Always available information
65
  unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; }
66
  bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; }
67
  bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; }
68
  bool isDivide(unsigned Op) { return Info[Op].IsDivide; }
69
  bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; }
70
  bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; }
71
  bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; }
72
  bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; }
73
  bool isRev(unsigned Op) { return Info[Op].IsRev; }
74
  bool isShift(unsigned Op) { return Info[Op].IsShift; }
75

76
  // information available if markDPConsumers is called.
77
  bool producesQP(unsigned Op) { return Info[Op].ProducesQP; }
78
  bool producesDP(unsigned Op) { return Info[Op].ProducesDP; }
79
  bool producesSP(unsigned Op) { return Info[Op].ProducesSP; }
80
  bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; }
81
  bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; }
82
  bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; }
83

84
  bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) {
85
    return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp;
86
  }
87

88
  InstructionInformation(const ARMBaseInstrInfo *TII);
89

90
protected:
91
  void markDPProducersConsumers(const ARMBaseInstrInfo *TII);
92
};
93

94
InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) {
95
  using namespace ARM;
96

97
  std::initializer_list<unsigned> hasBRegAddrList = {
98
      t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
99
      tLDRr,  tLDRBr,  tLDRHr,  tSTRr,  tSTRBr,  tSTRHr,
100
  };
101
  for (auto op : hasBRegAddrList) {
102
    Info[op].HasBRegAddr = true;
103
  }
104

105
  std::initializer_list<unsigned> hasBRegAddrShiftList = {
106
      t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
107
  };
108
  for (auto op : hasBRegAddrShiftList) {
109
    Info[op].HasBRegAddrShift = true;
110
  }
111

112
  Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
113

114
  std::initializer_list<unsigned> isInlineShiftALUList = {
115
      t2ADCrs,  t2ADDSrs, t2ADDrs,  t2BICrs, t2EORrs,
116
      t2ORNrs,  t2RSBSrs, t2RSBrs,  t2SBCrs, t2SUBrs,
117
      t2SUBSrs, t2CMPrs,  t2CMNzrs, t2TEQrs, t2TSTrs,
118
  };
119
  for (auto op : isInlineShiftALUList) {
120
    Info[op].IsInlineShiftALU = true;
121
  }
122

123
  Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
124

125
  std::initializer_list<unsigned> isMultiplyList = {
126
      t2MUL,    t2MLA,     t2MLS,     t2SMLABB, t2SMLABT,  t2SMLAD,   t2SMLADX,
127
      t2SMLAL,  t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT,
128
      t2SMLATB, t2SMLATT,  t2SMLAWT,  t2SMLSD,  t2SMLSDX,  t2SMLSLD,  t2SMLSLDX,
129
      t2SMMLA,  t2SMMLAR,  t2SMMLS,   t2SMMLSR, t2SMMUL,   t2SMMULR,  t2SMUAD,
130
      t2SMUADX, t2SMULBB,  t2SMULBT,  t2SMULL,  t2SMULTB,  t2SMULTT,  t2SMULWT,
131
      t2SMUSD,  t2SMUSDX,  t2UMAAL,   t2UMLAL,  t2UMULL,   tMUL,
132
  };
133
  for (auto op : isMultiplyList) {
134
    Info[op].IsMultiply = true;
135
  }
136

137
  std::initializer_list<unsigned> isMVEIntMACList = {
138
      MVE_VMLAS_qr_i16,    MVE_VMLAS_qr_i32,    MVE_VMLAS_qr_i8,
139
      MVE_VMLA_qr_i16,     MVE_VMLA_qr_i32,     MVE_VMLA_qr_i8,
140
      MVE_VQDMLAH_qrs16,   MVE_VQDMLAH_qrs32,   MVE_VQDMLAH_qrs8,
141
      MVE_VQDMLASH_qrs16,  MVE_VQDMLASH_qrs32,  MVE_VQDMLASH_qrs8,
142
      MVE_VQRDMLAH_qrs16,  MVE_VQRDMLAH_qrs32,  MVE_VQRDMLAH_qrs8,
143
      MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8,
144
      MVE_VQDMLADHXs16,    MVE_VQDMLADHXs32,    MVE_VQDMLADHXs8,
145
      MVE_VQDMLADHs16,     MVE_VQDMLADHs32,     MVE_VQDMLADHs8,
146
      MVE_VQDMLSDHXs16,    MVE_VQDMLSDHXs32,    MVE_VQDMLSDHXs8,
147
      MVE_VQDMLSDHs16,     MVE_VQDMLSDHs32,     MVE_VQDMLSDHs8,
148
      MVE_VQRDMLADHXs16,   MVE_VQRDMLADHXs32,   MVE_VQRDMLADHXs8,
149
      MVE_VQRDMLADHs16,    MVE_VQRDMLADHs32,    MVE_VQRDMLADHs8,
150
      MVE_VQRDMLSDHXs16,   MVE_VQRDMLSDHXs32,   MVE_VQRDMLSDHXs8,
151
      MVE_VQRDMLSDHs16,    MVE_VQRDMLSDHs32,    MVE_VQRDMLSDHs8,
152
  };
153
  for (auto op : isMVEIntMACList) {
154
    Info[op].IsMVEIntMAC = true;
155
  }
156

157
  std::initializer_list<unsigned> isNonSubwordLoadList = {
158
      t2LDRi12, t2LDRi8,  t2LDR_POST,  t2LDR_PRE,  t2LDRpci,
159
      t2LDRs,   t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi,
160
      tLDRpci,  tLDRr,    tLDRspi,
161
  };
162
  for (auto op : isNonSubwordLoadList) {
163
    Info[op].IsNonSubwordLoad = true;
164
  }
165

166
  std::initializer_list<unsigned> isRevList = {
167
      t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH,
168
  };
169
  for (auto op : isRevList) {
170
    Info[op].IsRev = true;
171
  }
172

173
  std::initializer_list<unsigned> isShiftList = {
174
      t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr,
175
      tASRri,  tASRrr,  tLSLSri, tLSLri,  tLSLrr,  tLSRri,  tLSRrr,  tROR,
176
  };
177
  for (auto op : isShiftList) {
178
    Info[op].IsShift = true;
179
  }
180

181
  std::initializer_list<unsigned> Address1List = {
182
      t2LDRBi12,
183
      t2LDRBi8,
184
      t2LDRBpci,
185
      t2LDRBs,
186
      t2LDRHi12,
187
      t2LDRHi8,
188
      t2LDRHpci,
189
      t2LDRHs,
190
      t2LDRSBi12,
191
      t2LDRSBi8,
192
      t2LDRSBpci,
193
      t2LDRSBs,
194
      t2LDRSHi12,
195
      t2LDRSHi8,
196
      t2LDRSHpci,
197
      t2LDRSHs,
198
      t2LDRi12,
199
      t2LDRi8,
200
      t2LDRpci,
201
      t2LDRs,
202
      tLDRBi,
203
      tLDRBr,
204
      tLDRHi,
205
      tLDRHr,
206
      tLDRSB,
207
      tLDRSH,
208
      tLDRi,
209
      tLDRpci,
210
      tLDRr,
211
      tLDRspi,
212
      t2STRBi12,
213
      t2STRBi8,
214
      t2STRBs,
215
      t2STRHi12,
216
      t2STRHi8,
217
      t2STRHs,
218
      t2STRi12,
219
      t2STRi8,
220
      t2STRs,
221
      tSTRBi,
222
      tSTRBr,
223
      tSTRHi,
224
      tSTRHr,
225
      tSTRi,
226
      tSTRr,
227
      tSTRspi,
228
      VLDRD,
229
      VLDRH,
230
      VLDRS,
231
      VSTRD,
232
      VSTRH,
233
      VSTRS,
234
      MVE_VLD20_16,
235
      MVE_VLD20_32,
236
      MVE_VLD20_8,
237
      MVE_VLD21_16,
238
      MVE_VLD21_32,
239
      MVE_VLD21_8,
240
      MVE_VLD40_16,
241
      MVE_VLD40_32,
242
      MVE_VLD40_8,
243
      MVE_VLD41_16,
244
      MVE_VLD41_32,
245
      MVE_VLD41_8,
246
      MVE_VLD42_16,
247
      MVE_VLD42_32,
248
      MVE_VLD42_8,
249
      MVE_VLD43_16,
250
      MVE_VLD43_32,
251
      MVE_VLD43_8,
252
      MVE_VLDRBS16,
253
      MVE_VLDRBS16_rq,
254
      MVE_VLDRBS32,
255
      MVE_VLDRBS32_rq,
256
      MVE_VLDRBU16,
257
      MVE_VLDRBU16_rq,
258
      MVE_VLDRBU32,
259
      MVE_VLDRBU32_rq,
260
      MVE_VLDRBU8,
261
      MVE_VLDRBU8_rq,
262
      MVE_VLDRDU64_qi,
263
      MVE_VLDRDU64_rq,
264
      MVE_VLDRDU64_rq_u,
265
      MVE_VLDRHS32,
266
      MVE_VLDRHS32_rq,
267
      MVE_VLDRHS32_rq_u,
268
      MVE_VLDRHU16,
269
      MVE_VLDRHU16_rq,
270
      MVE_VLDRHU16_rq_u,
271
      MVE_VLDRHU32,
272
      MVE_VLDRHU32_rq,
273
      MVE_VLDRHU32_rq_u,
274
      MVE_VLDRWU32,
275
      MVE_VLDRWU32_qi,
276
      MVE_VLDRWU32_rq,
277
      MVE_VLDRWU32_rq_u,
278
      MVE_VST20_16,
279
      MVE_VST20_32,
280
      MVE_VST20_8,
281
      MVE_VST21_16,
282
      MVE_VST21_32,
283
      MVE_VST21_8,
284
      MVE_VST40_16,
285
      MVE_VST40_32,
286
      MVE_VST40_8,
287
      MVE_VST41_16,
288
      MVE_VST41_32,
289
      MVE_VST41_8,
290
      MVE_VST42_16,
291
      MVE_VST42_32,
292
      MVE_VST42_8,
293
      MVE_VST43_16,
294
      MVE_VST43_32,
295
      MVE_VST43_8,
296
      MVE_VSTRB16,
297
      MVE_VSTRB16_rq,
298
      MVE_VSTRB32,
299
      MVE_VSTRB32_rq,
300
      MVE_VSTRBU8,
301
      MVE_VSTRB8_rq,
302
      MVE_VSTRD64_qi,
303
      MVE_VSTRD64_rq,
304
      MVE_VSTRD64_rq_u,
305
      MVE_VSTRH32,
306
      MVE_VSTRH32_rq,
307
      MVE_VSTRH32_rq_u,
308
      MVE_VSTRHU16,
309
      MVE_VSTRH16_rq,
310
      MVE_VSTRH16_rq_u,
311
      MVE_VSTRWU32,
312
      MVE_VSTRW32_qi,
313
      MVE_VSTRW32_rq,
314
      MVE_VSTRW32_rq_u,
315
  };
316
  std::initializer_list<unsigned> Address2List = {
317
      t2LDRB_POST,
318
      t2LDRB_PRE,
319
      t2LDRDi8,
320
      t2LDRH_POST,
321
      t2LDRH_PRE,
322
      t2LDRSB_POST,
323
      t2LDRSB_PRE,
324
      t2LDRSH_POST,
325
      t2LDRSH_PRE,
326
      t2LDR_POST,
327
      t2LDR_PRE,
328
      t2STRB_POST,
329
      t2STRB_PRE,
330
      t2STRDi8,
331
      t2STRH_POST,
332
      t2STRH_PRE,
333
      t2STR_POST,
334
      t2STR_PRE,
335
      MVE_VLD20_16_wb,
336
      MVE_VLD20_32_wb,
337
      MVE_VLD20_8_wb,
338
      MVE_VLD21_16_wb,
339
      MVE_VLD21_32_wb,
340
      MVE_VLD21_8_wb,
341
      MVE_VLD40_16_wb,
342
      MVE_VLD40_32_wb,
343
      MVE_VLD40_8_wb,
344
      MVE_VLD41_16_wb,
345
      MVE_VLD41_32_wb,
346
      MVE_VLD41_8_wb,
347
      MVE_VLD42_16_wb,
348
      MVE_VLD42_32_wb,
349
      MVE_VLD42_8_wb,
350
      MVE_VLD43_16_wb,
351
      MVE_VLD43_32_wb,
352
      MVE_VLD43_8_wb,
353
      MVE_VLDRBS16_post,
354
      MVE_VLDRBS16_pre,
355
      MVE_VLDRBS32_post,
356
      MVE_VLDRBS32_pre,
357
      MVE_VLDRBU16_post,
358
      MVE_VLDRBU16_pre,
359
      MVE_VLDRBU32_post,
360
      MVE_VLDRBU32_pre,
361
      MVE_VLDRBU8_post,
362
      MVE_VLDRBU8_pre,
363
      MVE_VLDRDU64_qi_pre,
364
      MVE_VLDRHS32_post,
365
      MVE_VLDRHS32_pre,
366
      MVE_VLDRHU16_post,
367
      MVE_VLDRHU16_pre,
368
      MVE_VLDRHU32_post,
369
      MVE_VLDRHU32_pre,
370
      MVE_VLDRWU32_post,
371
      MVE_VLDRWU32_pre,
372
      MVE_VLDRWU32_qi_pre,
373
      MVE_VST20_16_wb,
374
      MVE_VST20_32_wb,
375
      MVE_VST20_8_wb,
376
      MVE_VST21_16_wb,
377
      MVE_VST21_32_wb,
378
      MVE_VST21_8_wb,
379
      MVE_VST40_16_wb,
380
      MVE_VST40_32_wb,
381
      MVE_VST40_8_wb,
382
      MVE_VST41_16_wb,
383
      MVE_VST41_32_wb,
384
      MVE_VST41_8_wb,
385
      MVE_VST42_16_wb,
386
      MVE_VST42_32_wb,
387
      MVE_VST42_8_wb,
388
      MVE_VST43_16_wb,
389
      MVE_VST43_32_wb,
390
      MVE_VST43_8_wb,
391
      MVE_VSTRB16_post,
392
      MVE_VSTRB16_pre,
393
      MVE_VSTRB32_post,
394
      MVE_VSTRB32_pre,
395
      MVE_VSTRBU8_post,
396
      MVE_VSTRBU8_pre,
397
      MVE_VSTRD64_qi_pre,
398
      MVE_VSTRH32_post,
399
      MVE_VSTRH32_pre,
400
      MVE_VSTRHU16_post,
401
      MVE_VSTRHU16_pre,
402
      MVE_VSTRWU32_post,
403
      MVE_VSTRWU32_pre,
404
      MVE_VSTRW32_qi_pre,
405
  };
406
  std::initializer_list<unsigned> Address3List = {
407
      t2LDRD_POST,
408
      t2LDRD_PRE,
409
      t2STRD_POST,
410
      t2STRD_PRE,
411
  };
412
  // Compute a mask of which operands are involved in address computation
413
  for (auto &op : Address1List) {
414
    Info[op].AddressOpMask = 0x6;
415
  }
416
  for (auto &op : Address2List) {
417
    Info[op].AddressOpMask = 0xc;
418
  }
419
  for (auto &op : Address3List) {
420
    Info[op].AddressOpMask = 0x18;
421
  }
422
  for (auto &op : hasBRegAddrShiftList) {
423
    Info[op].AddressOpMask |= 0x8;
424
  }
425
}
426

427
void InstructionInformation::markDPProducersConsumers(
428
    const ARMBaseInstrInfo *TII) {
429
  // Learn about all instructions which have FP source/dest registers
430
  for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) {
431
    const MCInstrDesc &MID = TII->get(MI);
432
    auto Operands = MID.operands();
433
    for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) {
434
      bool MarkQP = false, MarkDP = false, MarkSP = false;
435
      switch (Operands[OI].RegClass) {
436
      case ARM::MQPRRegClassID:
437
      case ARM::DPRRegClassID:
438
      case ARM::DPR_8RegClassID:
439
      case ARM::DPR_VFP2RegClassID:
440
      case ARM::DPairRegClassID:
441
      case ARM::DPairSpcRegClassID:
442
      case ARM::DQuadRegClassID:
443
      case ARM::DQuadSpcRegClassID:
444
      case ARM::DTripleRegClassID:
445
      case ARM::DTripleSpcRegClassID:
446
        MarkDP = true;
447
        break;
448
      case ARM::QPRRegClassID:
449
      case ARM::QPR_8RegClassID:
450
      case ARM::QPR_VFP2RegClassID:
451
      case ARM::QQPRRegClassID:
452
      case ARM::QQQQPRRegClassID:
453
        MarkQP = true;
454
        break;
455
      case ARM::SPRRegClassID:
456
      case ARM::SPR_8RegClassID:
457
      case ARM::FPWithVPRRegClassID:
458
        MarkSP = true;
459
        break;
460
      default:
461
        break;
462
      }
463
      if (MarkQP) {
464
        if (OI < MID.getNumDefs())
465
          Info[MI].ProducesQP = true;
466
        else
467
          Info[MI].ConsumesQP = true;
468
      }
469
      if (MarkDP) {
470
        if (OI < MID.getNumDefs())
471
          Info[MI].ProducesDP = true;
472
        else
473
          Info[MI].ConsumesDP = true;
474
      }
475
      if (MarkSP) {
476
        if (OI < MID.getNumDefs())
477
          Info[MI].ProducesSP = true;
478
        else
479
          Info[MI].ConsumesSP = true;
480
      }
481
    }
482
  }
483
}
484

485
} // anonymous namespace
486

487
static bool hasImplicitCPSRUse(const MachineInstr *MI) {
488
  return MI->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR);
489
}
490

491
void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep,
492
                                            unsigned latency) {
493
  SDep Reverse = SrcDep;
494
  Reverse.setSUnit(&SrcSU);
495
  for (SDep &PDep : SrcDep.getSUnit()->Preds) {
496
    if (PDep == Reverse) {
497
      PDep.setLatency(latency);
498
      SrcDep.getSUnit()->setDepthDirty();
499
      break;
500
    }
501
  }
502
  SrcDep.setLatency(latency);
503
  SrcSU.setHeightDirty();
504
}
505

506
static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) {
507
  return (a & 0xe) != (b & 0xe);
508
}
509

510
// Set output dependences to zero latency for processors which can
511
// simultaneously issue to the same register.  Returns true if a change
512
// was made.
513
bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) {
514
  if (Dep.getKind() == SDep::Output) {
515
    setBidirLatencies(ISU, Dep, 0);
516
    return true;
517
  }
518
  return false;
519
}
520

521
// The graph doesn't look inside of bundles to determine their
522
// scheduling boundaries and reports zero latency into and out of them
523
// (except for CPSR into the bundle, which has latency 1).
524
// Make some better scheduling assumptions:
525
// 1) CPSR uses have zero latency; other uses have incoming latency 1
526
// 2) CPSR defs retain a latency of zero; others have a latency of 1.
527
//
528
// Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise
529
unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) {
530

531
  SUnit &DepSU = *Dep.getSUnit();
532
  const MachineInstr *SrcMI = ISU.getInstr();
533
  unsigned SrcOpcode = SrcMI->getOpcode();
534
  const MachineInstr *DstMI = DepSU.getInstr();
535
  unsigned DstOpcode = DstMI->getOpcode();
536

537
  if (DstOpcode == ARM::BUNDLE && TII->isPredicated(*DstMI)) {
538
    setBidirLatencies(
539
        ISU, Dep,
540
        (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1);
541
    return 1;
542
  }
543
  if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(*SrcMI) &&
544
      Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) {
545
    setBidirLatencies(ISU, Dep, 1);
546
    return 2;
547
  }
548
  return 0;
549
}
550

551
// Determine whether there is a memory RAW hazard here and set up latency
552
// accordingly
553
bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep,
554
                                          unsigned latency) {
555
  if (!Dep.isNormalMemory())
556
    return false;
557
  auto &SrcInst = *ISU.getInstr();
558
  auto &DstInst = *Dep.getSUnit()->getInstr();
559
  if (!SrcInst.mayStore() || !DstInst.mayLoad())
560
    return false;
561

562
  auto SrcMO = *SrcInst.memoperands().begin();
563
  auto DstMO = *DstInst.memoperands().begin();
564
  auto SrcVal = SrcMO->getValue();
565
  auto DstVal = DstMO->getValue();
566
  auto SrcPseudoVal = SrcMO->getPseudoValue();
567
  auto DstPseudoVal = DstMO->getPseudoValue();
568
  if (SrcVal && DstVal && AA->alias(SrcVal, DstVal) == AliasResult::MustAlias &&
569
      SrcMO->getOffset() == DstMO->getOffset()) {
570
    setBidirLatencies(ISU, Dep, latency);
571
    return true;
572
  } else if (SrcPseudoVal && DstPseudoVal &&
573
             SrcPseudoVal->kind() == DstPseudoVal->kind() &&
574
             SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) {
575
    // Spills/fills
576
    auto FS0 = cast<FixedStackPseudoSourceValue>(SrcPseudoVal);
577
    auto FS1 = cast<FixedStackPseudoSourceValue>(DstPseudoVal);
578
    if (FS0 == FS1) {
579
      setBidirLatencies(ISU, Dep, latency);
580
      return true;
581
    }
582
  }
583
  return false;
584
}
585

586
namespace {
587

588
std::unique_ptr<InstructionInformation> II;
589

590
class CortexM7InstructionInformation : public InstructionInformation {
591
public:
592
  CortexM7InstructionInformation(const ARMBaseInstrInfo *TII)
593
      : InstructionInformation(TII) {}
594
};
595

596
class CortexM7Overrides : public ARMOverrideBypasses {
597
public:
598
  CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
599
      : ARMOverrideBypasses(TII, AA) {
600
    if (!II)
601
      II.reset(new CortexM7InstructionInformation(TII));
602
  }
603

604
  void modifyBypasses(SUnit &) override;
605
};
606

607
void CortexM7Overrides::modifyBypasses(SUnit &ISU) {
608
  const MachineInstr *SrcMI = ISU.getInstr();
609
  unsigned SrcOpcode = SrcMI->getOpcode();
610
  bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
611

612
  // Walk the successors looking for latency overrides that are needed
613
  for (SDep &Dep : ISU.Succs) {
614

615
    // Output dependences should have 0 latency, as M7 is able to
616
    // schedule writers to the same register for simultaneous issue.
617
    if (zeroOutputDependences(ISU, Dep))
618
      continue;
619

620
    if (memoryRAWHazard(ISU, Dep, 4))
621
      continue;
622

623
    // Ignore dependencies other than data
624
    if (Dep.getKind() != SDep::Data)
625
      continue;
626

627
    SUnit &DepSU = *Dep.getSUnit();
628
    if (DepSU.isBoundaryNode())
629
      continue;
630

631
    if (makeBundleAssumptions(ISU, Dep) == 1)
632
      continue;
633

634
    const MachineInstr *DstMI = DepSU.getInstr();
635
    unsigned DstOpcode = DstMI->getOpcode();
636

637
    // Word loads into any multiply or divide instruction are considered
638
    // cannot bypass their scheduling stage. Didn't do this in the .td file
639
    // because we cannot easily create a read advance that is 0 from certain
640
    // writer classes and 1 from all the rest.
641
    // (The other way around would have been easy.)
642
    if (isNSWload && (II->isMultiply(DstOpcode) || II->isDivide(DstOpcode)))
643
      setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
644

645
    // Word loads into B operand of a load/store are considered cannot bypass
646
    // their scheduling stage. Cannot do in the .td file because
647
    // need to decide between -1 and -2 for ReadAdvance
648
    if (isNSWload && II->hasBRegAddr(DstOpcode) &&
649
        DstMI->getOperand(2).getReg() == Dep.getReg())
650
      setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
651

652
    // Multiplies into any address generation cannot bypass from EX3.  Cannot do
653
    // in the .td file because need to decide between -1 and -2 for ReadAdvance
654
    if (II->isMultiply(SrcOpcode)) {
655
      unsigned OpMask = II->getAddressOpMask(DstOpcode) >> 1;
656
      for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) {
657
        if ((OpMask & 1) && DstMI->getOperand(i).isReg() &&
658
            DstMI->getOperand(i).getReg() == Dep.getReg()) {
659
          setBidirLatencies(ISU, Dep, 4); // first legal bypass is EX4->EX1
660
          break;
661
        }
662
      }
663
    }
664

665
    // Mismatched conditional producers take longer on M7; they end up looking
666
    // like they were produced at EX3 and read at IS.
667
    if (TII->isPredicated(*SrcMI) && Dep.isAssignedRegDep() &&
668
        (SrcOpcode == ARM::BUNDLE ||
669
         mismatchedPred(TII->getPredicate(*SrcMI),
670
                        TII->getPredicate(*DstMI)))) {
671
      unsigned Lat = 1;
672
      // Operand A of shift+ALU is treated as an EX1 read instead of EX2.
673
      if (II->isInlineShiftALU(DstOpcode) && DstMI->getOperand(3).getImm() &&
674
          DstMI->getOperand(1).getReg() == Dep.getReg())
675
        Lat = 2;
676
      Lat = std::min(3u, Dep.getLatency() + Lat);
677
      setBidirLatencies(ISU, Dep, std::max(Dep.getLatency(), Lat));
678
    }
679

680
    // CC setter into conditional producer shouldn't have a latency of more
681
    // than 1 unless it's due to an implicit read. (All the "true" readers
682
    // of the condition code use an implicit read, and predicates use an
683
    // explicit.)
684
    if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
685
        TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
686
      setBidirLatencies(ISU, Dep, 1);
687

688
    // REV instructions cannot bypass directly into the EX1 shifter.  The
689
    // code is slightly inexact as it doesn't attempt to ensure that the bypass
690
    // is to the shifter operands.
691
    if (II->isRev(SrcOpcode)) {
692
      if (II->isInlineShiftALU(DstOpcode))
693
        setBidirLatencies(ISU, Dep, 2);
694
      else if (II->isShift(DstOpcode))
695
        setBidirLatencies(ISU, Dep, 1);
696
    }
697
  }
698
}
699

700
class M85InstructionInformation : public InstructionInformation {
701
public:
702
  M85InstructionInformation(const ARMBaseInstrInfo *t)
703
      : InstructionInformation(t) {
704
    markDPProducersConsumers(t);
705
  }
706
};
707

708
class M85Overrides : public ARMOverrideBypasses {
709
public:
710
  M85Overrides(const ARMBaseInstrInfo *t, AAResults *a)
711
      : ARMOverrideBypasses(t, a) {
712
    if (!II)
713
      II.reset(new M85InstructionInformation(t));
714
  }
715

716
  void modifyBypasses(SUnit &) override;
717

718
private:
719
  unsigned computeBypassStage(const MCSchedClassDesc *SCD);
720
  signed modifyMixedWidthFP(const MachineInstr *SrcMI,
721
                            const MachineInstr *DstMI, unsigned RegID,
722
                            const MCSchedClassDesc *SCD);
723
};
724

725
unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) {
726
  auto SM = DAG->getSchedModel();
727
  unsigned DefIdx = 0; // just look for the first output's timing
728
  if (DefIdx < SCDesc->NumWriteLatencyEntries) {
729
    // Lookup the definition's write latency in SubtargetInfo.
730
    const MCWriteLatencyEntry *WLEntry =
731
        SM->getSubtargetInfo()->getWriteLatencyEntry(SCDesc, DefIdx);
732
    unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000;
733
    if (Latency == 4)
734
      return 2;
735
    else if (Latency == 5)
736
      return 3;
737
    else if (Latency > 3)
738
      return 3;
739
    else
740
      return Latency;
741
  }
742
  return 2;
743
}
744

745
// Latency changes for bypassing between FP registers of different sizes:
746
//
747
// Note that mixed DP/SP are unlikely because of the semantics
748
// of C.  Mixed MVE/SP are quite common when MVE intrinsics are used.
749
signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI,
750
                                        const MachineInstr *DstMI,
751
                                        unsigned RegID,
752
                                        const MCSchedClassDesc *SCD) {
753

754
  if (!II->producesSP(SrcMI->getOpcode()) &&
755
      !II->producesDP(SrcMI->getOpcode()) &&
756
      !II->producesQP(SrcMI->getOpcode()))
757
    return 0;
758

759
  if (Register::isVirtualRegister(RegID)) {
760
    if (II->producesSP(SrcMI->getOpcode()) &&
761
        II->consumesDP(DstMI->getOpcode())) {
762
      for (auto &OP : SrcMI->operands())
763
        if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
764
            OP.getSubReg() == ARM::ssub_1)
765
          return 5 - computeBypassStage(SCD);
766
    } else if (II->producesSP(SrcMI->getOpcode()) &&
767
               II->consumesQP(DstMI->getOpcode())) {
768
      for (auto &OP : SrcMI->operands())
769
        if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
770
            (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
771
          return 5 - computeBypassStage(SCD) -
772
                 ((OP.getSubReg() == ARM::ssub_2 ||
773
                   OP.getSubReg() == ARM::ssub_3)
774
                      ? 1
775
                      : 0);
776
    } else if (II->producesDP(SrcMI->getOpcode()) &&
777
               II->consumesQP(DstMI->getOpcode())) {
778
      for (auto &OP : SrcMI->operands())
779
        if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
780
            OP.getSubReg() == ARM::ssub_1)
781
          return -1;
782
    } else if (II->producesDP(SrcMI->getOpcode()) &&
783
               II->consumesSP(DstMI->getOpcode())) {
784
      for (auto &OP : DstMI->operands())
785
        if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
786
            OP.getSubReg() == ARM::ssub_1)
787
          return 5 - computeBypassStage(SCD);
788
    } else if (II->producesQP(SrcMI->getOpcode()) &&
789
               II->consumesSP(DstMI->getOpcode())) {
790
      for (auto &OP : DstMI->operands())
791
        if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
792
            (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
793
          return 5 - computeBypassStage(SCD) +
794
                 ((OP.getSubReg() == ARM::ssub_2 ||
795
                   OP.getSubReg() == ARM::ssub_3)
796
                      ? 1
797
                      : 0);
798
    } else if (II->producesQP(SrcMI->getOpcode()) &&
799
               II->consumesDP(DstMI->getOpcode())) {
800
      for (auto &OP : DstMI->operands())
801
        if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
802
            OP.getSubReg() == ARM::ssub_1)
803
          return 1;
804
    }
805
  } else if (Register::isPhysicalRegister(RegID)) {
806
    // Note that when the producer is narrower, not all of the producers
807
    // may be present in the scheduling graph; somewhere earlier in the
808
    // compiler, an implicit def/use of the aliased full register gets
809
    // added to the producer, and so only that producer is seen as *the*
810
    // single producer.  This behavior also has the unfortunate effect of
811
    // serializing the producers in the compiler's view of things.
812
    if (II->producesSP(SrcMI->getOpcode()) &&
813
        II->consumesDP(DstMI->getOpcode())) {
814
      for (auto &OP : SrcMI->operands())
815
        if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
816
            OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
817
            (OP.getReg() == RegID ||
818
             (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
819
             (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
820
          return 5 - computeBypassStage(SCD);
821
    } else if (II->producesSP(SrcMI->getOpcode()) &&
822
               II->consumesQP(DstMI->getOpcode())) {
823
      for (auto &OP : SrcMI->operands())
824
        if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
825
            OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
826
            (OP.getReg() == RegID ||
827
             (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
828
             (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
829
          return 5 - computeBypassStage(SCD) -
830
                 (((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0);
831
    } else if (II->producesDP(SrcMI->getOpcode()) &&
832
               II->consumesQP(DstMI->getOpcode())) {
833
      for (auto &OP : SrcMI->operands())
834
        if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 &&
835
            OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 &&
836
            (OP.getReg() == RegID ||
837
             (OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID))
838
          return -1;
839
    } else if (II->producesDP(SrcMI->getOpcode()) &&
840
               II->consumesSP(DstMI->getOpcode())) {
841
      if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
842
        return 5 - computeBypassStage(SCD);
843
    } else if (II->producesQP(SrcMI->getOpcode()) &&
844
               II->consumesSP(DstMI->getOpcode())) {
845
      if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
846
        return 5 - computeBypassStage(SCD) +
847
               (((RegID - ARM::S0) / 2) % 2 ? 1 : 0);
848
    } else if (II->producesQP(SrcMI->getOpcode()) &&
849
               II->consumesDP(DstMI->getOpcode())) {
850
      if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2)
851
        return 1;
852
    }
853
  }
854
  return 0;
855
}
856

857
void M85Overrides::modifyBypasses(SUnit &ISU) {
858
  const MachineInstr *SrcMI = ISU.getInstr();
859
  unsigned SrcOpcode = SrcMI->getOpcode();
860
  bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
861

862
  // Walk the successors looking for latency overrides that are needed
863
  for (SDep &Dep : ISU.Succs) {
864

865
    // Output dependences should have 0 latency, as CortexM85 is able to
866
    // schedule writers to the same register for simultaneous issue.
867
    if (zeroOutputDependences(ISU, Dep))
868
      continue;
869

870
    if (memoryRAWHazard(ISU, Dep, 3))
871
      continue;
872

873
    // Ignore dependencies other than data or strong ordering.
874
    if (Dep.getKind() != SDep::Data)
875
      continue;
876

877
    SUnit &DepSU = *Dep.getSUnit();
878
    if (DepSU.isBoundaryNode())
879
      continue;
880

881
    if (makeBundleAssumptions(ISU, Dep) == 1)
882
      continue;
883

884
    const MachineInstr *DstMI = DepSU.getInstr();
885
    unsigned DstOpcode = DstMI->getOpcode();
886

887
    // Word loads into B operand of a load/store with cannot bypass their
888
    // scheduling stage. Cannot do in the .td file because need to decide
889
    // between -1 and -2 for ReadAdvance
890

891
    if (isNSWload && II->hasBRegAddrShift(DstOpcode) &&
892
        DstMI->getOperand(3).getImm() != 0 && // shift operand
893
        DstMI->getOperand(2).getReg() == Dep.getReg())
894
      setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
895

896
    if (isNSWload && isMVEVectorInstruction(DstMI)) {
897
      setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
898
    }
899

900
    if (II->isMVEIntMAC(DstOpcode) &&
901
        II->isMVEIntMACMatched(SrcOpcode, DstOpcode) &&
902
        DstMI->getOperand(0).isReg() &&
903
        DstMI->getOperand(0).getReg() == Dep.getReg())
904
      setBidirLatencies(ISU, Dep, Dep.getLatency() - 1);
905

906
    // CC setter into conditional producer shouldn't have a latency of more
907
    // than 0 unless it's due to an implicit read.
908
    if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
909
        TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
910
      setBidirLatencies(ISU, Dep, 0);
911

912
    if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, Dep.getReg(),
913
                                         DAG->getSchedClass(&ISU)))
914
      setBidirLatencies(ISU, Dep, std::max(0, signed(Dep.getLatency()) + ALat));
915

916
    if (II->isRev(SrcOpcode)) {
917
      if (II->isInlineShiftALU(DstOpcode))
918
        setBidirLatencies(ISU, Dep, 1);
919
      else if (II->isShift(DstOpcode))
920
        setBidirLatencies(ISU, Dep, 1);
921
    }
922
  }
923
}
924

925
// Add M55 specific overrides for latencies between instructions. Currently it:
926
//  - Adds an extra cycle latency between MVE VMLAV and scalar instructions.
927
class CortexM55Overrides : public ARMOverrideBypasses {
928
public:
929
  CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
930
      : ARMOverrideBypasses(TII, AA) {}
931

932
  void modifyBypasses(SUnit &SU) override {
933
    MachineInstr *SrcMI = SU.getInstr();
934
    if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction))
935
      return;
936

937
    for (SDep &Dep : SU.Succs) {
938
      if (Dep.getKind() != SDep::Data)
939
        continue;
940
      SUnit &DepSU = *Dep.getSUnit();
941
      if (DepSU.isBoundaryNode())
942
        continue;
943
      MachineInstr *DstMI = DepSU.getInstr();
944

945
      if (!isMVEVectorInstruction(DstMI) && !DstMI->mayStore())
946
        setBidirLatencies(SU, Dep, 3);
947
    }
948
  }
949
};
950

951
} // end anonymous namespace
952

953
void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) {
954
  DAG = DAGInstrs;
955
  for (SUnit &ISU : DAGInstrs->SUnits) {
956
    if (ISU.isBoundaryNode())
957
      continue;
958
    modifyBypasses(ISU);
959
  }
960
  if (DAGInstrs->ExitSU.getInstr())
961
    modifyBypasses(DAGInstrs->ExitSU);
962
}
963

964
std::unique_ptr<ScheduleDAGMutation>
965
createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) {
966
  if (ST.isCortexM85())
967
    return std::make_unique<M85Overrides>(ST.getInstrInfo(), AA);
968
  else if (ST.isCortexM7())
969
    return std::make_unique<CortexM7Overrides>(ST.getInstrInfo(), AA);
970
  else if (ST.isCortexM55())
971
    return std::make_unique<CortexM55Overrides>(ST.getInstrInfo(), AA);
972

973
  return nullptr;
974
}
975

976
} // end namespace llvm
977

978
Product

Resources

Company