CoCalc -- AMDGPUCombinerHelper.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
³⁵²⁶⁶ views
1
//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8

9
#include "AMDGPUCombinerHelper.h"
10
#include "GCNSubtarget.h"
11
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
12
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
13
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
14
#include "llvm/IR/IntrinsicsAMDGPU.h"
15
#include "llvm/Target/TargetMachine.h"
16

17
using namespace llvm;
18
using namespace MIPatternMatch;
19

20
LLVM_READNONE
21
static bool fnegFoldsIntoMI(const MachineInstr &MI) {
22
  switch (MI.getOpcode()) {
23
  case AMDGPU::G_FADD:
24
  case AMDGPU::G_FSUB:
25
  case AMDGPU::G_FMUL:
26
  case AMDGPU::G_FMA:
27
  case AMDGPU::G_FMAD:
28
  case AMDGPU::G_FMINNUM:
29
  case AMDGPU::G_FMAXNUM:
30
  case AMDGPU::G_FMINNUM_IEEE:
31
  case AMDGPU::G_FMAXNUM_IEEE:
32
  case AMDGPU::G_FMINIMUM:
33
  case AMDGPU::G_FMAXIMUM:
34
  case AMDGPU::G_FSIN:
35
  case AMDGPU::G_FPEXT:
36
  case AMDGPU::G_INTRINSIC_TRUNC:
37
  case AMDGPU::G_FPTRUNC:
38
  case AMDGPU::G_FRINT:
39
  case AMDGPU::G_FNEARBYINT:
40
  case AMDGPU::G_INTRINSIC_ROUND:
41
  case AMDGPU::G_INTRINSIC_ROUNDEVEN:
42
  case AMDGPU::G_FCANONICALIZE:
43
  case AMDGPU::G_AMDGPU_RCP_IFLAG:
44
  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
45
  case AMDGPU::G_AMDGPU_FMAX_LEGACY:
46
    return true;
47
  case AMDGPU::G_INTRINSIC: {
48
    Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
49
    switch (IntrinsicID) {
50
    case Intrinsic::amdgcn_rcp:
51
    case Intrinsic::amdgcn_rcp_legacy:
52
    case Intrinsic::amdgcn_sin:
53
    case Intrinsic::amdgcn_fmul_legacy:
54
    case Intrinsic::amdgcn_fmed3:
55
    case Intrinsic::amdgcn_fma_legacy:
56
      return true;
57
    default:
58
      return false;
59
    }
60
  }
61
  default:
62
    return false;
63
  }
64
}
65

66
/// \p returns true if the operation will definitely need to use a 64-bit
67
/// encoding, and thus will use a VOP3 encoding regardless of the source
68
/// modifiers.
69
LLVM_READONLY
70
static bool opMustUseVOP3Encoding(const MachineInstr &MI,
71
                                  const MachineRegisterInfo &MRI) {
72
  return MI.getNumOperands() > (isa<GIntrinsic>(MI) ? 4u : 3u) ||
73
         MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;
74
}
75

76
// Most FP instructions support source modifiers.
77
LLVM_READONLY
78
static bool hasSourceMods(const MachineInstr &MI) {
79
  if (!MI.memoperands().empty())
80
    return false;
81

82
  switch (MI.getOpcode()) {
83
  case AMDGPU::COPY:
84
  case AMDGPU::G_SELECT:
85
  case AMDGPU::G_FDIV:
86
  case AMDGPU::G_FREM:
87
  case TargetOpcode::INLINEASM:
88
  case TargetOpcode::INLINEASM_BR:
89
  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
90
  case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
91
  case AMDGPU::G_BITCAST:
92
  case AMDGPU::G_ANYEXT:
93
  case AMDGPU::G_BUILD_VECTOR:
94
  case AMDGPU::G_BUILD_VECTOR_TRUNC:
95
  case AMDGPU::G_PHI:
96
    return false;
97
  case AMDGPU::G_INTRINSIC:
98
  case AMDGPU::G_INTRINSIC_CONVERGENT: {
99
    Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
100
    switch (IntrinsicID) {
101
    case Intrinsic::amdgcn_interp_p1:
102
    case Intrinsic::amdgcn_interp_p2:
103
    case Intrinsic::amdgcn_interp_mov:
104
    case Intrinsic::amdgcn_interp_p1_f16:
105
    case Intrinsic::amdgcn_interp_p2_f16:
106
    case Intrinsic::amdgcn_div_scale:
107
      return false;
108
    default:
109
      return true;
110
    }
111
  }
112
  default:
113
    return true;
114
  }
115
}
116

117
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
118
                                  unsigned CostThreshold = 4) {
119
  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
120
  // it is truly free to use a source modifier in all cases. If there are
121
  // multiple users but for each one will necessitate using VOP3, there will be
122
  // a code size increase. Try to avoid increasing code size unless we know it
123
  // will save on the instruction count.
124
  unsigned NumMayIncreaseSize = 0;
125
  Register Dst = MI.getOperand(0).getReg();
126
  for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) {
127
    if (!hasSourceMods(Use))
128
      return false;
129

130
    if (!opMustUseVOP3Encoding(Use, MRI)) {
131
      if (++NumMayIncreaseSize > CostThreshold)
132
        return false;
133
    }
134
  }
135
  return true;
136
}
137

138
static bool mayIgnoreSignedZero(MachineInstr &MI) {
139
  const TargetOptions &Options = MI.getMF()->getTarget().Options;
140
  return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);
141
}
142

143
static bool isInv2Pi(const APFloat &APF) {
144
  static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
145
  static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
146
  static const APFloat KF64(APFloat::IEEEdouble(),
147
                            APInt(64, 0x3fc45f306dc9c882));
148

149
  return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) ||
150
         APF.bitwiseIsEqual(KF64);
151
}
152

153
// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
154
// additional cost to negate them.
155
static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,
156
                                       MachineRegisterInfo &MRI) {
157
  std::optional<FPValueAndVReg> FPValReg;
158
  if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {
159
    if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative())
160
      return true;
161

162
    const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();
163
    if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value))
164
      return true;
165
  }
166
  return false;
167
}
168

169
static unsigned inverseMinMax(unsigned Opc) {
170
  switch (Opc) {
171
  case AMDGPU::G_FMAXNUM:
172
    return AMDGPU::G_FMINNUM;
173
  case AMDGPU::G_FMINNUM:
174
    return AMDGPU::G_FMAXNUM;
175
  case AMDGPU::G_FMAXNUM_IEEE:
176
    return AMDGPU::G_FMINNUM_IEEE;
177
  case AMDGPU::G_FMINNUM_IEEE:
178
    return AMDGPU::G_FMAXNUM_IEEE;
179
  case AMDGPU::G_FMAXIMUM:
180
    return AMDGPU::G_FMINIMUM;
181
  case AMDGPU::G_FMINIMUM:
182
    return AMDGPU::G_FMAXIMUM;
183
  case AMDGPU::G_AMDGPU_FMAX_LEGACY:
184
    return AMDGPU::G_AMDGPU_FMIN_LEGACY;
185
  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
186
    return AMDGPU::G_AMDGPU_FMAX_LEGACY;
187
  default:
188
    llvm_unreachable("invalid min/max opcode");
189
  }
190
}
191

192
bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
193
                                             MachineInstr *&MatchInfo) {
194
  Register Src = MI.getOperand(1).getReg();
195
  MatchInfo = MRI.getVRegDef(Src);
196

197
  // If the input has multiple uses and we can either fold the negate down, or
198
  // the other uses cannot, give up. This both prevents unprofitable
199
  // transformations and infinite loops: we won't repeatedly try to fold around
200
  // a negate that has no 'good' form.
201
  if (MRI.hasOneNonDBGUse(Src)) {
202
    if (allUsesHaveSourceMods(MI, MRI, 0))
203
      return false;
204
  } else {
205
    if (fnegFoldsIntoMI(*MatchInfo) &&
206
        (allUsesHaveSourceMods(MI, MRI) ||
207
         !allUsesHaveSourceMods(*MatchInfo, MRI)))
208
      return false;
209
  }
210

211
  switch (MatchInfo->getOpcode()) {
212
  case AMDGPU::G_FMINNUM:
213
  case AMDGPU::G_FMAXNUM:
214
  case AMDGPU::G_FMINNUM_IEEE:
215
  case AMDGPU::G_FMAXNUM_IEEE:
216
  case AMDGPU::G_FMINIMUM:
217
  case AMDGPU::G_FMAXIMUM:
218
  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
219
  case AMDGPU::G_AMDGPU_FMAX_LEGACY:
220
    // 0 doesn't have a negated inline immediate.
221
    return !isConstantCostlierToNegate(*MatchInfo,
222
                                       MatchInfo->getOperand(2).getReg(), MRI);
223
  case AMDGPU::G_FADD:
224
  case AMDGPU::G_FSUB:
225
  case AMDGPU::G_FMA:
226
  case AMDGPU::G_FMAD:
227
    return mayIgnoreSignedZero(*MatchInfo);
228
  case AMDGPU::G_FMUL:
229
  case AMDGPU::G_FPEXT:
230
  case AMDGPU::G_INTRINSIC_TRUNC:
231
  case AMDGPU::G_FPTRUNC:
232
  case AMDGPU::G_FRINT:
233
  case AMDGPU::G_FNEARBYINT:
234
  case AMDGPU::G_INTRINSIC_ROUND:
235
  case AMDGPU::G_INTRINSIC_ROUNDEVEN:
236
  case AMDGPU::G_FSIN:
237
  case AMDGPU::G_FCANONICALIZE:
238
  case AMDGPU::G_AMDGPU_RCP_IFLAG:
239
    return true;
240
  case AMDGPU::G_INTRINSIC:
241
  case AMDGPU::G_INTRINSIC_CONVERGENT: {
242
    Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
243
    switch (IntrinsicID) {
244
    case Intrinsic::amdgcn_rcp:
245
    case Intrinsic::amdgcn_rcp_legacy:
246
    case Intrinsic::amdgcn_sin:
247
    case Intrinsic::amdgcn_fmul_legacy:
248
    case Intrinsic::amdgcn_fmed3:
249
      return true;
250
    case Intrinsic::amdgcn_fma_legacy:
251
      return mayIgnoreSignedZero(*MatchInfo);
252
    default:
253
      return false;
254
    }
255
  }
256
  default:
257
    return false;
258
  }
259
}
260

261
void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
262
                                             MachineInstr *&MatchInfo) {
263
  // Transform:
264
  // %A = inst %Op1, ...
265
  // %B = fneg %A
266
  //
267
  // into:
268
  //
269
  // (if %A has one use, specifically fneg above)
270
  // %B = inst (maybe fneg %Op1), ...
271
  //
272
  // (if %A has multiple uses)
273
  // %B = inst (maybe fneg %Op1), ...
274
  // %A = fneg %B
275

276
  // Replace register in operand with a register holding negated value.
277
  auto NegateOperand = [&](MachineOperand &Op) {
278
    Register Reg = Op.getReg();
279
    if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg))))
280
      Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0);
281
    replaceRegOpWith(MRI, Op, Reg);
282
  };
283

284
  // Replace either register in operands with a register holding negated value.
285
  auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {
286
    Register XReg = X.getReg();
287
    Register YReg = Y.getReg();
288
    if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg))))
289
      replaceRegOpWith(MRI, X, XReg);
290
    else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg))))
291
      replaceRegOpWith(MRI, Y, YReg);
292
    else {
293
      YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0);
294
      replaceRegOpWith(MRI, Y, YReg);
295
    }
296
  };
297

298
  Builder.setInstrAndDebugLoc(*MatchInfo);
299

300
  // Negate appropriate operands so that resulting value of MatchInfo is
301
  // negated.
302
  switch (MatchInfo->getOpcode()) {
303
  case AMDGPU::G_FADD:
304
  case AMDGPU::G_FSUB:
305
    NegateOperand(MatchInfo->getOperand(1));
306
    NegateOperand(MatchInfo->getOperand(2));
307
    break;
308
  case AMDGPU::G_FMUL:
309
    NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
310
    break;
311
  case AMDGPU::G_FMINNUM:
312
  case AMDGPU::G_FMAXNUM:
313
  case AMDGPU::G_FMINNUM_IEEE:
314
  case AMDGPU::G_FMAXNUM_IEEE:
315
  case AMDGPU::G_FMINIMUM:
316
  case AMDGPU::G_FMAXIMUM:
317
  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
318
  case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
319
    NegateOperand(MatchInfo->getOperand(1));
320
    NegateOperand(MatchInfo->getOperand(2));
321
    unsigned Opposite = inverseMinMax(MatchInfo->getOpcode());
322
    replaceOpcodeWith(*MatchInfo, Opposite);
323
    break;
324
  }
325
  case AMDGPU::G_FMA:
326
  case AMDGPU::G_FMAD:
327
    NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
328
    NegateOperand(MatchInfo->getOperand(3));
329
    break;
330
  case AMDGPU::G_FPEXT:
331
  case AMDGPU::G_INTRINSIC_TRUNC:
332
  case AMDGPU::G_FRINT:
333
  case AMDGPU::G_FNEARBYINT:
334
  case AMDGPU::G_INTRINSIC_ROUND:
335
  case AMDGPU::G_INTRINSIC_ROUNDEVEN:
336
  case AMDGPU::G_FSIN:
337
  case AMDGPU::G_FCANONICALIZE:
338
  case AMDGPU::G_AMDGPU_RCP_IFLAG:
339
  case AMDGPU::G_FPTRUNC:
340
    NegateOperand(MatchInfo->getOperand(1));
341
    break;
342
  case AMDGPU::G_INTRINSIC:
343
  case AMDGPU::G_INTRINSIC_CONVERGENT: {
344
    Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
345
    switch (IntrinsicID) {
346
    case Intrinsic::amdgcn_rcp:
347
    case Intrinsic::amdgcn_rcp_legacy:
348
    case Intrinsic::amdgcn_sin:
349
      NegateOperand(MatchInfo->getOperand(2));
350
      break;
351
    case Intrinsic::amdgcn_fmul_legacy:
352
      NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
353
      break;
354
    case Intrinsic::amdgcn_fmed3:
355
      NegateOperand(MatchInfo->getOperand(2));
356
      NegateOperand(MatchInfo->getOperand(3));
357
      NegateOperand(MatchInfo->getOperand(4));
358
      break;
359
    case Intrinsic::amdgcn_fma_legacy:
360
      NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
361
      NegateOperand(MatchInfo->getOperand(4));
362
      break;
363
    default:
364
      llvm_unreachable("folding fneg not supported for this intrinsic");
365
    }
366
    break;
367
  }
368
  default:
369
    llvm_unreachable("folding fneg not supported for this instruction");
370
  }
371

372
  Register Dst = MI.getOperand(0).getReg();
373
  Register MatchInfoDst = MatchInfo->getOperand(0).getReg();
374

375
  if (MRI.hasOneNonDBGUse(MatchInfoDst)) {
376
    // MatchInfo now has negated value so use that instead of old Dst.
377
    replaceRegWith(MRI, Dst, MatchInfoDst);
378
  } else {
379
    // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa
380
    // but replaceRegWith will replace defs as well. It is easier to replace one
381
    // def with a new register.
382
    LLT Type = MRI.getType(Dst);
383
    Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type);
384
    replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo);
385

386
    // MatchInfo now has negated value so use that instead of old Dst.
387
    replaceRegWith(MRI, Dst, NegatedMatchInfo);
388

389
    // Recreate non negated value for other uses of old MatchInfoDst
390
    auto NextInst = ++MatchInfo->getIterator();
391
    Builder.setInstrAndDebugLoc(*NextInst);
392
    Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());
393
  }
394

395
  MI.eraseFromParent();
396
}
397

398
// TODO: Should return converted value / extension source and avoid introducing
399
// intermediate fptruncs in the apply function.
400
static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
401
                                  Register Reg) {
402
  const MachineInstr *Def = MRI.getVRegDef(Reg);
403
  if (Def->getOpcode() == TargetOpcode::G_FPEXT) {
404
    Register SrcReg = Def->getOperand(1).getReg();
405
    return MRI.getType(SrcReg) == LLT::scalar(16);
406
  }
407

408
  if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) {
409
    APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF();
410
    bool LosesInfo = true;
411
    Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
412
    return !LosesInfo;
413
  }
414

415
  return false;
416
}
417

418
bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
419
                                                       Register Src0,
420
                                                       Register Src1,
421
                                                       Register Src2) {
422
  assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
423
  Register SrcReg = MI.getOperand(1).getReg();
424
  if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32))
425
    return false;
426

427
  return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) &&
428
         isFPExtFromF16OrConst(MRI, Src2);
429
}
430

431
void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
432
                                                       Register Src0,
433
                                                       Register Src1,
434
                                                       Register Src2) {
435
  // We expect fptrunc (fpext x) to fold out, and to constant fold any constant
436
  // sources.
437
  Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0);
438
  Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0);
439
  Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0);
440

441
  LLT Ty = MRI.getType(Src0);
442
  auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1);
443
  auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1);
444
  auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2);
445
  Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1);
446
  MI.eraseFromParent();
447
}
448

449
Product

Resources

Company