CoCalc -- AMDGPUInstCombineIntrinsic.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
³⁵²⁶⁶ views
1
//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// \file
10
// This file implements a TargetTransformInfo analysis pass specific to the
11
// AMDGPU target machine. It uses the target's detailed information to provide
12
// more precise answers to certain TTI queries, while letting the target
13
// independent and default TTI implementations handle the rest.
14
//
15
//===----------------------------------------------------------------------===//
16

17
#include "AMDGPUInstrInfo.h"
18
#include "AMDGPUTargetTransformInfo.h"
19
#include "GCNSubtarget.h"
20
#include "llvm/ADT/FloatingPointMode.h"
21
#include "llvm/IR/IntrinsicsAMDGPU.h"
22
#include "llvm/Transforms/InstCombine/InstCombiner.h"
23
#include <optional>
24

25
using namespace llvm;
26
using namespace llvm::PatternMatch;
27

28
#define DEBUG_TYPE "AMDGPUtti"
29

30
namespace {
31

32
struct AMDGPUImageDMaskIntrinsic {
33
  unsigned Intr;
34
};
35

36
#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
37
#include "InstCombineTables.inc"
38

39
} // end anonymous namespace
40

41
// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
42
//
43
// A single NaN input is folded to minnum, so we rely on that folding for
44
// handling NaNs.
45
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
46
                           const APFloat &Src2) {
47
  APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
48

49
  APFloat::cmpResult Cmp0 = Max3.compare(Src0);
50
  assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
51
  if (Cmp0 == APFloat::cmpEqual)
52
    return maxnum(Src1, Src2);
53

54
  APFloat::cmpResult Cmp1 = Max3.compare(Src1);
55
  assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
56
  if (Cmp1 == APFloat::cmpEqual)
57
    return maxnum(Src0, Src2);
58

59
  return maxnum(Src0, Src1);
60
}
61

62
// Check if a value can be converted to a 16-bit value without losing
63
// precision.
64
// The value is expected to be either a float (IsFloat = true) or an unsigned
65
// integer (IsFloat = false).
66
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
67
  Type *VTy = V.getType();
68
  if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
69
    // The value is already 16-bit, so we don't want to convert to 16-bit again!
70
    return false;
71
  }
72
  if (IsFloat) {
73
    if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
74
      // We need to check that if we cast the index down to a half, we do not
75
      // lose precision.
76
      APFloat FloatValue(ConstFloat->getValueAPF());
77
      bool LosesInfo = true;
78
      FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
79
                         &LosesInfo);
80
      return !LosesInfo;
81
    }
82
  } else {
83
    if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
84
      // We need to check that if we cast the index down to an i16, we do not
85
      // lose precision.
86
      APInt IntValue(ConstInt->getValue());
87
      return IntValue.getActiveBits() <= 16;
88
    }
89
  }
90

91
  Value *CastSrc;
92
  bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
93
                       : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
94
  if (IsExt) {
95
    Type *CastSrcTy = CastSrc->getType();
96
    if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
97
      return true;
98
  }
99

100
  return false;
101
}
102

103
// Convert a value to 16-bit.
104
static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
105
  Type *VTy = V.getType();
106
  if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
107
    return cast<Instruction>(&V)->getOperand(0);
108
  if (VTy->isIntegerTy())
109
    return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
110
  if (VTy->isFloatingPointTy())
111
    return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
112

113
  llvm_unreachable("Should never be called!");
114
}
115

116
/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
117
/// modified arguments (based on OldIntr) and replaces InstToReplace with
118
/// this newly created intrinsic call.
119
static std::optional<Instruction *> modifyIntrinsicCall(
120
    IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
121
    InstCombiner &IC,
122
    std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
123
        Func) {
124
  SmallVector<Type *, 4> ArgTys;
125
  if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
126
    return std::nullopt;
127

128
  SmallVector<Value *, 8> Args(OldIntr.args());
129

130
  // Modify arguments and types
131
  Func(Args, ArgTys);
132

133
  Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
134

135
  CallInst *NewCall = IC.Builder.CreateCall(I, Args);
136
  NewCall->takeName(&OldIntr);
137
  NewCall->copyMetadata(OldIntr);
138
  if (isa<FPMathOperator>(NewCall))
139
    NewCall->copyFastMathFlags(&OldIntr);
140

141
  // Erase and replace uses
142
  if (!InstToReplace.getType()->isVoidTy())
143
    IC.replaceInstUsesWith(InstToReplace, NewCall);
144

145
  bool RemoveOldIntr = &OldIntr != &InstToReplace;
146

147
  auto RetValue = IC.eraseInstFromFunction(InstToReplace);
148
  if (RemoveOldIntr)
149
    IC.eraseInstFromFunction(OldIntr);
150

151
  return RetValue;
152
}
153

154
static std::optional<Instruction *>
155
simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
156
                             const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
157
                             IntrinsicInst &II, InstCombiner &IC) {
158
  // Optimize _L to _LZ when _L is zero
159
  if (const auto *LZMappingInfo =
160
          AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
161
    if (auto *ConstantLod =
162
            dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
163
      if (ConstantLod->isZero() || ConstantLod->isNegative()) {
164
        const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
165
            AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
166
                                                     ImageDimIntr->Dim);
167
        return modifyIntrinsicCall(
168
            II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
169
              Args.erase(Args.begin() + ImageDimIntr->LodIndex);
170
            });
171
      }
172
    }
173
  }
174

175
  // Optimize _mip away, when 'lod' is zero
176
  if (const auto *MIPMappingInfo =
177
          AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
178
    if (auto *ConstantMip =
179
            dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
180
      if (ConstantMip->isZero()) {
181
        const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
182
            AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
183
                                                     ImageDimIntr->Dim);
184
        return modifyIntrinsicCall(
185
            II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
186
              Args.erase(Args.begin() + ImageDimIntr->MipIndex);
187
            });
188
      }
189
    }
190
  }
191

192
  // Optimize _bias away when 'bias' is zero
193
  if (const auto *BiasMappingInfo =
194
          AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
195
    if (auto *ConstantBias =
196
            dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
197
      if (ConstantBias->isZero()) {
198
        const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
199
            AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
200
                                                     ImageDimIntr->Dim);
201
        return modifyIntrinsicCall(
202
            II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
203
              Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
204
              ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
205
            });
206
      }
207
    }
208
  }
209

210
  // Optimize _offset away when 'offset' is zero
211
  if (const auto *OffsetMappingInfo =
212
          AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
213
    if (auto *ConstantOffset =
214
            dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
215
      if (ConstantOffset->isZero()) {
216
        const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
217
            AMDGPU::getImageDimIntrinsicByBaseOpcode(
218
                OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
219
        return modifyIntrinsicCall(
220
            II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
221
              Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
222
            });
223
      }
224
    }
225
  }
226

227
  // Try to use D16
228
  if (ST->hasD16Images()) {
229

230
    const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
231
        AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
232

233
    if (BaseOpcode->HasD16) {
234

235
      // If the only use of image intrinsic is a fptrunc (with conversion to
236
      // half) then both fptrunc and image intrinsic will be replaced with image
237
      // intrinsic with D16 flag.
238
      if (II.hasOneUse()) {
239
        Instruction *User = II.user_back();
240

241
        if (User->getOpcode() == Instruction::FPTrunc &&
242
            User->getType()->getScalarType()->isHalfTy()) {
243

244
          return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
245
                                     [&](auto &Args, auto &ArgTys) {
246
                                       // Change return type of image intrinsic.
247
                                       // Set it to return type of fptrunc.
248
                                       ArgTys[0] = User->getType();
249
                                     });
250
        }
251
      }
252
    }
253
  }
254

255
  // Try to use A16 or G16
256
  if (!ST->hasA16() && !ST->hasG16())
257
    return std::nullopt;
258

259
  // Address is interpreted as float if the instruction has a sampler or as
260
  // unsigned int if there is no sampler.
261
  bool HasSampler =
262
      AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
263
  bool FloatCoord = false;
264
  // true means derivatives can be converted to 16 bit, coordinates not
265
  bool OnlyDerivatives = false;
266

267
  for (unsigned OperandIndex = ImageDimIntr->GradientStart;
268
       OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
269
    Value *Coord = II.getOperand(OperandIndex);
270
    // If the values are not derived from 16-bit values, we cannot optimize.
271
    if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
272
      if (OperandIndex < ImageDimIntr->CoordStart ||
273
          ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
274
        return std::nullopt;
275
      }
276
      // All gradients can be converted, so convert only them
277
      OnlyDerivatives = true;
278
      break;
279
    }
280

281
    assert(OperandIndex == ImageDimIntr->GradientStart ||
282
           FloatCoord == Coord->getType()->isFloatingPointTy());
283
    FloatCoord = Coord->getType()->isFloatingPointTy();
284
  }
285

286
  if (!OnlyDerivatives && !ST->hasA16())
287
    OnlyDerivatives = true; // Only supports G16
288

289
  // Check if there is a bias parameter and if it can be converted to f16
290
  if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
291
    Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
292
    assert(HasSampler &&
293
           "Only image instructions with a sampler can have a bias");
294
    if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
295
      OnlyDerivatives = true;
296
  }
297

298
  if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
299
                                               ImageDimIntr->CoordStart))
300
    return std::nullopt;
301

302
  Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
303
                               : Type::getInt16Ty(II.getContext());
304

305
  return modifyIntrinsicCall(
306
      II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
307
        ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
308
        if (!OnlyDerivatives) {
309
          ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
310

311
          // Change the bias type
312
          if (ImageDimIntr->NumBiasArgs != 0)
313
            ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
314
        }
315

316
        unsigned EndIndex =
317
            OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
318
        for (unsigned OperandIndex = ImageDimIntr->GradientStart;
319
             OperandIndex < EndIndex; OperandIndex++) {
320
          Args[OperandIndex] =
321
              convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
322
        }
323

324
        // Convert the bias
325
        if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
326
          Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
327
          Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
328
        }
329
      });
330
}
331

332
bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
333
                                           const Value *Op0, const Value *Op1,
334
                                           InstCombiner &IC) const {
335
  // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
336
  // infinity, gives +0.0. If we can prove we don't have one of the special
337
  // cases then we can use a normal multiply instead.
338
  // TODO: Create and use isKnownFiniteNonZero instead of just matching
339
  // constants here.
340
  if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
341
      match(Op1, PatternMatch::m_FiniteNonZero())) {
342
    // One operand is not zero or infinity or NaN.
343
    return true;
344
  }
345

346
  SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(&I);
347
  if (isKnownNeverInfOrNaN(Op0, /*Depth=*/0, SQ) &&
348
      isKnownNeverInfOrNaN(Op1, /*Depth=*/0, SQ)) {
349
    // Neither operand is infinity or NaN.
350
    return true;
351
  }
352
  return false;
353
}
354

355
/// Match an fpext from half to float, or a constant we can convert.
356
static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
357
  if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc)))))
358
    return FPExtSrc->getType()->isHalfTy();
359

360
  ConstantFP *CFP;
361
  if (match(Arg, m_ConstantFP(CFP))) {
362
    bool LosesInfo;
363
    APFloat Val(CFP->getValueAPF());
364
    Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
365
    if (LosesInfo)
366
      return false;
367

368
    FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
369
    return true;
370
  }
371

372
  return false;
373
}
374

375
// Trim all zero components from the end of the vector \p UseV and return
376
// an appropriate bitset with known elements.
377
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
378
                                       Instruction *I) {
379
  auto *VTy = cast<FixedVectorType>(UseV->getType());
380
  unsigned VWidth = VTy->getNumElements();
381
  APInt DemandedElts = APInt::getAllOnes(VWidth);
382

383
  for (int i = VWidth - 1; i > 0; --i) {
384
    auto *Elt = findScalarElement(UseV, i);
385
    if (!Elt)
386
      break;
387

388
    if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
389
      if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
390
        break;
391
    } else {
392
      break;
393
    }
394

395
    DemandedElts.clearBit(i);
396
  }
397

398
  return DemandedElts;
399
}
400

401
// Trim elements of the end of the vector \p V, if they are
402
// equal to the first element of the vector.
403
static APInt defaultComponentBroadcast(Value *V) {
404
  auto *VTy = cast<FixedVectorType>(V->getType());
405
  unsigned VWidth = VTy->getNumElements();
406
  APInt DemandedElts = APInt::getAllOnes(VWidth);
407
  Value *FirstComponent = findScalarElement(V, 0);
408

409
  SmallVector<int> ShuffleMask;
410
  if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
411
    SVI->getShuffleMask(ShuffleMask);
412

413
  for (int I = VWidth - 1; I > 0; --I) {
414
    if (ShuffleMask.empty()) {
415
      auto *Elt = findScalarElement(V, I);
416
      if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
417
        break;
418
    } else {
419
      // Detect identical elements in the shufflevector result, even though
420
      // findScalarElement cannot tell us what that element is.
421
      if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
422
        break;
423
    }
424
    DemandedElts.clearBit(I);
425
  }
426

427
  return DemandedElts;
428
}
429

430
static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
431
                                                    IntrinsicInst &II,
432
                                                    APInt DemandedElts,
433
                                                    int DMaskIdx = -1,
434
                                                    bool IsLoad = true);
435

436
/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
437
static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
438
  return (SqrtOp->getType()->isFloatTy() &&
439
          (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
440
         SqrtOp->getType()->isHalfTy();
441
}
442

443
std::optional<Instruction *>
444
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
445
  Intrinsic::ID IID = II.getIntrinsicID();
446
  switch (IID) {
447
  case Intrinsic::amdgcn_rcp: {
448
    Value *Src = II.getArgOperand(0);
449

450
    // TODO: Move to ConstantFolding/InstSimplify?
451
    if (isa<UndefValue>(Src)) {
452
      Type *Ty = II.getType();
453
      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
454
      return IC.replaceInstUsesWith(II, QNaN);
455
    }
456

457
    if (II.isStrictFP())
458
      break;
459

460
    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
461
      const APFloat &ArgVal = C->getValueAPF();
462
      APFloat Val(ArgVal.getSemantics(), 1);
463
      Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
464

465
      // This is more precise than the instruction may give.
466
      //
467
      // TODO: The instruction always flushes denormal results (except for f16),
468
      // should this also?
469
      return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
470
    }
471

472
    FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
473
    if (!FMF.allowContract())
474
      break;
475
    auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
476
    if (!SrcCI)
477
      break;
478

479
    auto IID = SrcCI->getIntrinsicID();
480
    // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
481
    //
482
    // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
483
    // relaxed.
484
    if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
485
      const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
486
      FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
487
      if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
488
        break;
489

490
      if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
491
        break;
492

493
      Function *NewDecl = Intrinsic::getDeclaration(
494
          SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
495

496
      InnerFMF |= FMF;
497
      II.setFastMathFlags(InnerFMF);
498

499
      II.setCalledFunction(NewDecl);
500
      return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
501
    }
502

503
    break;
504
  }
505
  case Intrinsic::amdgcn_sqrt:
506
  case Intrinsic::amdgcn_rsq: {
507
    Value *Src = II.getArgOperand(0);
508

509
    // TODO: Move to ConstantFolding/InstSimplify?
510
    if (isa<UndefValue>(Src)) {
511
      Type *Ty = II.getType();
512
      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
513
      return IC.replaceInstUsesWith(II, QNaN);
514
    }
515

516
    // f16 amdgcn.sqrt is identical to regular sqrt.
517
    if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
518
      Function *NewDecl = Intrinsic::getDeclaration(
519
          II.getModule(), Intrinsic::sqrt, {II.getType()});
520
      II.setCalledFunction(NewDecl);
521
      return &II;
522
    }
523

524
    break;
525
  }
526
  case Intrinsic::amdgcn_log:
527
  case Intrinsic::amdgcn_exp2: {
528
    const bool IsLog = IID == Intrinsic::amdgcn_log;
529
    const bool IsExp = IID == Intrinsic::amdgcn_exp2;
530
    Value *Src = II.getArgOperand(0);
531
    Type *Ty = II.getType();
532

533
    if (isa<PoisonValue>(Src))
534
      return IC.replaceInstUsesWith(II, Src);
535

536
    if (IC.getSimplifyQuery().isUndefValue(Src))
537
      return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
538

539
    if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
540
      if (C->isInfinity()) {
541
        // exp2(+inf) -> +inf
542
        // log2(+inf) -> +inf
543
        if (!C->isNegative())
544
          return IC.replaceInstUsesWith(II, C);
545

546
        // exp2(-inf) -> 0
547
        if (IsExp && C->isNegative())
548
          return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty));
549
      }
550

551
      if (II.isStrictFP())
552
        break;
553

554
      if (C->isNaN()) {
555
        Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
556
        return IC.replaceInstUsesWith(II, Quieted);
557
      }
558

559
      // f32 instruction doesn't handle denormals, f16 does.
560
      if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
561
        Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
562
                                      : ConstantFP::get(Ty, 1.0);
563
        return IC.replaceInstUsesWith(II, FoldedValue);
564
      }
565

566
      if (IsLog && C->isNegative())
567
        return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
568

569
      // TODO: Full constant folding matching hardware behavior.
570
    }
571

572
    break;
573
  }
574
  case Intrinsic::amdgcn_frexp_mant:
575
  case Intrinsic::amdgcn_frexp_exp: {
576
    Value *Src = II.getArgOperand(0);
577
    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
578
      int Exp;
579
      APFloat Significand =
580
          frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
581

582
      if (IID == Intrinsic::amdgcn_frexp_mant) {
583
        return IC.replaceInstUsesWith(
584
            II, ConstantFP::get(II.getContext(), Significand));
585
      }
586

587
      // Match instruction special case behavior.
588
      if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
589
        Exp = 0;
590

591
      return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
592
    }
593

594
    if (isa<UndefValue>(Src)) {
595
      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
596
    }
597

598
    break;
599
  }
600
  case Intrinsic::amdgcn_class: {
601
    Value *Src0 = II.getArgOperand(0);
602
    Value *Src1 = II.getArgOperand(1);
603
    const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
604
    if (CMask) {
605
      II.setCalledOperand(Intrinsic::getDeclaration(
606
          II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
607

608
      // Clamp any excess bits, as they're illegal for the generic intrinsic.
609
      II.setArgOperand(1, ConstantInt::get(Src1->getType(),
610
                                           CMask->getZExtValue() & fcAllFlags));
611
      return &II;
612
    }
613

614
    // Propagate poison.
615
    if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
616
      return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
617

618
    // llvm.amdgcn.class(_, undef) -> false
619
    if (IC.getSimplifyQuery().isUndefValue(Src1))
620
      return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
621

622
    // llvm.amdgcn.class(undef, mask) -> mask != 0
623
    if (IC.getSimplifyQuery().isUndefValue(Src0)) {
624
      Value *CmpMask = IC.Builder.CreateICmpNE(
625
          Src1, ConstantInt::getNullValue(Src1->getType()));
626
      return IC.replaceInstUsesWith(II, CmpMask);
627
    }
628
    break;
629
  }
630
  case Intrinsic::amdgcn_cvt_pkrtz: {
631
    Value *Src0 = II.getArgOperand(0);
632
    Value *Src1 = II.getArgOperand(1);
633
    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
634
      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
635
        const fltSemantics &HalfSem =
636
            II.getType()->getScalarType()->getFltSemantics();
637
        bool LosesInfo;
638
        APFloat Val0 = C0->getValueAPF();
639
        APFloat Val1 = C1->getValueAPF();
640
        Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
641
        Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
642

643
        Constant *Folded =
644
            ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
645
                                 ConstantFP::get(II.getContext(), Val1)});
646
        return IC.replaceInstUsesWith(II, Folded);
647
      }
648
    }
649

650
    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
651
      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
652
    }
653

654
    break;
655
  }
656
  case Intrinsic::amdgcn_cvt_pknorm_i16:
657
  case Intrinsic::amdgcn_cvt_pknorm_u16:
658
  case Intrinsic::amdgcn_cvt_pk_i16:
659
  case Intrinsic::amdgcn_cvt_pk_u16: {
660
    Value *Src0 = II.getArgOperand(0);
661
    Value *Src1 = II.getArgOperand(1);
662

663
    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
664
      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
665
    }
666

667
    break;
668
  }
669
  case Intrinsic::amdgcn_ubfe:
670
  case Intrinsic::amdgcn_sbfe: {
671
    // Decompose simple cases into standard shifts.
672
    Value *Src = II.getArgOperand(0);
673
    if (isa<UndefValue>(Src)) {
674
      return IC.replaceInstUsesWith(II, Src);
675
    }
676

677
    unsigned Width;
678
    Type *Ty = II.getType();
679
    unsigned IntSize = Ty->getIntegerBitWidth();
680

681
    ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
682
    if (CWidth) {
683
      Width = CWidth->getZExtValue();
684
      if ((Width & (IntSize - 1)) == 0) {
685
        return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
686
      }
687

688
      // Hardware ignores high bits, so remove those.
689
      if (Width >= IntSize) {
690
        return IC.replaceOperand(
691
            II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
692
      }
693
    }
694

695
    unsigned Offset;
696
    ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
697
    if (COffset) {
698
      Offset = COffset->getZExtValue();
699
      if (Offset >= IntSize) {
700
        return IC.replaceOperand(
701
            II, 1,
702
            ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
703
      }
704
    }
705

706
    bool Signed = IID == Intrinsic::amdgcn_sbfe;
707

708
    if (!CWidth || !COffset)
709
      break;
710

711
    // The case of Width == 0 is handled above, which makes this transformation
712
    // safe.  If Width == 0, then the ashr and lshr instructions become poison
713
    // value since the shift amount would be equal to the bit size.
714
    assert(Width != 0);
715

716
    // TODO: This allows folding to undef when the hardware has specific
717
    // behavior?
718
    if (Offset + Width < IntSize) {
719
      Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
720
      Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
721
                                 : IC.Builder.CreateLShr(Shl, IntSize - Width);
722
      RightShift->takeName(&II);
723
      return IC.replaceInstUsesWith(II, RightShift);
724
    }
725

726
    Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
727
                               : IC.Builder.CreateLShr(Src, Offset);
728

729
    RightShift->takeName(&II);
730
    return IC.replaceInstUsesWith(II, RightShift);
731
  }
732
  case Intrinsic::amdgcn_exp:
733
  case Intrinsic::amdgcn_exp_row:
734
  case Intrinsic::amdgcn_exp_compr: {
735
    ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
736
    unsigned EnBits = En->getZExtValue();
737
    if (EnBits == 0xf)
738
      break; // All inputs enabled.
739

740
    bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
741
    bool Changed = false;
742
    for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
743
      if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
744
          (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
745
        Value *Src = II.getArgOperand(I + 2);
746
        if (!isa<UndefValue>(Src)) {
747
          IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
748
          Changed = true;
749
        }
750
      }
751
    }
752

753
    if (Changed) {
754
      return &II;
755
    }
756

757
    break;
758
  }
759
  case Intrinsic::amdgcn_fmed3: {
760
    // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
761
    // for the shader.
762

763
    Value *Src0 = II.getArgOperand(0);
764
    Value *Src1 = II.getArgOperand(1);
765
    Value *Src2 = II.getArgOperand(2);
766

767
    // Checking for NaN before canonicalization provides better fidelity when
768
    // mapping other operations onto fmed3 since the order of operands is
769
    // unchanged.
770
    Value *V = nullptr;
771
    if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
772
      V = IC.Builder.CreateMinNum(Src1, Src2);
773
    } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
774
      V = IC.Builder.CreateMinNum(Src0, Src2);
775
    } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
776
      V = IC.Builder.CreateMaxNum(Src0, Src1);
777
    }
778

779
    if (V) {
780
      if (auto *CI = dyn_cast<CallInst>(V)) {
781
        CI->copyFastMathFlags(&II);
782
        CI->takeName(&II);
783
      }
784
      return IC.replaceInstUsesWith(II, V);
785
    }
786

787
    bool Swap = false;
788
    // Canonicalize constants to RHS operands.
789
    //
790
    // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
791
    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
792
      std::swap(Src0, Src1);
793
      Swap = true;
794
    }
795

796
    if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
797
      std::swap(Src1, Src2);
798
      Swap = true;
799
    }
800

801
    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
802
      std::swap(Src0, Src1);
803
      Swap = true;
804
    }
805

806
    if (Swap) {
807
      II.setArgOperand(0, Src0);
808
      II.setArgOperand(1, Src1);
809
      II.setArgOperand(2, Src2);
810
      return &II;
811
    }
812

813
    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
814
      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
815
        if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
816
          APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
817
                                       C2->getValueAPF());
818
          return IC.replaceInstUsesWith(
819
              II, ConstantFP::get(IC.Builder.getContext(), Result));
820
        }
821
      }
822
    }
823

824
    if (!ST->hasMed3_16())
825
      break;
826

827
    Value *X, *Y, *Z;
828

829
    // Repeat floating-point width reduction done for minnum/maxnum.
830
    // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
831
    if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) &&
832
        matchFPExtFromF16(Src2, Z)) {
833
      Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()},
834
                                                  {X, Y, Z}, &II, II.getName());
835
      return new FPExtInst(NewCall, II.getType());
836
    }
837

838
    break;
839
  }
840
  case Intrinsic::amdgcn_icmp:
841
  case Intrinsic::amdgcn_fcmp: {
842
    const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
843
    // Guard against invalid arguments.
844
    int64_t CCVal = CC->getZExtValue();
845
    bool IsInteger = IID == Intrinsic::amdgcn_icmp;
846
    if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
847
                       CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
848
        (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
849
                        CCVal > CmpInst::LAST_FCMP_PREDICATE)))
850
      break;
851

852
    Value *Src0 = II.getArgOperand(0);
853
    Value *Src1 = II.getArgOperand(1);
854

855
    if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
856
      if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
857
        Constant *CCmp = ConstantFoldCompareInstOperands(
858
            (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
859
        if (CCmp && CCmp->isNullValue()) {
860
          return IC.replaceInstUsesWith(
861
              II, IC.Builder.CreateSExt(CCmp, II.getType()));
862
        }
863

864
        // The result of V_ICMP/V_FCMP assembly instructions (which this
865
        // intrinsic exposes) is one bit per thread, masked with the EXEC
866
        // register (which contains the bitmask of live threads). So a
867
        // comparison that always returns true is the same as a read of the
868
        // EXEC register.
869
        Function *NewF = Intrinsic::getDeclaration(
870
            II.getModule(), Intrinsic::read_register, II.getType());
871
        Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
872
        MDNode *MD = MDNode::get(II.getContext(), MDArgs);
873
        Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
874
        CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
875
        NewCall->addFnAttr(Attribute::Convergent);
876
        NewCall->takeName(&II);
877
        return IC.replaceInstUsesWith(II, NewCall);
878
      }
879

880
      // Canonicalize constants to RHS.
881
      CmpInst::Predicate SwapPred =
882
          CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
883
      II.setArgOperand(0, Src1);
884
      II.setArgOperand(1, Src0);
885
      II.setArgOperand(
886
          2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
887
      return &II;
888
    }
889

890
    if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
891
      break;
892

893
    // Canonicalize compare eq with true value to compare != 0
894
    // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
895
    //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
896
    // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
897
    //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
898
    Value *ExtSrc;
899
    if (CCVal == CmpInst::ICMP_EQ &&
900
        ((match(Src1, PatternMatch::m_One()) &&
901
          match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
902
         (match(Src1, PatternMatch::m_AllOnes()) &&
903
          match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
904
        ExtSrc->getType()->isIntegerTy(1)) {
905
      IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
906
      IC.replaceOperand(II, 2,
907
                        ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
908
      return &II;
909
    }
910

911
    CmpInst::Predicate SrcPred;
912
    Value *SrcLHS;
913
    Value *SrcRHS;
914

915
    // Fold compare eq/ne with 0 from a compare result as the predicate to the
916
    // intrinsic. The typical use is a wave vote function in the library, which
917
    // will be fed from a user code condition compared with 0. Fold in the
918
    // redundant compare.
919

920
    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
921
    //   -> llvm.amdgcn.[if]cmp(a, b, pred)
922
    //
923
    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
924
    //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
925
    if (match(Src1, PatternMatch::m_Zero()) &&
926
        match(Src0, PatternMatch::m_ZExtOrSExt(
927
                        m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
928
                              PatternMatch::m_Value(SrcRHS))))) {
929
      if (CCVal == CmpInst::ICMP_EQ)
930
        SrcPred = CmpInst::getInversePredicate(SrcPred);
931

932
      Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
933
                                 ? Intrinsic::amdgcn_fcmp
934
                                 : Intrinsic::amdgcn_icmp;
935

936
      Type *Ty = SrcLHS->getType();
937
      if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
938
        // Promote to next legal integer type.
939
        unsigned Width = CmpType->getBitWidth();
940
        unsigned NewWidth = Width;
941

942
        // Don't do anything for i1 comparisons.
943
        if (Width == 1)
944
          break;
945

946
        if (Width <= 16)
947
          NewWidth = 16;
948
        else if (Width <= 32)
949
          NewWidth = 32;
950
        else if (Width <= 64)
951
          NewWidth = 64;
952
        else
953
          break; // Can't handle this.
954

955
        if (Width != NewWidth) {
956
          IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
957
          if (CmpInst::isSigned(SrcPred)) {
958
            SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
959
            SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
960
          } else {
961
            SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
962
            SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
963
          }
964
        }
965
      } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
966
        break;
967

968
      Function *NewF = Intrinsic::getDeclaration(
969
          II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
970
      Value *Args[] = {SrcLHS, SrcRHS,
971
                       ConstantInt::get(CC->getType(), SrcPred)};
972
      CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
973
      NewCall->takeName(&II);
974
      return IC.replaceInstUsesWith(II, NewCall);
975
    }
976

977
    break;
978
  }
979
  case Intrinsic::amdgcn_mbcnt_hi: {
980
    // exec_hi is all 0, so this is just a copy.
981
    if (ST->isWave32())
982
      return IC.replaceInstUsesWith(II, II.getArgOperand(1));
983
    break;
984
  }
985
  case Intrinsic::amdgcn_ballot: {
986
    if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
987
      if (Src->isZero()) {
988
        // amdgcn.ballot(i1 0) is zero.
989
        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
990
      }
991
    }
992
    if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
993
      // %b64 = call i64 ballot.i64(...)
994
      // =>
995
      // %b32 = call i32 ballot.i32(...)
996
      // %b64 = zext i32 %b32 to i64
997
      Value *Call = IC.Builder.CreateZExt(
998
          IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
999
                                     {IC.Builder.getInt32Ty()},
1000
                                     {II.getArgOperand(0)}),
1001
          II.getType());
1002
      Call->takeName(&II);
1003
      return IC.replaceInstUsesWith(II, Call);
1004
    }
1005
    break;
1006
  }
1007
  case Intrinsic::amdgcn_wqm_vote: {
1008
    // wqm_vote is identity when the argument is constant.
1009
    if (!isa<Constant>(II.getArgOperand(0)))
1010
      break;
1011

1012
    return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1013
  }
1014
  case Intrinsic::amdgcn_kill: {
1015
    const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1016
    if (!C || !C->getZExtValue())
1017
      break;
1018

1019
    // amdgcn.kill(i1 1) is a no-op
1020
    return IC.eraseInstFromFunction(II);
1021
  }
1022
  case Intrinsic::amdgcn_update_dpp: {
1023
    Value *Old = II.getArgOperand(0);
1024

1025
    auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1026
    auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1027
    auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1028
    if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
1029
        BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
1030
      break;
1031

1032
    // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1033
    return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
1034
  }
1035
  case Intrinsic::amdgcn_permlane16:
1036
  case Intrinsic::amdgcn_permlane16_var:
1037
  case Intrinsic::amdgcn_permlanex16:
1038
  case Intrinsic::amdgcn_permlanex16_var: {
1039
    // Discard vdst_in if it's not going to be read.
1040
    Value *VDstIn = II.getArgOperand(0);
1041
    if (isa<UndefValue>(VDstIn))
1042
      break;
1043

1044
    // FetchInvalid operand idx.
1045
    unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1046
                          IID == Intrinsic::amdgcn_permlanex16)
1047
                             ? 4  /* for permlane16 and permlanex16 */
1048
                             : 3; /* for permlane16_var and permlanex16_var */
1049

1050
    // BoundCtrl operand idx.
1051
    // For permlane16 and permlanex16 it should be 5
1052
    // For Permlane16_var and permlanex16_var it should be 4
1053
    unsigned int BcIdx = FiIdx + 1;
1054

1055
    ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1056
    ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1057
    if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1058
      break;
1059

1060
    return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
1061
  }
1062
  case Intrinsic::amdgcn_permlane64:
1063
    // A constant value is trivially uniform.
1064
    if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1065
      return IC.replaceInstUsesWith(II, C);
1066
    }
1067
    break;
1068
  case Intrinsic::amdgcn_readfirstlane:
1069
  case Intrinsic::amdgcn_readlane: {
1070
    // A constant value is trivially uniform.
1071
    if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1072
      return IC.replaceInstUsesWith(II, C);
1073
    }
1074

1075
    // The rest of these may not be safe if the exec may not be the same between
1076
    // the def and use.
1077
    Value *Src = II.getArgOperand(0);
1078
    Instruction *SrcInst = dyn_cast<Instruction>(Src);
1079
    if (SrcInst && SrcInst->getParent() != II.getParent())
1080
      break;
1081

1082
    // readfirstlane (readfirstlane x) -> readfirstlane x
1083
    // readlane (readfirstlane x), y -> readfirstlane x
1084
    if (match(Src,
1085
              PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
1086
      return IC.replaceInstUsesWith(II, Src);
1087
    }
1088

1089
    if (IID == Intrinsic::amdgcn_readfirstlane) {
1090
      // readfirstlane (readlane x, y) -> readlane x, y
1091
      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
1092
        return IC.replaceInstUsesWith(II, Src);
1093
      }
1094
    } else {
1095
      // readlane (readlane x, y), y -> readlane x, y
1096
      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
1097
                         PatternMatch::m_Value(),
1098
                         PatternMatch::m_Specific(II.getArgOperand(1))))) {
1099
        return IC.replaceInstUsesWith(II, Src);
1100
      }
1101
    }
1102

1103
    break;
1104
  }
1105
  case Intrinsic::amdgcn_trig_preop: {
1106
    // The intrinsic is declared with name mangling, but currently the
1107
    // instruction only exists for f64
1108
    if (!II.getType()->isDoubleTy())
1109
      break;
1110

1111
    Value *Src = II.getArgOperand(0);
1112
    Value *Segment = II.getArgOperand(1);
1113
    if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1114
      return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1115

1116
    if (isa<UndefValue>(Src)) {
1117
      auto *QNaN = ConstantFP::get(
1118
          II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));
1119
      return IC.replaceInstUsesWith(II, QNaN);
1120
    }
1121

1122
    const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);
1123
    if (!Csrc)
1124
      break;
1125

1126
    if (II.isStrictFP())
1127
      break;
1128

1129
    const APFloat &Fsrc = Csrc->getValueAPF();
1130
    if (Fsrc.isNaN()) {
1131
      auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet());
1132
      return IC.replaceInstUsesWith(II, Quieted);
1133
    }
1134

1135
    const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
1136
    if (!Cseg)
1137
      break;
1138

1139
    unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
1140
    unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
1141
    unsigned Shift = SegmentVal * 53;
1142
    if (Exponent > 1077)
1143
      Shift += Exponent - 1077;
1144

1145
    // 2.0/PI table.
1146
    static const uint32_t TwoByPi[] = {
1147
        0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1148
        0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1149
        0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1150
        0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1151
        0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1152
        0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1153
        0x56033046};
1154

1155
    // Return 0 for outbound segment (hardware behavior).
1156
    unsigned Idx = Shift >> 5;
1157
    if (Idx + 2 >= std::size(TwoByPi)) {
1158
      APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
1159
      return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
1160
    }
1161

1162
    unsigned BShift = Shift & 0x1f;
1163
    uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
1164
    uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
1165
    if (BShift)
1166
      Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1167
    Thi = Thi >> 11;
1168
    APFloat Result = APFloat((double)Thi);
1169

1170
    int Scale = -53 - Shift;
1171
    if (Exponent >= 1968)
1172
      Scale += 128;
1173

1174
    Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
1175
    return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
1176
  }
1177
  case Intrinsic::amdgcn_fmul_legacy: {
1178
    Value *Op0 = II.getArgOperand(0);
1179
    Value *Op1 = II.getArgOperand(1);
1180

1181
    // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1182
    // infinity, gives +0.0.
1183
    // TODO: Move to InstSimplify?
1184
    if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1185
        match(Op1, PatternMatch::m_AnyZeroFP()))
1186
      return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1187

1188
    // If we can prove we don't have one of the special cases then we can use a
1189
    // normal fmul instruction instead.
1190
    if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1191
      auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1192
      FMul->takeName(&II);
1193
      return IC.replaceInstUsesWith(II, FMul);
1194
    }
1195
    break;
1196
  }
1197
  case Intrinsic::amdgcn_fma_legacy: {
1198
    Value *Op0 = II.getArgOperand(0);
1199
    Value *Op1 = II.getArgOperand(1);
1200
    Value *Op2 = II.getArgOperand(2);
1201

1202
    // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1203
    // infinity, gives +0.0.
1204
    // TODO: Move to InstSimplify?
1205
    if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1206
        match(Op1, PatternMatch::m_AnyZeroFP())) {
1207
      // It's tempting to just return Op2 here, but that would give the wrong
1208
      // result if Op2 was -0.0.
1209
      auto *Zero = ConstantFP::getZero(II.getType());
1210
      auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1211
      FAdd->takeName(&II);
1212
      return IC.replaceInstUsesWith(II, FAdd);
1213
    }
1214

1215
    // If we can prove we don't have one of the special cases then we can use a
1216
    // normal fma instead.
1217
    if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1218
      II.setCalledOperand(Intrinsic::getDeclaration(
1219
          II.getModule(), Intrinsic::fma, II.getType()));
1220
      return &II;
1221
    }
1222
    break;
1223
  }
1224
  case Intrinsic::amdgcn_is_shared:
1225
  case Intrinsic::amdgcn_is_private: {
1226
    if (isa<UndefValue>(II.getArgOperand(0)))
1227
      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1228

1229
    if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1230
      return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1231
    break;
1232
  }
1233
  case Intrinsic::amdgcn_raw_buffer_store_format:
1234
  case Intrinsic::amdgcn_struct_buffer_store_format:
1235
  case Intrinsic::amdgcn_raw_tbuffer_store:
1236
  case Intrinsic::amdgcn_struct_tbuffer_store:
1237
  case Intrinsic::amdgcn_image_store_1d:
1238
  case Intrinsic::amdgcn_image_store_1darray:
1239
  case Intrinsic::amdgcn_image_store_2d:
1240
  case Intrinsic::amdgcn_image_store_2darray:
1241
  case Intrinsic::amdgcn_image_store_2darraymsaa:
1242
  case Intrinsic::amdgcn_image_store_2dmsaa:
1243
  case Intrinsic::amdgcn_image_store_3d:
1244
  case Intrinsic::amdgcn_image_store_cube:
1245
  case Intrinsic::amdgcn_image_store_mip_1d:
1246
  case Intrinsic::amdgcn_image_store_mip_1darray:
1247
  case Intrinsic::amdgcn_image_store_mip_2d:
1248
  case Intrinsic::amdgcn_image_store_mip_2darray:
1249
  case Intrinsic::amdgcn_image_store_mip_3d:
1250
  case Intrinsic::amdgcn_image_store_mip_cube: {
1251
    if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
1252
      break;
1253

1254
    APInt DemandedElts;
1255
    if (ST->hasDefaultComponentBroadcast())
1256
      DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
1257
    else if (ST->hasDefaultComponentZero())
1258
      DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
1259
    else
1260
      break;
1261

1262
    int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1263
    if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1264
                                              false)) {
1265
      return IC.eraseInstFromFunction(II);
1266
    }
1267

1268
    break;
1269
  }
1270
  }
1271
  if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1272
            AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1273
    return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1274
  }
1275
  return std::nullopt;
1276
}
1277

1278
/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1279
///
1280
/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1281
/// definitions of the intrinsics vector argument, not Uses of the result like
1282
/// image and buffer loads.
1283
/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1284
///       struct returns.
1285
static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1286
                                                    IntrinsicInst &II,
1287
                                                    APInt DemandedElts,
1288
                                                    int DMaskIdx, bool IsLoad) {
1289

1290
  auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
1291
                                             : II.getOperand(0)->getType());
1292
  unsigned VWidth = IIVTy->getNumElements();
1293
  if (VWidth == 1)
1294
    return nullptr;
1295
  Type *EltTy = IIVTy->getElementType();
1296

1297
  IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1298
  IC.Builder.SetInsertPoint(&II);
1299

1300
  // Assume the arguments are unchanged and later override them, if needed.
1301
  SmallVector<Value *, 16> Args(II.args());
1302

1303
  if (DMaskIdx < 0) {
1304
    // Buffer case.
1305

1306
    const unsigned ActiveBits = DemandedElts.getActiveBits();
1307
    const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1308

1309
    // Start assuming the prefix of elements is demanded, but possibly clear
1310
    // some other bits if there are trailing zeros (unused components at front)
1311
    // and update offset.
1312
    DemandedElts = (1 << ActiveBits) - 1;
1313

1314
    if (UnusedComponentsAtFront > 0) {
1315
      static const unsigned InvalidOffsetIdx = 0xf;
1316

1317
      unsigned OffsetIdx;
1318
      switch (II.getIntrinsicID()) {
1319
      case Intrinsic::amdgcn_raw_buffer_load:
1320
      case Intrinsic::amdgcn_raw_ptr_buffer_load:
1321
        OffsetIdx = 1;
1322
        break;
1323
      case Intrinsic::amdgcn_s_buffer_load:
1324
        // If resulting type is vec3, there is no point in trimming the
1325
        // load with updated offset, as the vec3 would most likely be widened to
1326
        // vec4 anyway during lowering.
1327
        if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1328
          OffsetIdx = InvalidOffsetIdx;
1329
        else
1330
          OffsetIdx = 1;
1331
        break;
1332
      case Intrinsic::amdgcn_struct_buffer_load:
1333
      case Intrinsic::amdgcn_struct_ptr_buffer_load:
1334
        OffsetIdx = 2;
1335
        break;
1336
      default:
1337
        // TODO: handle tbuffer* intrinsics.
1338
        OffsetIdx = InvalidOffsetIdx;
1339
        break;
1340
      }
1341

1342
      if (OffsetIdx != InvalidOffsetIdx) {
1343
        // Clear demanded bits and update the offset.
1344
        DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1345
        auto *Offset = Args[OffsetIdx];
1346
        unsigned SingleComponentSizeInBits =
1347
            IC.getDataLayout().getTypeSizeInBits(EltTy);
1348
        unsigned OffsetAdd =
1349
            UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1350
        auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1351
        Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1352
      }
1353
    }
1354
  } else {
1355
    // Image case.
1356

1357
    ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1358
    unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1359

1360
    // dmask 0 has special semantics, do not simplify.
1361
    if (DMaskVal == 0)
1362
      return nullptr;
1363

1364
    // Mask off values that are undefined because the dmask doesn't cover them
1365
    DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1366

1367
    unsigned NewDMaskVal = 0;
1368
    unsigned OrigLdStIdx = 0;
1369
    for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1370
      const unsigned Bit = 1 << SrcIdx;
1371
      if (!!(DMaskVal & Bit)) {
1372
        if (!!DemandedElts[OrigLdStIdx])
1373
          NewDMaskVal |= Bit;
1374
        OrigLdStIdx++;
1375
      }
1376
    }
1377

1378
    if (DMaskVal != NewDMaskVal)
1379
      Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1380
  }
1381

1382
  unsigned NewNumElts = DemandedElts.popcount();
1383
  if (!NewNumElts)
1384
    return PoisonValue::get(IIVTy);
1385

1386
  if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1387
    if (DMaskIdx >= 0)
1388
      II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1389
    return nullptr;
1390
  }
1391

1392
  // Validate function argument and return types, extracting overloaded types
1393
  // along the way.
1394
  SmallVector<Type *, 6> OverloadTys;
1395
  if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1396
    return nullptr;
1397

1398
  Type *NewTy =
1399
      (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1400
  OverloadTys[0] = NewTy;
1401

1402
  if (!IsLoad) {
1403
    SmallVector<int, 8> EltMask;
1404
    for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1405
      if (DemandedElts[OrigStoreIdx])
1406
        EltMask.push_back(OrigStoreIdx);
1407

1408
    if (NewNumElts == 1)
1409
      Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
1410
    else
1411
      Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
1412
  }
1413

1414
  Function *NewIntrin = Intrinsic::getDeclaration(
1415
      II.getModule(), II.getIntrinsicID(), OverloadTys);
1416
  CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1417
  NewCall->takeName(&II);
1418
  NewCall->copyMetadata(II);
1419

1420
  if (IsLoad) {
1421
    if (NewNumElts == 1) {
1422
      return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
1423
                                            DemandedElts.countr_zero());
1424
    }
1425

1426
    SmallVector<int, 8> EltMask;
1427
    unsigned NewLoadIdx = 0;
1428
    for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1429
      if (!!DemandedElts[OrigLoadIdx])
1430
        EltMask.push_back(NewLoadIdx++);
1431
      else
1432
        EltMask.push_back(NewNumElts);
1433
    }
1434

1435
    auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1436

1437
    return Shuffle;
1438
  }
1439

1440
  return NewCall;
1441
}
1442

1443
std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1444
    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1445
    APInt &UndefElts2, APInt &UndefElts3,
1446
    std::function<void(Instruction *, unsigned, APInt, APInt &)>
1447
        SimplifyAndSetOp) const {
1448
  switch (II.getIntrinsicID()) {
1449
  case Intrinsic::amdgcn_raw_buffer_load:
1450
  case Intrinsic::amdgcn_raw_ptr_buffer_load:
1451
  case Intrinsic::amdgcn_raw_buffer_load_format:
1452
  case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1453
  case Intrinsic::amdgcn_raw_tbuffer_load:
1454
  case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1455
  case Intrinsic::amdgcn_s_buffer_load:
1456
  case Intrinsic::amdgcn_struct_buffer_load:
1457
  case Intrinsic::amdgcn_struct_ptr_buffer_load:
1458
  case Intrinsic::amdgcn_struct_buffer_load_format:
1459
  case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1460
  case Intrinsic::amdgcn_struct_tbuffer_load:
1461
  case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
1462
    return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1463
  default: {
1464
    if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1465
      return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1466
    }
1467
    break;
1468
  }
1469
  }
1470
  return std::nullopt;
1471
}
1472

1473
Product

Resources

Company