Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
35266 views
1
//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// \file
10
// This file implements a TargetTransformInfo analysis pass specific to the
11
// AMDGPU target machine. It uses the target's detailed information to provide
12
// more precise answers to certain TTI queries, while letting the target
13
// independent and default TTI implementations handle the rest.
14
//
15
//===----------------------------------------------------------------------===//
16
17
#include "AMDGPUInstrInfo.h"
18
#include "AMDGPUTargetTransformInfo.h"
19
#include "GCNSubtarget.h"
20
#include "llvm/ADT/FloatingPointMode.h"
21
#include "llvm/IR/IntrinsicsAMDGPU.h"
22
#include "llvm/Transforms/InstCombine/InstCombiner.h"
23
#include <optional>
24
25
using namespace llvm;
26
using namespace llvm::PatternMatch;
27
28
#define DEBUG_TYPE "AMDGPUtti"
29
30
namespace {
31
32
struct AMDGPUImageDMaskIntrinsic {
33
unsigned Intr;
34
};
35
36
#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
37
#include "InstCombineTables.inc"
38
39
} // end anonymous namespace
40
41
// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
42
//
43
// A single NaN input is folded to minnum, so we rely on that folding for
44
// handling NaNs.
45
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
46
const APFloat &Src2) {
47
APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
48
49
APFloat::cmpResult Cmp0 = Max3.compare(Src0);
50
assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
51
if (Cmp0 == APFloat::cmpEqual)
52
return maxnum(Src1, Src2);
53
54
APFloat::cmpResult Cmp1 = Max3.compare(Src1);
55
assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
56
if (Cmp1 == APFloat::cmpEqual)
57
return maxnum(Src0, Src2);
58
59
return maxnum(Src0, Src1);
60
}
61
62
// Check if a value can be converted to a 16-bit value without losing
63
// precision.
64
// The value is expected to be either a float (IsFloat = true) or an unsigned
65
// integer (IsFloat = false).
66
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
67
Type *VTy = V.getType();
68
if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
69
// The value is already 16-bit, so we don't want to convert to 16-bit again!
70
return false;
71
}
72
if (IsFloat) {
73
if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
74
// We need to check that if we cast the index down to a half, we do not
75
// lose precision.
76
APFloat FloatValue(ConstFloat->getValueAPF());
77
bool LosesInfo = true;
78
FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
79
&LosesInfo);
80
return !LosesInfo;
81
}
82
} else {
83
if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
84
// We need to check that if we cast the index down to an i16, we do not
85
// lose precision.
86
APInt IntValue(ConstInt->getValue());
87
return IntValue.getActiveBits() <= 16;
88
}
89
}
90
91
Value *CastSrc;
92
bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
93
: match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
94
if (IsExt) {
95
Type *CastSrcTy = CastSrc->getType();
96
if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
97
return true;
98
}
99
100
return false;
101
}
102
103
// Convert a value to 16-bit.
104
static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
105
Type *VTy = V.getType();
106
if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
107
return cast<Instruction>(&V)->getOperand(0);
108
if (VTy->isIntegerTy())
109
return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
110
if (VTy->isFloatingPointTy())
111
return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
112
113
llvm_unreachable("Should never be called!");
114
}
115
116
/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
117
/// modified arguments (based on OldIntr) and replaces InstToReplace with
118
/// this newly created intrinsic call.
119
static std::optional<Instruction *> modifyIntrinsicCall(
120
IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
121
InstCombiner &IC,
122
std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
123
Func) {
124
SmallVector<Type *, 4> ArgTys;
125
if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
126
return std::nullopt;
127
128
SmallVector<Value *, 8> Args(OldIntr.args());
129
130
// Modify arguments and types
131
Func(Args, ArgTys);
132
133
Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
134
135
CallInst *NewCall = IC.Builder.CreateCall(I, Args);
136
NewCall->takeName(&OldIntr);
137
NewCall->copyMetadata(OldIntr);
138
if (isa<FPMathOperator>(NewCall))
139
NewCall->copyFastMathFlags(&OldIntr);
140
141
// Erase and replace uses
142
if (!InstToReplace.getType()->isVoidTy())
143
IC.replaceInstUsesWith(InstToReplace, NewCall);
144
145
bool RemoveOldIntr = &OldIntr != &InstToReplace;
146
147
auto RetValue = IC.eraseInstFromFunction(InstToReplace);
148
if (RemoveOldIntr)
149
IC.eraseInstFromFunction(OldIntr);
150
151
return RetValue;
152
}
153
154
static std::optional<Instruction *>
155
simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
156
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
157
IntrinsicInst &II, InstCombiner &IC) {
158
// Optimize _L to _LZ when _L is zero
159
if (const auto *LZMappingInfo =
160
AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
161
if (auto *ConstantLod =
162
dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
163
if (ConstantLod->isZero() || ConstantLod->isNegative()) {
164
const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
165
AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
166
ImageDimIntr->Dim);
167
return modifyIntrinsicCall(
168
II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
169
Args.erase(Args.begin() + ImageDimIntr->LodIndex);
170
});
171
}
172
}
173
}
174
175
// Optimize _mip away, when 'lod' is zero
176
if (const auto *MIPMappingInfo =
177
AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
178
if (auto *ConstantMip =
179
dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
180
if (ConstantMip->isZero()) {
181
const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
182
AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
183
ImageDimIntr->Dim);
184
return modifyIntrinsicCall(
185
II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
186
Args.erase(Args.begin() + ImageDimIntr->MipIndex);
187
});
188
}
189
}
190
}
191
192
// Optimize _bias away when 'bias' is zero
193
if (const auto *BiasMappingInfo =
194
AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
195
if (auto *ConstantBias =
196
dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
197
if (ConstantBias->isZero()) {
198
const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
199
AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
200
ImageDimIntr->Dim);
201
return modifyIntrinsicCall(
202
II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
203
Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
204
ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
205
});
206
}
207
}
208
}
209
210
// Optimize _offset away when 'offset' is zero
211
if (const auto *OffsetMappingInfo =
212
AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
213
if (auto *ConstantOffset =
214
dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
215
if (ConstantOffset->isZero()) {
216
const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
217
AMDGPU::getImageDimIntrinsicByBaseOpcode(
218
OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
219
return modifyIntrinsicCall(
220
II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
221
Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
222
});
223
}
224
}
225
}
226
227
// Try to use D16
228
if (ST->hasD16Images()) {
229
230
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
231
AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
232
233
if (BaseOpcode->HasD16) {
234
235
// If the only use of image intrinsic is a fptrunc (with conversion to
236
// half) then both fptrunc and image intrinsic will be replaced with image
237
// intrinsic with D16 flag.
238
if (II.hasOneUse()) {
239
Instruction *User = II.user_back();
240
241
if (User->getOpcode() == Instruction::FPTrunc &&
242
User->getType()->getScalarType()->isHalfTy()) {
243
244
return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
245
[&](auto &Args, auto &ArgTys) {
246
// Change return type of image intrinsic.
247
// Set it to return type of fptrunc.
248
ArgTys[0] = User->getType();
249
});
250
}
251
}
252
}
253
}
254
255
// Try to use A16 or G16
256
if (!ST->hasA16() && !ST->hasG16())
257
return std::nullopt;
258
259
// Address is interpreted as float if the instruction has a sampler or as
260
// unsigned int if there is no sampler.
261
bool HasSampler =
262
AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
263
bool FloatCoord = false;
264
// true means derivatives can be converted to 16 bit, coordinates not
265
bool OnlyDerivatives = false;
266
267
for (unsigned OperandIndex = ImageDimIntr->GradientStart;
268
OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
269
Value *Coord = II.getOperand(OperandIndex);
270
// If the values are not derived from 16-bit values, we cannot optimize.
271
if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
272
if (OperandIndex < ImageDimIntr->CoordStart ||
273
ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
274
return std::nullopt;
275
}
276
// All gradients can be converted, so convert only them
277
OnlyDerivatives = true;
278
break;
279
}
280
281
assert(OperandIndex == ImageDimIntr->GradientStart ||
282
FloatCoord == Coord->getType()->isFloatingPointTy());
283
FloatCoord = Coord->getType()->isFloatingPointTy();
284
}
285
286
if (!OnlyDerivatives && !ST->hasA16())
287
OnlyDerivatives = true; // Only supports G16
288
289
// Check if there is a bias parameter and if it can be converted to f16
290
if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
291
Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
292
assert(HasSampler &&
293
"Only image instructions with a sampler can have a bias");
294
if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
295
OnlyDerivatives = true;
296
}
297
298
if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
299
ImageDimIntr->CoordStart))
300
return std::nullopt;
301
302
Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
303
: Type::getInt16Ty(II.getContext());
304
305
return modifyIntrinsicCall(
306
II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
307
ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
308
if (!OnlyDerivatives) {
309
ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
310
311
// Change the bias type
312
if (ImageDimIntr->NumBiasArgs != 0)
313
ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
314
}
315
316
unsigned EndIndex =
317
OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
318
for (unsigned OperandIndex = ImageDimIntr->GradientStart;
319
OperandIndex < EndIndex; OperandIndex++) {
320
Args[OperandIndex] =
321
convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
322
}
323
324
// Convert the bias
325
if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
326
Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
327
Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
328
}
329
});
330
}
331
332
bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
333
const Value *Op0, const Value *Op1,
334
InstCombiner &IC) const {
335
// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
336
// infinity, gives +0.0. If we can prove we don't have one of the special
337
// cases then we can use a normal multiply instead.
338
// TODO: Create and use isKnownFiniteNonZero instead of just matching
339
// constants here.
340
if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
341
match(Op1, PatternMatch::m_FiniteNonZero())) {
342
// One operand is not zero or infinity or NaN.
343
return true;
344
}
345
346
SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(&I);
347
if (isKnownNeverInfOrNaN(Op0, /*Depth=*/0, SQ) &&
348
isKnownNeverInfOrNaN(Op1, /*Depth=*/0, SQ)) {
349
// Neither operand is infinity or NaN.
350
return true;
351
}
352
return false;
353
}
354
355
/// Match an fpext from half to float, or a constant we can convert.
356
static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
357
if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc)))))
358
return FPExtSrc->getType()->isHalfTy();
359
360
ConstantFP *CFP;
361
if (match(Arg, m_ConstantFP(CFP))) {
362
bool LosesInfo;
363
APFloat Val(CFP->getValueAPF());
364
Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
365
if (LosesInfo)
366
return false;
367
368
FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
369
return true;
370
}
371
372
return false;
373
}
374
375
// Trim all zero components from the end of the vector \p UseV and return
376
// an appropriate bitset with known elements.
377
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
378
Instruction *I) {
379
auto *VTy = cast<FixedVectorType>(UseV->getType());
380
unsigned VWidth = VTy->getNumElements();
381
APInt DemandedElts = APInt::getAllOnes(VWidth);
382
383
for (int i = VWidth - 1; i > 0; --i) {
384
auto *Elt = findScalarElement(UseV, i);
385
if (!Elt)
386
break;
387
388
if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
389
if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
390
break;
391
} else {
392
break;
393
}
394
395
DemandedElts.clearBit(i);
396
}
397
398
return DemandedElts;
399
}
400
401
// Trim elements of the end of the vector \p V, if they are
402
// equal to the first element of the vector.
403
static APInt defaultComponentBroadcast(Value *V) {
404
auto *VTy = cast<FixedVectorType>(V->getType());
405
unsigned VWidth = VTy->getNumElements();
406
APInt DemandedElts = APInt::getAllOnes(VWidth);
407
Value *FirstComponent = findScalarElement(V, 0);
408
409
SmallVector<int> ShuffleMask;
410
if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
411
SVI->getShuffleMask(ShuffleMask);
412
413
for (int I = VWidth - 1; I > 0; --I) {
414
if (ShuffleMask.empty()) {
415
auto *Elt = findScalarElement(V, I);
416
if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
417
break;
418
} else {
419
// Detect identical elements in the shufflevector result, even though
420
// findScalarElement cannot tell us what that element is.
421
if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
422
break;
423
}
424
DemandedElts.clearBit(I);
425
}
426
427
return DemandedElts;
428
}
429
430
static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
431
IntrinsicInst &II,
432
APInt DemandedElts,
433
int DMaskIdx = -1,
434
bool IsLoad = true);
435
436
/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
437
static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
438
return (SqrtOp->getType()->isFloatTy() &&
439
(SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
440
SqrtOp->getType()->isHalfTy();
441
}
442
443
std::optional<Instruction *>
444
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
445
Intrinsic::ID IID = II.getIntrinsicID();
446
switch (IID) {
447
case Intrinsic::amdgcn_rcp: {
448
Value *Src = II.getArgOperand(0);
449
450
// TODO: Move to ConstantFolding/InstSimplify?
451
if (isa<UndefValue>(Src)) {
452
Type *Ty = II.getType();
453
auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
454
return IC.replaceInstUsesWith(II, QNaN);
455
}
456
457
if (II.isStrictFP())
458
break;
459
460
if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
461
const APFloat &ArgVal = C->getValueAPF();
462
APFloat Val(ArgVal.getSemantics(), 1);
463
Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
464
465
// This is more precise than the instruction may give.
466
//
467
// TODO: The instruction always flushes denormal results (except for f16),
468
// should this also?
469
return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
470
}
471
472
FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
473
if (!FMF.allowContract())
474
break;
475
auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
476
if (!SrcCI)
477
break;
478
479
auto IID = SrcCI->getIntrinsicID();
480
// llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
481
//
482
// llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
483
// relaxed.
484
if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
485
const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
486
FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
487
if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
488
break;
489
490
if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
491
break;
492
493
Function *NewDecl = Intrinsic::getDeclaration(
494
SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
495
496
InnerFMF |= FMF;
497
II.setFastMathFlags(InnerFMF);
498
499
II.setCalledFunction(NewDecl);
500
return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
501
}
502
503
break;
504
}
505
case Intrinsic::amdgcn_sqrt:
506
case Intrinsic::amdgcn_rsq: {
507
Value *Src = II.getArgOperand(0);
508
509
// TODO: Move to ConstantFolding/InstSimplify?
510
if (isa<UndefValue>(Src)) {
511
Type *Ty = II.getType();
512
auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
513
return IC.replaceInstUsesWith(II, QNaN);
514
}
515
516
// f16 amdgcn.sqrt is identical to regular sqrt.
517
if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
518
Function *NewDecl = Intrinsic::getDeclaration(
519
II.getModule(), Intrinsic::sqrt, {II.getType()});
520
II.setCalledFunction(NewDecl);
521
return &II;
522
}
523
524
break;
525
}
526
case Intrinsic::amdgcn_log:
527
case Intrinsic::amdgcn_exp2: {
528
const bool IsLog = IID == Intrinsic::amdgcn_log;
529
const bool IsExp = IID == Intrinsic::amdgcn_exp2;
530
Value *Src = II.getArgOperand(0);
531
Type *Ty = II.getType();
532
533
if (isa<PoisonValue>(Src))
534
return IC.replaceInstUsesWith(II, Src);
535
536
if (IC.getSimplifyQuery().isUndefValue(Src))
537
return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
538
539
if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
540
if (C->isInfinity()) {
541
// exp2(+inf) -> +inf
542
// log2(+inf) -> +inf
543
if (!C->isNegative())
544
return IC.replaceInstUsesWith(II, C);
545
546
// exp2(-inf) -> 0
547
if (IsExp && C->isNegative())
548
return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty));
549
}
550
551
if (II.isStrictFP())
552
break;
553
554
if (C->isNaN()) {
555
Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
556
return IC.replaceInstUsesWith(II, Quieted);
557
}
558
559
// f32 instruction doesn't handle denormals, f16 does.
560
if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
561
Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
562
: ConstantFP::get(Ty, 1.0);
563
return IC.replaceInstUsesWith(II, FoldedValue);
564
}
565
566
if (IsLog && C->isNegative())
567
return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
568
569
// TODO: Full constant folding matching hardware behavior.
570
}
571
572
break;
573
}
574
case Intrinsic::amdgcn_frexp_mant:
575
case Intrinsic::amdgcn_frexp_exp: {
576
Value *Src = II.getArgOperand(0);
577
if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
578
int Exp;
579
APFloat Significand =
580
frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
581
582
if (IID == Intrinsic::amdgcn_frexp_mant) {
583
return IC.replaceInstUsesWith(
584
II, ConstantFP::get(II.getContext(), Significand));
585
}
586
587
// Match instruction special case behavior.
588
if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
589
Exp = 0;
590
591
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
592
}
593
594
if (isa<UndefValue>(Src)) {
595
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
596
}
597
598
break;
599
}
600
case Intrinsic::amdgcn_class: {
601
Value *Src0 = II.getArgOperand(0);
602
Value *Src1 = II.getArgOperand(1);
603
const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
604
if (CMask) {
605
II.setCalledOperand(Intrinsic::getDeclaration(
606
II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
607
608
// Clamp any excess bits, as they're illegal for the generic intrinsic.
609
II.setArgOperand(1, ConstantInt::get(Src1->getType(),
610
CMask->getZExtValue() & fcAllFlags));
611
return &II;
612
}
613
614
// Propagate poison.
615
if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
616
return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
617
618
// llvm.amdgcn.class(_, undef) -> false
619
if (IC.getSimplifyQuery().isUndefValue(Src1))
620
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
621
622
// llvm.amdgcn.class(undef, mask) -> mask != 0
623
if (IC.getSimplifyQuery().isUndefValue(Src0)) {
624
Value *CmpMask = IC.Builder.CreateICmpNE(
625
Src1, ConstantInt::getNullValue(Src1->getType()));
626
return IC.replaceInstUsesWith(II, CmpMask);
627
}
628
break;
629
}
630
case Intrinsic::amdgcn_cvt_pkrtz: {
631
Value *Src0 = II.getArgOperand(0);
632
Value *Src1 = II.getArgOperand(1);
633
if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
634
if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
635
const fltSemantics &HalfSem =
636
II.getType()->getScalarType()->getFltSemantics();
637
bool LosesInfo;
638
APFloat Val0 = C0->getValueAPF();
639
APFloat Val1 = C1->getValueAPF();
640
Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
641
Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
642
643
Constant *Folded =
644
ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
645
ConstantFP::get(II.getContext(), Val1)});
646
return IC.replaceInstUsesWith(II, Folded);
647
}
648
}
649
650
if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
651
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
652
}
653
654
break;
655
}
656
case Intrinsic::amdgcn_cvt_pknorm_i16:
657
case Intrinsic::amdgcn_cvt_pknorm_u16:
658
case Intrinsic::amdgcn_cvt_pk_i16:
659
case Intrinsic::amdgcn_cvt_pk_u16: {
660
Value *Src0 = II.getArgOperand(0);
661
Value *Src1 = II.getArgOperand(1);
662
663
if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
664
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
665
}
666
667
break;
668
}
669
case Intrinsic::amdgcn_ubfe:
670
case Intrinsic::amdgcn_sbfe: {
671
// Decompose simple cases into standard shifts.
672
Value *Src = II.getArgOperand(0);
673
if (isa<UndefValue>(Src)) {
674
return IC.replaceInstUsesWith(II, Src);
675
}
676
677
unsigned Width;
678
Type *Ty = II.getType();
679
unsigned IntSize = Ty->getIntegerBitWidth();
680
681
ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
682
if (CWidth) {
683
Width = CWidth->getZExtValue();
684
if ((Width & (IntSize - 1)) == 0) {
685
return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
686
}
687
688
// Hardware ignores high bits, so remove those.
689
if (Width >= IntSize) {
690
return IC.replaceOperand(
691
II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
692
}
693
}
694
695
unsigned Offset;
696
ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
697
if (COffset) {
698
Offset = COffset->getZExtValue();
699
if (Offset >= IntSize) {
700
return IC.replaceOperand(
701
II, 1,
702
ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
703
}
704
}
705
706
bool Signed = IID == Intrinsic::amdgcn_sbfe;
707
708
if (!CWidth || !COffset)
709
break;
710
711
// The case of Width == 0 is handled above, which makes this transformation
712
// safe. If Width == 0, then the ashr and lshr instructions become poison
713
// value since the shift amount would be equal to the bit size.
714
assert(Width != 0);
715
716
// TODO: This allows folding to undef when the hardware has specific
717
// behavior?
718
if (Offset + Width < IntSize) {
719
Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
720
Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
721
: IC.Builder.CreateLShr(Shl, IntSize - Width);
722
RightShift->takeName(&II);
723
return IC.replaceInstUsesWith(II, RightShift);
724
}
725
726
Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
727
: IC.Builder.CreateLShr(Src, Offset);
728
729
RightShift->takeName(&II);
730
return IC.replaceInstUsesWith(II, RightShift);
731
}
732
case Intrinsic::amdgcn_exp:
733
case Intrinsic::amdgcn_exp_row:
734
case Intrinsic::amdgcn_exp_compr: {
735
ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
736
unsigned EnBits = En->getZExtValue();
737
if (EnBits == 0xf)
738
break; // All inputs enabled.
739
740
bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
741
bool Changed = false;
742
for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
743
if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
744
(IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
745
Value *Src = II.getArgOperand(I + 2);
746
if (!isa<UndefValue>(Src)) {
747
IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
748
Changed = true;
749
}
750
}
751
}
752
753
if (Changed) {
754
return &II;
755
}
756
757
break;
758
}
759
case Intrinsic::amdgcn_fmed3: {
760
// Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
761
// for the shader.
762
763
Value *Src0 = II.getArgOperand(0);
764
Value *Src1 = II.getArgOperand(1);
765
Value *Src2 = II.getArgOperand(2);
766
767
// Checking for NaN before canonicalization provides better fidelity when
768
// mapping other operations onto fmed3 since the order of operands is
769
// unchanged.
770
Value *V = nullptr;
771
if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
772
V = IC.Builder.CreateMinNum(Src1, Src2);
773
} else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
774
V = IC.Builder.CreateMinNum(Src0, Src2);
775
} else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
776
V = IC.Builder.CreateMaxNum(Src0, Src1);
777
}
778
779
if (V) {
780
if (auto *CI = dyn_cast<CallInst>(V)) {
781
CI->copyFastMathFlags(&II);
782
CI->takeName(&II);
783
}
784
return IC.replaceInstUsesWith(II, V);
785
}
786
787
bool Swap = false;
788
// Canonicalize constants to RHS operands.
789
//
790
// fmed3(c0, x, c1) -> fmed3(x, c0, c1)
791
if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
792
std::swap(Src0, Src1);
793
Swap = true;
794
}
795
796
if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
797
std::swap(Src1, Src2);
798
Swap = true;
799
}
800
801
if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
802
std::swap(Src0, Src1);
803
Swap = true;
804
}
805
806
if (Swap) {
807
II.setArgOperand(0, Src0);
808
II.setArgOperand(1, Src1);
809
II.setArgOperand(2, Src2);
810
return &II;
811
}
812
813
if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
814
if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
815
if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
816
APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
817
C2->getValueAPF());
818
return IC.replaceInstUsesWith(
819
II, ConstantFP::get(IC.Builder.getContext(), Result));
820
}
821
}
822
}
823
824
if (!ST->hasMed3_16())
825
break;
826
827
Value *X, *Y, *Z;
828
829
// Repeat floating-point width reduction done for minnum/maxnum.
830
// fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
831
if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) &&
832
matchFPExtFromF16(Src2, Z)) {
833
Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()},
834
{X, Y, Z}, &II, II.getName());
835
return new FPExtInst(NewCall, II.getType());
836
}
837
838
break;
839
}
840
case Intrinsic::amdgcn_icmp:
841
case Intrinsic::amdgcn_fcmp: {
842
const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
843
// Guard against invalid arguments.
844
int64_t CCVal = CC->getZExtValue();
845
bool IsInteger = IID == Intrinsic::amdgcn_icmp;
846
if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
847
CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
848
(!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
849
CCVal > CmpInst::LAST_FCMP_PREDICATE)))
850
break;
851
852
Value *Src0 = II.getArgOperand(0);
853
Value *Src1 = II.getArgOperand(1);
854
855
if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
856
if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
857
Constant *CCmp = ConstantFoldCompareInstOperands(
858
(ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
859
if (CCmp && CCmp->isNullValue()) {
860
return IC.replaceInstUsesWith(
861
II, IC.Builder.CreateSExt(CCmp, II.getType()));
862
}
863
864
// The result of V_ICMP/V_FCMP assembly instructions (which this
865
// intrinsic exposes) is one bit per thread, masked with the EXEC
866
// register (which contains the bitmask of live threads). So a
867
// comparison that always returns true is the same as a read of the
868
// EXEC register.
869
Function *NewF = Intrinsic::getDeclaration(
870
II.getModule(), Intrinsic::read_register, II.getType());
871
Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
872
MDNode *MD = MDNode::get(II.getContext(), MDArgs);
873
Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
874
CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
875
NewCall->addFnAttr(Attribute::Convergent);
876
NewCall->takeName(&II);
877
return IC.replaceInstUsesWith(II, NewCall);
878
}
879
880
// Canonicalize constants to RHS.
881
CmpInst::Predicate SwapPred =
882
CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
883
II.setArgOperand(0, Src1);
884
II.setArgOperand(1, Src0);
885
II.setArgOperand(
886
2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
887
return &II;
888
}
889
890
if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
891
break;
892
893
// Canonicalize compare eq with true value to compare != 0
894
// llvm.amdgcn.icmp(zext (i1 x), 1, eq)
895
// -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
896
// llvm.amdgcn.icmp(sext (i1 x), -1, eq)
897
// -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
898
Value *ExtSrc;
899
if (CCVal == CmpInst::ICMP_EQ &&
900
((match(Src1, PatternMatch::m_One()) &&
901
match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
902
(match(Src1, PatternMatch::m_AllOnes()) &&
903
match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
904
ExtSrc->getType()->isIntegerTy(1)) {
905
IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
906
IC.replaceOperand(II, 2,
907
ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
908
return &II;
909
}
910
911
CmpInst::Predicate SrcPred;
912
Value *SrcLHS;
913
Value *SrcRHS;
914
915
// Fold compare eq/ne with 0 from a compare result as the predicate to the
916
// intrinsic. The typical use is a wave vote function in the library, which
917
// will be fed from a user code condition compared with 0. Fold in the
918
// redundant compare.
919
920
// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
921
// -> llvm.amdgcn.[if]cmp(a, b, pred)
922
//
923
// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
924
// -> llvm.amdgcn.[if]cmp(a, b, inv pred)
925
if (match(Src1, PatternMatch::m_Zero()) &&
926
match(Src0, PatternMatch::m_ZExtOrSExt(
927
m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
928
PatternMatch::m_Value(SrcRHS))))) {
929
if (CCVal == CmpInst::ICMP_EQ)
930
SrcPred = CmpInst::getInversePredicate(SrcPred);
931
932
Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
933
? Intrinsic::amdgcn_fcmp
934
: Intrinsic::amdgcn_icmp;
935
936
Type *Ty = SrcLHS->getType();
937
if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
938
// Promote to next legal integer type.
939
unsigned Width = CmpType->getBitWidth();
940
unsigned NewWidth = Width;
941
942
// Don't do anything for i1 comparisons.
943
if (Width == 1)
944
break;
945
946
if (Width <= 16)
947
NewWidth = 16;
948
else if (Width <= 32)
949
NewWidth = 32;
950
else if (Width <= 64)
951
NewWidth = 64;
952
else
953
break; // Can't handle this.
954
955
if (Width != NewWidth) {
956
IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
957
if (CmpInst::isSigned(SrcPred)) {
958
SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
959
SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
960
} else {
961
SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
962
SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
963
}
964
}
965
} else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
966
break;
967
968
Function *NewF = Intrinsic::getDeclaration(
969
II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
970
Value *Args[] = {SrcLHS, SrcRHS,
971
ConstantInt::get(CC->getType(), SrcPred)};
972
CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
973
NewCall->takeName(&II);
974
return IC.replaceInstUsesWith(II, NewCall);
975
}
976
977
break;
978
}
979
case Intrinsic::amdgcn_mbcnt_hi: {
980
// exec_hi is all 0, so this is just a copy.
981
if (ST->isWave32())
982
return IC.replaceInstUsesWith(II, II.getArgOperand(1));
983
break;
984
}
985
case Intrinsic::amdgcn_ballot: {
986
if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
987
if (Src->isZero()) {
988
// amdgcn.ballot(i1 0) is zero.
989
return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
990
}
991
}
992
if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
993
// %b64 = call i64 ballot.i64(...)
994
// =>
995
// %b32 = call i32 ballot.i32(...)
996
// %b64 = zext i32 %b32 to i64
997
Value *Call = IC.Builder.CreateZExt(
998
IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
999
{IC.Builder.getInt32Ty()},
1000
{II.getArgOperand(0)}),
1001
II.getType());
1002
Call->takeName(&II);
1003
return IC.replaceInstUsesWith(II, Call);
1004
}
1005
break;
1006
}
1007
case Intrinsic::amdgcn_wqm_vote: {
1008
// wqm_vote is identity when the argument is constant.
1009
if (!isa<Constant>(II.getArgOperand(0)))
1010
break;
1011
1012
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1013
}
1014
case Intrinsic::amdgcn_kill: {
1015
const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1016
if (!C || !C->getZExtValue())
1017
break;
1018
1019
// amdgcn.kill(i1 1) is a no-op
1020
return IC.eraseInstFromFunction(II);
1021
}
1022
case Intrinsic::amdgcn_update_dpp: {
1023
Value *Old = II.getArgOperand(0);
1024
1025
auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1026
auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1027
auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1028
if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
1029
BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
1030
break;
1031
1032
// If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1033
return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
1034
}
1035
case Intrinsic::amdgcn_permlane16:
1036
case Intrinsic::amdgcn_permlane16_var:
1037
case Intrinsic::amdgcn_permlanex16:
1038
case Intrinsic::amdgcn_permlanex16_var: {
1039
// Discard vdst_in if it's not going to be read.
1040
Value *VDstIn = II.getArgOperand(0);
1041
if (isa<UndefValue>(VDstIn))
1042
break;
1043
1044
// FetchInvalid operand idx.
1045
unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1046
IID == Intrinsic::amdgcn_permlanex16)
1047
? 4 /* for permlane16 and permlanex16 */
1048
: 3; /* for permlane16_var and permlanex16_var */
1049
1050
// BoundCtrl operand idx.
1051
// For permlane16 and permlanex16 it should be 5
1052
// For Permlane16_var and permlanex16_var it should be 4
1053
unsigned int BcIdx = FiIdx + 1;
1054
1055
ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1056
ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1057
if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1058
break;
1059
1060
return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
1061
}
1062
case Intrinsic::amdgcn_permlane64:
1063
// A constant value is trivially uniform.
1064
if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1065
return IC.replaceInstUsesWith(II, C);
1066
}
1067
break;
1068
case Intrinsic::amdgcn_readfirstlane:
1069
case Intrinsic::amdgcn_readlane: {
1070
// A constant value is trivially uniform.
1071
if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1072
return IC.replaceInstUsesWith(II, C);
1073
}
1074
1075
// The rest of these may not be safe if the exec may not be the same between
1076
// the def and use.
1077
Value *Src = II.getArgOperand(0);
1078
Instruction *SrcInst = dyn_cast<Instruction>(Src);
1079
if (SrcInst && SrcInst->getParent() != II.getParent())
1080
break;
1081
1082
// readfirstlane (readfirstlane x) -> readfirstlane x
1083
// readlane (readfirstlane x), y -> readfirstlane x
1084
if (match(Src,
1085
PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
1086
return IC.replaceInstUsesWith(II, Src);
1087
}
1088
1089
if (IID == Intrinsic::amdgcn_readfirstlane) {
1090
// readfirstlane (readlane x, y) -> readlane x, y
1091
if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
1092
return IC.replaceInstUsesWith(II, Src);
1093
}
1094
} else {
1095
// readlane (readlane x, y), y -> readlane x, y
1096
if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
1097
PatternMatch::m_Value(),
1098
PatternMatch::m_Specific(II.getArgOperand(1))))) {
1099
return IC.replaceInstUsesWith(II, Src);
1100
}
1101
}
1102
1103
break;
1104
}
1105
case Intrinsic::amdgcn_trig_preop: {
1106
// The intrinsic is declared with name mangling, but currently the
1107
// instruction only exists for f64
1108
if (!II.getType()->isDoubleTy())
1109
break;
1110
1111
Value *Src = II.getArgOperand(0);
1112
Value *Segment = II.getArgOperand(1);
1113
if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1114
return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1115
1116
if (isa<UndefValue>(Src)) {
1117
auto *QNaN = ConstantFP::get(
1118
II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));
1119
return IC.replaceInstUsesWith(II, QNaN);
1120
}
1121
1122
const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);
1123
if (!Csrc)
1124
break;
1125
1126
if (II.isStrictFP())
1127
break;
1128
1129
const APFloat &Fsrc = Csrc->getValueAPF();
1130
if (Fsrc.isNaN()) {
1131
auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet());
1132
return IC.replaceInstUsesWith(II, Quieted);
1133
}
1134
1135
const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
1136
if (!Cseg)
1137
break;
1138
1139
unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
1140
unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
1141
unsigned Shift = SegmentVal * 53;
1142
if (Exponent > 1077)
1143
Shift += Exponent - 1077;
1144
1145
// 2.0/PI table.
1146
static const uint32_t TwoByPi[] = {
1147
0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1148
0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1149
0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1150
0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1151
0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1152
0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1153
0x56033046};
1154
1155
// Return 0 for outbound segment (hardware behavior).
1156
unsigned Idx = Shift >> 5;
1157
if (Idx + 2 >= std::size(TwoByPi)) {
1158
APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
1159
return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
1160
}
1161
1162
unsigned BShift = Shift & 0x1f;
1163
uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
1164
uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
1165
if (BShift)
1166
Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
1167
Thi = Thi >> 11;
1168
APFloat Result = APFloat((double)Thi);
1169
1170
int Scale = -53 - Shift;
1171
if (Exponent >= 1968)
1172
Scale += 128;
1173
1174
Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
1175
return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
1176
}
1177
case Intrinsic::amdgcn_fmul_legacy: {
1178
Value *Op0 = II.getArgOperand(0);
1179
Value *Op1 = II.getArgOperand(1);
1180
1181
// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1182
// infinity, gives +0.0.
1183
// TODO: Move to InstSimplify?
1184
if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1185
match(Op1, PatternMatch::m_AnyZeroFP()))
1186
return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1187
1188
// If we can prove we don't have one of the special cases then we can use a
1189
// normal fmul instruction instead.
1190
if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1191
auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1192
FMul->takeName(&II);
1193
return IC.replaceInstUsesWith(II, FMul);
1194
}
1195
break;
1196
}
1197
case Intrinsic::amdgcn_fma_legacy: {
1198
Value *Op0 = II.getArgOperand(0);
1199
Value *Op1 = II.getArgOperand(1);
1200
Value *Op2 = II.getArgOperand(2);
1201
1202
// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1203
// infinity, gives +0.0.
1204
// TODO: Move to InstSimplify?
1205
if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1206
match(Op1, PatternMatch::m_AnyZeroFP())) {
1207
// It's tempting to just return Op2 here, but that would give the wrong
1208
// result if Op2 was -0.0.
1209
auto *Zero = ConstantFP::getZero(II.getType());
1210
auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1211
FAdd->takeName(&II);
1212
return IC.replaceInstUsesWith(II, FAdd);
1213
}
1214
1215
// If we can prove we don't have one of the special cases then we can use a
1216
// normal fma instead.
1217
if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1218
II.setCalledOperand(Intrinsic::getDeclaration(
1219
II.getModule(), Intrinsic::fma, II.getType()));
1220
return &II;
1221
}
1222
break;
1223
}
1224
case Intrinsic::amdgcn_is_shared:
1225
case Intrinsic::amdgcn_is_private: {
1226
if (isa<UndefValue>(II.getArgOperand(0)))
1227
return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1228
1229
if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1230
return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1231
break;
1232
}
1233
case Intrinsic::amdgcn_raw_buffer_store_format:
1234
case Intrinsic::amdgcn_struct_buffer_store_format:
1235
case Intrinsic::amdgcn_raw_tbuffer_store:
1236
case Intrinsic::amdgcn_struct_tbuffer_store:
1237
case Intrinsic::amdgcn_image_store_1d:
1238
case Intrinsic::amdgcn_image_store_1darray:
1239
case Intrinsic::amdgcn_image_store_2d:
1240
case Intrinsic::amdgcn_image_store_2darray:
1241
case Intrinsic::amdgcn_image_store_2darraymsaa:
1242
case Intrinsic::amdgcn_image_store_2dmsaa:
1243
case Intrinsic::amdgcn_image_store_3d:
1244
case Intrinsic::amdgcn_image_store_cube:
1245
case Intrinsic::amdgcn_image_store_mip_1d:
1246
case Intrinsic::amdgcn_image_store_mip_1darray:
1247
case Intrinsic::amdgcn_image_store_mip_2d:
1248
case Intrinsic::amdgcn_image_store_mip_2darray:
1249
case Intrinsic::amdgcn_image_store_mip_3d:
1250
case Intrinsic::amdgcn_image_store_mip_cube: {
1251
if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
1252
break;
1253
1254
APInt DemandedElts;
1255
if (ST->hasDefaultComponentBroadcast())
1256
DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
1257
else if (ST->hasDefaultComponentZero())
1258
DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
1259
else
1260
break;
1261
1262
int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1263
if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1264
false)) {
1265
return IC.eraseInstFromFunction(II);
1266
}
1267
1268
break;
1269
}
1270
}
1271
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1272
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1273
return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1274
}
1275
return std::nullopt;
1276
}
1277
1278
/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1279
///
1280
/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1281
/// definitions of the intrinsics vector argument, not Uses of the result like
1282
/// image and buffer loads.
1283
/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1284
/// struct returns.
1285
static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1286
IntrinsicInst &II,
1287
APInt DemandedElts,
1288
int DMaskIdx, bool IsLoad) {
1289
1290
auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
1291
: II.getOperand(0)->getType());
1292
unsigned VWidth = IIVTy->getNumElements();
1293
if (VWidth == 1)
1294
return nullptr;
1295
Type *EltTy = IIVTy->getElementType();
1296
1297
IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1298
IC.Builder.SetInsertPoint(&II);
1299
1300
// Assume the arguments are unchanged and later override them, if needed.
1301
SmallVector<Value *, 16> Args(II.args());
1302
1303
if (DMaskIdx < 0) {
1304
// Buffer case.
1305
1306
const unsigned ActiveBits = DemandedElts.getActiveBits();
1307
const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1308
1309
// Start assuming the prefix of elements is demanded, but possibly clear
1310
// some other bits if there are trailing zeros (unused components at front)
1311
// and update offset.
1312
DemandedElts = (1 << ActiveBits) - 1;
1313
1314
if (UnusedComponentsAtFront > 0) {
1315
static const unsigned InvalidOffsetIdx = 0xf;
1316
1317
unsigned OffsetIdx;
1318
switch (II.getIntrinsicID()) {
1319
case Intrinsic::amdgcn_raw_buffer_load:
1320
case Intrinsic::amdgcn_raw_ptr_buffer_load:
1321
OffsetIdx = 1;
1322
break;
1323
case Intrinsic::amdgcn_s_buffer_load:
1324
// If resulting type is vec3, there is no point in trimming the
1325
// load with updated offset, as the vec3 would most likely be widened to
1326
// vec4 anyway during lowering.
1327
if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1328
OffsetIdx = InvalidOffsetIdx;
1329
else
1330
OffsetIdx = 1;
1331
break;
1332
case Intrinsic::amdgcn_struct_buffer_load:
1333
case Intrinsic::amdgcn_struct_ptr_buffer_load:
1334
OffsetIdx = 2;
1335
break;
1336
default:
1337
// TODO: handle tbuffer* intrinsics.
1338
OffsetIdx = InvalidOffsetIdx;
1339
break;
1340
}
1341
1342
if (OffsetIdx != InvalidOffsetIdx) {
1343
// Clear demanded bits and update the offset.
1344
DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1345
auto *Offset = Args[OffsetIdx];
1346
unsigned SingleComponentSizeInBits =
1347
IC.getDataLayout().getTypeSizeInBits(EltTy);
1348
unsigned OffsetAdd =
1349
UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1350
auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1351
Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1352
}
1353
}
1354
} else {
1355
// Image case.
1356
1357
ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1358
unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1359
1360
// dmask 0 has special semantics, do not simplify.
1361
if (DMaskVal == 0)
1362
return nullptr;
1363
1364
// Mask off values that are undefined because the dmask doesn't cover them
1365
DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1366
1367
unsigned NewDMaskVal = 0;
1368
unsigned OrigLdStIdx = 0;
1369
for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1370
const unsigned Bit = 1 << SrcIdx;
1371
if (!!(DMaskVal & Bit)) {
1372
if (!!DemandedElts[OrigLdStIdx])
1373
NewDMaskVal |= Bit;
1374
OrigLdStIdx++;
1375
}
1376
}
1377
1378
if (DMaskVal != NewDMaskVal)
1379
Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1380
}
1381
1382
unsigned NewNumElts = DemandedElts.popcount();
1383
if (!NewNumElts)
1384
return PoisonValue::get(IIVTy);
1385
1386
if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1387
if (DMaskIdx >= 0)
1388
II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1389
return nullptr;
1390
}
1391
1392
// Validate function argument and return types, extracting overloaded types
1393
// along the way.
1394
SmallVector<Type *, 6> OverloadTys;
1395
if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1396
return nullptr;
1397
1398
Type *NewTy =
1399
(NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1400
OverloadTys[0] = NewTy;
1401
1402
if (!IsLoad) {
1403
SmallVector<int, 8> EltMask;
1404
for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1405
if (DemandedElts[OrigStoreIdx])
1406
EltMask.push_back(OrigStoreIdx);
1407
1408
if (NewNumElts == 1)
1409
Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
1410
else
1411
Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
1412
}
1413
1414
Function *NewIntrin = Intrinsic::getDeclaration(
1415
II.getModule(), II.getIntrinsicID(), OverloadTys);
1416
CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1417
NewCall->takeName(&II);
1418
NewCall->copyMetadata(II);
1419
1420
if (IsLoad) {
1421
if (NewNumElts == 1) {
1422
return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
1423
DemandedElts.countr_zero());
1424
}
1425
1426
SmallVector<int, 8> EltMask;
1427
unsigned NewLoadIdx = 0;
1428
for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1429
if (!!DemandedElts[OrigLoadIdx])
1430
EltMask.push_back(NewLoadIdx++);
1431
else
1432
EltMask.push_back(NewNumElts);
1433
}
1434
1435
auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1436
1437
return Shuffle;
1438
}
1439
1440
return NewCall;
1441
}
1442
1443
std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1444
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1445
APInt &UndefElts2, APInt &UndefElts3,
1446
std::function<void(Instruction *, unsigned, APInt, APInt &)>
1447
SimplifyAndSetOp) const {
1448
switch (II.getIntrinsicID()) {
1449
case Intrinsic::amdgcn_raw_buffer_load:
1450
case Intrinsic::amdgcn_raw_ptr_buffer_load:
1451
case Intrinsic::amdgcn_raw_buffer_load_format:
1452
case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1453
case Intrinsic::amdgcn_raw_tbuffer_load:
1454
case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1455
case Intrinsic::amdgcn_s_buffer_load:
1456
case Intrinsic::amdgcn_struct_buffer_load:
1457
case Intrinsic::amdgcn_struct_ptr_buffer_load:
1458
case Intrinsic::amdgcn_struct_buffer_load_format:
1459
case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1460
case Intrinsic::amdgcn_struct_tbuffer_load:
1461
case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
1462
return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1463
default: {
1464
if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1465
return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1466
}
1467
break;
1468
}
1469
}
1470
return std::nullopt;
1471
}
1472
1473