Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
35266 views
1
//===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// This pass optimizes atomic operations by using a single lane of a wavefront
11
/// to perform the atomic operation, thus reducing contention on that memory
12
/// location.
13
/// Atomic optimizer uses following strategies to compute scan and reduced
14
/// values
15
/// 1. DPP -
16
/// This is the most efficient implementation for scan. DPP uses Whole Wave
17
/// Mode (WWM)
18
/// 2. Iterative -
19
// An alternative implementation iterates over all active lanes
20
/// of Wavefront using llvm.cttz and performs scan using readlane & writelane
21
/// intrinsics
22
//===----------------------------------------------------------------------===//
23
24
#include "AMDGPU.h"
25
#include "GCNSubtarget.h"
26
#include "llvm/Analysis/DomTreeUpdater.h"
27
#include "llvm/Analysis/UniformityAnalysis.h"
28
#include "llvm/CodeGen/TargetPassConfig.h"
29
#include "llvm/IR/IRBuilder.h"
30
#include "llvm/IR/InstVisitor.h"
31
#include "llvm/IR/IntrinsicsAMDGPU.h"
32
#include "llvm/InitializePasses.h"
33
#include "llvm/Target/TargetMachine.h"
34
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
35
36
#define DEBUG_TYPE "amdgpu-atomic-optimizer"
37
38
using namespace llvm;
39
using namespace llvm::AMDGPU;
40
41
namespace {
42
43
struct ReplacementInfo {
44
Instruction *I;
45
AtomicRMWInst::BinOp Op;
46
unsigned ValIdx;
47
bool ValDivergent;
48
};
49
50
class AMDGPUAtomicOptimizer : public FunctionPass {
51
public:
52
static char ID;
53
ScanOptions ScanImpl;
54
AMDGPUAtomicOptimizer(ScanOptions ScanImpl)
55
: FunctionPass(ID), ScanImpl(ScanImpl) {}
56
57
bool runOnFunction(Function &F) override;
58
59
void getAnalysisUsage(AnalysisUsage &AU) const override {
60
AU.addPreserved<DominatorTreeWrapperPass>();
61
AU.addRequired<UniformityInfoWrapperPass>();
62
AU.addRequired<TargetPassConfig>();
63
}
64
};
65
66
class AMDGPUAtomicOptimizerImpl
67
: public InstVisitor<AMDGPUAtomicOptimizerImpl> {
68
private:
69
SmallVector<ReplacementInfo, 8> ToReplace;
70
const UniformityInfo *UA;
71
const DataLayout *DL;
72
DomTreeUpdater &DTU;
73
const GCNSubtarget *ST;
74
bool IsPixelShader;
75
ScanOptions ScanImpl;
76
77
Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
78
Value *const Identity) const;
79
Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
80
Value *const Identity) const;
81
Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
82
83
std::pair<Value *, Value *>
84
buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
85
Value *const Identity, Value *V, Instruction &I,
86
BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const;
87
88
void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
89
bool ValDivergent) const;
90
91
public:
92
AMDGPUAtomicOptimizerImpl() = delete;
93
94
AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,
95
DomTreeUpdater &DTU, const GCNSubtarget *ST,
96
bool IsPixelShader, ScanOptions ScanImpl)
97
: UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
98
ScanImpl(ScanImpl) {}
99
100
bool run(Function &F);
101
102
void visitAtomicRMWInst(AtomicRMWInst &I);
103
void visitIntrinsicInst(IntrinsicInst &I);
104
};
105
106
} // namespace
107
108
char AMDGPUAtomicOptimizer::ID = 0;
109
110
char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID;
111
112
bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
113
if (skipFunction(F)) {
114
return false;
115
}
116
117
const UniformityInfo *UA =
118
&getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
119
const DataLayout *DL = &F.getDataLayout();
120
121
DominatorTreeWrapperPass *const DTW =
122
getAnalysisIfAvailable<DominatorTreeWrapperPass>();
123
DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,
124
DomTreeUpdater::UpdateStrategy::Lazy);
125
126
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
127
const TargetMachine &TM = TPC.getTM<TargetMachine>();
128
const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
129
130
bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
131
132
return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
133
.run(F);
134
}
135
136
PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
137
FunctionAnalysisManager &AM) {
138
139
const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);
140
const DataLayout *DL = &F.getDataLayout();
141
142
DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F),
143
DomTreeUpdater::UpdateStrategy::Lazy);
144
const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
145
146
bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
147
148
bool IsChanged =
149
AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
150
.run(F);
151
152
if (!IsChanged) {
153
return PreservedAnalyses::all();
154
}
155
156
PreservedAnalyses PA;
157
PA.preserve<DominatorTreeAnalysis>();
158
return PA;
159
}
160
161
bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
162
163
// Scan option None disables the Pass
164
if (ScanImpl == ScanOptions::None) {
165
return false;
166
}
167
168
visit(F);
169
170
const bool Changed = !ToReplace.empty();
171
172
for (ReplacementInfo &Info : ToReplace) {
173
optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
174
}
175
176
ToReplace.clear();
177
178
return Changed;
179
}
180
181
static bool isLegalCrossLaneType(Type *Ty) {
182
switch (Ty->getTypeID()) {
183
case Type::FloatTyID:
184
case Type::DoubleTyID:
185
return true;
186
case Type::IntegerTyID: {
187
unsigned Size = Ty->getIntegerBitWidth();
188
return (Size == 32 || Size == 64);
189
}
190
default:
191
return false;
192
}
193
}
194
195
void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
196
// Early exit for unhandled address space atomic instructions.
197
switch (I.getPointerAddressSpace()) {
198
default:
199
return;
200
case AMDGPUAS::GLOBAL_ADDRESS:
201
case AMDGPUAS::LOCAL_ADDRESS:
202
break;
203
}
204
205
AtomicRMWInst::BinOp Op = I.getOperation();
206
207
switch (Op) {
208
default:
209
return;
210
case AtomicRMWInst::Add:
211
case AtomicRMWInst::Sub:
212
case AtomicRMWInst::And:
213
case AtomicRMWInst::Or:
214
case AtomicRMWInst::Xor:
215
case AtomicRMWInst::Max:
216
case AtomicRMWInst::Min:
217
case AtomicRMWInst::UMax:
218
case AtomicRMWInst::UMin:
219
case AtomicRMWInst::FAdd:
220
case AtomicRMWInst::FSub:
221
case AtomicRMWInst::FMax:
222
case AtomicRMWInst::FMin:
223
break;
224
}
225
226
// Only 32 and 64 bit floating point atomic ops are supported.
227
if (AtomicRMWInst::isFPOperation(Op) &&
228
!(I.getType()->isFloatTy() || I.getType()->isDoubleTy())) {
229
return;
230
}
231
232
const unsigned PtrIdx = 0;
233
const unsigned ValIdx = 1;
234
235
// If the pointer operand is divergent, then each lane is doing an atomic
236
// operation on a different address, and we cannot optimize that.
237
if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {
238
return;
239
}
240
241
bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
242
243
// If the value operand is divergent, each lane is contributing a different
244
// value to the atomic calculation. We can only optimize divergent values if
245
// we have DPP available on our subtarget (for DPP strategy), and the atomic
246
// operation is 32 or 64 bits.
247
if (ValDivergent) {
248
if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
249
return;
250
251
if (!isLegalCrossLaneType(I.getType()))
252
return;
253
}
254
255
// If we get here, we can optimize the atomic using a single wavefront-wide
256
// atomic operation to do the calculation for the entire wavefront, so
257
// remember the instruction so we can come back to it.
258
const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
259
260
ToReplace.push_back(Info);
261
}
262
263
void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
264
AtomicRMWInst::BinOp Op;
265
266
switch (I.getIntrinsicID()) {
267
default:
268
return;
269
case Intrinsic::amdgcn_struct_buffer_atomic_add:
270
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
271
case Intrinsic::amdgcn_raw_buffer_atomic_add:
272
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
273
Op = AtomicRMWInst::Add;
274
break;
275
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
276
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
277
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
278
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
279
Op = AtomicRMWInst::Sub;
280
break;
281
case Intrinsic::amdgcn_struct_buffer_atomic_and:
282
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
283
case Intrinsic::amdgcn_raw_buffer_atomic_and:
284
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
285
Op = AtomicRMWInst::And;
286
break;
287
case Intrinsic::amdgcn_struct_buffer_atomic_or:
288
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
289
case Intrinsic::amdgcn_raw_buffer_atomic_or:
290
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
291
Op = AtomicRMWInst::Or;
292
break;
293
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
294
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
295
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
296
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
297
Op = AtomicRMWInst::Xor;
298
break;
299
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
300
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
301
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
302
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
303
Op = AtomicRMWInst::Min;
304
break;
305
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
306
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
307
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
308
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
309
Op = AtomicRMWInst::UMin;
310
break;
311
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
312
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
313
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
314
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
315
Op = AtomicRMWInst::Max;
316
break;
317
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
318
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
319
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
320
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
321
Op = AtomicRMWInst::UMax;
322
break;
323
}
324
325
const unsigned ValIdx = 0;
326
327
const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
328
329
// If the value operand is divergent, each lane is contributing a different
330
// value to the atomic calculation. We can only optimize divergent values if
331
// we have DPP available on our subtarget (for DPP strategy), and the atomic
332
// operation is 32 or 64 bits.
333
if (ValDivergent) {
334
if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
335
return;
336
337
if (!isLegalCrossLaneType(I.getType()))
338
return;
339
}
340
341
// If any of the other arguments to the intrinsic are divergent, we can't
342
// optimize the operation.
343
for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
344
if (UA->isDivergentUse(I.getOperandUse(Idx))) {
345
return;
346
}
347
}
348
349
// If we get here, we can optimize the atomic using a single wavefront-wide
350
// atomic operation to do the calculation for the entire wavefront, so
351
// remember the instruction so we can come back to it.
352
const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
353
354
ToReplace.push_back(Info);
355
}
356
357
// Use the builder to create the non-atomic counterpart of the specified
358
// atomicrmw binary op.
359
static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
360
Value *LHS, Value *RHS) {
361
CmpInst::Predicate Pred;
362
363
switch (Op) {
364
default:
365
llvm_unreachable("Unhandled atomic op");
366
case AtomicRMWInst::Add:
367
return B.CreateBinOp(Instruction::Add, LHS, RHS);
368
case AtomicRMWInst::FAdd:
369
return B.CreateFAdd(LHS, RHS);
370
case AtomicRMWInst::Sub:
371
return B.CreateBinOp(Instruction::Sub, LHS, RHS);
372
case AtomicRMWInst::FSub:
373
return B.CreateFSub(LHS, RHS);
374
case AtomicRMWInst::And:
375
return B.CreateBinOp(Instruction::And, LHS, RHS);
376
case AtomicRMWInst::Or:
377
return B.CreateBinOp(Instruction::Or, LHS, RHS);
378
case AtomicRMWInst::Xor:
379
return B.CreateBinOp(Instruction::Xor, LHS, RHS);
380
381
case AtomicRMWInst::Max:
382
Pred = CmpInst::ICMP_SGT;
383
break;
384
case AtomicRMWInst::Min:
385
Pred = CmpInst::ICMP_SLT;
386
break;
387
case AtomicRMWInst::UMax:
388
Pred = CmpInst::ICMP_UGT;
389
break;
390
case AtomicRMWInst::UMin:
391
Pred = CmpInst::ICMP_ULT;
392
break;
393
case AtomicRMWInst::FMax:
394
return B.CreateMaxNum(LHS, RHS);
395
case AtomicRMWInst::FMin:
396
return B.CreateMinNum(LHS, RHS);
397
}
398
Value *Cond = B.CreateICmp(Pred, LHS, RHS);
399
return B.CreateSelect(Cond, LHS, RHS);
400
}
401
402
// Use the builder to create a reduction of V across the wavefront, with all
403
// lanes active, returning the same result in all lanes.
404
Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
405
AtomicRMWInst::BinOp Op,
406
Value *V,
407
Value *const Identity) const {
408
Type *AtomicTy = V->getType();
409
Module *M = B.GetInsertBlock()->getModule();
410
Function *UpdateDPP =
411
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
412
413
// Reduce within each row of 16 lanes.
414
for (unsigned Idx = 0; Idx < 4; Idx++) {
415
V = buildNonAtomicBinOp(
416
B, Op, V,
417
B.CreateCall(UpdateDPP,
418
{Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
419
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
420
}
421
422
// Reduce within each pair of rows (i.e. 32 lanes).
423
assert(ST->hasPermLaneX16());
424
Value *Permlanex16Call = B.CreateIntrinsic(
425
V->getType(), Intrinsic::amdgcn_permlanex16,
426
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
427
V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
428
if (ST->isWave32()) {
429
return V;
430
}
431
432
if (ST->hasPermLane64()) {
433
// Reduce across the upper and lower 32 lanes.
434
Value *Permlane64Call =
435
B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
436
return buildNonAtomicBinOp(B, Op, V, Permlane64Call);
437
}
438
439
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
440
// combine them with a scalar operation.
441
Function *ReadLane =
442
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
443
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
444
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
445
return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
446
}
447
448
// Use the builder to create an inclusive scan of V across the wavefront, with
449
// all lanes active.
450
Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
451
AtomicRMWInst::BinOp Op, Value *V,
452
Value *Identity) const {
453
Type *AtomicTy = V->getType();
454
Module *M = B.GetInsertBlock()->getModule();
455
Function *UpdateDPP =
456
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
457
458
for (unsigned Idx = 0; Idx < 4; Idx++) {
459
V = buildNonAtomicBinOp(
460
B, Op, V,
461
B.CreateCall(UpdateDPP,
462
{Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
463
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
464
}
465
if (ST->hasDPPBroadcasts()) {
466
// GFX9 has DPP row broadcast operations.
467
V = buildNonAtomicBinOp(
468
B, Op, V,
469
B.CreateCall(UpdateDPP,
470
{Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
471
B.getInt32(0xf), B.getFalse()}));
472
V = buildNonAtomicBinOp(
473
B, Op, V,
474
B.CreateCall(UpdateDPP,
475
{Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
476
B.getInt32(0xf), B.getFalse()}));
477
} else {
478
// On GFX10 all DPP operations are confined to a single row. To get cross-
479
// row operations we have to use permlane or readlane.
480
481
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
482
// 48..63).
483
assert(ST->hasPermLaneX16());
484
Value *PermX = B.CreateIntrinsic(
485
V->getType(), Intrinsic::amdgcn_permlanex16,
486
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
487
488
Value *UpdateDPPCall = B.CreateCall(
489
UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
490
B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
491
V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
492
493
if (!ST->isWave32()) {
494
// Combine lane 31 into lanes 32..63.
495
Value *const Lane31 = B.CreateIntrinsic(
496
V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
497
498
Value *UpdateDPPCall = B.CreateCall(
499
UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
500
B.getInt32(0xc), B.getInt32(0xf), B.getFalse()});
501
502
V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
503
}
504
}
505
return V;
506
}
507
508
// Use the builder to create a shift right of V across the wavefront, with all
509
// lanes active, to turn an inclusive scan into an exclusive scan.
510
Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
511
Value *Identity) const {
512
Type *AtomicTy = V->getType();
513
Module *M = B.GetInsertBlock()->getModule();
514
Function *UpdateDPP =
515
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
516
if (ST->hasDPPWavefrontShifts()) {
517
// GFX9 has DPP wavefront shift operations.
518
V = B.CreateCall(UpdateDPP,
519
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
520
B.getInt32(0xf), B.getFalse()});
521
} else {
522
Function *ReadLane =
523
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
524
Function *WriteLane =
525
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy);
526
527
// On GFX10 all DPP operations are confined to a single row. To get cross-
528
// row operations we have to use permlane or readlane.
529
Value *Old = V;
530
V = B.CreateCall(UpdateDPP,
531
{Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),
532
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
533
534
// Copy the old lane 15 to the new lane 16.
535
V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
536
B.getInt32(16), V});
537
538
if (!ST->isWave32()) {
539
// Copy the old lane 31 to the new lane 32.
540
V = B.CreateCall(
541
WriteLane,
542
{B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
543
544
// Copy the old lane 47 to the new lane 48.
545
V = B.CreateCall(
546
WriteLane,
547
{B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
548
}
549
}
550
551
return V;
552
}
553
554
// Use the builder to create an exclusive scan and compute the final reduced
555
// value using an iterative approach. This provides an alternative
556
// implementation to DPP which uses WMM for scan computations. This API iterate
557
// over active lanes to read, compute and update the value using
558
// readlane and writelane intrinsics.
559
std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
560
IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
561
Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
562
auto *Ty = I.getType();
563
auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());
564
auto *EntryBB = I.getParent();
565
auto NeedResult = !I.use_empty();
566
567
auto *Ballot =
568
B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
569
570
// Start inserting instructions for ComputeLoop block
571
B.SetInsertPoint(ComputeLoop);
572
// Phi nodes for Accumulator, Scan results destination, and Active Lanes
573
auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator");
574
Accumulator->addIncoming(Identity, EntryBB);
575
PHINode *OldValuePhi = nullptr;
576
if (NeedResult) {
577
OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi");
578
OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB);
579
}
580
auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits");
581
ActiveBits->addIncoming(Ballot, EntryBB);
582
583
// Use llvm.cttz instrinsic to find the lowest remaining active lane.
584
auto *FF1 =
585
B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});
586
587
auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty());
588
589
// Get the value required for atomic operation
590
Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane,
591
{V, LaneIdxInt});
592
593
// Perform writelane if intermediate scan results are required later in the
594
// kernel computations
595
Value *OldValue = nullptr;
596
if (NeedResult) {
597
OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane,
598
{Accumulator, LaneIdxInt, OldValuePhi});
599
OldValuePhi->addIncoming(OldValue, ComputeLoop);
600
}
601
602
// Accumulate the results
603
auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue);
604
Accumulator->addIncoming(NewAccumulator, ComputeLoop);
605
606
// Set bit to zero of current active lane so that for next iteration llvm.cttz
607
// return the next active lane
608
auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1);
609
610
auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1));
611
auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask);
612
ActiveBits->addIncoming(NewActiveBits, ComputeLoop);
613
614
// Branch out of the loop when all lanes are processed.
615
auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0));
616
B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop);
617
618
B.SetInsertPoint(ComputeEnd);
619
620
return {OldValue, NewAccumulator};
621
}
622
623
static Constant *getIdentityValueForAtomicOp(Type *const Ty,
624
AtomicRMWInst::BinOp Op) {
625
LLVMContext &C = Ty->getContext();
626
const unsigned BitWidth = Ty->getPrimitiveSizeInBits();
627
switch (Op) {
628
default:
629
llvm_unreachable("Unhandled atomic op");
630
case AtomicRMWInst::Add:
631
case AtomicRMWInst::Sub:
632
case AtomicRMWInst::Or:
633
case AtomicRMWInst::Xor:
634
case AtomicRMWInst::UMax:
635
return ConstantInt::get(C, APInt::getMinValue(BitWidth));
636
case AtomicRMWInst::And:
637
case AtomicRMWInst::UMin:
638
return ConstantInt::get(C, APInt::getMaxValue(BitWidth));
639
case AtomicRMWInst::Max:
640
return ConstantInt::get(C, APInt::getSignedMinValue(BitWidth));
641
case AtomicRMWInst::Min:
642
return ConstantInt::get(C, APInt::getSignedMaxValue(BitWidth));
643
case AtomicRMWInst::FAdd:
644
return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true));
645
case AtomicRMWInst::FSub:
646
return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false));
647
case AtomicRMWInst::FMin:
648
case AtomicRMWInst::FMax:
649
// FIXME: atomicrmw fmax/fmin behave like llvm.maxnum/minnum so NaN is the
650
// closest thing they have to an identity, but it still does not preserve
651
// the difference between quiet and signaling NaNs or NaNs with different
652
// payloads.
653
return ConstantFP::get(C, APFloat::getNaN(Ty->getFltSemantics()));
654
}
655
}
656
657
static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) {
658
const ConstantInt *CI = dyn_cast<ConstantInt>(LHS);
659
return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);
660
}
661
662
void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
663
AtomicRMWInst::BinOp Op,
664
unsigned ValIdx,
665
bool ValDivergent) const {
666
// Start building just before the instruction.
667
IRBuilder<> B(&I);
668
669
if (AtomicRMWInst::isFPOperation(Op)) {
670
B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP));
671
}
672
673
// If we are in a pixel shader, because of how we have to mask out helper
674
// lane invocations, we need to record the entry and exit BB's.
675
BasicBlock *PixelEntryBB = nullptr;
676
BasicBlock *PixelExitBB = nullptr;
677
678
// If we're optimizing an atomic within a pixel shader, we need to wrap the
679
// entire atomic operation in a helper-lane check. We do not want any helper
680
// lanes that are around only for the purposes of derivatives to take part
681
// in any cross-lane communication, and we use a branch on whether the lane is
682
// live to do this.
683
if (IsPixelShader) {
684
// Record I's original position as the entry block.
685
PixelEntryBB = I.getParent();
686
687
Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
688
Instruction *const NonHelperTerminator =
689
SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
690
691
// Record I's new position as the exit block.
692
PixelExitBB = I.getParent();
693
694
I.moveBefore(NonHelperTerminator);
695
B.SetInsertPoint(&I);
696
}
697
698
Type *const Ty = I.getType();
699
Type *Int32Ty = B.getInt32Ty();
700
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
701
[[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
702
703
// This is the value in the atomic operation we need to combine in order to
704
// reduce the number of atomic operations.
705
Value *V = I.getOperand(ValIdx);
706
707
// We need to know how many lanes are active within the wavefront, and we do
708
// this by doing a ballot of active lanes.
709
Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
710
CallInst *const Ballot =
711
B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
712
713
// We need to know how many lanes are active within the wavefront that are
714
// below us. If we counted each lane linearly starting from 0, a lane is
715
// below us only if its associated index was less than ours. We do this by
716
// using the mbcnt intrinsic.
717
Value *Mbcnt;
718
if (ST->isWave32()) {
719
Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
720
{Ballot, B.getInt32(0)});
721
} else {
722
Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);
723
Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);
724
Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
725
{ExtractLo, B.getInt32(0)});
726
Mbcnt =
727
B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
728
}
729
730
Function *F = I.getFunction();
731
LLVMContext &C = F->getContext();
732
733
// For atomic sub, perform scan with add operation and allow one lane to
734
// subtract the reduced value later.
735
AtomicRMWInst::BinOp ScanOp = Op;
736
if (Op == AtomicRMWInst::Sub) {
737
ScanOp = AtomicRMWInst::Add;
738
} else if (Op == AtomicRMWInst::FSub) {
739
ScanOp = AtomicRMWInst::FAdd;
740
}
741
Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp);
742
743
Value *ExclScan = nullptr;
744
Value *NewV = nullptr;
745
746
const bool NeedResult = !I.use_empty();
747
748
BasicBlock *ComputeLoop = nullptr;
749
BasicBlock *ComputeEnd = nullptr;
750
// If we have a divergent value in each lane, we need to combine the value
751
// using DPP.
752
if (ValDivergent) {
753
if (ScanImpl == ScanOptions::DPP) {
754
// First we need to set all inactive invocations to the identity value, so
755
// that they can correctly contribute to the final result.
756
NewV =
757
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
758
if (!NeedResult && ST->hasPermLaneX16()) {
759
// On GFX10 the permlanex16 instruction helps us build a reduction
760
// without too many readlanes and writelanes, which are generally bad
761
// for performance.
762
NewV = buildReduction(B, ScanOp, NewV, Identity);
763
} else {
764
NewV = buildScan(B, ScanOp, NewV, Identity);
765
if (NeedResult)
766
ExclScan = buildShiftRight(B, NewV, Identity);
767
// Read the value from the last lane, which has accumulated the values
768
// of each active lane in the wavefront. This will be our new value
769
// which we will provide to the atomic operation.
770
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
771
NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
772
{NewV, LastLaneIdx});
773
}
774
// Finally mark the readlanes in the WWM section.
775
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
776
} else if (ScanImpl == ScanOptions::Iterative) {
777
// Alternative implementation for scan
778
ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F);
779
ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F);
780
std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,
781
ComputeLoop, ComputeEnd);
782
} else {
783
llvm_unreachable("Atomic Optimzer is disabled for None strategy");
784
}
785
} else {
786
switch (Op) {
787
default:
788
llvm_unreachable("Unhandled atomic op");
789
790
case AtomicRMWInst::Add:
791
case AtomicRMWInst::Sub: {
792
// The new value we will be contributing to the atomic operation is the
793
// old value times the number of active lanes.
794
Value *const Ctpop = B.CreateIntCast(
795
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
796
NewV = buildMul(B, V, Ctpop);
797
break;
798
}
799
case AtomicRMWInst::FAdd:
800
case AtomicRMWInst::FSub: {
801
Value *const Ctpop = B.CreateIntCast(
802
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
803
Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
804
NewV = B.CreateFMul(V, CtpopFP);
805
break;
806
}
807
case AtomicRMWInst::And:
808
case AtomicRMWInst::Or:
809
case AtomicRMWInst::Max:
810
case AtomicRMWInst::Min:
811
case AtomicRMWInst::UMax:
812
case AtomicRMWInst::UMin:
813
case AtomicRMWInst::FMin:
814
case AtomicRMWInst::FMax:
815
// These operations with a uniform value are idempotent: doing the atomic
816
// operation multiple times has the same effect as doing it once.
817
NewV = V;
818
break;
819
820
case AtomicRMWInst::Xor:
821
// The new value we will be contributing to the atomic operation is the
822
// old value times the parity of the number of active lanes.
823
Value *const Ctpop = B.CreateIntCast(
824
B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
825
NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
826
break;
827
}
828
}
829
830
// We only want a single lane to enter our new control flow, and we do this
831
// by checking if there are any active lanes below us. Only one lane will
832
// have 0 active lanes below us, so that will be the only one to progress.
833
Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
834
835
// Store I's original basic block before we split the block.
836
BasicBlock *const OriginalBB = I.getParent();
837
838
// We need to introduce some new control flow to force a single lane to be
839
// active. We do this by splitting I's basic block at I, and introducing the
840
// new block such that:
841
// entry --> single_lane -\
842
// \------------------> exit
843
Instruction *const SingleLaneTerminator =
844
SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
845
846
// At this point, we have split the I's block to allow one lane in wavefront
847
// to update the precomputed reduced value. Also, completed the codegen for
848
// new control flow i.e. iterative loop which perform reduction and scan using
849
// ComputeLoop and ComputeEnd.
850
// For the new control flow, we need to move branch instruction i.e.
851
// terminator created during SplitBlockAndInsertIfThen from I's block to
852
// ComputeEnd block. We also need to set up predecessor to next block when
853
// single lane done updating the final reduced value.
854
BasicBlock *Predecessor = nullptr;
855
if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
856
// Move terminator from I's block to ComputeEnd block.
857
//
858
// OriginalBB is known to have a branch as terminator because
859
// SplitBlockAndInsertIfThen will have inserted one.
860
BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());
861
B.SetInsertPoint(ComputeEnd);
862
Terminator->removeFromParent();
863
B.Insert(Terminator);
864
865
// Branch to ComputeLoop Block unconditionally from the I's block for
866
// iterative approach.
867
B.SetInsertPoint(OriginalBB);
868
B.CreateBr(ComputeLoop);
869
870
// Update the dominator tree for new control flow.
871
SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(
872
{{DominatorTree::Insert, OriginalBB, ComputeLoop},
873
{DominatorTree::Insert, ComputeLoop, ComputeEnd}});
874
875
// We're moving the terminator from EntryBB to ComputeEnd, make sure we move
876
// the DT edges as well.
877
for (auto *Succ : Terminator->successors()) {
878
DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});
879
DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});
880
}
881
882
DTU.applyUpdates(DomTreeUpdates);
883
884
Predecessor = ComputeEnd;
885
} else {
886
Predecessor = OriginalBB;
887
}
888
// Move the IR builder into single_lane next.
889
B.SetInsertPoint(SingleLaneTerminator);
890
891
// Clone the original atomic operation into single lane, replacing the
892
// original value with our newly created one.
893
Instruction *const NewI = I.clone();
894
B.Insert(NewI);
895
NewI->setOperand(ValIdx, NewV);
896
897
// Move the IR builder into exit next, and start inserting just before the
898
// original instruction.
899
B.SetInsertPoint(&I);
900
901
if (NeedResult) {
902
// Create a PHI node to get our new atomic result into the exit block.
903
PHINode *const PHI = B.CreatePHI(Ty, 2);
904
PHI->addIncoming(PoisonValue::get(Ty), Predecessor);
905
PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
906
907
// We need to broadcast the value who was the lowest active lane (the first
908
// lane) to all other lanes in the wavefront. We use an intrinsic for this,
909
// but have to handle 64-bit broadcasts with two calls to this intrinsic.
910
Value *BroadcastI = nullptr;
911
BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
912
913
// Now that we have the result of our single atomic operation, we need to
914
// get our individual lane's slice into the result. We use the lane offset
915
// we previously calculated combined with the atomic result value we got
916
// from the first lane, to get our lane's index into the atomic result.
917
Value *LaneOffset = nullptr;
918
if (ValDivergent) {
919
if (ScanImpl == ScanOptions::DPP) {
920
LaneOffset =
921
B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
922
} else if (ScanImpl == ScanOptions::Iterative) {
923
LaneOffset = ExclScan;
924
} else {
925
llvm_unreachable("Atomic Optimzer is disabled for None strategy");
926
}
927
} else {
928
Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)
929
: B.CreateIntCast(Mbcnt, Ty, false);
930
switch (Op) {
931
default:
932
llvm_unreachable("Unhandled atomic op");
933
case AtomicRMWInst::Add:
934
case AtomicRMWInst::Sub:
935
LaneOffset = buildMul(B, V, Mbcnt);
936
break;
937
case AtomicRMWInst::And:
938
case AtomicRMWInst::Or:
939
case AtomicRMWInst::Max:
940
case AtomicRMWInst::Min:
941
case AtomicRMWInst::UMax:
942
case AtomicRMWInst::UMin:
943
case AtomicRMWInst::FMin:
944
case AtomicRMWInst::FMax:
945
LaneOffset = B.CreateSelect(Cond, Identity, V);
946
break;
947
case AtomicRMWInst::Xor:
948
LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
949
break;
950
case AtomicRMWInst::FAdd:
951
case AtomicRMWInst::FSub: {
952
LaneOffset = B.CreateFMul(V, Mbcnt);
953
break;
954
}
955
}
956
}
957
Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
958
if (isAtomicFloatingPointTy) {
959
// For fadd/fsub the first active lane of LaneOffset should be the
960
// identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated
961
// is V * +0.0 which might have the wrong sign or might be nan (if V is
962
// inf or nan).
963
//
964
// For all floating point ops if the in-memory value was a nan then the
965
// binop we just built might have quieted it or changed its payload.
966
//
967
// Correct all these problems by using BroadcastI as the result in the
968
// first active lane.
969
Result = B.CreateSelect(Cond, BroadcastI, Result);
970
}
971
972
if (IsPixelShader) {
973
// Need a final PHI to reconverge to above the helper lane branch mask.
974
B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
975
976
PHINode *const PHI = B.CreatePHI(Ty, 2);
977
PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
978
PHI->addIncoming(Result, I.getParent());
979
I.replaceAllUsesWith(PHI);
980
} else {
981
// Replace the original atomic instruction with the new one.
982
I.replaceAllUsesWith(Result);
983
}
984
}
985
986
// And delete the original.
987
I.eraseFromParent();
988
}
989
990
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
991
"AMDGPU atomic optimizations", false, false)
992
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
993
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
994
INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
995
"AMDGPU atomic optimizations", false, false)
996
997
FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) {
998
return new AMDGPUAtomicOptimizer(ScanStrategy);
999
}
1000
1001