Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
35266 views
//===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8/// \file9/// This pass optimizes atomic operations by using a single lane of a wavefront10/// to perform the atomic operation, thus reducing contention on that memory11/// location.12/// Atomic optimizer uses following strategies to compute scan and reduced13/// values14/// 1. DPP -15/// This is the most efficient implementation for scan. DPP uses Whole Wave16/// Mode (WWM)17/// 2. Iterative -18// An alternative implementation iterates over all active lanes19/// of Wavefront using llvm.cttz and performs scan using readlane & writelane20/// intrinsics21//===----------------------------------------------------------------------===//2223#include "AMDGPU.h"24#include "GCNSubtarget.h"25#include "llvm/Analysis/DomTreeUpdater.h"26#include "llvm/Analysis/UniformityAnalysis.h"27#include "llvm/CodeGen/TargetPassConfig.h"28#include "llvm/IR/IRBuilder.h"29#include "llvm/IR/InstVisitor.h"30#include "llvm/IR/IntrinsicsAMDGPU.h"31#include "llvm/InitializePasses.h"32#include "llvm/Target/TargetMachine.h"33#include "llvm/Transforms/Utils/BasicBlockUtils.h"3435#define DEBUG_TYPE "amdgpu-atomic-optimizer"3637using namespace llvm;38using namespace llvm::AMDGPU;3940namespace {4142struct ReplacementInfo {43Instruction *I;44AtomicRMWInst::BinOp Op;45unsigned ValIdx;46bool ValDivergent;47};4849class AMDGPUAtomicOptimizer : public FunctionPass {50public:51static char ID;52ScanOptions ScanImpl;53AMDGPUAtomicOptimizer(ScanOptions ScanImpl)54: FunctionPass(ID), ScanImpl(ScanImpl) {}5556bool runOnFunction(Function &F) override;5758void getAnalysisUsage(AnalysisUsage &AU) const override {59AU.addPreserved<DominatorTreeWrapperPass>();60AU.addRequired<UniformityInfoWrapperPass>();61AU.addRequired<TargetPassConfig>();62}63};6465class AMDGPUAtomicOptimizerImpl66: public InstVisitor<AMDGPUAtomicOptimizerImpl> {67private:68SmallVector<ReplacementInfo, 8> ToReplace;69const UniformityInfo *UA;70const DataLayout *DL;71DomTreeUpdater &DTU;72const GCNSubtarget *ST;73bool IsPixelShader;74ScanOptions ScanImpl;7576Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,77Value *const Identity) const;78Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,79Value *const Identity) const;80Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;8182std::pair<Value *, Value *>83buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op,84Value *const Identity, Value *V, Instruction &I,85BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const;8687void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,88bool ValDivergent) const;8990public:91AMDGPUAtomicOptimizerImpl() = delete;9293AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,94DomTreeUpdater &DTU, const GCNSubtarget *ST,95bool IsPixelShader, ScanOptions ScanImpl)96: UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),97ScanImpl(ScanImpl) {}9899bool run(Function &F);100101void visitAtomicRMWInst(AtomicRMWInst &I);102void visitIntrinsicInst(IntrinsicInst &I);103};104105} // namespace106107char AMDGPUAtomicOptimizer::ID = 0;108109char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID;110111bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {112if (skipFunction(F)) {113return false;114}115116const UniformityInfo *UA =117&getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();118const DataLayout *DL = &F.getDataLayout();119120DominatorTreeWrapperPass *const DTW =121getAnalysisIfAvailable<DominatorTreeWrapperPass>();122DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,123DomTreeUpdater::UpdateStrategy::Lazy);124125const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();126const TargetMachine &TM = TPC.getTM<TargetMachine>();127const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);128129bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;130131return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)132.run(F);133}134135PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,136FunctionAnalysisManager &AM) {137138const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);139const DataLayout *DL = &F.getDataLayout();140141DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F),142DomTreeUpdater::UpdateStrategy::Lazy);143const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);144145bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;146147bool IsChanged =148AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)149.run(F);150151if (!IsChanged) {152return PreservedAnalyses::all();153}154155PreservedAnalyses PA;156PA.preserve<DominatorTreeAnalysis>();157return PA;158}159160bool AMDGPUAtomicOptimizerImpl::run(Function &F) {161162// Scan option None disables the Pass163if (ScanImpl == ScanOptions::None) {164return false;165}166167visit(F);168169const bool Changed = !ToReplace.empty();170171for (ReplacementInfo &Info : ToReplace) {172optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);173}174175ToReplace.clear();176177return Changed;178}179180static bool isLegalCrossLaneType(Type *Ty) {181switch (Ty->getTypeID()) {182case Type::FloatTyID:183case Type::DoubleTyID:184return true;185case Type::IntegerTyID: {186unsigned Size = Ty->getIntegerBitWidth();187return (Size == 32 || Size == 64);188}189default:190return false;191}192}193194void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {195// Early exit for unhandled address space atomic instructions.196switch (I.getPointerAddressSpace()) {197default:198return;199case AMDGPUAS::GLOBAL_ADDRESS:200case AMDGPUAS::LOCAL_ADDRESS:201break;202}203204AtomicRMWInst::BinOp Op = I.getOperation();205206switch (Op) {207default:208return;209case AtomicRMWInst::Add:210case AtomicRMWInst::Sub:211case AtomicRMWInst::And:212case AtomicRMWInst::Or:213case AtomicRMWInst::Xor:214case AtomicRMWInst::Max:215case AtomicRMWInst::Min:216case AtomicRMWInst::UMax:217case AtomicRMWInst::UMin:218case AtomicRMWInst::FAdd:219case AtomicRMWInst::FSub:220case AtomicRMWInst::FMax:221case AtomicRMWInst::FMin:222break;223}224225// Only 32 and 64 bit floating point atomic ops are supported.226if (AtomicRMWInst::isFPOperation(Op) &&227!(I.getType()->isFloatTy() || I.getType()->isDoubleTy())) {228return;229}230231const unsigned PtrIdx = 0;232const unsigned ValIdx = 1;233234// If the pointer operand is divergent, then each lane is doing an atomic235// operation on a different address, and we cannot optimize that.236if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {237return;238}239240bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));241242// If the value operand is divergent, each lane is contributing a different243// value to the atomic calculation. We can only optimize divergent values if244// we have DPP available on our subtarget (for DPP strategy), and the atomic245// operation is 32 or 64 bits.246if (ValDivergent) {247if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())248return;249250if (!isLegalCrossLaneType(I.getType()))251return;252}253254// If we get here, we can optimize the atomic using a single wavefront-wide255// atomic operation to do the calculation for the entire wavefront, so256// remember the instruction so we can come back to it.257const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};258259ToReplace.push_back(Info);260}261262void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {263AtomicRMWInst::BinOp Op;264265switch (I.getIntrinsicID()) {266default:267return;268case Intrinsic::amdgcn_struct_buffer_atomic_add:269case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:270case Intrinsic::amdgcn_raw_buffer_atomic_add:271case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:272Op = AtomicRMWInst::Add;273break;274case Intrinsic::amdgcn_struct_buffer_atomic_sub:275case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:276case Intrinsic::amdgcn_raw_buffer_atomic_sub:277case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:278Op = AtomicRMWInst::Sub;279break;280case Intrinsic::amdgcn_struct_buffer_atomic_and:281case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:282case Intrinsic::amdgcn_raw_buffer_atomic_and:283case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:284Op = AtomicRMWInst::And;285break;286case Intrinsic::amdgcn_struct_buffer_atomic_or:287case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:288case Intrinsic::amdgcn_raw_buffer_atomic_or:289case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:290Op = AtomicRMWInst::Or;291break;292case Intrinsic::amdgcn_struct_buffer_atomic_xor:293case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:294case Intrinsic::amdgcn_raw_buffer_atomic_xor:295case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:296Op = AtomicRMWInst::Xor;297break;298case Intrinsic::amdgcn_struct_buffer_atomic_smin:299case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:300case Intrinsic::amdgcn_raw_buffer_atomic_smin:301case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:302Op = AtomicRMWInst::Min;303break;304case Intrinsic::amdgcn_struct_buffer_atomic_umin:305case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:306case Intrinsic::amdgcn_raw_buffer_atomic_umin:307case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:308Op = AtomicRMWInst::UMin;309break;310case Intrinsic::amdgcn_struct_buffer_atomic_smax:311case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:312case Intrinsic::amdgcn_raw_buffer_atomic_smax:313case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:314Op = AtomicRMWInst::Max;315break;316case Intrinsic::amdgcn_struct_buffer_atomic_umax:317case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:318case Intrinsic::amdgcn_raw_buffer_atomic_umax:319case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:320Op = AtomicRMWInst::UMax;321break;322}323324const unsigned ValIdx = 0;325326const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));327328// If the value operand is divergent, each lane is contributing a different329// value to the atomic calculation. We can only optimize divergent values if330// we have DPP available on our subtarget (for DPP strategy), and the atomic331// operation is 32 or 64 bits.332if (ValDivergent) {333if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())334return;335336if (!isLegalCrossLaneType(I.getType()))337return;338}339340// If any of the other arguments to the intrinsic are divergent, we can't341// optimize the operation.342for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {343if (UA->isDivergentUse(I.getOperandUse(Idx))) {344return;345}346}347348// If we get here, we can optimize the atomic using a single wavefront-wide349// atomic operation to do the calculation for the entire wavefront, so350// remember the instruction so we can come back to it.351const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};352353ToReplace.push_back(Info);354}355356// Use the builder to create the non-atomic counterpart of the specified357// atomicrmw binary op.358static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,359Value *LHS, Value *RHS) {360CmpInst::Predicate Pred;361362switch (Op) {363default:364llvm_unreachable("Unhandled atomic op");365case AtomicRMWInst::Add:366return B.CreateBinOp(Instruction::Add, LHS, RHS);367case AtomicRMWInst::FAdd:368return B.CreateFAdd(LHS, RHS);369case AtomicRMWInst::Sub:370return B.CreateBinOp(Instruction::Sub, LHS, RHS);371case AtomicRMWInst::FSub:372return B.CreateFSub(LHS, RHS);373case AtomicRMWInst::And:374return B.CreateBinOp(Instruction::And, LHS, RHS);375case AtomicRMWInst::Or:376return B.CreateBinOp(Instruction::Or, LHS, RHS);377case AtomicRMWInst::Xor:378return B.CreateBinOp(Instruction::Xor, LHS, RHS);379380case AtomicRMWInst::Max:381Pred = CmpInst::ICMP_SGT;382break;383case AtomicRMWInst::Min:384Pred = CmpInst::ICMP_SLT;385break;386case AtomicRMWInst::UMax:387Pred = CmpInst::ICMP_UGT;388break;389case AtomicRMWInst::UMin:390Pred = CmpInst::ICMP_ULT;391break;392case AtomicRMWInst::FMax:393return B.CreateMaxNum(LHS, RHS);394case AtomicRMWInst::FMin:395return B.CreateMinNum(LHS, RHS);396}397Value *Cond = B.CreateICmp(Pred, LHS, RHS);398return B.CreateSelect(Cond, LHS, RHS);399}400401// Use the builder to create a reduction of V across the wavefront, with all402// lanes active, returning the same result in all lanes.403Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,404AtomicRMWInst::BinOp Op,405Value *V,406Value *const Identity) const {407Type *AtomicTy = V->getType();408Module *M = B.GetInsertBlock()->getModule();409Function *UpdateDPP =410Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);411412// Reduce within each row of 16 lanes.413for (unsigned Idx = 0; Idx < 4; Idx++) {414V = buildNonAtomicBinOp(415B, Op, V,416B.CreateCall(UpdateDPP,417{Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),418B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));419}420421// Reduce within each pair of rows (i.e. 32 lanes).422assert(ST->hasPermLaneX16());423Value *Permlanex16Call = B.CreateIntrinsic(424V->getType(), Intrinsic::amdgcn_permlanex16,425{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});426V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);427if (ST->isWave32()) {428return V;429}430431if (ST->hasPermLane64()) {432// Reduce across the upper and lower 32 lanes.433Value *Permlane64Call =434B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);435return buildNonAtomicBinOp(B, Op, V, Permlane64Call);436}437438// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and439// combine them with a scalar operation.440Function *ReadLane =441Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);442Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});443Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});444return buildNonAtomicBinOp(B, Op, Lane0, Lane32);445}446447// Use the builder to create an inclusive scan of V across the wavefront, with448// all lanes active.449Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,450AtomicRMWInst::BinOp Op, Value *V,451Value *Identity) const {452Type *AtomicTy = V->getType();453Module *M = B.GetInsertBlock()->getModule();454Function *UpdateDPP =455Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);456457for (unsigned Idx = 0; Idx < 4; Idx++) {458V = buildNonAtomicBinOp(459B, Op, V,460B.CreateCall(UpdateDPP,461{Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),462B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));463}464if (ST->hasDPPBroadcasts()) {465// GFX9 has DPP row broadcast operations.466V = buildNonAtomicBinOp(467B, Op, V,468B.CreateCall(UpdateDPP,469{Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),470B.getInt32(0xf), B.getFalse()}));471V = buildNonAtomicBinOp(472B, Op, V,473B.CreateCall(UpdateDPP,474{Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),475B.getInt32(0xf), B.getFalse()}));476} else {477// On GFX10 all DPP operations are confined to a single row. To get cross-478// row operations we have to use permlane or readlane.479480// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes481// 48..63).482assert(ST->hasPermLaneX16());483Value *PermX = B.CreateIntrinsic(484V->getType(), Intrinsic::amdgcn_permlanex16,485{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});486487Value *UpdateDPPCall = B.CreateCall(488UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),489B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});490V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);491492if (!ST->isWave32()) {493// Combine lane 31 into lanes 32..63.494Value *const Lane31 = B.CreateIntrinsic(495V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});496497Value *UpdateDPPCall = B.CreateCall(498UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),499B.getInt32(0xc), B.getInt32(0xf), B.getFalse()});500501V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);502}503}504return V;505}506507// Use the builder to create a shift right of V across the wavefront, with all508// lanes active, to turn an inclusive scan into an exclusive scan.509Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,510Value *Identity) const {511Type *AtomicTy = V->getType();512Module *M = B.GetInsertBlock()->getModule();513Function *UpdateDPP =514Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);515if (ST->hasDPPWavefrontShifts()) {516// GFX9 has DPP wavefront shift operations.517V = B.CreateCall(UpdateDPP,518{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),519B.getInt32(0xf), B.getFalse()});520} else {521Function *ReadLane =522Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);523Function *WriteLane =524Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy);525526// On GFX10 all DPP operations are confined to a single row. To get cross-527// row operations we have to use permlane or readlane.528Value *Old = V;529V = B.CreateCall(UpdateDPP,530{Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),531B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});532533// Copy the old lane 15 to the new lane 16.534V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),535B.getInt32(16), V});536537if (!ST->isWave32()) {538// Copy the old lane 31 to the new lane 32.539V = B.CreateCall(540WriteLane,541{B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});542543// Copy the old lane 47 to the new lane 48.544V = B.CreateCall(545WriteLane,546{B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});547}548}549550return V;551}552553// Use the builder to create an exclusive scan and compute the final reduced554// value using an iterative approach. This provides an alternative555// implementation to DPP which uses WMM for scan computations. This API iterate556// over active lanes to read, compute and update the value using557// readlane and writelane intrinsics.558std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(559IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,560Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {561auto *Ty = I.getType();562auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());563auto *EntryBB = I.getParent();564auto NeedResult = !I.use_empty();565566auto *Ballot =567B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());568569// Start inserting instructions for ComputeLoop block570B.SetInsertPoint(ComputeLoop);571// Phi nodes for Accumulator, Scan results destination, and Active Lanes572auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator");573Accumulator->addIncoming(Identity, EntryBB);574PHINode *OldValuePhi = nullptr;575if (NeedResult) {576OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi");577OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB);578}579auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits");580ActiveBits->addIncoming(Ballot, EntryBB);581582// Use llvm.cttz instrinsic to find the lowest remaining active lane.583auto *FF1 =584B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});585586auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty());587588// Get the value required for atomic operation589Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane,590{V, LaneIdxInt});591592// Perform writelane if intermediate scan results are required later in the593// kernel computations594Value *OldValue = nullptr;595if (NeedResult) {596OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane,597{Accumulator, LaneIdxInt, OldValuePhi});598OldValuePhi->addIncoming(OldValue, ComputeLoop);599}600601// Accumulate the results602auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue);603Accumulator->addIncoming(NewAccumulator, ComputeLoop);604605// Set bit to zero of current active lane so that for next iteration llvm.cttz606// return the next active lane607auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1);608609auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1));610auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask);611ActiveBits->addIncoming(NewActiveBits, ComputeLoop);612613// Branch out of the loop when all lanes are processed.614auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0));615B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop);616617B.SetInsertPoint(ComputeEnd);618619return {OldValue, NewAccumulator};620}621622static Constant *getIdentityValueForAtomicOp(Type *const Ty,623AtomicRMWInst::BinOp Op) {624LLVMContext &C = Ty->getContext();625const unsigned BitWidth = Ty->getPrimitiveSizeInBits();626switch (Op) {627default:628llvm_unreachable("Unhandled atomic op");629case AtomicRMWInst::Add:630case AtomicRMWInst::Sub:631case AtomicRMWInst::Or:632case AtomicRMWInst::Xor:633case AtomicRMWInst::UMax:634return ConstantInt::get(C, APInt::getMinValue(BitWidth));635case AtomicRMWInst::And:636case AtomicRMWInst::UMin:637return ConstantInt::get(C, APInt::getMaxValue(BitWidth));638case AtomicRMWInst::Max:639return ConstantInt::get(C, APInt::getSignedMinValue(BitWidth));640case AtomicRMWInst::Min:641return ConstantInt::get(C, APInt::getSignedMaxValue(BitWidth));642case AtomicRMWInst::FAdd:643return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true));644case AtomicRMWInst::FSub:645return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false));646case AtomicRMWInst::FMin:647case AtomicRMWInst::FMax:648// FIXME: atomicrmw fmax/fmin behave like llvm.maxnum/minnum so NaN is the649// closest thing they have to an identity, but it still does not preserve650// the difference between quiet and signaling NaNs or NaNs with different651// payloads.652return ConstantFP::get(C, APFloat::getNaN(Ty->getFltSemantics()));653}654}655656static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) {657const ConstantInt *CI = dyn_cast<ConstantInt>(LHS);658return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);659}660661void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,662AtomicRMWInst::BinOp Op,663unsigned ValIdx,664bool ValDivergent) const {665// Start building just before the instruction.666IRBuilder<> B(&I);667668if (AtomicRMWInst::isFPOperation(Op)) {669B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP));670}671672// If we are in a pixel shader, because of how we have to mask out helper673// lane invocations, we need to record the entry and exit BB's.674BasicBlock *PixelEntryBB = nullptr;675BasicBlock *PixelExitBB = nullptr;676677// If we're optimizing an atomic within a pixel shader, we need to wrap the678// entire atomic operation in a helper-lane check. We do not want any helper679// lanes that are around only for the purposes of derivatives to take part680// in any cross-lane communication, and we use a branch on whether the lane is681// live to do this.682if (IsPixelShader) {683// Record I's original position as the entry block.684PixelEntryBB = I.getParent();685686Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});687Instruction *const NonHelperTerminator =688SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);689690// Record I's new position as the exit block.691PixelExitBB = I.getParent();692693I.moveBefore(NonHelperTerminator);694B.SetInsertPoint(&I);695}696697Type *const Ty = I.getType();698Type *Int32Ty = B.getInt32Ty();699bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();700[[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);701702// This is the value in the atomic operation we need to combine in order to703// reduce the number of atomic operations.704Value *V = I.getOperand(ValIdx);705706// We need to know how many lanes are active within the wavefront, and we do707// this by doing a ballot of active lanes.708Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());709CallInst *const Ballot =710B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());711712// We need to know how many lanes are active within the wavefront that are713// below us. If we counted each lane linearly starting from 0, a lane is714// below us only if its associated index was less than ours. We do this by715// using the mbcnt intrinsic.716Value *Mbcnt;717if (ST->isWave32()) {718Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},719{Ballot, B.getInt32(0)});720} else {721Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);722Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);723Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},724{ExtractLo, B.getInt32(0)});725Mbcnt =726B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});727}728729Function *F = I.getFunction();730LLVMContext &C = F->getContext();731732// For atomic sub, perform scan with add operation and allow one lane to733// subtract the reduced value later.734AtomicRMWInst::BinOp ScanOp = Op;735if (Op == AtomicRMWInst::Sub) {736ScanOp = AtomicRMWInst::Add;737} else if (Op == AtomicRMWInst::FSub) {738ScanOp = AtomicRMWInst::FAdd;739}740Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp);741742Value *ExclScan = nullptr;743Value *NewV = nullptr;744745const bool NeedResult = !I.use_empty();746747BasicBlock *ComputeLoop = nullptr;748BasicBlock *ComputeEnd = nullptr;749// If we have a divergent value in each lane, we need to combine the value750// using DPP.751if (ValDivergent) {752if (ScanImpl == ScanOptions::DPP) {753// First we need to set all inactive invocations to the identity value, so754// that they can correctly contribute to the final result.755NewV =756B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});757if (!NeedResult && ST->hasPermLaneX16()) {758// On GFX10 the permlanex16 instruction helps us build a reduction759// without too many readlanes and writelanes, which are generally bad760// for performance.761NewV = buildReduction(B, ScanOp, NewV, Identity);762} else {763NewV = buildScan(B, ScanOp, NewV, Identity);764if (NeedResult)765ExclScan = buildShiftRight(B, NewV, Identity);766// Read the value from the last lane, which has accumulated the values767// of each active lane in the wavefront. This will be our new value768// which we will provide to the atomic operation.769Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);770NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,771{NewV, LastLaneIdx});772}773// Finally mark the readlanes in the WWM section.774NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);775} else if (ScanImpl == ScanOptions::Iterative) {776// Alternative implementation for scan777ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F);778ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F);779std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,780ComputeLoop, ComputeEnd);781} else {782llvm_unreachable("Atomic Optimzer is disabled for None strategy");783}784} else {785switch (Op) {786default:787llvm_unreachable("Unhandled atomic op");788789case AtomicRMWInst::Add:790case AtomicRMWInst::Sub: {791// The new value we will be contributing to the atomic operation is the792// old value times the number of active lanes.793Value *const Ctpop = B.CreateIntCast(794B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);795NewV = buildMul(B, V, Ctpop);796break;797}798case AtomicRMWInst::FAdd:799case AtomicRMWInst::FSub: {800Value *const Ctpop = B.CreateIntCast(801B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);802Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);803NewV = B.CreateFMul(V, CtpopFP);804break;805}806case AtomicRMWInst::And:807case AtomicRMWInst::Or:808case AtomicRMWInst::Max:809case AtomicRMWInst::Min:810case AtomicRMWInst::UMax:811case AtomicRMWInst::UMin:812case AtomicRMWInst::FMin:813case AtomicRMWInst::FMax:814// These operations with a uniform value are idempotent: doing the atomic815// operation multiple times has the same effect as doing it once.816NewV = V;817break;818819case AtomicRMWInst::Xor:820// The new value we will be contributing to the atomic operation is the821// old value times the parity of the number of active lanes.822Value *const Ctpop = B.CreateIntCast(823B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);824NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));825break;826}827}828829// We only want a single lane to enter our new control flow, and we do this830// by checking if there are any active lanes below us. Only one lane will831// have 0 active lanes below us, so that will be the only one to progress.832Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));833834// Store I's original basic block before we split the block.835BasicBlock *const OriginalBB = I.getParent();836837// We need to introduce some new control flow to force a single lane to be838// active. We do this by splitting I's basic block at I, and introducing the839// new block such that:840// entry --> single_lane -\841// \------------------> exit842Instruction *const SingleLaneTerminator =843SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);844845// At this point, we have split the I's block to allow one lane in wavefront846// to update the precomputed reduced value. Also, completed the codegen for847// new control flow i.e. iterative loop which perform reduction and scan using848// ComputeLoop and ComputeEnd.849// For the new control flow, we need to move branch instruction i.e.850// terminator created during SplitBlockAndInsertIfThen from I's block to851// ComputeEnd block. We also need to set up predecessor to next block when852// single lane done updating the final reduced value.853BasicBlock *Predecessor = nullptr;854if (ValDivergent && ScanImpl == ScanOptions::Iterative) {855// Move terminator from I's block to ComputeEnd block.856//857// OriginalBB is known to have a branch as terminator because858// SplitBlockAndInsertIfThen will have inserted one.859BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());860B.SetInsertPoint(ComputeEnd);861Terminator->removeFromParent();862B.Insert(Terminator);863864// Branch to ComputeLoop Block unconditionally from the I's block for865// iterative approach.866B.SetInsertPoint(OriginalBB);867B.CreateBr(ComputeLoop);868869// Update the dominator tree for new control flow.870SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(871{{DominatorTree::Insert, OriginalBB, ComputeLoop},872{DominatorTree::Insert, ComputeLoop, ComputeEnd}});873874// We're moving the terminator from EntryBB to ComputeEnd, make sure we move875// the DT edges as well.876for (auto *Succ : Terminator->successors()) {877DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});878DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});879}880881DTU.applyUpdates(DomTreeUpdates);882883Predecessor = ComputeEnd;884} else {885Predecessor = OriginalBB;886}887// Move the IR builder into single_lane next.888B.SetInsertPoint(SingleLaneTerminator);889890// Clone the original atomic operation into single lane, replacing the891// original value with our newly created one.892Instruction *const NewI = I.clone();893B.Insert(NewI);894NewI->setOperand(ValIdx, NewV);895896// Move the IR builder into exit next, and start inserting just before the897// original instruction.898B.SetInsertPoint(&I);899900if (NeedResult) {901// Create a PHI node to get our new atomic result into the exit block.902PHINode *const PHI = B.CreatePHI(Ty, 2);903PHI->addIncoming(PoisonValue::get(Ty), Predecessor);904PHI->addIncoming(NewI, SingleLaneTerminator->getParent());905906// We need to broadcast the value who was the lowest active lane (the first907// lane) to all other lanes in the wavefront. We use an intrinsic for this,908// but have to handle 64-bit broadcasts with two calls to this intrinsic.909Value *BroadcastI = nullptr;910BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);911912// Now that we have the result of our single atomic operation, we need to913// get our individual lane's slice into the result. We use the lane offset914// we previously calculated combined with the atomic result value we got915// from the first lane, to get our lane's index into the atomic result.916Value *LaneOffset = nullptr;917if (ValDivergent) {918if (ScanImpl == ScanOptions::DPP) {919LaneOffset =920B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);921} else if (ScanImpl == ScanOptions::Iterative) {922LaneOffset = ExclScan;923} else {924llvm_unreachable("Atomic Optimzer is disabled for None strategy");925}926} else {927Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)928: B.CreateIntCast(Mbcnt, Ty, false);929switch (Op) {930default:931llvm_unreachable("Unhandled atomic op");932case AtomicRMWInst::Add:933case AtomicRMWInst::Sub:934LaneOffset = buildMul(B, V, Mbcnt);935break;936case AtomicRMWInst::And:937case AtomicRMWInst::Or:938case AtomicRMWInst::Max:939case AtomicRMWInst::Min:940case AtomicRMWInst::UMax:941case AtomicRMWInst::UMin:942case AtomicRMWInst::FMin:943case AtomicRMWInst::FMax:944LaneOffset = B.CreateSelect(Cond, Identity, V);945break;946case AtomicRMWInst::Xor:947LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));948break;949case AtomicRMWInst::FAdd:950case AtomicRMWInst::FSub: {951LaneOffset = B.CreateFMul(V, Mbcnt);952break;953}954}955}956Value *Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);957if (isAtomicFloatingPointTy) {958// For fadd/fsub the first active lane of LaneOffset should be the959// identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated960// is V * +0.0 which might have the wrong sign or might be nan (if V is961// inf or nan).962//963// For all floating point ops if the in-memory value was a nan then the964// binop we just built might have quieted it or changed its payload.965//966// Correct all these problems by using BroadcastI as the result in the967// first active lane.968Result = B.CreateSelect(Cond, BroadcastI, Result);969}970971if (IsPixelShader) {972// Need a final PHI to reconverge to above the helper lane branch mask.973B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());974975PHINode *const PHI = B.CreatePHI(Ty, 2);976PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);977PHI->addIncoming(Result, I.getParent());978I.replaceAllUsesWith(PHI);979} else {980// Replace the original atomic instruction with the new one.981I.replaceAllUsesWith(Result);982}983}984985// And delete the original.986I.eraseFromParent();987}988989INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,990"AMDGPU atomic optimizations", false, false)991INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)992INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)993INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,994"AMDGPU atomic optimizations", false, false)995996FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) {997return new AMDGPUAtomicOptimizer(ScanStrategy);998}99910001001