Path: blob/main/contrib/llvm-project/llvm/lib/CodeGen/AtomicExpandPass.cpp
35234 views
//===- AtomicExpandPass.cpp - Expand atomic instructions ------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file contains a pass (at IR level) to replace atomic instructions with9// __atomic_* library calls, or target specific instruction which implement the10// same semantics in a way which better fits the target backend. This can11// include the use of (intrinsic-based) load-linked/store-conditional loops,12// AtomicCmpXchg, or type coercions.13//14//===----------------------------------------------------------------------===//1516#include "llvm/ADT/ArrayRef.h"17#include "llvm/ADT/STLFunctionalExtras.h"18#include "llvm/ADT/SmallVector.h"19#include "llvm/Analysis/InstSimplifyFolder.h"20#include "llvm/Analysis/OptimizationRemarkEmitter.h"21#include "llvm/CodeGen/AtomicExpand.h"22#include "llvm/CodeGen/AtomicExpandUtils.h"23#include "llvm/CodeGen/RuntimeLibcallUtil.h"24#include "llvm/CodeGen/TargetLowering.h"25#include "llvm/CodeGen/TargetPassConfig.h"26#include "llvm/CodeGen/TargetSubtargetInfo.h"27#include "llvm/CodeGen/ValueTypes.h"28#include "llvm/IR/Attributes.h"29#include "llvm/IR/BasicBlock.h"30#include "llvm/IR/Constant.h"31#include "llvm/IR/Constants.h"32#include "llvm/IR/DataLayout.h"33#include "llvm/IR/DerivedTypes.h"34#include "llvm/IR/Function.h"35#include "llvm/IR/IRBuilder.h"36#include "llvm/IR/InstIterator.h"37#include "llvm/IR/Instruction.h"38#include "llvm/IR/Instructions.h"39#include "llvm/IR/MDBuilder.h"40#include "llvm/IR/MemoryModelRelaxationAnnotations.h"41#include "llvm/IR/Module.h"42#include "llvm/IR/Type.h"43#include "llvm/IR/User.h"44#include "llvm/IR/Value.h"45#include "llvm/InitializePasses.h"46#include "llvm/Pass.h"47#include "llvm/Support/AtomicOrdering.h"48#include "llvm/Support/Casting.h"49#include "llvm/Support/Debug.h"50#include "llvm/Support/ErrorHandling.h"51#include "llvm/Support/raw_ostream.h"52#include "llvm/Target/TargetMachine.h"53#include "llvm/Transforms/Utils/LowerAtomic.h"54#include <cassert>55#include <cstdint>56#include <iterator>5758using namespace llvm;5960#define DEBUG_TYPE "atomic-expand"6162namespace {6364class AtomicExpandImpl {65const TargetLowering *TLI = nullptr;66const DataLayout *DL = nullptr;6768private:69bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);70IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);71LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);72bool tryExpandAtomicLoad(LoadInst *LI);73bool expandAtomicLoadToLL(LoadInst *LI);74bool expandAtomicLoadToCmpXchg(LoadInst *LI);75StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);76bool tryExpandAtomicStore(StoreInst *SI);77void expandAtomicStore(StoreInst *SI);78bool tryExpandAtomicRMW(AtomicRMWInst *AI);79AtomicRMWInst *convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI);80Value *81insertRMWLLSCLoop(IRBuilderBase &Builder, Type *ResultTy, Value *Addr,82Align AddrAlign, AtomicOrdering MemOpOrder,83function_ref<Value *(IRBuilderBase &, Value *)> PerformOp);84void expandAtomicOpToLLSC(85Instruction *I, Type *ResultTy, Value *Addr, Align AddrAlign,86AtomicOrdering MemOpOrder,87function_ref<Value *(IRBuilderBase &, Value *)> PerformOp);88void expandPartwordAtomicRMW(89AtomicRMWInst *I, TargetLoweringBase::AtomicExpansionKind ExpansionKind);90AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI);91bool expandPartwordCmpXchg(AtomicCmpXchgInst *I);92void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI);93void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI);9495AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);96static Value *insertRMWCmpXchgLoop(97IRBuilderBase &Builder, Type *ResultType, Value *Addr, Align AddrAlign,98AtomicOrdering MemOpOrder, SyncScope::ID SSID,99function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,100CreateCmpXchgInstFun CreateCmpXchg);101bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);102103bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);104bool isIdempotentRMW(AtomicRMWInst *RMWI);105bool simplifyIdempotentRMW(AtomicRMWInst *RMWI);106107bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, Align Alignment,108Value *PointerOperand, Value *ValueOperand,109Value *CASExpected, AtomicOrdering Ordering,110AtomicOrdering Ordering2,111ArrayRef<RTLIB::Libcall> Libcalls);112void expandAtomicLoadToLibcall(LoadInst *LI);113void expandAtomicStoreToLibcall(StoreInst *LI);114void expandAtomicRMWToLibcall(AtomicRMWInst *I);115void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);116117friend bool118llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,119CreateCmpXchgInstFun CreateCmpXchg);120121public:122bool run(Function &F, const TargetMachine *TM);123};124125class AtomicExpandLegacy : public FunctionPass {126public:127static char ID; // Pass identification, replacement for typeid128129AtomicExpandLegacy() : FunctionPass(ID) {130initializeAtomicExpandLegacyPass(*PassRegistry::getPassRegistry());131}132133bool runOnFunction(Function &F) override;134};135136// IRBuilder to be used for replacement atomic instructions.137struct ReplacementIRBuilder138: IRBuilder<InstSimplifyFolder, IRBuilderCallbackInserter> {139MDNode *MMRAMD = nullptr;140141// Preserves the DebugLoc from I, and preserves still valid metadata.142// Enable StrictFP builder mode when appropriate.143explicit ReplacementIRBuilder(Instruction *I, const DataLayout &DL)144: IRBuilder(I->getContext(), DL,145IRBuilderCallbackInserter(146[this](Instruction *I) { addMMRAMD(I); })) {147SetInsertPoint(I);148this->CollectMetadataToCopy(I, {LLVMContext::MD_pcsections});149if (BB->getParent()->getAttributes().hasFnAttr(Attribute::StrictFP))150this->setIsFPConstrained(true);151152MMRAMD = I->getMetadata(LLVMContext::MD_mmra);153}154155void addMMRAMD(Instruction *I) {156if (canInstructionHaveMMRAs(*I))157I->setMetadata(LLVMContext::MD_mmra, MMRAMD);158}159};160161} // end anonymous namespace162163char AtomicExpandLegacy::ID = 0;164165char &llvm::AtomicExpandID = AtomicExpandLegacy::ID;166167INITIALIZE_PASS_BEGIN(AtomicExpandLegacy, DEBUG_TYPE,168"Expand Atomic instructions", false, false)169INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)170INITIALIZE_PASS_END(AtomicExpandLegacy, DEBUG_TYPE,171"Expand Atomic instructions", false, false)172173// Helper functions to retrieve the size of atomic instructions.174static unsigned getAtomicOpSize(LoadInst *LI) {175const DataLayout &DL = LI->getDataLayout();176return DL.getTypeStoreSize(LI->getType());177}178179static unsigned getAtomicOpSize(StoreInst *SI) {180const DataLayout &DL = SI->getDataLayout();181return DL.getTypeStoreSize(SI->getValueOperand()->getType());182}183184static unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {185const DataLayout &DL = RMWI->getDataLayout();186return DL.getTypeStoreSize(RMWI->getValOperand()->getType());187}188189static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {190const DataLayout &DL = CASI->getDataLayout();191return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());192}193194// Determine if a particular atomic operation has a supported size,195// and is of appropriate alignment, to be passed through for target196// lowering. (Versus turning into a __atomic libcall)197template <typename Inst>198static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {199unsigned Size = getAtomicOpSize(I);200Align Alignment = I->getAlign();201return Alignment >= Size &&202Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;203}204205bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {206const auto *Subtarget = TM->getSubtargetImpl(F);207if (!Subtarget->enableAtomicExpand())208return false;209TLI = Subtarget->getTargetLowering();210DL = &F.getDataLayout();211212SmallVector<Instruction *, 1> AtomicInsts;213214// Changing control-flow while iterating through it is a bad idea, so gather a215// list of all atomic instructions before we start.216for (Instruction &I : instructions(F))217if (I.isAtomic() && !isa<FenceInst>(&I))218AtomicInsts.push_back(&I);219220bool MadeChange = false;221for (auto *I : AtomicInsts) {222auto LI = dyn_cast<LoadInst>(I);223auto SI = dyn_cast<StoreInst>(I);224auto RMWI = dyn_cast<AtomicRMWInst>(I);225auto CASI = dyn_cast<AtomicCmpXchgInst>(I);226assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction");227228// If the Size/Alignment is not supported, replace with a libcall.229if (LI) {230if (!atomicSizeSupported(TLI, LI)) {231expandAtomicLoadToLibcall(LI);232MadeChange = true;233continue;234}235} else if (SI) {236if (!atomicSizeSupported(TLI, SI)) {237expandAtomicStoreToLibcall(SI);238MadeChange = true;239continue;240}241} else if (RMWI) {242if (!atomicSizeSupported(TLI, RMWI)) {243expandAtomicRMWToLibcall(RMWI);244MadeChange = true;245continue;246}247} else if (CASI) {248if (!atomicSizeSupported(TLI, CASI)) {249expandAtomicCASToLibcall(CASI);250MadeChange = true;251continue;252}253}254255if (LI && TLI->shouldCastAtomicLoadInIR(LI) ==256TargetLoweringBase::AtomicExpansionKind::CastToInteger) {257I = LI = convertAtomicLoadToIntegerType(LI);258MadeChange = true;259} else if (SI &&260TLI->shouldCastAtomicStoreInIR(SI) ==261TargetLoweringBase::AtomicExpansionKind::CastToInteger) {262I = SI = convertAtomicStoreToIntegerType(SI);263MadeChange = true;264} else if (RMWI &&265TLI->shouldCastAtomicRMWIInIR(RMWI) ==266TargetLoweringBase::AtomicExpansionKind::CastToInteger) {267I = RMWI = convertAtomicXchgToIntegerType(RMWI);268MadeChange = true;269} else if (CASI) {270// TODO: when we're ready to make the change at the IR level, we can271// extend convertCmpXchgToInteger for floating point too.272if (CASI->getCompareOperand()->getType()->isPointerTy()) {273// TODO: add a TLI hook to control this so that each target can274// convert to lowering the original type one at a time.275I = CASI = convertCmpXchgToIntegerType(CASI);276MadeChange = true;277}278}279280if (TLI->shouldInsertFencesForAtomic(I)) {281auto FenceOrdering = AtomicOrdering::Monotonic;282if (LI && isAcquireOrStronger(LI->getOrdering())) {283FenceOrdering = LI->getOrdering();284LI->setOrdering(AtomicOrdering::Monotonic);285} else if (SI && isReleaseOrStronger(SI->getOrdering())) {286FenceOrdering = SI->getOrdering();287SI->setOrdering(AtomicOrdering::Monotonic);288} else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||289isAcquireOrStronger(RMWI->getOrdering()))) {290FenceOrdering = RMWI->getOrdering();291RMWI->setOrdering(AtomicOrdering::Monotonic);292} else if (CASI &&293TLI->shouldExpandAtomicCmpXchgInIR(CASI) ==294TargetLoweringBase::AtomicExpansionKind::None &&295(isReleaseOrStronger(CASI->getSuccessOrdering()) ||296isAcquireOrStronger(CASI->getSuccessOrdering()) ||297isAcquireOrStronger(CASI->getFailureOrdering()))) {298// If a compare and swap is lowered to LL/SC, we can do smarter fence299// insertion, with a stronger one on the success path than on the300// failure path. As a result, fence insertion is directly done by301// expandAtomicCmpXchg in that case.302FenceOrdering = CASI->getMergedOrdering();303CASI->setSuccessOrdering(AtomicOrdering::Monotonic);304CASI->setFailureOrdering(AtomicOrdering::Monotonic);305}306307if (FenceOrdering != AtomicOrdering::Monotonic) {308MadeChange |= bracketInstWithFences(I, FenceOrdering);309}310} else if (I->hasAtomicStore() &&311TLI->shouldInsertTrailingFenceForAtomicStore(I)) {312auto FenceOrdering = AtomicOrdering::Monotonic;313if (SI)314FenceOrdering = SI->getOrdering();315else if (RMWI)316FenceOrdering = RMWI->getOrdering();317else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI) !=318TargetLoweringBase::AtomicExpansionKind::LLSC)319// LLSC is handled in expandAtomicCmpXchg().320FenceOrdering = CASI->getSuccessOrdering();321322IRBuilder Builder(I);323if (auto TrailingFence =324TLI->emitTrailingFence(Builder, I, FenceOrdering)) {325TrailingFence->moveAfter(I);326MadeChange = true;327}328}329330if (LI)331MadeChange |= tryExpandAtomicLoad(LI);332else if (SI)333MadeChange |= tryExpandAtomicStore(SI);334else if (RMWI) {335// There are two different ways of expanding RMW instructions:336// - into a load if it is idempotent337// - into a Cmpxchg/LL-SC loop otherwise338// we try them in that order.339340if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {341MadeChange = true;342} else {343MadeChange |= tryExpandAtomicRMW(RMWI);344}345} else if (CASI)346MadeChange |= tryExpandAtomicCmpXchg(CASI);347}348return MadeChange;349}350351bool AtomicExpandLegacy::runOnFunction(Function &F) {352353auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();354if (!TPC)355return false;356auto *TM = &TPC->getTM<TargetMachine>();357AtomicExpandImpl AE;358return AE.run(F, TM);359}360361FunctionPass *llvm::createAtomicExpandLegacyPass() {362return new AtomicExpandLegacy();363}364365PreservedAnalyses AtomicExpandPass::run(Function &F,366FunctionAnalysisManager &AM) {367AtomicExpandImpl AE;368369bool Changed = AE.run(F, TM);370if (!Changed)371return PreservedAnalyses::all();372373return PreservedAnalyses::none();374}375376bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,377AtomicOrdering Order) {378ReplacementIRBuilder Builder(I, *DL);379380auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);381382auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);383// We have a guard here because not every atomic operation generates a384// trailing fence.385if (TrailingFence)386TrailingFence->moveAfter(I);387388return (LeadingFence || TrailingFence);389}390391/// Get the iX type with the same bitwidth as T.392IntegerType *393AtomicExpandImpl::getCorrespondingIntegerType(Type *T, const DataLayout &DL) {394EVT VT = TLI->getMemValueType(DL, T);395unsigned BitWidth = VT.getStoreSizeInBits();396assert(BitWidth == VT.getSizeInBits() && "must be a power of two");397return IntegerType::get(T->getContext(), BitWidth);398}399400/// Convert an atomic load of a non-integral type to an integer load of the401/// equivalent bitwidth. See the function comment on402/// convertAtomicStoreToIntegerType for background.403LoadInst *AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {404auto *M = LI->getModule();405Type *NewTy = getCorrespondingIntegerType(LI->getType(), M->getDataLayout());406407ReplacementIRBuilder Builder(LI, *DL);408409Value *Addr = LI->getPointerOperand();410411auto *NewLI = Builder.CreateLoad(NewTy, Addr);412NewLI->setAlignment(LI->getAlign());413NewLI->setVolatile(LI->isVolatile());414NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());415LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");416417Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());418LI->replaceAllUsesWith(NewVal);419LI->eraseFromParent();420return NewLI;421}422423AtomicRMWInst *424AtomicExpandImpl::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {425auto *M = RMWI->getModule();426Type *NewTy =427getCorrespondingIntegerType(RMWI->getType(), M->getDataLayout());428429ReplacementIRBuilder Builder(RMWI, *DL);430431Value *Addr = RMWI->getPointerOperand();432Value *Val = RMWI->getValOperand();433Value *NewVal = Val->getType()->isPointerTy()434? Builder.CreatePtrToInt(Val, NewTy)435: Builder.CreateBitCast(Val, NewTy);436437auto *NewRMWI = Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, Addr, NewVal,438RMWI->getAlign(), RMWI->getOrdering(),439RMWI->getSyncScopeID());440NewRMWI->setVolatile(RMWI->isVolatile());441LLVM_DEBUG(dbgs() << "Replaced " << *RMWI << " with " << *NewRMWI << "\n");442443Value *NewRVal = RMWI->getType()->isPointerTy()444? Builder.CreateIntToPtr(NewRMWI, RMWI->getType())445: Builder.CreateBitCast(NewRMWI, RMWI->getType());446RMWI->replaceAllUsesWith(NewRVal);447RMWI->eraseFromParent();448return NewRMWI;449}450451bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {452switch (TLI->shouldExpandAtomicLoadInIR(LI)) {453case TargetLoweringBase::AtomicExpansionKind::None:454return false;455case TargetLoweringBase::AtomicExpansionKind::LLSC:456expandAtomicOpToLLSC(457LI, LI->getType(), LI->getPointerOperand(), LI->getAlign(),458LI->getOrdering(),459[](IRBuilderBase &Builder, Value *Loaded) { return Loaded; });460return true;461case TargetLoweringBase::AtomicExpansionKind::LLOnly:462return expandAtomicLoadToLL(LI);463case TargetLoweringBase::AtomicExpansionKind::CmpXChg:464return expandAtomicLoadToCmpXchg(LI);465case TargetLoweringBase::AtomicExpansionKind::NotAtomic:466LI->setAtomic(AtomicOrdering::NotAtomic);467return true;468default:469llvm_unreachable("Unhandled case in tryExpandAtomicLoad");470}471}472473bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) {474switch (TLI->shouldExpandAtomicStoreInIR(SI)) {475case TargetLoweringBase::AtomicExpansionKind::None:476return false;477case TargetLoweringBase::AtomicExpansionKind::Expand:478expandAtomicStore(SI);479return true;480case TargetLoweringBase::AtomicExpansionKind::NotAtomic:481SI->setAtomic(AtomicOrdering::NotAtomic);482return true;483default:484llvm_unreachable("Unhandled case in tryExpandAtomicStore");485}486}487488bool AtomicExpandImpl::expandAtomicLoadToLL(LoadInst *LI) {489ReplacementIRBuilder Builder(LI, *DL);490491// On some architectures, load-linked instructions are atomic for larger492// sizes than normal loads. For example, the only 64-bit load guaranteed493// to be single-copy atomic by ARM is an ldrexd (A3.5.3).494Value *Val = TLI->emitLoadLinked(Builder, LI->getType(),495LI->getPointerOperand(), LI->getOrdering());496TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);497498LI->replaceAllUsesWith(Val);499LI->eraseFromParent();500501return true;502}503504bool AtomicExpandImpl::expandAtomicLoadToCmpXchg(LoadInst *LI) {505ReplacementIRBuilder Builder(LI, *DL);506AtomicOrdering Order = LI->getOrdering();507if (Order == AtomicOrdering::Unordered)508Order = AtomicOrdering::Monotonic;509510Value *Addr = LI->getPointerOperand();511Type *Ty = LI->getType();512Constant *DummyVal = Constant::getNullValue(Ty);513514Value *Pair = Builder.CreateAtomicCmpXchg(515Addr, DummyVal, DummyVal, LI->getAlign(), Order,516AtomicCmpXchgInst::getStrongestFailureOrdering(Order));517Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");518519LI->replaceAllUsesWith(Loaded);520LI->eraseFromParent();521522return true;523}524525/// Convert an atomic store of a non-integral type to an integer store of the526/// equivalent bitwidth. We used to not support floating point or vector527/// atomics in the IR at all. The backends learned to deal with the bitcast528/// idiom because that was the only way of expressing the notion of a atomic529/// float or vector store. The long term plan is to teach each backend to530/// instruction select from the original atomic store, but as a migration531/// mechanism, we convert back to the old format which the backends understand.532/// Each backend will need individual work to recognize the new format.533StoreInst *AtomicExpandImpl::convertAtomicStoreToIntegerType(StoreInst *SI) {534ReplacementIRBuilder Builder(SI, *DL);535auto *M = SI->getModule();536Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),537M->getDataLayout());538Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy);539540Value *Addr = SI->getPointerOperand();541542StoreInst *NewSI = Builder.CreateStore(NewVal, Addr);543NewSI->setAlignment(SI->getAlign());544NewSI->setVolatile(SI->isVolatile());545NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());546LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");547SI->eraseFromParent();548return NewSI;549}550551void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) {552// This function is only called on atomic stores that are too large to be553// atomic if implemented as a native store. So we replace them by an554// atomic swap, that can be implemented for example as a ldrex/strex on ARM555// or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.556// It is the responsibility of the target to only signal expansion via557// shouldExpandAtomicRMW in cases where this is required and possible.558ReplacementIRBuilder Builder(SI, *DL);559AtomicOrdering Ordering = SI->getOrdering();560assert(Ordering != AtomicOrdering::NotAtomic);561AtomicOrdering RMWOrdering = Ordering == AtomicOrdering::Unordered562? AtomicOrdering::Monotonic563: Ordering;564AtomicRMWInst *AI = Builder.CreateAtomicRMW(565AtomicRMWInst::Xchg, SI->getPointerOperand(), SI->getValueOperand(),566SI->getAlign(), RMWOrdering);567SI->eraseFromParent();568569// Now we have an appropriate swap instruction, lower it as usual.570tryExpandAtomicRMW(AI);571}572573static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,574Value *Loaded, Value *NewVal, Align AddrAlign,575AtomicOrdering MemOpOrder, SyncScope::ID SSID,576Value *&Success, Value *&NewLoaded) {577Type *OrigTy = NewVal->getType();578579// This code can go away when cmpxchg supports FP and vector types.580assert(!OrigTy->isPointerTy());581bool NeedBitcast = OrigTy->isFloatingPointTy() || OrigTy->isVectorTy();582if (NeedBitcast) {583IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits());584NewVal = Builder.CreateBitCast(NewVal, IntTy);585Loaded = Builder.CreateBitCast(Loaded, IntTy);586}587588Value *Pair = Builder.CreateAtomicCmpXchg(589Addr, Loaded, NewVal, AddrAlign, MemOpOrder,590AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);591Success = Builder.CreateExtractValue(Pair, 1, "success");592NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");593594if (NeedBitcast)595NewLoaded = Builder.CreateBitCast(NewLoaded, OrigTy);596}597598bool AtomicExpandImpl::tryExpandAtomicRMW(AtomicRMWInst *AI) {599LLVMContext &Ctx = AI->getModule()->getContext();600TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(AI);601switch (Kind) {602case TargetLoweringBase::AtomicExpansionKind::None:603return false;604case TargetLoweringBase::AtomicExpansionKind::LLSC: {605unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;606unsigned ValueSize = getAtomicOpSize(AI);607if (ValueSize < MinCASSize) {608expandPartwordAtomicRMW(AI,609TargetLoweringBase::AtomicExpansionKind::LLSC);610} else {611auto PerformOp = [&](IRBuilderBase &Builder, Value *Loaded) {612return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded,613AI->getValOperand());614};615expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(),616AI->getAlign(), AI->getOrdering(), PerformOp);617}618return true;619}620case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {621unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;622unsigned ValueSize = getAtomicOpSize(AI);623if (ValueSize < MinCASSize) {624expandPartwordAtomicRMW(AI,625TargetLoweringBase::AtomicExpansionKind::CmpXChg);626} else {627SmallVector<StringRef> SSNs;628Ctx.getSyncScopeNames(SSNs);629auto MemScope = SSNs[AI->getSyncScopeID()].empty()630? "system"631: SSNs[AI->getSyncScopeID()];632OptimizationRemarkEmitter ORE(AI->getFunction());633ORE.emit([&]() {634return OptimizationRemark(DEBUG_TYPE, "Passed", AI)635<< "A compare and swap loop was generated for an atomic "636<< AI->getOperationName(AI->getOperation()) << " operation at "637<< MemScope << " memory scope";638});639expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);640}641return true;642}643case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic: {644unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;645unsigned ValueSize = getAtomicOpSize(AI);646if (ValueSize < MinCASSize) {647AtomicRMWInst::BinOp Op = AI->getOperation();648// Widen And/Or/Xor and give the target another chance at expanding it.649if (Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor ||650Op == AtomicRMWInst::And) {651tryExpandAtomicRMW(widenPartwordAtomicRMW(AI));652return true;653}654}655expandAtomicRMWToMaskedIntrinsic(AI);656return true;657}658case TargetLoweringBase::AtomicExpansionKind::BitTestIntrinsic: {659TLI->emitBitTestAtomicRMWIntrinsic(AI);660return true;661}662case TargetLoweringBase::AtomicExpansionKind::CmpArithIntrinsic: {663TLI->emitCmpArithAtomicRMWIntrinsic(AI);664return true;665}666case TargetLoweringBase::AtomicExpansionKind::NotAtomic:667return lowerAtomicRMWInst(AI);668case TargetLoweringBase::AtomicExpansionKind::Expand:669TLI->emitExpandAtomicRMW(AI);670return true;671default:672llvm_unreachable("Unhandled case in tryExpandAtomicRMW");673}674}675676namespace {677678struct PartwordMaskValues {679// These three fields are guaranteed to be set by createMaskInstrs.680Type *WordType = nullptr;681Type *ValueType = nullptr;682Type *IntValueType = nullptr;683Value *AlignedAddr = nullptr;684Align AlignedAddrAlignment;685// The remaining fields can be null.686Value *ShiftAmt = nullptr;687Value *Mask = nullptr;688Value *Inv_Mask = nullptr;689};690691LLVM_ATTRIBUTE_UNUSED692raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) {693auto PrintObj = [&O](auto *V) {694if (V)695O << *V;696else697O << "nullptr";698O << '\n';699};700O << "PartwordMaskValues {\n";701O << " WordType: ";702PrintObj(PMV.WordType);703O << " ValueType: ";704PrintObj(PMV.ValueType);705O << " AlignedAddr: ";706PrintObj(PMV.AlignedAddr);707O << " AlignedAddrAlignment: " << PMV.AlignedAddrAlignment.value() << '\n';708O << " ShiftAmt: ";709PrintObj(PMV.ShiftAmt);710O << " Mask: ";711PrintObj(PMV.Mask);712O << " Inv_Mask: ";713PrintObj(PMV.Inv_Mask);714O << "}\n";715return O;716}717718} // end anonymous namespace719720/// This is a helper function which builds instructions to provide721/// values necessary for partword atomic operations. It takes an722/// incoming address, Addr, and ValueType, and constructs the address,723/// shift-amounts and masks needed to work with a larger value of size724/// WordSize.725///726/// AlignedAddr: Addr rounded down to a multiple of WordSize727///728/// ShiftAmt: Number of bits to right-shift a WordSize value loaded729/// from AlignAddr for it to have the same value as if730/// ValueType was loaded from Addr.731///732/// Mask: Value to mask with the value loaded from AlignAddr to733/// include only the part that would've been loaded from Addr.734///735/// Inv_Mask: The inverse of Mask.736static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,737Instruction *I, Type *ValueType,738Value *Addr, Align AddrAlign,739unsigned MinWordSize) {740PartwordMaskValues PMV;741742Module *M = I->getModule();743LLVMContext &Ctx = M->getContext();744const DataLayout &DL = M->getDataLayout();745unsigned ValueSize = DL.getTypeStoreSize(ValueType);746747PMV.ValueType = PMV.IntValueType = ValueType;748if (PMV.ValueType->isFloatingPointTy() || PMV.ValueType->isVectorTy())749PMV.IntValueType =750Type::getIntNTy(Ctx, ValueType->getPrimitiveSizeInBits());751752PMV.WordType = MinWordSize > ValueSize ? Type::getIntNTy(Ctx, MinWordSize * 8)753: ValueType;754if (PMV.ValueType == PMV.WordType) {755PMV.AlignedAddr = Addr;756PMV.AlignedAddrAlignment = AddrAlign;757PMV.ShiftAmt = ConstantInt::get(PMV.ValueType, 0);758PMV.Mask = ConstantInt::get(PMV.ValueType, ~0, /*isSigned*/ true);759return PMV;760}761762PMV.AlignedAddrAlignment = Align(MinWordSize);763764assert(ValueSize < MinWordSize);765766PointerType *PtrTy = cast<PointerType>(Addr->getType());767IntegerType *IntTy = DL.getIndexType(Ctx, PtrTy->getAddressSpace());768Value *PtrLSB;769770if (AddrAlign < MinWordSize) {771PMV.AlignedAddr = Builder.CreateIntrinsic(772Intrinsic::ptrmask, {PtrTy, IntTy},773{Addr, ConstantInt::get(IntTy, ~(uint64_t)(MinWordSize - 1))}, nullptr,774"AlignedAddr");775776Value *AddrInt = Builder.CreatePtrToInt(Addr, IntTy);777PtrLSB = Builder.CreateAnd(AddrInt, MinWordSize - 1, "PtrLSB");778} else {779// If the alignment is high enough, the LSB are known 0.780PMV.AlignedAddr = Addr;781PtrLSB = ConstantInt::getNullValue(IntTy);782}783784if (DL.isLittleEndian()) {785// turn bytes into bits786PMV.ShiftAmt = Builder.CreateShl(PtrLSB, 3);787} else {788// turn bytes into bits, and count from the other side.789PMV.ShiftAmt = Builder.CreateShl(790Builder.CreateXor(PtrLSB, MinWordSize - ValueSize), 3);791}792793PMV.ShiftAmt = Builder.CreateTrunc(PMV.ShiftAmt, PMV.WordType, "ShiftAmt");794PMV.Mask = Builder.CreateShl(795ConstantInt::get(PMV.WordType, (1 << (ValueSize * 8)) - 1), PMV.ShiftAmt,796"Mask");797798PMV.Inv_Mask = Builder.CreateNot(PMV.Mask, "Inv_Mask");799800return PMV;801}802803static Value *extractMaskedValue(IRBuilderBase &Builder, Value *WideWord,804const PartwordMaskValues &PMV) {805assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");806if (PMV.WordType == PMV.ValueType)807return WideWord;808809Value *Shift = Builder.CreateLShr(WideWord, PMV.ShiftAmt, "shifted");810Value *Trunc = Builder.CreateTrunc(Shift, PMV.IntValueType, "extracted");811return Builder.CreateBitCast(Trunc, PMV.ValueType);812}813814static Value *insertMaskedValue(IRBuilderBase &Builder, Value *WideWord,815Value *Updated, const PartwordMaskValues &PMV) {816assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");817assert(Updated->getType() == PMV.ValueType && "Value type mismatch");818if (PMV.WordType == PMV.ValueType)819return Updated;820821Updated = Builder.CreateBitCast(Updated, PMV.IntValueType);822823Value *ZExt = Builder.CreateZExt(Updated, PMV.WordType, "extended");824Value *Shift =825Builder.CreateShl(ZExt, PMV.ShiftAmt, "shifted", /*HasNUW*/ true);826Value *And = Builder.CreateAnd(WideWord, PMV.Inv_Mask, "unmasked");827Value *Or = Builder.CreateOr(And, Shift, "inserted");828return Or;829}830831/// Emit IR to implement a masked version of a given atomicrmw832/// operation. (That is, only the bits under the Mask should be833/// affected by the operation)834static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,835IRBuilderBase &Builder, Value *Loaded,836Value *Shifted_Inc, Value *Inc,837const PartwordMaskValues &PMV) {838// TODO: update to use839// https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge in order840// to merge bits from two values without requiring PMV.Inv_Mask.841switch (Op) {842case AtomicRMWInst::Xchg: {843Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);844Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc);845return FinalVal;846}847case AtomicRMWInst::Or:848case AtomicRMWInst::Xor:849case AtomicRMWInst::And:850llvm_unreachable("Or/Xor/And handled by widenPartwordAtomicRMW");851case AtomicRMWInst::Add:852case AtomicRMWInst::Sub:853case AtomicRMWInst::Nand: {854// The other arithmetic ops need to be masked into place.855Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded, Shifted_Inc);856Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask);857Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);858Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked);859return FinalVal;860}861case AtomicRMWInst::Max:862case AtomicRMWInst::Min:863case AtomicRMWInst::UMax:864case AtomicRMWInst::UMin:865case AtomicRMWInst::FAdd:866case AtomicRMWInst::FSub:867case AtomicRMWInst::FMin:868case AtomicRMWInst::FMax:869case AtomicRMWInst::UIncWrap:870case AtomicRMWInst::UDecWrap: {871// Finally, other ops will operate on the full value, so truncate down to872// the original size, and expand out again after doing the873// operation. Bitcasts will be inserted for FP values.874Value *Loaded_Extract = extractMaskedValue(Builder, Loaded, PMV);875Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded_Extract, Inc);876Value *FinalVal = insertMaskedValue(Builder, Loaded, NewVal, PMV);877return FinalVal;878}879default:880llvm_unreachable("Unknown atomic op");881}882}883884/// Expand a sub-word atomicrmw operation into an appropriate885/// word-sized operation.886///887/// It will create an LL/SC or cmpxchg loop, as appropriate, the same888/// way as a typical atomicrmw expansion. The only difference here is889/// that the operation inside of the loop may operate upon only a890/// part of the value.891void AtomicExpandImpl::expandPartwordAtomicRMW(892AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {893// Widen And/Or/Xor and give the target another chance at expanding it.894AtomicRMWInst::BinOp Op = AI->getOperation();895if (Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor ||896Op == AtomicRMWInst::And) {897tryExpandAtomicRMW(widenPartwordAtomicRMW(AI));898return;899}900AtomicOrdering MemOpOrder = AI->getOrdering();901SyncScope::ID SSID = AI->getSyncScopeID();902903ReplacementIRBuilder Builder(AI, *DL);904905PartwordMaskValues PMV =906createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),907AI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);908909Value *ValOperand_Shifted = nullptr;910if (Op == AtomicRMWInst::Xchg || Op == AtomicRMWInst::Add ||911Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Nand) {912Value *ValOp = Builder.CreateBitCast(AI->getValOperand(), PMV.IntValueType);913ValOperand_Shifted =914Builder.CreateShl(Builder.CreateZExt(ValOp, PMV.WordType), PMV.ShiftAmt,915"ValOperand_Shifted");916}917918auto PerformPartwordOp = [&](IRBuilderBase &Builder, Value *Loaded) {919return performMaskedAtomicOp(Op, Builder, Loaded, ValOperand_Shifted,920AI->getValOperand(), PMV);921};922923Value *OldResult;924if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {925OldResult = insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr,926PMV.AlignedAddrAlignment, MemOpOrder, SSID,927PerformPartwordOp, createCmpXchgInstFun);928} else {929assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);930OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr,931PMV.AlignedAddrAlignment, MemOpOrder,932PerformPartwordOp);933}934935Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV);936AI->replaceAllUsesWith(FinalOldResult);937AI->eraseFromParent();938}939940/// Copy metadata that's safe to preserve when widening atomics.941static void copyMetadataForAtomic(Instruction &Dest,942const Instruction &Source) {943SmallVector<std::pair<unsigned, MDNode *>, 8> MD;944Source.getAllMetadata(MD);945LLVMContext &Ctx = Dest.getContext();946MDBuilder MDB(Ctx);947948for (auto [ID, N] : MD) {949switch (ID) {950case LLVMContext::MD_dbg:951case LLVMContext::MD_tbaa:952case LLVMContext::MD_tbaa_struct:953case LLVMContext::MD_alias_scope:954case LLVMContext::MD_noalias:955case LLVMContext::MD_access_group:956case LLVMContext::MD_mmra:957Dest.setMetadata(ID, N);958break;959default:960if (ID == Ctx.getMDKindID("amdgpu.no.remote.memory"))961Dest.setMetadata(ID, N);962else if (ID == Ctx.getMDKindID("amdgpu.no.fine.grained.memory"))963Dest.setMetadata(ID, N);964965break;966}967}968}969970// Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width.971AtomicRMWInst *AtomicExpandImpl::widenPartwordAtomicRMW(AtomicRMWInst *AI) {972ReplacementIRBuilder Builder(AI, *DL);973AtomicRMWInst::BinOp Op = AI->getOperation();974975assert((Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor ||976Op == AtomicRMWInst::And) &&977"Unable to widen operation");978979PartwordMaskValues PMV =980createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),981AI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);982983Value *ValOperand_Shifted =984Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),985PMV.ShiftAmt, "ValOperand_Shifted");986987Value *NewOperand;988989if (Op == AtomicRMWInst::And)990NewOperand =991Builder.CreateOr(ValOperand_Shifted, PMV.Inv_Mask, "AndOperand");992else993NewOperand = ValOperand_Shifted;994995AtomicRMWInst *NewAI = Builder.CreateAtomicRMW(996Op, PMV.AlignedAddr, NewOperand, PMV.AlignedAddrAlignment,997AI->getOrdering(), AI->getSyncScopeID());998999copyMetadataForAtomic(*NewAI, *AI);10001001Value *FinalOldResult = extractMaskedValue(Builder, NewAI, PMV);1002AI->replaceAllUsesWith(FinalOldResult);1003AI->eraseFromParent();1004return NewAI;1005}10061007bool AtomicExpandImpl::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {1008// The basic idea here is that we're expanding a cmpxchg of a1009// smaller memory size up to a word-sized cmpxchg. To do this, we1010// need to add a retry-loop for strong cmpxchg, so that1011// modifications to other parts of the word don't cause a spurious1012// failure.10131014// This generates code like the following:1015// [[Setup mask values PMV.*]]1016// %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt1017// %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt1018// %InitLoaded = load i32* %addr1019// %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask1020// br partword.cmpxchg.loop1021// partword.cmpxchg.loop:1022// %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ],1023// [ %OldVal_MaskOut, %partword.cmpxchg.failure ]1024// %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted1025// %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted1026// %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp,1027// i32 %FullWord_NewVal success_ordering failure_ordering1028// %OldVal = extractvalue { i32, i1 } %NewCI, 01029// %Success = extractvalue { i32, i1 } %NewCI, 11030// br i1 %Success, label %partword.cmpxchg.end,1031// label %partword.cmpxchg.failure1032// partword.cmpxchg.failure:1033// %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask1034// %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut1035// br i1 %ShouldContinue, label %partword.cmpxchg.loop,1036// label %partword.cmpxchg.end1037// partword.cmpxchg.end:1038// %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt1039// %FinalOldVal = trunc i32 %tmp1 to i81040// %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 01041// %Res = insertvalue { i8, i1 } %25, i1 %Success, 110421043Value *Addr = CI->getPointerOperand();1044Value *Cmp = CI->getCompareOperand();1045Value *NewVal = CI->getNewValOperand();10461047BasicBlock *BB = CI->getParent();1048Function *F = BB->getParent();1049ReplacementIRBuilder Builder(CI, *DL);1050LLVMContext &Ctx = Builder.getContext();10511052BasicBlock *EndBB =1053BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end");1054auto FailureBB =1055BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB);1056auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB);10571058// The split call above "helpfully" added a branch at the end of BB1059// (to the wrong place).1060std::prev(BB->end())->eraseFromParent();1061Builder.SetInsertPoint(BB);10621063PartwordMaskValues PMV =1064createMaskInstrs(Builder, CI, CI->getCompareOperand()->getType(), Addr,1065CI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);10661067// Shift the incoming values over, into the right location in the word.1068Value *NewVal_Shifted =1069Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);1070Value *Cmp_Shifted =1071Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt);10721073// Load the entire current word, and mask into place the expected and new1074// values1075LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr);1076InitLoaded->setVolatile(CI->isVolatile());1077Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask);1078Builder.CreateBr(LoopBB);10791080// partword.cmpxchg.loop:1081Builder.SetInsertPoint(LoopBB);1082PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2);1083Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB);10841085// Mask/Or the expected and new values into place in the loaded word.1086Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted);1087Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted);1088AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(1089PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, PMV.AlignedAddrAlignment,1090CI->getSuccessOrdering(), CI->getFailureOrdering(), CI->getSyncScopeID());1091NewCI->setVolatile(CI->isVolatile());1092// When we're building a strong cmpxchg, we need a loop, so you1093// might think we could use a weak cmpxchg inside. But, using strong1094// allows the below comparison for ShouldContinue, and we're1095// expecting the underlying cmpxchg to be a machine instruction,1096// which is strong anyways.1097NewCI->setWeak(CI->isWeak());10981099Value *OldVal = Builder.CreateExtractValue(NewCI, 0);1100Value *Success = Builder.CreateExtractValue(NewCI, 1);11011102if (CI->isWeak())1103Builder.CreateBr(EndBB);1104else1105Builder.CreateCondBr(Success, EndBB, FailureBB);11061107// partword.cmpxchg.failure:1108Builder.SetInsertPoint(FailureBB);1109// Upon failure, verify that the masked-out part of the loaded value1110// has been modified. If it didn't, abort the cmpxchg, since the1111// masked-in part must've.1112Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask);1113Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut);1114Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB);11151116// Add the second value to the phi from above1117Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB);11181119// partword.cmpxchg.end:1120Builder.SetInsertPoint(CI);11211122Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV);1123Value *Res = PoisonValue::get(CI->getType());1124Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);1125Res = Builder.CreateInsertValue(Res, Success, 1);11261127CI->replaceAllUsesWith(Res);1128CI->eraseFromParent();1129return true;1130}11311132void AtomicExpandImpl::expandAtomicOpToLLSC(1133Instruction *I, Type *ResultType, Value *Addr, Align AddrAlign,1134AtomicOrdering MemOpOrder,1135function_ref<Value *(IRBuilderBase &, Value *)> PerformOp) {1136ReplacementIRBuilder Builder(I, *DL);1137Value *Loaded = insertRMWLLSCLoop(Builder, ResultType, Addr, AddrAlign,1138MemOpOrder, PerformOp);11391140I->replaceAllUsesWith(Loaded);1141I->eraseFromParent();1142}11431144void AtomicExpandImpl::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {1145ReplacementIRBuilder Builder(AI, *DL);11461147PartwordMaskValues PMV =1148createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),1149AI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);11501151// The value operand must be sign-extended for signed min/max so that the1152// target's signed comparison instructions can be used. Otherwise, just1153// zero-ext.1154Instruction::CastOps CastOp = Instruction::ZExt;1155AtomicRMWInst::BinOp RMWOp = AI->getOperation();1156if (RMWOp == AtomicRMWInst::Max || RMWOp == AtomicRMWInst::Min)1157CastOp = Instruction::SExt;11581159Value *ValOperand_Shifted = Builder.CreateShl(1160Builder.CreateCast(CastOp, AI->getValOperand(), PMV.WordType),1161PMV.ShiftAmt, "ValOperand_Shifted");1162Value *OldResult = TLI->emitMaskedAtomicRMWIntrinsic(1163Builder, AI, PMV.AlignedAddr, ValOperand_Shifted, PMV.Mask, PMV.ShiftAmt,1164AI->getOrdering());1165Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV);1166AI->replaceAllUsesWith(FinalOldResult);1167AI->eraseFromParent();1168}11691170void AtomicExpandImpl::expandAtomicCmpXchgToMaskedIntrinsic(1171AtomicCmpXchgInst *CI) {1172ReplacementIRBuilder Builder(CI, *DL);11731174PartwordMaskValues PMV = createMaskInstrs(1175Builder, CI, CI->getCompareOperand()->getType(), CI->getPointerOperand(),1176CI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);11771178Value *CmpVal_Shifted = Builder.CreateShl(1179Builder.CreateZExt(CI->getCompareOperand(), PMV.WordType), PMV.ShiftAmt,1180"CmpVal_Shifted");1181Value *NewVal_Shifted = Builder.CreateShl(1182Builder.CreateZExt(CI->getNewValOperand(), PMV.WordType), PMV.ShiftAmt,1183"NewVal_Shifted");1184Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic(1185Builder, CI, PMV.AlignedAddr, CmpVal_Shifted, NewVal_Shifted, PMV.Mask,1186CI->getMergedOrdering());1187Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV);1188Value *Res = PoisonValue::get(CI->getType());1189Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);1190Value *Success = Builder.CreateICmpEQ(1191CmpVal_Shifted, Builder.CreateAnd(OldVal, PMV.Mask), "Success");1192Res = Builder.CreateInsertValue(Res, Success, 1);11931194CI->replaceAllUsesWith(Res);1195CI->eraseFromParent();1196}11971198Value *AtomicExpandImpl::insertRMWLLSCLoop(1199IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,1200AtomicOrdering MemOpOrder,1201function_ref<Value *(IRBuilderBase &, Value *)> PerformOp) {1202LLVMContext &Ctx = Builder.getContext();1203BasicBlock *BB = Builder.GetInsertBlock();1204Function *F = BB->getParent();12051206assert(AddrAlign >=1207F->getDataLayout().getTypeStoreSize(ResultTy) &&1208"Expected at least natural alignment at this point.");12091210// Given: atomicrmw some_op iN* %addr, iN %incr ordering1211//1212// The standard expansion we produce is:1213// [...]1214// atomicrmw.start:1215// %loaded = @load.linked(%addr)1216// %new = some_op iN %loaded, %incr1217// %stored = @store_conditional(%new, %addr)1218// %try_again = icmp i32 ne %stored, 01219// br i1 %try_again, label %loop, label %atomicrmw.end1220// atomicrmw.end:1221// [...]1222BasicBlock *ExitBB =1223BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");1224BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);12251226// The split call above "helpfully" added a branch at the end of BB (to the1227// wrong place).1228std::prev(BB->end())->eraseFromParent();1229Builder.SetInsertPoint(BB);1230Builder.CreateBr(LoopBB);12311232// Start the main loop block now that we've taken care of the preliminaries.1233Builder.SetInsertPoint(LoopBB);1234Value *Loaded = TLI->emitLoadLinked(Builder, ResultTy, Addr, MemOpOrder);12351236Value *NewVal = PerformOp(Builder, Loaded);12371238Value *StoreSuccess =1239TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);1240Value *TryAgain = Builder.CreateICmpNE(1241StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");1242Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);12431244Builder.SetInsertPoint(ExitBB, ExitBB->begin());1245return Loaded;1246}12471248/// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of1249/// the equivalent bitwidth. We used to not support pointer cmpxchg in the1250/// IR. As a migration step, we convert back to what use to be the standard1251/// way to represent a pointer cmpxchg so that we can update backends one by1252/// one.1253AtomicCmpXchgInst *1254AtomicExpandImpl::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {1255auto *M = CI->getModule();1256Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),1257M->getDataLayout());12581259ReplacementIRBuilder Builder(CI, *DL);12601261Value *Addr = CI->getPointerOperand();12621263Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy);1264Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy);12651266auto *NewCI = Builder.CreateAtomicCmpXchg(1267Addr, NewCmp, NewNewVal, CI->getAlign(), CI->getSuccessOrdering(),1268CI->getFailureOrdering(), CI->getSyncScopeID());1269NewCI->setVolatile(CI->isVolatile());1270NewCI->setWeak(CI->isWeak());1271LLVM_DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");12721273Value *OldVal = Builder.CreateExtractValue(NewCI, 0);1274Value *Succ = Builder.CreateExtractValue(NewCI, 1);12751276OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType());12771278Value *Res = PoisonValue::get(CI->getType());1279Res = Builder.CreateInsertValue(Res, OldVal, 0);1280Res = Builder.CreateInsertValue(Res, Succ, 1);12811282CI->replaceAllUsesWith(Res);1283CI->eraseFromParent();1284return NewCI;1285}12861287bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {1288AtomicOrdering SuccessOrder = CI->getSuccessOrdering();1289AtomicOrdering FailureOrder = CI->getFailureOrdering();1290Value *Addr = CI->getPointerOperand();1291BasicBlock *BB = CI->getParent();1292Function *F = BB->getParent();1293LLVMContext &Ctx = F->getContext();1294// If shouldInsertFencesForAtomic() returns true, then the target does not1295// want to deal with memory orders, and emitLeading/TrailingFence should take1296// care of everything. Otherwise, emitLeading/TrailingFence are no-op and we1297// should preserve the ordering.1298bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI);1299AtomicOrdering MemOpOrder = ShouldInsertFencesForAtomic1300? AtomicOrdering::Monotonic1301: CI->getMergedOrdering();13021303// In implementations which use a barrier to achieve release semantics, we can1304// delay emitting this barrier until we know a store is actually going to be1305// attempted. The cost of this delay is that we need 2 copies of the block1306// emitting the load-linked, affecting code size.1307//1308// Ideally, this logic would be unconditional except for the minsize check1309// since in other cases the extra blocks naturally collapse down to the1310// minimal loop. Unfortunately, this puts too much stress on later1311// optimisations so we avoid emitting the extra logic in those cases too.1312bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&1313SuccessOrder != AtomicOrdering::Monotonic &&1314SuccessOrder != AtomicOrdering::Acquire &&1315!F->hasMinSize();13161317// There's no overhead for sinking the release barrier in a weak cmpxchg, so1318// do it even on minsize.1319bool UseUnconditionalReleaseBarrier = F->hasMinSize() && !CI->isWeak();13201321// Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord1322//1323// The full expansion we produce is:1324// [...]1325// %aligned.addr = ...1326// cmpxchg.start:1327// %unreleasedload = @load.linked(%aligned.addr)1328// %unreleasedload.extract = extract value from %unreleasedload1329// %should_store = icmp eq %unreleasedload.extract, %desired1330// br i1 %should_store, label %cmpxchg.releasingstore,1331// label %cmpxchg.nostore1332// cmpxchg.releasingstore:1333// fence?1334// br label cmpxchg.trystore1335// cmpxchg.trystore:1336// %loaded.trystore = phi [%unreleasedload, %cmpxchg.releasingstore],1337// [%releasedload, %cmpxchg.releasedload]1338// %updated.new = insert %new into %loaded.trystore1339// %stored = @store_conditional(%updated.new, %aligned.addr)1340// %success = icmp eq i32 %stored, 01341// br i1 %success, label %cmpxchg.success,1342// label %cmpxchg.releasedload/%cmpxchg.failure1343// cmpxchg.releasedload:1344// %releasedload = @load.linked(%aligned.addr)1345// %releasedload.extract = extract value from %releasedload1346// %should_store = icmp eq %releasedload.extract, %desired1347// br i1 %should_store, label %cmpxchg.trystore,1348// label %cmpxchg.failure1349// cmpxchg.success:1350// fence?1351// br label %cmpxchg.end1352// cmpxchg.nostore:1353// %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],1354// [%releasedload,1355// %cmpxchg.releasedload/%cmpxchg.trystore]1356// @load_linked_fail_balance()?1357// br label %cmpxchg.failure1358// cmpxchg.failure:1359// fence?1360// br label %cmpxchg.end1361// cmpxchg.end:1362// %loaded.exit = phi [%loaded.nostore, %cmpxchg.failure],1363// [%loaded.trystore, %cmpxchg.trystore]1364// %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]1365// %loaded = extract value from %loaded.exit1366// %restmp = insertvalue { iN, i1 } undef, iN %loaded, 01367// %res = insertvalue { iN, i1 } %restmp, i1 %success, 11368// [...]1369BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end");1370auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);1371auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);1372auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);1373auto ReleasedLoadBB =1374BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB);1375auto TryStoreBB =1376BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB);1377auto ReleasingStoreBB =1378BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB);1379auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB);13801381ReplacementIRBuilder Builder(CI, *DL);13821383// The split call above "helpfully" added a branch at the end of BB (to the1384// wrong place), but we might want a fence too. It's easiest to just remove1385// the branch entirely.1386std::prev(BB->end())->eraseFromParent();1387Builder.SetInsertPoint(BB);1388if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)1389TLI->emitLeadingFence(Builder, CI, SuccessOrder);13901391PartwordMaskValues PMV =1392createMaskInstrs(Builder, CI, CI->getCompareOperand()->getType(), Addr,1393CI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);1394Builder.CreateBr(StartBB);13951396// Start the main loop block now that we've taken care of the preliminaries.1397Builder.SetInsertPoint(StartBB);1398Value *UnreleasedLoad =1399TLI->emitLoadLinked(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder);1400Value *UnreleasedLoadExtract =1401extractMaskedValue(Builder, UnreleasedLoad, PMV);1402Value *ShouldStore = Builder.CreateICmpEQ(1403UnreleasedLoadExtract, CI->getCompareOperand(), "should_store");14041405// If the cmpxchg doesn't actually need any ordering when it fails, we can1406// jump straight past that fence instruction (if it exists).1407Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB);14081409Builder.SetInsertPoint(ReleasingStoreBB);1410if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)1411TLI->emitLeadingFence(Builder, CI, SuccessOrder);1412Builder.CreateBr(TryStoreBB);14131414Builder.SetInsertPoint(TryStoreBB);1415PHINode *LoadedTryStore =1416Builder.CreatePHI(PMV.WordType, 2, "loaded.trystore");1417LoadedTryStore->addIncoming(UnreleasedLoad, ReleasingStoreBB);1418Value *NewValueInsert =1419insertMaskedValue(Builder, LoadedTryStore, CI->getNewValOperand(), PMV);1420Value *StoreSuccess = TLI->emitStoreConditional(Builder, NewValueInsert,1421PMV.AlignedAddr, MemOpOrder);1422StoreSuccess = Builder.CreateICmpEQ(1423StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");1424BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;1425Builder.CreateCondBr(StoreSuccess, SuccessBB,1426CI->isWeak() ? FailureBB : RetryBB);14271428Builder.SetInsertPoint(ReleasedLoadBB);1429Value *SecondLoad;1430if (HasReleasedLoadBB) {1431SecondLoad =1432TLI->emitLoadLinked(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder);1433Value *SecondLoadExtract = extractMaskedValue(Builder, SecondLoad, PMV);1434ShouldStore = Builder.CreateICmpEQ(SecondLoadExtract,1435CI->getCompareOperand(), "should_store");14361437// If the cmpxchg doesn't actually need any ordering when it fails, we can1438// jump straight past that fence instruction (if it exists).1439Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);1440// Update PHI node in TryStoreBB.1441LoadedTryStore->addIncoming(SecondLoad, ReleasedLoadBB);1442} else1443Builder.CreateUnreachable();14441445// Make sure later instructions don't get reordered with a fence if1446// necessary.1447Builder.SetInsertPoint(SuccessBB);1448if (ShouldInsertFencesForAtomic ||1449TLI->shouldInsertTrailingFenceForAtomicStore(CI))1450TLI->emitTrailingFence(Builder, CI, SuccessOrder);1451Builder.CreateBr(ExitBB);14521453Builder.SetInsertPoint(NoStoreBB);1454PHINode *LoadedNoStore =1455Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.nostore");1456LoadedNoStore->addIncoming(UnreleasedLoad, StartBB);1457if (HasReleasedLoadBB)1458LoadedNoStore->addIncoming(SecondLoad, ReleasedLoadBB);14591460// In the failing case, where we don't execute the store-conditional, the1461// target might want to balance out the load-linked with a dedicated1462// instruction (e.g., on ARM, clearing the exclusive monitor).1463TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);1464Builder.CreateBr(FailureBB);14651466Builder.SetInsertPoint(FailureBB);1467PHINode *LoadedFailure =1468Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.failure");1469LoadedFailure->addIncoming(LoadedNoStore, NoStoreBB);1470if (CI->isWeak())1471LoadedFailure->addIncoming(LoadedTryStore, TryStoreBB);1472if (ShouldInsertFencesForAtomic)1473TLI->emitTrailingFence(Builder, CI, FailureOrder);1474Builder.CreateBr(ExitBB);14751476// Finally, we have control-flow based knowledge of whether the cmpxchg1477// succeeded or not. We expose this to later passes by converting any1478// subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate1479// PHI.1480Builder.SetInsertPoint(ExitBB, ExitBB->begin());1481PHINode *LoadedExit =1482Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.exit");1483LoadedExit->addIncoming(LoadedTryStore, SuccessBB);1484LoadedExit->addIncoming(LoadedFailure, FailureBB);1485PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2, "success");1486Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);1487Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);14881489// This is the "exit value" from the cmpxchg expansion. It may be of1490// a type wider than the one in the cmpxchg instruction.1491Value *LoadedFull = LoadedExit;14921493Builder.SetInsertPoint(ExitBB, std::next(Success->getIterator()));1494Value *Loaded = extractMaskedValue(Builder, LoadedFull, PMV);14951496// Look for any users of the cmpxchg that are just comparing the loaded value1497// against the desired one, and replace them with the CFG-derived version.1498SmallVector<ExtractValueInst *, 2> PrunedInsts;1499for (auto *User : CI->users()) {1500ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);1501if (!EV)1502continue;15031504assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&1505"weird extraction from { iN, i1 }");15061507if (EV->getIndices()[0] == 0)1508EV->replaceAllUsesWith(Loaded);1509else1510EV->replaceAllUsesWith(Success);15111512PrunedInsts.push_back(EV);1513}15141515// We can remove the instructions now we're no longer iterating through them.1516for (auto *EV : PrunedInsts)1517EV->eraseFromParent();15181519if (!CI->use_empty()) {1520// Some use of the full struct return that we don't understand has happened,1521// so we've got to reconstruct it properly.1522Value *Res;1523Res = Builder.CreateInsertValue(PoisonValue::get(CI->getType()), Loaded, 0);1524Res = Builder.CreateInsertValue(Res, Success, 1);15251526CI->replaceAllUsesWith(Res);1527}15281529CI->eraseFromParent();1530return true;1531}15321533bool AtomicExpandImpl::isIdempotentRMW(AtomicRMWInst *RMWI) {1534auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());1535if (!C)1536return false;15371538AtomicRMWInst::BinOp Op = RMWI->getOperation();1539switch (Op) {1540case AtomicRMWInst::Add:1541case AtomicRMWInst::Sub:1542case AtomicRMWInst::Or:1543case AtomicRMWInst::Xor:1544return C->isZero();1545case AtomicRMWInst::And:1546return C->isMinusOne();1547// FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...1548default:1549return false;1550}1551}15521553bool AtomicExpandImpl::simplifyIdempotentRMW(AtomicRMWInst *RMWI) {1554if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {1555tryExpandAtomicLoad(ResultingLoad);1556return true;1557}1558return false;1559}15601561Value *AtomicExpandImpl::insertRMWCmpXchgLoop(1562IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,1563AtomicOrdering MemOpOrder, SyncScope::ID SSID,1564function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,1565CreateCmpXchgInstFun CreateCmpXchg) {1566LLVMContext &Ctx = Builder.getContext();1567BasicBlock *BB = Builder.GetInsertBlock();1568Function *F = BB->getParent();15691570// Given: atomicrmw some_op iN* %addr, iN %incr ordering1571//1572// The standard expansion we produce is:1573// [...]1574// %init_loaded = load atomic iN* %addr1575// br label %loop1576// loop:1577// %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]1578// %new = some_op iN %loaded, %incr1579// %pair = cmpxchg iN* %addr, iN %loaded, iN %new1580// %new_loaded = extractvalue { iN, i1 } %pair, 01581// %success = extractvalue { iN, i1 } %pair, 11582// br i1 %success, label %atomicrmw.end, label %loop1583// atomicrmw.end:1584// [...]1585BasicBlock *ExitBB =1586BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");1587BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);15881589// The split call above "helpfully" added a branch at the end of BB (to the1590// wrong place), but we want a load. It's easiest to just remove1591// the branch entirely.1592std::prev(BB->end())->eraseFromParent();1593Builder.SetInsertPoint(BB);1594LoadInst *InitLoaded = Builder.CreateAlignedLoad(ResultTy, Addr, AddrAlign);1595Builder.CreateBr(LoopBB);15961597// Start the main loop block now that we've taken care of the preliminaries.1598Builder.SetInsertPoint(LoopBB);1599PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded");1600Loaded->addIncoming(InitLoaded, BB);16011602Value *NewVal = PerformOp(Builder, Loaded);16031604Value *NewLoaded = nullptr;1605Value *Success = nullptr;16061607CreateCmpXchg(Builder, Addr, Loaded, NewVal, AddrAlign,1608MemOpOrder == AtomicOrdering::Unordered1609? AtomicOrdering::Monotonic1610: MemOpOrder,1611SSID, Success, NewLoaded);1612assert(Success && NewLoaded);16131614Loaded->addIncoming(NewLoaded, LoopBB);16151616Builder.CreateCondBr(Success, ExitBB, LoopBB);16171618Builder.SetInsertPoint(ExitBB, ExitBB->begin());1619return NewLoaded;1620}16211622bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {1623unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;1624unsigned ValueSize = getAtomicOpSize(CI);16251626switch (TLI->shouldExpandAtomicCmpXchgInIR(CI)) {1627default:1628llvm_unreachable("Unhandled case in tryExpandAtomicCmpXchg");1629case TargetLoweringBase::AtomicExpansionKind::None:1630if (ValueSize < MinCASSize)1631return expandPartwordCmpXchg(CI);1632return false;1633case TargetLoweringBase::AtomicExpansionKind::LLSC: {1634return expandAtomicCmpXchg(CI);1635}1636case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic:1637expandAtomicCmpXchgToMaskedIntrinsic(CI);1638return true;1639case TargetLoweringBase::AtomicExpansionKind::NotAtomic:1640return lowerAtomicCmpXchgInst(CI);1641}1642}16431644// Note: This function is exposed externally by AtomicExpandUtils.h1645bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,1646CreateCmpXchgInstFun CreateCmpXchg) {1647ReplacementIRBuilder Builder(AI, AI->getDataLayout());1648Builder.setIsFPConstrained(1649AI->getFunction()->hasFnAttribute(Attribute::StrictFP));16501651// FIXME: If FP exceptions are observable, we should force them off for the1652// loop for the FP atomics.1653Value *Loaded = AtomicExpandImpl::insertRMWCmpXchgLoop(1654Builder, AI->getType(), AI->getPointerOperand(), AI->getAlign(),1655AI->getOrdering(), AI->getSyncScopeID(),1656[&](IRBuilderBase &Builder, Value *Loaded) {1657return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded,1658AI->getValOperand());1659},1660CreateCmpXchg);16611662AI->replaceAllUsesWith(Loaded);1663AI->eraseFromParent();1664return true;1665}16661667// In order to use one of the sized library calls such as1668// __atomic_fetch_add_4, the alignment must be sufficient, the size1669// must be one of the potentially-specialized sizes, and the value1670// type must actually exist in C on the target (otherwise, the1671// function wouldn't actually be defined.)1672static bool canUseSizedAtomicCall(unsigned Size, Align Alignment,1673const DataLayout &DL) {1674// TODO: "LargestSize" is an approximation for "largest type that1675// you can express in C". It seems to be the case that int128 is1676// supported on all 64-bit platforms, otherwise only up to 64-bit1677// integers are supported. If we get this wrong, then we'll try to1678// call a sized libcall that doesn't actually exist. There should1679// really be some more reliable way in LLVM of determining integer1680// sizes which are valid in the target's C ABI...1681unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8;1682return Alignment >= Size &&1683(Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) &&1684Size <= LargestSize;1685}16861687void AtomicExpandImpl::expandAtomicLoadToLibcall(LoadInst *I) {1688static const RTLIB::Libcall Libcalls[6] = {1689RTLIB::ATOMIC_LOAD, RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,1690RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};1691unsigned Size = getAtomicOpSize(I);16921693bool expanded = expandAtomicOpToLibcall(1694I, Size, I->getAlign(), I->getPointerOperand(), nullptr, nullptr,1695I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);1696if (!expanded)1697report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Load");1698}16991700void AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) {1701static const RTLIB::Libcall Libcalls[6] = {1702RTLIB::ATOMIC_STORE, RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,1703RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};1704unsigned Size = getAtomicOpSize(I);17051706bool expanded = expandAtomicOpToLibcall(1707I, Size, I->getAlign(), I->getPointerOperand(), I->getValueOperand(),1708nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);1709if (!expanded)1710report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Store");1711}17121713void AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {1714static const RTLIB::Libcall Libcalls[6] = {1715RTLIB::ATOMIC_COMPARE_EXCHANGE, RTLIB::ATOMIC_COMPARE_EXCHANGE_1,1716RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,1717RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};1718unsigned Size = getAtomicOpSize(I);17191720bool expanded = expandAtomicOpToLibcall(1721I, Size, I->getAlign(), I->getPointerOperand(), I->getNewValOperand(),1722I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),1723Libcalls);1724if (!expanded)1725report_fatal_error("expandAtomicOpToLibcall shouldn't fail for CAS");1726}17271728static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {1729static const RTLIB::Libcall LibcallsXchg[6] = {1730RTLIB::ATOMIC_EXCHANGE, RTLIB::ATOMIC_EXCHANGE_1,1731RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4,1732RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16};1733static const RTLIB::Libcall LibcallsAdd[6] = {1734RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_ADD_1,1735RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4,1736RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16};1737static const RTLIB::Libcall LibcallsSub[6] = {1738RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_SUB_1,1739RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4,1740RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16};1741static const RTLIB::Libcall LibcallsAnd[6] = {1742RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_AND_1,1743RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4,1744RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16};1745static const RTLIB::Libcall LibcallsOr[6] = {1746RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_OR_1,1747RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4,1748RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16};1749static const RTLIB::Libcall LibcallsXor[6] = {1750RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_XOR_1,1751RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4,1752RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16};1753static const RTLIB::Libcall LibcallsNand[6] = {1754RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_NAND_1,1755RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4,1756RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16};17571758switch (Op) {1759case AtomicRMWInst::BAD_BINOP:1760llvm_unreachable("Should not have BAD_BINOP.");1761case AtomicRMWInst::Xchg:1762return ArrayRef(LibcallsXchg);1763case AtomicRMWInst::Add:1764return ArrayRef(LibcallsAdd);1765case AtomicRMWInst::Sub:1766return ArrayRef(LibcallsSub);1767case AtomicRMWInst::And:1768return ArrayRef(LibcallsAnd);1769case AtomicRMWInst::Or:1770return ArrayRef(LibcallsOr);1771case AtomicRMWInst::Xor:1772return ArrayRef(LibcallsXor);1773case AtomicRMWInst::Nand:1774return ArrayRef(LibcallsNand);1775case AtomicRMWInst::Max:1776case AtomicRMWInst::Min:1777case AtomicRMWInst::UMax:1778case AtomicRMWInst::UMin:1779case AtomicRMWInst::FMax:1780case AtomicRMWInst::FMin:1781case AtomicRMWInst::FAdd:1782case AtomicRMWInst::FSub:1783case AtomicRMWInst::UIncWrap:1784case AtomicRMWInst::UDecWrap:1785// No atomic libcalls are available for max/min/umax/umin.1786return {};1787}1788llvm_unreachable("Unexpected AtomicRMW operation.");1789}17901791void AtomicExpandImpl::expandAtomicRMWToLibcall(AtomicRMWInst *I) {1792ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation());17931794unsigned Size = getAtomicOpSize(I);17951796bool Success = false;1797if (!Libcalls.empty())1798Success = expandAtomicOpToLibcall(1799I, Size, I->getAlign(), I->getPointerOperand(), I->getValOperand(),1800nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);18011802// The expansion failed: either there were no libcalls at all for1803// the operation (min/max), or there were only size-specialized1804// libcalls (add/sub/etc) and we needed a generic. So, expand to a1805// CAS libcall, via a CAS loop, instead.1806if (!Success) {1807expandAtomicRMWToCmpXchg(1808I, [this](IRBuilderBase &Builder, Value *Addr, Value *Loaded,1809Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder,1810SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) {1811// Create the CAS instruction normally...1812AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(1813Addr, Loaded, NewVal, Alignment, MemOpOrder,1814AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);1815Success = Builder.CreateExtractValue(Pair, 1, "success");1816NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");18171818// ...and then expand the CAS into a libcall.1819expandAtomicCASToLibcall(Pair);1820});1821}1822}18231824// A helper routine for the above expandAtomic*ToLibcall functions.1825//1826// 'Libcalls' contains an array of enum values for the particular1827// ATOMIC libcalls to be emitted. All of the other arguments besides1828// 'I' are extracted from the Instruction subclass by the1829// caller. Depending on the particular call, some will be null.1830bool AtomicExpandImpl::expandAtomicOpToLibcall(1831Instruction *I, unsigned Size, Align Alignment, Value *PointerOperand,1832Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering,1833AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {1834assert(Libcalls.size() == 6);18351836LLVMContext &Ctx = I->getContext();1837Module *M = I->getModule();1838const DataLayout &DL = M->getDataLayout();1839IRBuilder<> Builder(I);1840IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());18411842bool UseSizedLibcall = canUseSizedAtomicCall(Size, Alignment, DL);1843Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8);18441845const Align AllocaAlignment = DL.getPrefTypeAlign(SizedIntTy);18461847// TODO: the "order" argument type is "int", not int32. So1848// getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.1849ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size);1850assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");1851Constant *OrderingVal =1852ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering));1853Constant *Ordering2Val = nullptr;1854if (CASExpected) {1855assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO");1856Ordering2Val =1857ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2));1858}1859bool HasResult = I->getType() != Type::getVoidTy(Ctx);18601861RTLIB::Libcall RTLibType;1862if (UseSizedLibcall) {1863switch (Size) {1864case 1:1865RTLibType = Libcalls[1];1866break;1867case 2:1868RTLibType = Libcalls[2];1869break;1870case 4:1871RTLibType = Libcalls[3];1872break;1873case 8:1874RTLibType = Libcalls[4];1875break;1876case 16:1877RTLibType = Libcalls[5];1878break;1879}1880} else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) {1881RTLibType = Libcalls[0];1882} else {1883// Can't use sized function, and there's no generic for this1884// operation, so give up.1885return false;1886}18871888if (!TLI->getLibcallName(RTLibType)) {1889// This target does not implement the requested atomic libcall so give up.1890return false;1891}18921893// Build up the function call. There's two kinds. First, the sized1894// variants. These calls are going to be one of the following (with1895// N=1,2,4,8,16):1896// iN __atomic_load_N(iN *ptr, int ordering)1897// void __atomic_store_N(iN *ptr, iN val, int ordering)1898// iN __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering)1899// bool __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired,1900// int success_order, int failure_order)1901//1902// Note that these functions can be used for non-integer atomic1903// operations, the values just need to be bitcast to integers on the1904// way in and out.1905//1906// And, then, the generic variants. They look like the following:1907// void __atomic_load(size_t size, void *ptr, void *ret, int ordering)1908// void __atomic_store(size_t size, void *ptr, void *val, int ordering)1909// void __atomic_exchange(size_t size, void *ptr, void *val, void *ret,1910// int ordering)1911// bool __atomic_compare_exchange(size_t size, void *ptr, void *expected,1912// void *desired, int success_order,1913// int failure_order)1914//1915// The different signatures are built up depending on the1916// 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult'1917// variables.19181919AllocaInst *AllocaCASExpected = nullptr;1920AllocaInst *AllocaValue = nullptr;1921AllocaInst *AllocaResult = nullptr;19221923Type *ResultTy;1924SmallVector<Value *, 6> Args;1925AttributeList Attr;19261927// 'size' argument.1928if (!UseSizedLibcall) {1929// Note, getIntPtrType is assumed equivalent to size_t.1930Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size));1931}19321933// 'ptr' argument.1934// note: This assumes all address spaces share a common libfunc1935// implementation and that addresses are convertable. For systems without1936// that property, we'd need to extend this mechanism to support AS-specific1937// families of atomic intrinsics.1938Value *PtrVal = PointerOperand;1939PtrVal = Builder.CreateAddrSpaceCast(PtrVal, PointerType::getUnqual(Ctx));1940Args.push_back(PtrVal);19411942// 'expected' argument, if present.1943if (CASExpected) {1944AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());1945AllocaCASExpected->setAlignment(AllocaAlignment);1946Builder.CreateLifetimeStart(AllocaCASExpected, SizeVal64);1947Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment);1948Args.push_back(AllocaCASExpected);1949}19501951// 'val' argument ('desired' for cas), if present.1952if (ValueOperand) {1953if (UseSizedLibcall) {1954Value *IntValue =1955Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy);1956Args.push_back(IntValue);1957} else {1958AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());1959AllocaValue->setAlignment(AllocaAlignment);1960Builder.CreateLifetimeStart(AllocaValue, SizeVal64);1961Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment);1962Args.push_back(AllocaValue);1963}1964}19651966// 'ret' argument.1967if (!CASExpected && HasResult && !UseSizedLibcall) {1968AllocaResult = AllocaBuilder.CreateAlloca(I->getType());1969AllocaResult->setAlignment(AllocaAlignment);1970Builder.CreateLifetimeStart(AllocaResult, SizeVal64);1971Args.push_back(AllocaResult);1972}19731974// 'ordering' ('success_order' for cas) argument.1975Args.push_back(OrderingVal);19761977// 'failure_order' argument, if present.1978if (Ordering2Val)1979Args.push_back(Ordering2Val);19801981// Now, the return type.1982if (CASExpected) {1983ResultTy = Type::getInt1Ty(Ctx);1984Attr = Attr.addRetAttribute(Ctx, Attribute::ZExt);1985} else if (HasResult && UseSizedLibcall)1986ResultTy = SizedIntTy;1987else1988ResultTy = Type::getVoidTy(Ctx);19891990// Done with setting up arguments and return types, create the call:1991SmallVector<Type *, 6> ArgTys;1992for (Value *Arg : Args)1993ArgTys.push_back(Arg->getType());1994FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false);1995FunctionCallee LibcallFn =1996M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr);1997CallInst *Call = Builder.CreateCall(LibcallFn, Args);1998Call->setAttributes(Attr);1999Value *Result = Call;20002001// And then, extract the results...2002if (ValueOperand && !UseSizedLibcall)2003Builder.CreateLifetimeEnd(AllocaValue, SizeVal64);20042005if (CASExpected) {2006// The final result from the CAS is {load of 'expected' alloca, bool result2007// from call}2008Type *FinalResultTy = I->getType();2009Value *V = PoisonValue::get(FinalResultTy);2010Value *ExpectedOut = Builder.CreateAlignedLoad(2011CASExpected->getType(), AllocaCASExpected, AllocaAlignment);2012Builder.CreateLifetimeEnd(AllocaCASExpected, SizeVal64);2013V = Builder.CreateInsertValue(V, ExpectedOut, 0);2014V = Builder.CreateInsertValue(V, Result, 1);2015I->replaceAllUsesWith(V);2016} else if (HasResult) {2017Value *V;2018if (UseSizedLibcall)2019V = Builder.CreateBitOrPointerCast(Result, I->getType());2020else {2021V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,2022AllocaAlignment);2023Builder.CreateLifetimeEnd(AllocaResult, SizeVal64);2024}2025I->replaceAllUsesWith(V);2026}2027I->eraseFromParent();2028return true;2029}203020312032