Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
35266 views
//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// \file9// This file implements a TargetTransformInfo analysis pass specific to the10// AMDGPU target machine. It uses the target's detailed information to provide11// more precise answers to certain TTI queries, while letting the target12// independent and default TTI implementations handle the rest.13//14//===----------------------------------------------------------------------===//1516#include "AMDGPUInstrInfo.h"17#include "AMDGPUTargetTransformInfo.h"18#include "GCNSubtarget.h"19#include "llvm/ADT/FloatingPointMode.h"20#include "llvm/IR/IntrinsicsAMDGPU.h"21#include "llvm/Transforms/InstCombine/InstCombiner.h"22#include <optional>2324using namespace llvm;25using namespace llvm::PatternMatch;2627#define DEBUG_TYPE "AMDGPUtti"2829namespace {3031struct AMDGPUImageDMaskIntrinsic {32unsigned Intr;33};3435#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL36#include "InstCombineTables.inc"3738} // end anonymous namespace3940// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.41//42// A single NaN input is folded to minnum, so we rely on that folding for43// handling NaNs.44static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,45const APFloat &Src2) {46APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);4748APFloat::cmpResult Cmp0 = Max3.compare(Src0);49assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");50if (Cmp0 == APFloat::cmpEqual)51return maxnum(Src1, Src2);5253APFloat::cmpResult Cmp1 = Max3.compare(Src1);54assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");55if (Cmp1 == APFloat::cmpEqual)56return maxnum(Src0, Src2);5758return maxnum(Src0, Src1);59}6061// Check if a value can be converted to a 16-bit value without losing62// precision.63// The value is expected to be either a float (IsFloat = true) or an unsigned64// integer (IsFloat = false).65static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {66Type *VTy = V.getType();67if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {68// The value is already 16-bit, so we don't want to convert to 16-bit again!69return false;70}71if (IsFloat) {72if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {73// We need to check that if we cast the index down to a half, we do not74// lose precision.75APFloat FloatValue(ConstFloat->getValueAPF());76bool LosesInfo = true;77FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,78&LosesInfo);79return !LosesInfo;80}81} else {82if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {83// We need to check that if we cast the index down to an i16, we do not84// lose precision.85APInt IntValue(ConstInt->getValue());86return IntValue.getActiveBits() <= 16;87}88}8990Value *CastSrc;91bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))92: match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));93if (IsExt) {94Type *CastSrcTy = CastSrc->getType();95if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))96return true;97}9899return false;100}101102// Convert a value to 16-bit.103static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {104Type *VTy = V.getType();105if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))106return cast<Instruction>(&V)->getOperand(0);107if (VTy->isIntegerTy())108return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);109if (VTy->isFloatingPointTy())110return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));111112llvm_unreachable("Should never be called!");113}114115/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with116/// modified arguments (based on OldIntr) and replaces InstToReplace with117/// this newly created intrinsic call.118static std::optional<Instruction *> modifyIntrinsicCall(119IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,120InstCombiner &IC,121std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>122Func) {123SmallVector<Type *, 4> ArgTys;124if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))125return std::nullopt;126127SmallVector<Value *, 8> Args(OldIntr.args());128129// Modify arguments and types130Func(Args, ArgTys);131132Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);133134CallInst *NewCall = IC.Builder.CreateCall(I, Args);135NewCall->takeName(&OldIntr);136NewCall->copyMetadata(OldIntr);137if (isa<FPMathOperator>(NewCall))138NewCall->copyFastMathFlags(&OldIntr);139140// Erase and replace uses141if (!InstToReplace.getType()->isVoidTy())142IC.replaceInstUsesWith(InstToReplace, NewCall);143144bool RemoveOldIntr = &OldIntr != &InstToReplace;145146auto RetValue = IC.eraseInstFromFunction(InstToReplace);147if (RemoveOldIntr)148IC.eraseInstFromFunction(OldIntr);149150return RetValue;151}152153static std::optional<Instruction *>154simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,155const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,156IntrinsicInst &II, InstCombiner &IC) {157// Optimize _L to _LZ when _L is zero158if (const auto *LZMappingInfo =159AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {160if (auto *ConstantLod =161dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {162if (ConstantLod->isZero() || ConstantLod->isNegative()) {163const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =164AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,165ImageDimIntr->Dim);166return modifyIntrinsicCall(167II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {168Args.erase(Args.begin() + ImageDimIntr->LodIndex);169});170}171}172}173174// Optimize _mip away, when 'lod' is zero175if (const auto *MIPMappingInfo =176AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {177if (auto *ConstantMip =178dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {179if (ConstantMip->isZero()) {180const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =181AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,182ImageDimIntr->Dim);183return modifyIntrinsicCall(184II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {185Args.erase(Args.begin() + ImageDimIntr->MipIndex);186});187}188}189}190191// Optimize _bias away when 'bias' is zero192if (const auto *BiasMappingInfo =193AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {194if (auto *ConstantBias =195dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {196if (ConstantBias->isZero()) {197const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =198AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,199ImageDimIntr->Dim);200return modifyIntrinsicCall(201II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {202Args.erase(Args.begin() + ImageDimIntr->BiasIndex);203ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);204});205}206}207}208209// Optimize _offset away when 'offset' is zero210if (const auto *OffsetMappingInfo =211AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {212if (auto *ConstantOffset =213dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {214if (ConstantOffset->isZero()) {215const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =216AMDGPU::getImageDimIntrinsicByBaseOpcode(217OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);218return modifyIntrinsicCall(219II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {220Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);221});222}223}224}225226// Try to use D16227if (ST->hasD16Images()) {228229const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =230AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);231232if (BaseOpcode->HasD16) {233234// If the only use of image intrinsic is a fptrunc (with conversion to235// half) then both fptrunc and image intrinsic will be replaced with image236// intrinsic with D16 flag.237if (II.hasOneUse()) {238Instruction *User = II.user_back();239240if (User->getOpcode() == Instruction::FPTrunc &&241User->getType()->getScalarType()->isHalfTy()) {242243return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,244[&](auto &Args, auto &ArgTys) {245// Change return type of image intrinsic.246// Set it to return type of fptrunc.247ArgTys[0] = User->getType();248});249}250}251}252}253254// Try to use A16 or G16255if (!ST->hasA16() && !ST->hasG16())256return std::nullopt;257258// Address is interpreted as float if the instruction has a sampler or as259// unsigned int if there is no sampler.260bool HasSampler =261AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;262bool FloatCoord = false;263// true means derivatives can be converted to 16 bit, coordinates not264bool OnlyDerivatives = false;265266for (unsigned OperandIndex = ImageDimIntr->GradientStart;267OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {268Value *Coord = II.getOperand(OperandIndex);269// If the values are not derived from 16-bit values, we cannot optimize.270if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {271if (OperandIndex < ImageDimIntr->CoordStart ||272ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {273return std::nullopt;274}275// All gradients can be converted, so convert only them276OnlyDerivatives = true;277break;278}279280assert(OperandIndex == ImageDimIntr->GradientStart ||281FloatCoord == Coord->getType()->isFloatingPointTy());282FloatCoord = Coord->getType()->isFloatingPointTy();283}284285if (!OnlyDerivatives && !ST->hasA16())286OnlyDerivatives = true; // Only supports G16287288// Check if there is a bias parameter and if it can be converted to f16289if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {290Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);291assert(HasSampler &&292"Only image instructions with a sampler can have a bias");293if (!canSafelyConvertTo16Bit(*Bias, HasSampler))294OnlyDerivatives = true;295}296297if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==298ImageDimIntr->CoordStart))299return std::nullopt;300301Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())302: Type::getInt16Ty(II.getContext());303304return modifyIntrinsicCall(305II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {306ArgTys[ImageDimIntr->GradientTyArg] = CoordType;307if (!OnlyDerivatives) {308ArgTys[ImageDimIntr->CoordTyArg] = CoordType;309310// Change the bias type311if (ImageDimIntr->NumBiasArgs != 0)312ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());313}314315unsigned EndIndex =316OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;317for (unsigned OperandIndex = ImageDimIntr->GradientStart;318OperandIndex < EndIndex; OperandIndex++) {319Args[OperandIndex] =320convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);321}322323// Convert the bias324if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {325Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);326Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);327}328});329}330331bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,332const Value *Op0, const Value *Op1,333InstCombiner &IC) const {334// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or335// infinity, gives +0.0. If we can prove we don't have one of the special336// cases then we can use a normal multiply instead.337// TODO: Create and use isKnownFiniteNonZero instead of just matching338// constants here.339if (match(Op0, PatternMatch::m_FiniteNonZero()) ||340match(Op1, PatternMatch::m_FiniteNonZero())) {341// One operand is not zero or infinity or NaN.342return true;343}344345SimplifyQuery SQ = IC.getSimplifyQuery().getWithInstruction(&I);346if (isKnownNeverInfOrNaN(Op0, /*Depth=*/0, SQ) &&347isKnownNeverInfOrNaN(Op1, /*Depth=*/0, SQ)) {348// Neither operand is infinity or NaN.349return true;350}351return false;352}353354/// Match an fpext from half to float, or a constant we can convert.355static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {356if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc)))))357return FPExtSrc->getType()->isHalfTy();358359ConstantFP *CFP;360if (match(Arg, m_ConstantFP(CFP))) {361bool LosesInfo;362APFloat Val(CFP->getValueAPF());363Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);364if (LosesInfo)365return false;366367FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);368return true;369}370371return false;372}373374// Trim all zero components from the end of the vector \p UseV and return375// an appropriate bitset with known elements.376static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,377Instruction *I) {378auto *VTy = cast<FixedVectorType>(UseV->getType());379unsigned VWidth = VTy->getNumElements();380APInt DemandedElts = APInt::getAllOnes(VWidth);381382for (int i = VWidth - 1; i > 0; --i) {383auto *Elt = findScalarElement(UseV, i);384if (!Elt)385break;386387if (auto *ConstElt = dyn_cast<Constant>(Elt)) {388if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))389break;390} else {391break;392}393394DemandedElts.clearBit(i);395}396397return DemandedElts;398}399400// Trim elements of the end of the vector \p V, if they are401// equal to the first element of the vector.402static APInt defaultComponentBroadcast(Value *V) {403auto *VTy = cast<FixedVectorType>(V->getType());404unsigned VWidth = VTy->getNumElements();405APInt DemandedElts = APInt::getAllOnes(VWidth);406Value *FirstComponent = findScalarElement(V, 0);407408SmallVector<int> ShuffleMask;409if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))410SVI->getShuffleMask(ShuffleMask);411412for (int I = VWidth - 1; I > 0; --I) {413if (ShuffleMask.empty()) {414auto *Elt = findScalarElement(V, I);415if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))416break;417} else {418// Detect identical elements in the shufflevector result, even though419// findScalarElement cannot tell us what that element is.420if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)421break;422}423DemandedElts.clearBit(I);424}425426return DemandedElts;427}428429static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,430IntrinsicInst &II,431APInt DemandedElts,432int DMaskIdx = -1,433bool IsLoad = true);434435/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)436static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {437return (SqrtOp->getType()->isFloatTy() &&438(SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||439SqrtOp->getType()->isHalfTy();440}441442std::optional<Instruction *>443GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {444Intrinsic::ID IID = II.getIntrinsicID();445switch (IID) {446case Intrinsic::amdgcn_rcp: {447Value *Src = II.getArgOperand(0);448449// TODO: Move to ConstantFolding/InstSimplify?450if (isa<UndefValue>(Src)) {451Type *Ty = II.getType();452auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));453return IC.replaceInstUsesWith(II, QNaN);454}455456if (II.isStrictFP())457break;458459if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {460const APFloat &ArgVal = C->getValueAPF();461APFloat Val(ArgVal.getSemantics(), 1);462Val.divide(ArgVal, APFloat::rmNearestTiesToEven);463464// This is more precise than the instruction may give.465//466// TODO: The instruction always flushes denormal results (except for f16),467// should this also?468return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));469}470471FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();472if (!FMF.allowContract())473break;474auto *SrcCI = dyn_cast<IntrinsicInst>(Src);475if (!SrcCI)476break;477478auto IID = SrcCI->getIntrinsicID();479// llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable480//481// llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and482// relaxed.483if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {484const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);485FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();486if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())487break;488489if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))490break;491492Function *NewDecl = Intrinsic::getDeclaration(493SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});494495InnerFMF |= FMF;496II.setFastMathFlags(InnerFMF);497498II.setCalledFunction(NewDecl);499return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));500}501502break;503}504case Intrinsic::amdgcn_sqrt:505case Intrinsic::amdgcn_rsq: {506Value *Src = II.getArgOperand(0);507508// TODO: Move to ConstantFolding/InstSimplify?509if (isa<UndefValue>(Src)) {510Type *Ty = II.getType();511auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));512return IC.replaceInstUsesWith(II, QNaN);513}514515// f16 amdgcn.sqrt is identical to regular sqrt.516if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {517Function *NewDecl = Intrinsic::getDeclaration(518II.getModule(), Intrinsic::sqrt, {II.getType()});519II.setCalledFunction(NewDecl);520return &II;521}522523break;524}525case Intrinsic::amdgcn_log:526case Intrinsic::amdgcn_exp2: {527const bool IsLog = IID == Intrinsic::amdgcn_log;528const bool IsExp = IID == Intrinsic::amdgcn_exp2;529Value *Src = II.getArgOperand(0);530Type *Ty = II.getType();531532if (isa<PoisonValue>(Src))533return IC.replaceInstUsesWith(II, Src);534535if (IC.getSimplifyQuery().isUndefValue(Src))536return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));537538if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {539if (C->isInfinity()) {540// exp2(+inf) -> +inf541// log2(+inf) -> +inf542if (!C->isNegative())543return IC.replaceInstUsesWith(II, C);544545// exp2(-inf) -> 0546if (IsExp && C->isNegative())547return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty));548}549550if (II.isStrictFP())551break;552553if (C->isNaN()) {554Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());555return IC.replaceInstUsesWith(II, Quieted);556}557558// f32 instruction doesn't handle denormals, f16 does.559if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {560Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)561: ConstantFP::get(Ty, 1.0);562return IC.replaceInstUsesWith(II, FoldedValue);563}564565if (IsLog && C->isNegative())566return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));567568// TODO: Full constant folding matching hardware behavior.569}570571break;572}573case Intrinsic::amdgcn_frexp_mant:574case Intrinsic::amdgcn_frexp_exp: {575Value *Src = II.getArgOperand(0);576if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {577int Exp;578APFloat Significand =579frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);580581if (IID == Intrinsic::amdgcn_frexp_mant) {582return IC.replaceInstUsesWith(583II, ConstantFP::get(II.getContext(), Significand));584}585586// Match instruction special case behavior.587if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)588Exp = 0;589590return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));591}592593if (isa<UndefValue>(Src)) {594return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));595}596597break;598}599case Intrinsic::amdgcn_class: {600Value *Src0 = II.getArgOperand(0);601Value *Src1 = II.getArgOperand(1);602const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);603if (CMask) {604II.setCalledOperand(Intrinsic::getDeclaration(605II.getModule(), Intrinsic::is_fpclass, Src0->getType()));606607// Clamp any excess bits, as they're illegal for the generic intrinsic.608II.setArgOperand(1, ConstantInt::get(Src1->getType(),609CMask->getZExtValue() & fcAllFlags));610return &II;611}612613// Propagate poison.614if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))615return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));616617// llvm.amdgcn.class(_, undef) -> false618if (IC.getSimplifyQuery().isUndefValue(Src1))619return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));620621// llvm.amdgcn.class(undef, mask) -> mask != 0622if (IC.getSimplifyQuery().isUndefValue(Src0)) {623Value *CmpMask = IC.Builder.CreateICmpNE(624Src1, ConstantInt::getNullValue(Src1->getType()));625return IC.replaceInstUsesWith(II, CmpMask);626}627break;628}629case Intrinsic::amdgcn_cvt_pkrtz: {630Value *Src0 = II.getArgOperand(0);631Value *Src1 = II.getArgOperand(1);632if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {633if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {634const fltSemantics &HalfSem =635II.getType()->getScalarType()->getFltSemantics();636bool LosesInfo;637APFloat Val0 = C0->getValueAPF();638APFloat Val1 = C1->getValueAPF();639Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);640Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);641642Constant *Folded =643ConstantVector::get({ConstantFP::get(II.getContext(), Val0),644ConstantFP::get(II.getContext(), Val1)});645return IC.replaceInstUsesWith(II, Folded);646}647}648649if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {650return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));651}652653break;654}655case Intrinsic::amdgcn_cvt_pknorm_i16:656case Intrinsic::amdgcn_cvt_pknorm_u16:657case Intrinsic::amdgcn_cvt_pk_i16:658case Intrinsic::amdgcn_cvt_pk_u16: {659Value *Src0 = II.getArgOperand(0);660Value *Src1 = II.getArgOperand(1);661662if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {663return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));664}665666break;667}668case Intrinsic::amdgcn_ubfe:669case Intrinsic::amdgcn_sbfe: {670// Decompose simple cases into standard shifts.671Value *Src = II.getArgOperand(0);672if (isa<UndefValue>(Src)) {673return IC.replaceInstUsesWith(II, Src);674}675676unsigned Width;677Type *Ty = II.getType();678unsigned IntSize = Ty->getIntegerBitWidth();679680ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));681if (CWidth) {682Width = CWidth->getZExtValue();683if ((Width & (IntSize - 1)) == 0) {684return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));685}686687// Hardware ignores high bits, so remove those.688if (Width >= IntSize) {689return IC.replaceOperand(690II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));691}692}693694unsigned Offset;695ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));696if (COffset) {697Offset = COffset->getZExtValue();698if (Offset >= IntSize) {699return IC.replaceOperand(700II, 1,701ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));702}703}704705bool Signed = IID == Intrinsic::amdgcn_sbfe;706707if (!CWidth || !COffset)708break;709710// The case of Width == 0 is handled above, which makes this transformation711// safe. If Width == 0, then the ashr and lshr instructions become poison712// value since the shift amount would be equal to the bit size.713assert(Width != 0);714715// TODO: This allows folding to undef when the hardware has specific716// behavior?717if (Offset + Width < IntSize) {718Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);719Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)720: IC.Builder.CreateLShr(Shl, IntSize - Width);721RightShift->takeName(&II);722return IC.replaceInstUsesWith(II, RightShift);723}724725Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)726: IC.Builder.CreateLShr(Src, Offset);727728RightShift->takeName(&II);729return IC.replaceInstUsesWith(II, RightShift);730}731case Intrinsic::amdgcn_exp:732case Intrinsic::amdgcn_exp_row:733case Intrinsic::amdgcn_exp_compr: {734ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));735unsigned EnBits = En->getZExtValue();736if (EnBits == 0xf)737break; // All inputs enabled.738739bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;740bool Changed = false;741for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {742if ((!IsCompr && (EnBits & (1 << I)) == 0) ||743(IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {744Value *Src = II.getArgOperand(I + 2);745if (!isa<UndefValue>(Src)) {746IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));747Changed = true;748}749}750}751752if (Changed) {753return &II;754}755756break;757}758case Intrinsic::amdgcn_fmed3: {759// Note this does not preserve proper sNaN behavior if IEEE-mode is enabled760// for the shader.761762Value *Src0 = II.getArgOperand(0);763Value *Src1 = II.getArgOperand(1);764Value *Src2 = II.getArgOperand(2);765766// Checking for NaN before canonicalization provides better fidelity when767// mapping other operations onto fmed3 since the order of operands is768// unchanged.769Value *V = nullptr;770if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {771V = IC.Builder.CreateMinNum(Src1, Src2);772} else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {773V = IC.Builder.CreateMinNum(Src0, Src2);774} else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {775V = IC.Builder.CreateMaxNum(Src0, Src1);776}777778if (V) {779if (auto *CI = dyn_cast<CallInst>(V)) {780CI->copyFastMathFlags(&II);781CI->takeName(&II);782}783return IC.replaceInstUsesWith(II, V);784}785786bool Swap = false;787// Canonicalize constants to RHS operands.788//789// fmed3(c0, x, c1) -> fmed3(x, c0, c1)790if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {791std::swap(Src0, Src1);792Swap = true;793}794795if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {796std::swap(Src1, Src2);797Swap = true;798}799800if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {801std::swap(Src0, Src1);802Swap = true;803}804805if (Swap) {806II.setArgOperand(0, Src0);807II.setArgOperand(1, Src1);808II.setArgOperand(2, Src2);809return &II;810}811812if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {813if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {814if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {815APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),816C2->getValueAPF());817return IC.replaceInstUsesWith(818II, ConstantFP::get(IC.Builder.getContext(), Result));819}820}821}822823if (!ST->hasMed3_16())824break;825826Value *X, *Y, *Z;827828// Repeat floating-point width reduction done for minnum/maxnum.829// fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))830if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) &&831matchFPExtFromF16(Src2, Z)) {832Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()},833{X, Y, Z}, &II, II.getName());834return new FPExtInst(NewCall, II.getType());835}836837break;838}839case Intrinsic::amdgcn_icmp:840case Intrinsic::amdgcn_fcmp: {841const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));842// Guard against invalid arguments.843int64_t CCVal = CC->getZExtValue();844bool IsInteger = IID == Intrinsic::amdgcn_icmp;845if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||846CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||847(!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||848CCVal > CmpInst::LAST_FCMP_PREDICATE)))849break;850851Value *Src0 = II.getArgOperand(0);852Value *Src1 = II.getArgOperand(1);853854if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {855if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {856Constant *CCmp = ConstantFoldCompareInstOperands(857(ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);858if (CCmp && CCmp->isNullValue()) {859return IC.replaceInstUsesWith(860II, IC.Builder.CreateSExt(CCmp, II.getType()));861}862863// The result of V_ICMP/V_FCMP assembly instructions (which this864// intrinsic exposes) is one bit per thread, masked with the EXEC865// register (which contains the bitmask of live threads). So a866// comparison that always returns true is the same as a read of the867// EXEC register.868Function *NewF = Intrinsic::getDeclaration(869II.getModule(), Intrinsic::read_register, II.getType());870Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};871MDNode *MD = MDNode::get(II.getContext(), MDArgs);872Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};873CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);874NewCall->addFnAttr(Attribute::Convergent);875NewCall->takeName(&II);876return IC.replaceInstUsesWith(II, NewCall);877}878879// Canonicalize constants to RHS.880CmpInst::Predicate SwapPred =881CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));882II.setArgOperand(0, Src1);883II.setArgOperand(1, Src0);884II.setArgOperand(8852, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));886return &II;887}888889if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)890break;891892// Canonicalize compare eq with true value to compare != 0893// llvm.amdgcn.icmp(zext (i1 x), 1, eq)894// -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)895// llvm.amdgcn.icmp(sext (i1 x), -1, eq)896// -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)897Value *ExtSrc;898if (CCVal == CmpInst::ICMP_EQ &&899((match(Src1, PatternMatch::m_One()) &&900match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||901(match(Src1, PatternMatch::m_AllOnes()) &&902match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&903ExtSrc->getType()->isIntegerTy(1)) {904IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));905IC.replaceOperand(II, 2,906ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));907return &II;908}909910CmpInst::Predicate SrcPred;911Value *SrcLHS;912Value *SrcRHS;913914// Fold compare eq/ne with 0 from a compare result as the predicate to the915// intrinsic. The typical use is a wave vote function in the library, which916// will be fed from a user code condition compared with 0. Fold in the917// redundant compare.918919// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)920// -> llvm.amdgcn.[if]cmp(a, b, pred)921//922// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)923// -> llvm.amdgcn.[if]cmp(a, b, inv pred)924if (match(Src1, PatternMatch::m_Zero()) &&925match(Src0, PatternMatch::m_ZExtOrSExt(926m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),927PatternMatch::m_Value(SrcRHS))))) {928if (CCVal == CmpInst::ICMP_EQ)929SrcPred = CmpInst::getInversePredicate(SrcPred);930931Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)932? Intrinsic::amdgcn_fcmp933: Intrinsic::amdgcn_icmp;934935Type *Ty = SrcLHS->getType();936if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {937// Promote to next legal integer type.938unsigned Width = CmpType->getBitWidth();939unsigned NewWidth = Width;940941// Don't do anything for i1 comparisons.942if (Width == 1)943break;944945if (Width <= 16)946NewWidth = 16;947else if (Width <= 32)948NewWidth = 32;949else if (Width <= 64)950NewWidth = 64;951else952break; // Can't handle this.953954if (Width != NewWidth) {955IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);956if (CmpInst::isSigned(SrcPred)) {957SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);958SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);959} else {960SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);961SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);962}963}964} else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())965break;966967Function *NewF = Intrinsic::getDeclaration(968II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});969Value *Args[] = {SrcLHS, SrcRHS,970ConstantInt::get(CC->getType(), SrcPred)};971CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);972NewCall->takeName(&II);973return IC.replaceInstUsesWith(II, NewCall);974}975976break;977}978case Intrinsic::amdgcn_mbcnt_hi: {979// exec_hi is all 0, so this is just a copy.980if (ST->isWave32())981return IC.replaceInstUsesWith(II, II.getArgOperand(1));982break;983}984case Intrinsic::amdgcn_ballot: {985if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {986if (Src->isZero()) {987// amdgcn.ballot(i1 0) is zero.988return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));989}990}991if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {992// %b64 = call i64 ballot.i64(...)993// =>994// %b32 = call i32 ballot.i32(...)995// %b64 = zext i32 %b32 to i64996Value *Call = IC.Builder.CreateZExt(997IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,998{IC.Builder.getInt32Ty()},999{II.getArgOperand(0)}),1000II.getType());1001Call->takeName(&II);1002return IC.replaceInstUsesWith(II, Call);1003}1004break;1005}1006case Intrinsic::amdgcn_wqm_vote: {1007// wqm_vote is identity when the argument is constant.1008if (!isa<Constant>(II.getArgOperand(0)))1009break;10101011return IC.replaceInstUsesWith(II, II.getArgOperand(0));1012}1013case Intrinsic::amdgcn_kill: {1014const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));1015if (!C || !C->getZExtValue())1016break;10171018// amdgcn.kill(i1 1) is a no-op1019return IC.eraseInstFromFunction(II);1020}1021case Intrinsic::amdgcn_update_dpp: {1022Value *Old = II.getArgOperand(0);10231024auto *BC = cast<ConstantInt>(II.getArgOperand(5));1025auto *RM = cast<ConstantInt>(II.getArgOperand(3));1026auto *BM = cast<ConstantInt>(II.getArgOperand(4));1027if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||1028BM->getZExtValue() != 0xF || isa<UndefValue>(Old))1029break;10301031// If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.1032return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));1033}1034case Intrinsic::amdgcn_permlane16:1035case Intrinsic::amdgcn_permlane16_var:1036case Intrinsic::amdgcn_permlanex16:1037case Intrinsic::amdgcn_permlanex16_var: {1038// Discard vdst_in if it's not going to be read.1039Value *VDstIn = II.getArgOperand(0);1040if (isa<UndefValue>(VDstIn))1041break;10421043// FetchInvalid operand idx.1044unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||1045IID == Intrinsic::amdgcn_permlanex16)1046? 4 /* for permlane16 and permlanex16 */1047: 3; /* for permlane16_var and permlanex16_var */10481049// BoundCtrl operand idx.1050// For permlane16 and permlanex16 it should be 51051// For Permlane16_var and permlanex16_var it should be 41052unsigned int BcIdx = FiIdx + 1;10531054ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));1055ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));1056if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())1057break;10581059return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));1060}1061case Intrinsic::amdgcn_permlane64:1062// A constant value is trivially uniform.1063if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {1064return IC.replaceInstUsesWith(II, C);1065}1066break;1067case Intrinsic::amdgcn_readfirstlane:1068case Intrinsic::amdgcn_readlane: {1069// A constant value is trivially uniform.1070if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {1071return IC.replaceInstUsesWith(II, C);1072}10731074// The rest of these may not be safe if the exec may not be the same between1075// the def and use.1076Value *Src = II.getArgOperand(0);1077Instruction *SrcInst = dyn_cast<Instruction>(Src);1078if (SrcInst && SrcInst->getParent() != II.getParent())1079break;10801081// readfirstlane (readfirstlane x) -> readfirstlane x1082// readlane (readfirstlane x), y -> readfirstlane x1083if (match(Src,1084PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {1085return IC.replaceInstUsesWith(II, Src);1086}10871088if (IID == Intrinsic::amdgcn_readfirstlane) {1089// readfirstlane (readlane x, y) -> readlane x, y1090if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {1091return IC.replaceInstUsesWith(II, Src);1092}1093} else {1094// readlane (readlane x, y), y -> readlane x, y1095if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(1096PatternMatch::m_Value(),1097PatternMatch::m_Specific(II.getArgOperand(1))))) {1098return IC.replaceInstUsesWith(II, Src);1099}1100}11011102break;1103}1104case Intrinsic::amdgcn_trig_preop: {1105// The intrinsic is declared with name mangling, but currently the1106// instruction only exists for f641107if (!II.getType()->isDoubleTy())1108break;11091110Value *Src = II.getArgOperand(0);1111Value *Segment = II.getArgOperand(1);1112if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))1113return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));11141115if (isa<UndefValue>(Src)) {1116auto *QNaN = ConstantFP::get(1117II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics()));1118return IC.replaceInstUsesWith(II, QNaN);1119}11201121const ConstantFP *Csrc = dyn_cast<ConstantFP>(Src);1122if (!Csrc)1123break;11241125if (II.isStrictFP())1126break;11271128const APFloat &Fsrc = Csrc->getValueAPF();1129if (Fsrc.isNaN()) {1130auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet());1131return IC.replaceInstUsesWith(II, Quieted);1132}11331134const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);1135if (!Cseg)1136break;11371138unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;1139unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();1140unsigned Shift = SegmentVal * 53;1141if (Exponent > 1077)1142Shift += Exponent - 1077;11431144// 2.0/PI table.1145static const uint32_t TwoByPi[] = {11460xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,11470xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,11480xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,11490x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,11500xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,11510x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,11520x56033046};11531154// Return 0 for outbound segment (hardware behavior).1155unsigned Idx = Shift >> 5;1156if (Idx + 2 >= std::size(TwoByPi)) {1157APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());1158return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));1159}11601161unsigned BShift = Shift & 0x1f;1162uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);1163uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);1164if (BShift)1165Thi = (Thi << BShift) | (Tlo >> (64 - BShift));1166Thi = Thi >> 11;1167APFloat Result = APFloat((double)Thi);11681169int Scale = -53 - Shift;1170if (Exponent >= 1968)1171Scale += 128;11721173Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);1174return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));1175}1176case Intrinsic::amdgcn_fmul_legacy: {1177Value *Op0 = II.getArgOperand(0);1178Value *Op1 = II.getArgOperand(1);11791180// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or1181// infinity, gives +0.0.1182// TODO: Move to InstSimplify?1183if (match(Op0, PatternMatch::m_AnyZeroFP()) ||1184match(Op1, PatternMatch::m_AnyZeroFP()))1185return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));11861187// If we can prove we don't have one of the special cases then we can use a1188// normal fmul instruction instead.1189if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {1190auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);1191FMul->takeName(&II);1192return IC.replaceInstUsesWith(II, FMul);1193}1194break;1195}1196case Intrinsic::amdgcn_fma_legacy: {1197Value *Op0 = II.getArgOperand(0);1198Value *Op1 = II.getArgOperand(1);1199Value *Op2 = II.getArgOperand(2);12001201// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or1202// infinity, gives +0.0.1203// TODO: Move to InstSimplify?1204if (match(Op0, PatternMatch::m_AnyZeroFP()) ||1205match(Op1, PatternMatch::m_AnyZeroFP())) {1206// It's tempting to just return Op2 here, but that would give the wrong1207// result if Op2 was -0.0.1208auto *Zero = ConstantFP::getZero(II.getType());1209auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);1210FAdd->takeName(&II);1211return IC.replaceInstUsesWith(II, FAdd);1212}12131214// If we can prove we don't have one of the special cases then we can use a1215// normal fma instead.1216if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {1217II.setCalledOperand(Intrinsic::getDeclaration(1218II.getModule(), Intrinsic::fma, II.getType()));1219return &II;1220}1221break;1222}1223case Intrinsic::amdgcn_is_shared:1224case Intrinsic::amdgcn_is_private: {1225if (isa<UndefValue>(II.getArgOperand(0)))1226return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));12271228if (isa<ConstantPointerNull>(II.getArgOperand(0)))1229return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));1230break;1231}1232case Intrinsic::amdgcn_raw_buffer_store_format:1233case Intrinsic::amdgcn_struct_buffer_store_format:1234case Intrinsic::amdgcn_raw_tbuffer_store:1235case Intrinsic::amdgcn_struct_tbuffer_store:1236case Intrinsic::amdgcn_image_store_1d:1237case Intrinsic::amdgcn_image_store_1darray:1238case Intrinsic::amdgcn_image_store_2d:1239case Intrinsic::amdgcn_image_store_2darray:1240case Intrinsic::amdgcn_image_store_2darraymsaa:1241case Intrinsic::amdgcn_image_store_2dmsaa:1242case Intrinsic::amdgcn_image_store_3d:1243case Intrinsic::amdgcn_image_store_cube:1244case Intrinsic::amdgcn_image_store_mip_1d:1245case Intrinsic::amdgcn_image_store_mip_1darray:1246case Intrinsic::amdgcn_image_store_mip_2d:1247case Intrinsic::amdgcn_image_store_mip_2darray:1248case Intrinsic::amdgcn_image_store_mip_3d:1249case Intrinsic::amdgcn_image_store_mip_cube: {1250if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))1251break;12521253APInt DemandedElts;1254if (ST->hasDefaultComponentBroadcast())1255DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));1256else if (ST->hasDefaultComponentZero())1257DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);1258else1259break;12601261int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;1262if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,1263false)) {1264return IC.eraseInstFromFunction(II);1265}12661267break;1268}1269}1270if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =1271AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {1272return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);1273}1274return std::nullopt;1275}12761277/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.1278///1279/// The result of simplifying amdgcn image and buffer store intrinsics is updating1280/// definitions of the intrinsics vector argument, not Uses of the result like1281/// image and buffer loads.1282/// Note: This only supports non-TFE/LWE image intrinsic calls; those have1283/// struct returns.1284static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,1285IntrinsicInst &II,1286APInt DemandedElts,1287int DMaskIdx, bool IsLoad) {12881289auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()1290: II.getOperand(0)->getType());1291unsigned VWidth = IIVTy->getNumElements();1292if (VWidth == 1)1293return nullptr;1294Type *EltTy = IIVTy->getElementType();12951296IRBuilderBase::InsertPointGuard Guard(IC.Builder);1297IC.Builder.SetInsertPoint(&II);12981299// Assume the arguments are unchanged and later override them, if needed.1300SmallVector<Value *, 16> Args(II.args());13011302if (DMaskIdx < 0) {1303// Buffer case.13041305const unsigned ActiveBits = DemandedElts.getActiveBits();1306const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();13071308// Start assuming the prefix of elements is demanded, but possibly clear1309// some other bits if there are trailing zeros (unused components at front)1310// and update offset.1311DemandedElts = (1 << ActiveBits) - 1;13121313if (UnusedComponentsAtFront > 0) {1314static const unsigned InvalidOffsetIdx = 0xf;13151316unsigned OffsetIdx;1317switch (II.getIntrinsicID()) {1318case Intrinsic::amdgcn_raw_buffer_load:1319case Intrinsic::amdgcn_raw_ptr_buffer_load:1320OffsetIdx = 1;1321break;1322case Intrinsic::amdgcn_s_buffer_load:1323// If resulting type is vec3, there is no point in trimming the1324// load with updated offset, as the vec3 would most likely be widened to1325// vec4 anyway during lowering.1326if (ActiveBits == 4 && UnusedComponentsAtFront == 1)1327OffsetIdx = InvalidOffsetIdx;1328else1329OffsetIdx = 1;1330break;1331case Intrinsic::amdgcn_struct_buffer_load:1332case Intrinsic::amdgcn_struct_ptr_buffer_load:1333OffsetIdx = 2;1334break;1335default:1336// TODO: handle tbuffer* intrinsics.1337OffsetIdx = InvalidOffsetIdx;1338break;1339}13401341if (OffsetIdx != InvalidOffsetIdx) {1342// Clear demanded bits and update the offset.1343DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);1344auto *Offset = Args[OffsetIdx];1345unsigned SingleComponentSizeInBits =1346IC.getDataLayout().getTypeSizeInBits(EltTy);1347unsigned OffsetAdd =1348UnusedComponentsAtFront * SingleComponentSizeInBits / 8;1349auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);1350Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);1351}1352}1353} else {1354// Image case.13551356ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);1357unsigned DMaskVal = DMask->getZExtValue() & 0xf;13581359// dmask 0 has special semantics, do not simplify.1360if (DMaskVal == 0)1361return nullptr;13621363// Mask off values that are undefined because the dmask doesn't cover them1364DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;13651366unsigned NewDMaskVal = 0;1367unsigned OrigLdStIdx = 0;1368for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {1369const unsigned Bit = 1 << SrcIdx;1370if (!!(DMaskVal & Bit)) {1371if (!!DemandedElts[OrigLdStIdx])1372NewDMaskVal |= Bit;1373OrigLdStIdx++;1374}1375}13761377if (DMaskVal != NewDMaskVal)1378Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);1379}13801381unsigned NewNumElts = DemandedElts.popcount();1382if (!NewNumElts)1383return PoisonValue::get(IIVTy);13841385if (NewNumElts >= VWidth && DemandedElts.isMask()) {1386if (DMaskIdx >= 0)1387II.setArgOperand(DMaskIdx, Args[DMaskIdx]);1388return nullptr;1389}13901391// Validate function argument and return types, extracting overloaded types1392// along the way.1393SmallVector<Type *, 6> OverloadTys;1394if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))1395return nullptr;13961397Type *NewTy =1398(NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);1399OverloadTys[0] = NewTy;14001401if (!IsLoad) {1402SmallVector<int, 8> EltMask;1403for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)1404if (DemandedElts[OrigStoreIdx])1405EltMask.push_back(OrigStoreIdx);14061407if (NewNumElts == 1)1408Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);1409else1410Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);1411}14121413Function *NewIntrin = Intrinsic::getDeclaration(1414II.getModule(), II.getIntrinsicID(), OverloadTys);1415CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);1416NewCall->takeName(&II);1417NewCall->copyMetadata(II);14181419if (IsLoad) {1420if (NewNumElts == 1) {1421return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,1422DemandedElts.countr_zero());1423}14241425SmallVector<int, 8> EltMask;1426unsigned NewLoadIdx = 0;1427for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {1428if (!!DemandedElts[OrigLoadIdx])1429EltMask.push_back(NewLoadIdx++);1430else1431EltMask.push_back(NewNumElts);1432}14331434auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);14351436return Shuffle;1437}14381439return NewCall;1440}14411442std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(1443InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,1444APInt &UndefElts2, APInt &UndefElts3,1445std::function<void(Instruction *, unsigned, APInt, APInt &)>1446SimplifyAndSetOp) const {1447switch (II.getIntrinsicID()) {1448case Intrinsic::amdgcn_raw_buffer_load:1449case Intrinsic::amdgcn_raw_ptr_buffer_load:1450case Intrinsic::amdgcn_raw_buffer_load_format:1451case Intrinsic::amdgcn_raw_ptr_buffer_load_format:1452case Intrinsic::amdgcn_raw_tbuffer_load:1453case Intrinsic::amdgcn_raw_ptr_tbuffer_load:1454case Intrinsic::amdgcn_s_buffer_load:1455case Intrinsic::amdgcn_struct_buffer_load:1456case Intrinsic::amdgcn_struct_ptr_buffer_load:1457case Intrinsic::amdgcn_struct_buffer_load_format:1458case Intrinsic::amdgcn_struct_ptr_buffer_load_format:1459case Intrinsic::amdgcn_struct_tbuffer_load:1460case Intrinsic::amdgcn_struct_ptr_tbuffer_load:1461return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);1462default: {1463if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {1464return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);1465}1466break;1467}1468}1469return std::nullopt;1470}147114721473