Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
35266 views
//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//78#include "AMDGPUCombinerHelper.h"9#include "GCNSubtarget.h"10#include "MCTargetDesc/AMDGPUMCTargetDesc.h"11#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"12#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"13#include "llvm/IR/IntrinsicsAMDGPU.h"14#include "llvm/Target/TargetMachine.h"1516using namespace llvm;17using namespace MIPatternMatch;1819LLVM_READNONE20static bool fnegFoldsIntoMI(const MachineInstr &MI) {21switch (MI.getOpcode()) {22case AMDGPU::G_FADD:23case AMDGPU::G_FSUB:24case AMDGPU::G_FMUL:25case AMDGPU::G_FMA:26case AMDGPU::G_FMAD:27case AMDGPU::G_FMINNUM:28case AMDGPU::G_FMAXNUM:29case AMDGPU::G_FMINNUM_IEEE:30case AMDGPU::G_FMAXNUM_IEEE:31case AMDGPU::G_FMINIMUM:32case AMDGPU::G_FMAXIMUM:33case AMDGPU::G_FSIN:34case AMDGPU::G_FPEXT:35case AMDGPU::G_INTRINSIC_TRUNC:36case AMDGPU::G_FPTRUNC:37case AMDGPU::G_FRINT:38case AMDGPU::G_FNEARBYINT:39case AMDGPU::G_INTRINSIC_ROUND:40case AMDGPU::G_INTRINSIC_ROUNDEVEN:41case AMDGPU::G_FCANONICALIZE:42case AMDGPU::G_AMDGPU_RCP_IFLAG:43case AMDGPU::G_AMDGPU_FMIN_LEGACY:44case AMDGPU::G_AMDGPU_FMAX_LEGACY:45return true;46case AMDGPU::G_INTRINSIC: {47Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();48switch (IntrinsicID) {49case Intrinsic::amdgcn_rcp:50case Intrinsic::amdgcn_rcp_legacy:51case Intrinsic::amdgcn_sin:52case Intrinsic::amdgcn_fmul_legacy:53case Intrinsic::amdgcn_fmed3:54case Intrinsic::amdgcn_fma_legacy:55return true;56default:57return false;58}59}60default:61return false;62}63}6465/// \p returns true if the operation will definitely need to use a 64-bit66/// encoding, and thus will use a VOP3 encoding regardless of the source67/// modifiers.68LLVM_READONLY69static bool opMustUseVOP3Encoding(const MachineInstr &MI,70const MachineRegisterInfo &MRI) {71return MI.getNumOperands() > (isa<GIntrinsic>(MI) ? 4u : 3u) ||72MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;73}7475// Most FP instructions support source modifiers.76LLVM_READONLY77static bool hasSourceMods(const MachineInstr &MI) {78if (!MI.memoperands().empty())79return false;8081switch (MI.getOpcode()) {82case AMDGPU::COPY:83case AMDGPU::G_SELECT:84case AMDGPU::G_FDIV:85case AMDGPU::G_FREM:86case TargetOpcode::INLINEASM:87case TargetOpcode::INLINEASM_BR:88case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:89case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:90case AMDGPU::G_BITCAST:91case AMDGPU::G_ANYEXT:92case AMDGPU::G_BUILD_VECTOR:93case AMDGPU::G_BUILD_VECTOR_TRUNC:94case AMDGPU::G_PHI:95return false;96case AMDGPU::G_INTRINSIC:97case AMDGPU::G_INTRINSIC_CONVERGENT: {98Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();99switch (IntrinsicID) {100case Intrinsic::amdgcn_interp_p1:101case Intrinsic::amdgcn_interp_p2:102case Intrinsic::amdgcn_interp_mov:103case Intrinsic::amdgcn_interp_p1_f16:104case Intrinsic::amdgcn_interp_p2_f16:105case Intrinsic::amdgcn_div_scale:106return false;107default:108return true;109}110}111default:112return true;113}114}115116static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,117unsigned CostThreshold = 4) {118// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus119// it is truly free to use a source modifier in all cases. If there are120// multiple users but for each one will necessitate using VOP3, there will be121// a code size increase. Try to avoid increasing code size unless we know it122// will save on the instruction count.123unsigned NumMayIncreaseSize = 0;124Register Dst = MI.getOperand(0).getReg();125for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) {126if (!hasSourceMods(Use))127return false;128129if (!opMustUseVOP3Encoding(Use, MRI)) {130if (++NumMayIncreaseSize > CostThreshold)131return false;132}133}134return true;135}136137static bool mayIgnoreSignedZero(MachineInstr &MI) {138const TargetOptions &Options = MI.getMF()->getTarget().Options;139return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);140}141142static bool isInv2Pi(const APFloat &APF) {143static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));144static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));145static const APFloat KF64(APFloat::IEEEdouble(),146APInt(64, 0x3fc45f306dc9c882));147148return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) ||149APF.bitwiseIsEqual(KF64);150}151152// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an153// additional cost to negate them.154static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,155MachineRegisterInfo &MRI) {156std::optional<FPValueAndVReg> FPValReg;157if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {158if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative())159return true;160161const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();162if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value))163return true;164}165return false;166}167168static unsigned inverseMinMax(unsigned Opc) {169switch (Opc) {170case AMDGPU::G_FMAXNUM:171return AMDGPU::G_FMINNUM;172case AMDGPU::G_FMINNUM:173return AMDGPU::G_FMAXNUM;174case AMDGPU::G_FMAXNUM_IEEE:175return AMDGPU::G_FMINNUM_IEEE;176case AMDGPU::G_FMINNUM_IEEE:177return AMDGPU::G_FMAXNUM_IEEE;178case AMDGPU::G_FMAXIMUM:179return AMDGPU::G_FMINIMUM;180case AMDGPU::G_FMINIMUM:181return AMDGPU::G_FMAXIMUM;182case AMDGPU::G_AMDGPU_FMAX_LEGACY:183return AMDGPU::G_AMDGPU_FMIN_LEGACY;184case AMDGPU::G_AMDGPU_FMIN_LEGACY:185return AMDGPU::G_AMDGPU_FMAX_LEGACY;186default:187llvm_unreachable("invalid min/max opcode");188}189}190191bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,192MachineInstr *&MatchInfo) {193Register Src = MI.getOperand(1).getReg();194MatchInfo = MRI.getVRegDef(Src);195196// If the input has multiple uses and we can either fold the negate down, or197// the other uses cannot, give up. This both prevents unprofitable198// transformations and infinite loops: we won't repeatedly try to fold around199// a negate that has no 'good' form.200if (MRI.hasOneNonDBGUse(Src)) {201if (allUsesHaveSourceMods(MI, MRI, 0))202return false;203} else {204if (fnegFoldsIntoMI(*MatchInfo) &&205(allUsesHaveSourceMods(MI, MRI) ||206!allUsesHaveSourceMods(*MatchInfo, MRI)))207return false;208}209210switch (MatchInfo->getOpcode()) {211case AMDGPU::G_FMINNUM:212case AMDGPU::G_FMAXNUM:213case AMDGPU::G_FMINNUM_IEEE:214case AMDGPU::G_FMAXNUM_IEEE:215case AMDGPU::G_FMINIMUM:216case AMDGPU::G_FMAXIMUM:217case AMDGPU::G_AMDGPU_FMIN_LEGACY:218case AMDGPU::G_AMDGPU_FMAX_LEGACY:219// 0 doesn't have a negated inline immediate.220return !isConstantCostlierToNegate(*MatchInfo,221MatchInfo->getOperand(2).getReg(), MRI);222case AMDGPU::G_FADD:223case AMDGPU::G_FSUB:224case AMDGPU::G_FMA:225case AMDGPU::G_FMAD:226return mayIgnoreSignedZero(*MatchInfo);227case AMDGPU::G_FMUL:228case AMDGPU::G_FPEXT:229case AMDGPU::G_INTRINSIC_TRUNC:230case AMDGPU::G_FPTRUNC:231case AMDGPU::G_FRINT:232case AMDGPU::G_FNEARBYINT:233case AMDGPU::G_INTRINSIC_ROUND:234case AMDGPU::G_INTRINSIC_ROUNDEVEN:235case AMDGPU::G_FSIN:236case AMDGPU::G_FCANONICALIZE:237case AMDGPU::G_AMDGPU_RCP_IFLAG:238return true;239case AMDGPU::G_INTRINSIC:240case AMDGPU::G_INTRINSIC_CONVERGENT: {241Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();242switch (IntrinsicID) {243case Intrinsic::amdgcn_rcp:244case Intrinsic::amdgcn_rcp_legacy:245case Intrinsic::amdgcn_sin:246case Intrinsic::amdgcn_fmul_legacy:247case Intrinsic::amdgcn_fmed3:248return true;249case Intrinsic::amdgcn_fma_legacy:250return mayIgnoreSignedZero(*MatchInfo);251default:252return false;253}254}255default:256return false;257}258}259260void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,261MachineInstr *&MatchInfo) {262// Transform:263// %A = inst %Op1, ...264// %B = fneg %A265//266// into:267//268// (if %A has one use, specifically fneg above)269// %B = inst (maybe fneg %Op1), ...270//271// (if %A has multiple uses)272// %B = inst (maybe fneg %Op1), ...273// %A = fneg %B274275// Replace register in operand with a register holding negated value.276auto NegateOperand = [&](MachineOperand &Op) {277Register Reg = Op.getReg();278if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg))))279Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0);280replaceRegOpWith(MRI, Op, Reg);281};282283// Replace either register in operands with a register holding negated value.284auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {285Register XReg = X.getReg();286Register YReg = Y.getReg();287if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg))))288replaceRegOpWith(MRI, X, XReg);289else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg))))290replaceRegOpWith(MRI, Y, YReg);291else {292YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0);293replaceRegOpWith(MRI, Y, YReg);294}295};296297Builder.setInstrAndDebugLoc(*MatchInfo);298299// Negate appropriate operands so that resulting value of MatchInfo is300// negated.301switch (MatchInfo->getOpcode()) {302case AMDGPU::G_FADD:303case AMDGPU::G_FSUB:304NegateOperand(MatchInfo->getOperand(1));305NegateOperand(MatchInfo->getOperand(2));306break;307case AMDGPU::G_FMUL:308NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));309break;310case AMDGPU::G_FMINNUM:311case AMDGPU::G_FMAXNUM:312case AMDGPU::G_FMINNUM_IEEE:313case AMDGPU::G_FMAXNUM_IEEE:314case AMDGPU::G_FMINIMUM:315case AMDGPU::G_FMAXIMUM:316case AMDGPU::G_AMDGPU_FMIN_LEGACY:317case AMDGPU::G_AMDGPU_FMAX_LEGACY: {318NegateOperand(MatchInfo->getOperand(1));319NegateOperand(MatchInfo->getOperand(2));320unsigned Opposite = inverseMinMax(MatchInfo->getOpcode());321replaceOpcodeWith(*MatchInfo, Opposite);322break;323}324case AMDGPU::G_FMA:325case AMDGPU::G_FMAD:326NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));327NegateOperand(MatchInfo->getOperand(3));328break;329case AMDGPU::G_FPEXT:330case AMDGPU::G_INTRINSIC_TRUNC:331case AMDGPU::G_FRINT:332case AMDGPU::G_FNEARBYINT:333case AMDGPU::G_INTRINSIC_ROUND:334case AMDGPU::G_INTRINSIC_ROUNDEVEN:335case AMDGPU::G_FSIN:336case AMDGPU::G_FCANONICALIZE:337case AMDGPU::G_AMDGPU_RCP_IFLAG:338case AMDGPU::G_FPTRUNC:339NegateOperand(MatchInfo->getOperand(1));340break;341case AMDGPU::G_INTRINSIC:342case AMDGPU::G_INTRINSIC_CONVERGENT: {343Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();344switch (IntrinsicID) {345case Intrinsic::amdgcn_rcp:346case Intrinsic::amdgcn_rcp_legacy:347case Intrinsic::amdgcn_sin:348NegateOperand(MatchInfo->getOperand(2));349break;350case Intrinsic::amdgcn_fmul_legacy:351NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));352break;353case Intrinsic::amdgcn_fmed3:354NegateOperand(MatchInfo->getOperand(2));355NegateOperand(MatchInfo->getOperand(3));356NegateOperand(MatchInfo->getOperand(4));357break;358case Intrinsic::amdgcn_fma_legacy:359NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));360NegateOperand(MatchInfo->getOperand(4));361break;362default:363llvm_unreachable("folding fneg not supported for this intrinsic");364}365break;366}367default:368llvm_unreachable("folding fneg not supported for this instruction");369}370371Register Dst = MI.getOperand(0).getReg();372Register MatchInfoDst = MatchInfo->getOperand(0).getReg();373374if (MRI.hasOneNonDBGUse(MatchInfoDst)) {375// MatchInfo now has negated value so use that instead of old Dst.376replaceRegWith(MRI, Dst, MatchInfoDst);377} else {378// We want to swap all uses of Dst with uses of MatchInfoDst and vice versa379// but replaceRegWith will replace defs as well. It is easier to replace one380// def with a new register.381LLT Type = MRI.getType(Dst);382Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type);383replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo);384385// MatchInfo now has negated value so use that instead of old Dst.386replaceRegWith(MRI, Dst, NegatedMatchInfo);387388// Recreate non negated value for other uses of old MatchInfoDst389auto NextInst = ++MatchInfo->getIterator();390Builder.setInstrAndDebugLoc(*NextInst);391Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());392}393394MI.eraseFromParent();395}396397// TODO: Should return converted value / extension source and avoid introducing398// intermediate fptruncs in the apply function.399static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,400Register Reg) {401const MachineInstr *Def = MRI.getVRegDef(Reg);402if (Def->getOpcode() == TargetOpcode::G_FPEXT) {403Register SrcReg = Def->getOperand(1).getReg();404return MRI.getType(SrcReg) == LLT::scalar(16);405}406407if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) {408APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF();409bool LosesInfo = true;410Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);411return !LosesInfo;412}413414return false;415}416417bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,418Register Src0,419Register Src1,420Register Src2) {421assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);422Register SrcReg = MI.getOperand(1).getReg();423if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32))424return false;425426return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) &&427isFPExtFromF16OrConst(MRI, Src2);428}429430void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,431Register Src0,432Register Src1,433Register Src2) {434// We expect fptrunc (fpext x) to fold out, and to constant fold any constant435// sources.436Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0);437Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0);438Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0);439440LLT Ty = MRI.getType(Src0);441auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1);442auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1);443auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2);444Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1);445MI.eraseFromParent();446}447448449