Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
35267 views
//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//==-----------------------------------------------------------------------===//7//8/// \file9/// Defines an instruction selector for the AMDGPU target.10//11//===----------------------------------------------------------------------===//1213#include "AMDGPUISelDAGToDAG.h"14#include "AMDGPU.h"15#include "AMDGPUInstrInfo.h"16#include "AMDGPUSubtarget.h"17#include "AMDGPUTargetMachine.h"18#include "MCTargetDesc/AMDGPUMCTargetDesc.h"19#include "MCTargetDesc/R600MCTargetDesc.h"20#include "R600RegisterInfo.h"21#include "SIISelLowering.h"22#include "SIMachineFunctionInfo.h"23#include "llvm/Analysis/UniformityAnalysis.h"24#include "llvm/Analysis/ValueTracking.h"25#include "llvm/CodeGen/FunctionLoweringInfo.h"26#include "llvm/CodeGen/SelectionDAG.h"27#include "llvm/CodeGen/SelectionDAGISel.h"28#include "llvm/CodeGen/SelectionDAGNodes.h"29#include "llvm/IR/IntrinsicsAMDGPU.h"30#include "llvm/InitializePasses.h"31#include "llvm/Support/ErrorHandling.h"3233#ifdef EXPENSIVE_CHECKS34#include "llvm/Analysis/LoopInfo.h"35#include "llvm/IR/Dominators.h"36#endif3738#define DEBUG_TYPE "amdgpu-isel"3940using namespace llvm;4142//===----------------------------------------------------------------------===//43// Instruction Selector Implementation44//===----------------------------------------------------------------------===//4546namespace {47static SDValue stripBitcast(SDValue Val) {48return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;49}5051// Figure out if this is really an extract of the high 16-bits of a dword.52static bool isExtractHiElt(SDValue In, SDValue &Out) {53In = stripBitcast(In);5455if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {56if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {57if (!Idx->isOne())58return false;59Out = In.getOperand(0);60return true;61}62}6364if (In.getOpcode() != ISD::TRUNCATE)65return false;6667SDValue Srl = In.getOperand(0);68if (Srl.getOpcode() == ISD::SRL) {69if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {70if (ShiftAmt->getZExtValue() == 16) {71Out = stripBitcast(Srl.getOperand(0));72return true;73}74}75}7677return false;78}7980// Look through operations that obscure just looking at the low 16-bits of the81// same register.82static SDValue stripExtractLoElt(SDValue In) {83if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {84SDValue Idx = In.getOperand(1);85if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)86return In.getOperand(0);87}8889if (In.getOpcode() == ISD::TRUNCATE) {90SDValue Src = In.getOperand(0);91if (Src.getValueType().getSizeInBits() == 32)92return stripBitcast(Src);93}9495return In;96}9798} // end anonymous namespace99100INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",101"AMDGPU DAG->DAG Pattern Instruction Selection", false,102false)103INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)104INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)105INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)106#ifdef EXPENSIVE_CHECKS107INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)108INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)109#endif110INITIALIZE_PASS_END(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",111"AMDGPU DAG->DAG Pattern Instruction Selection", false,112false)113114/// This pass converts a legalized DAG into a AMDGPU-specific115// DAG, ready for instruction scheduling.116FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,117CodeGenOptLevel OptLevel) {118return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);119}120121AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,122CodeGenOptLevel OptLevel)123: SelectionDAGISel(TM, OptLevel) {124EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;125}126127bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {128Subtarget = &MF.getSubtarget<GCNSubtarget>();129Subtarget->checkSubtargetFeatures(MF.getFunction());130Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);131return SelectionDAGISel::runOnMachineFunction(MF);132}133134bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {135// XXX - only need to list legal operations.136switch (Opc) {137case ISD::FADD:138case ISD::FSUB:139case ISD::FMUL:140case ISD::FDIV:141case ISD::FREM:142case ISD::FCANONICALIZE:143case ISD::UINT_TO_FP:144case ISD::SINT_TO_FP:145case ISD::FABS:146// Fabs is lowered to a bit operation, but it's an and which will clear the147// high bits anyway.148case ISD::FSQRT:149case ISD::FSIN:150case ISD::FCOS:151case ISD::FPOWI:152case ISD::FPOW:153case ISD::FLOG:154case ISD::FLOG2:155case ISD::FLOG10:156case ISD::FEXP:157case ISD::FEXP2:158case ISD::FCEIL:159case ISD::FTRUNC:160case ISD::FRINT:161case ISD::FNEARBYINT:162case ISD::FROUNDEVEN:163case ISD::FROUND:164case ISD::FFLOOR:165case ISD::FMINNUM:166case ISD::FMAXNUM:167case ISD::FLDEXP:168case AMDGPUISD::FRACT:169case AMDGPUISD::CLAMP:170case AMDGPUISD::COS_HW:171case AMDGPUISD::SIN_HW:172case AMDGPUISD::FMIN3:173case AMDGPUISD::FMAX3:174case AMDGPUISD::FMED3:175case AMDGPUISD::FMAD_FTZ:176case AMDGPUISD::RCP:177case AMDGPUISD::RSQ:178case AMDGPUISD::RCP_IFLAG:179// On gfx10, all 16-bit instructions preserve the high bits.180return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;181case ISD::FP_ROUND:182// We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the183// high bits on gfx9.184// TODO: If we had the source node we could see if the source was fma/mad185return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;186case ISD::FMA:187case ISD::FMAD:188case AMDGPUISD::DIV_FIXUP:189return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;190default:191// fcopysign, select and others may be lowered to 32-bit bit operations192// which don't zero the high bits.193return false;194}195}196197bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {198#ifdef EXPENSIVE_CHECKS199DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();200LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();201for (auto &L : LI->getLoopsInPreorder()) {202assert(L->isLCSSAForm(DT));203}204#endif205return SelectionDAGISelLegacy::runOnMachineFunction(MF);206}207208void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {209AU.addRequired<AMDGPUArgumentUsageInfo>();210AU.addRequired<UniformityInfoWrapperPass>();211#ifdef EXPENSIVE_CHECKS212AU.addRequired<DominatorTreeWrapperPass>();213AU.addRequired<LoopInfoWrapperPass>();214#endif215SelectionDAGISelLegacy::getAnalysisUsage(AU);216}217218bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {219assert(Subtarget->d16PreservesUnusedBits());220MVT VT = N->getValueType(0).getSimpleVT();221if (VT != MVT::v2i16 && VT != MVT::v2f16)222return false;223224SDValue Lo = N->getOperand(0);225SDValue Hi = N->getOperand(1);226227LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));228229// build_vector lo, (load ptr) -> load_d16_hi ptr, lo230// build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo231// build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo232233// Need to check for possible indirect dependencies on the other half of the234// vector to avoid introducing a cycle.235if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {236SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);237238SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);239SDValue Ops[] = {240LdHi->getChain(), LdHi->getBasePtr(), TiedIn241};242243unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;244if (LdHi->getMemoryVT() == MVT::i8) {245LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?246AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;247} else {248assert(LdHi->getMemoryVT() == MVT::i16);249}250251SDValue NewLoadHi =252CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,253Ops, LdHi->getMemoryVT(),254LdHi->getMemOperand());255256CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);257CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));258return true;259}260261// build_vector (load ptr), hi -> load_d16_lo ptr, hi262// build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi263// build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi264LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));265if (LdLo && Lo.hasOneUse()) {266SDValue TiedIn = getHi16Elt(Hi);267if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))268return false;269270SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);271unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;272if (LdLo->getMemoryVT() == MVT::i8) {273LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?274AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;275} else {276assert(LdLo->getMemoryVT() == MVT::i16);277}278279TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);280281SDValue Ops[] = {282LdLo->getChain(), LdLo->getBasePtr(), TiedIn283};284285SDValue NewLoadLo =286CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,287Ops, LdLo->getMemoryVT(),288LdLo->getMemOperand());289290CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);291CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));292return true;293}294295return false;296}297298void AMDGPUDAGToDAGISel::PreprocessISelDAG() {299if (!Subtarget->d16PreservesUnusedBits())300return;301302SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();303304bool MadeChange = false;305while (Position != CurDAG->allnodes_begin()) {306SDNode *N = &*--Position;307if (N->use_empty())308continue;309310switch (N->getOpcode()) {311case ISD::BUILD_VECTOR:312// TODO: Match load d16 from shl (extload:i16), 16313MadeChange |= matchLoadD16FromBuildVector(N);314break;315default:316break;317}318}319320if (MadeChange) {321CurDAG->RemoveDeadNodes();322LLVM_DEBUG(dbgs() << "After PreProcess:\n";323CurDAG->dump(););324}325}326327bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {328if (N->isUndef())329return true;330331const SIInstrInfo *TII = Subtarget->getInstrInfo();332if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))333return TII->isInlineConstant(C->getAPIntValue());334335if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))336return TII->isInlineConstant(C->getValueAPF());337338return false;339}340341/// Determine the register class for \p OpNo342/// \returns The register class of the virtual register that will be used for343/// the given operand number \OpNo or NULL if the register class cannot be344/// determined.345const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,346unsigned OpNo) const {347if (!N->isMachineOpcode()) {348if (N->getOpcode() == ISD::CopyToReg) {349Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();350if (Reg.isVirtual()) {351MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();352return MRI.getRegClass(Reg);353}354355const SIRegisterInfo *TRI356= static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();357return TRI->getPhysRegBaseClass(Reg);358}359360return nullptr;361}362363switch (N->getMachineOpcode()) {364default: {365const MCInstrDesc &Desc =366Subtarget->getInstrInfo()->get(N->getMachineOpcode());367unsigned OpIdx = Desc.getNumDefs() + OpNo;368if (OpIdx >= Desc.getNumOperands())369return nullptr;370int RegClass = Desc.operands()[OpIdx].RegClass;371if (RegClass == -1)372return nullptr;373374return Subtarget->getRegisterInfo()->getRegClass(RegClass);375}376case AMDGPU::REG_SEQUENCE: {377unsigned RCID = N->getConstantOperandVal(0);378const TargetRegisterClass *SuperRC =379Subtarget->getRegisterInfo()->getRegClass(RCID);380381SDValue SubRegOp = N->getOperand(OpNo + 1);382unsigned SubRegIdx = SubRegOp->getAsZExtVal();383return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,384SubRegIdx);385}386}387}388389SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,390SDValue Glue) const {391SmallVector <SDValue, 8> Ops;392Ops.push_back(NewChain); // Replace the chain.393for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)394Ops.push_back(N->getOperand(i));395396Ops.push_back(Glue);397return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);398}399400SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {401const SITargetLowering& Lowering =402*static_cast<const SITargetLowering*>(getTargetLowering());403404assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");405406SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);407return glueCopyToOp(N, M0, M0.getValue(1));408}409410SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {411unsigned AS = cast<MemSDNode>(N)->getAddressSpace();412if (AS == AMDGPUAS::LOCAL_ADDRESS) {413if (Subtarget->ldsRequiresM0Init())414return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));415} else if (AS == AMDGPUAS::REGION_ADDRESS) {416MachineFunction &MF = CurDAG->getMachineFunction();417unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();418return419glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));420}421return N;422}423424MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,425EVT VT) const {426SDNode *Lo = CurDAG->getMachineNode(427AMDGPU::S_MOV_B32, DL, MVT::i32,428CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));429SDNode *Hi =430CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,431CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));432const SDValue Ops[] = {433CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),434SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),435SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};436437return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);438}439440void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {441EVT VT = N->getValueType(0);442unsigned NumVectorElts = VT.getVectorNumElements();443EVT EltVT = VT.getVectorElementType();444SDLoc DL(N);445SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);446447if (NumVectorElts == 1) {448CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),449RegClass);450return;451}452453assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "454"supported yet");455// 32 = Max Num Vector Elements456// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)457// 1 = Vector Register Class458SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);459460bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==461Triple::amdgcn;462RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);463bool IsRegSeq = true;464unsigned NOps = N->getNumOperands();465for (unsigned i = 0; i < NOps; i++) {466// XXX: Why is this here?467if (isa<RegisterSDNode>(N->getOperand(i))) {468IsRegSeq = false;469break;470}471unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)472: R600RegisterInfo::getSubRegFromChannel(i);473RegSeqArgs[1 + (2 * i)] = N->getOperand(i);474RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);475}476if (NOps != NumVectorElts) {477// Fill in the missing undef elements if this was a scalar_to_vector.478assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);479MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,480DL, EltVT);481for (unsigned i = NOps; i < NumVectorElts; ++i) {482unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)483: R600RegisterInfo::getSubRegFromChannel(i);484RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);485RegSeqArgs[1 + (2 * i) + 1] =486CurDAG->getTargetConstant(Sub, DL, MVT::i32);487}488}489490if (!IsRegSeq)491SelectCode(N);492CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);493}494495void AMDGPUDAGToDAGISel::Select(SDNode *N) {496unsigned int Opc = N->getOpcode();497if (N->isMachineOpcode()) {498N->setNodeId(-1);499return; // Already selected.500}501502// isa<MemSDNode> almost works but is slightly too permissive for some DS503// intrinsics.504if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {505N = glueCopyToM0LDSInit(N);506SelectCode(N);507return;508}509510switch (Opc) {511default:512break;513// We are selecting i64 ADD here instead of custom lower it during514// DAG legalization, so we can fold some i64 ADDs used for address515// calculation into the LOAD and STORE instructions.516case ISD::ADDC:517case ISD::ADDE:518case ISD::SUBC:519case ISD::SUBE: {520if (N->getValueType(0) != MVT::i64)521break;522523SelectADD_SUB_I64(N);524return;525}526case ISD::UADDO_CARRY:527case ISD::USUBO_CARRY:528if (N->getValueType(0) != MVT::i32)529break;530531SelectAddcSubb(N);532return;533case ISD::UADDO:534case ISD::USUBO: {535SelectUADDO_USUBO(N);536return;537}538case AMDGPUISD::FMUL_W_CHAIN: {539SelectFMUL_W_CHAIN(N);540return;541}542case AMDGPUISD::FMA_W_CHAIN: {543SelectFMA_W_CHAIN(N);544return;545}546547case ISD::SCALAR_TO_VECTOR:548case ISD::BUILD_VECTOR: {549EVT VT = N->getValueType(0);550unsigned NumVectorElts = VT.getVectorNumElements();551if (VT.getScalarSizeInBits() == 16) {552if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {553if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {554ReplaceNode(N, Packed);555return;556}557}558559break;560}561562assert(VT.getVectorElementType().bitsEq(MVT::i32));563unsigned RegClassID =564SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();565SelectBuildVector(N, RegClassID);566return;567}568case ISD::BUILD_PAIR: {569SDValue RC, SubReg0, SubReg1;570SDLoc DL(N);571if (N->getValueType(0) == MVT::i128) {572RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);573SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);574SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);575} else if (N->getValueType(0) == MVT::i64) {576RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);577SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);578SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);579} else {580llvm_unreachable("Unhandled value type for BUILD_PAIR");581}582const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,583N->getOperand(1), SubReg1 };584ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,585N->getValueType(0), Ops));586return;587}588589case ISD::Constant:590case ISD::ConstantFP: {591if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))592break;593594uint64_t Imm;595if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {596Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();597if (AMDGPU::isValid32BitLiteral(Imm, true))598break;599} else {600ConstantSDNode *C = cast<ConstantSDNode>(N);601Imm = C->getZExtValue();602if (AMDGPU::isValid32BitLiteral(Imm, false))603break;604}605606SDLoc DL(N);607ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));608return;609}610case AMDGPUISD::BFE_I32:611case AMDGPUISD::BFE_U32: {612// There is a scalar version available, but unlike the vector version which613// has a separate operand for the offset and width, the scalar version packs614// the width and offset into a single operand. Try to move to the scalar615// version if the offsets are constant, so that we can try to keep extended616// loads of kernel arguments in SGPRs.617618// TODO: Technically we could try to pattern match scalar bitshifts of619// dynamic values, but it's probably not useful.620ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));621if (!Offset)622break;623624ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));625if (!Width)626break;627628bool Signed = Opc == AMDGPUISD::BFE_I32;629630uint32_t OffsetVal = Offset->getZExtValue();631uint32_t WidthVal = Width->getZExtValue();632633ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,634WidthVal));635return;636}637case AMDGPUISD::DIV_SCALE: {638SelectDIV_SCALE(N);639return;640}641case AMDGPUISD::MAD_I64_I32:642case AMDGPUISD::MAD_U64_U32: {643SelectMAD_64_32(N);644return;645}646case ISD::SMUL_LOHI:647case ISD::UMUL_LOHI:648return SelectMUL_LOHI(N);649case ISD::CopyToReg: {650const SITargetLowering& Lowering =651*static_cast<const SITargetLowering*>(getTargetLowering());652N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);653break;654}655case ISD::AND:656case ISD::SRL:657case ISD::SRA:658case ISD::SIGN_EXTEND_INREG:659if (N->getValueType(0) != MVT::i32)660break;661662SelectS_BFE(N);663return;664case ISD::BRCOND:665SelectBRCOND(N);666return;667case ISD::FP_EXTEND:668SelectFP_EXTEND(N);669return;670case AMDGPUISD::CVT_PKRTZ_F16_F32:671case AMDGPUISD::CVT_PKNORM_I16_F32:672case AMDGPUISD::CVT_PKNORM_U16_F32:673case AMDGPUISD::CVT_PK_U16_U32:674case AMDGPUISD::CVT_PK_I16_I32: {675// Hack around using a legal type if f16 is illegal.676if (N->getValueType(0) == MVT::i32) {677MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;678N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),679{ N->getOperand(0), N->getOperand(1) });680SelectCode(N);681return;682}683684break;685}686case ISD::INTRINSIC_W_CHAIN: {687SelectINTRINSIC_W_CHAIN(N);688return;689}690case ISD::INTRINSIC_WO_CHAIN: {691SelectINTRINSIC_WO_CHAIN(N);692return;693}694case ISD::INTRINSIC_VOID: {695SelectINTRINSIC_VOID(N);696return;697}698case AMDGPUISD::WAVE_ADDRESS: {699SelectWAVE_ADDRESS(N);700return;701}702case ISD::STACKRESTORE: {703SelectSTACKRESTORE(N);704return;705}706}707708SelectCode(N);709}710711bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {712const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();713const Instruction *Term = BB->getTerminator();714return Term->getMetadata("amdgpu.uniform") ||715Term->getMetadata("structurizecfg.uniform");716}717718bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,719unsigned ShAmtBits) const {720assert(N->getOpcode() == ISD::AND);721722const APInt &RHS = N->getConstantOperandAPInt(1);723if (RHS.countr_one() >= ShAmtBits)724return true;725726const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;727return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;728}729730static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,731SDValue &N0, SDValue &N1) {732if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&733Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {734// As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.735// (i64 (bitcast (v2i32 (build_vector736// (or (extract_vector_elt V, 0), OFFSET),737// (extract_vector_elt V, 1)))))738SDValue Lo = Addr.getOperand(0).getOperand(0);739if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {740SDValue BaseLo = Lo.getOperand(0);741SDValue BaseHi = Addr.getOperand(0).getOperand(1);742// Check that split base (Lo and Hi) are extracted from the same one.743if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&744BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&745BaseLo.getOperand(0) == BaseHi.getOperand(0) &&746// Lo is statically extracted from index 0.747isa<ConstantSDNode>(BaseLo.getOperand(1)) &&748BaseLo.getConstantOperandVal(1) == 0 &&749// Hi is statically extracted from index 0.750isa<ConstantSDNode>(BaseHi.getOperand(1)) &&751BaseHi.getConstantOperandVal(1) == 1) {752N0 = BaseLo.getOperand(0).getOperand(0);753N1 = Lo.getOperand(1);754return true;755}756}757}758return false;759}760761bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,762SDValue &RHS) const {763if (CurDAG->isBaseWithConstantOffset(Addr)) {764LHS = Addr.getOperand(0);765RHS = Addr.getOperand(1);766return true;767}768769if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {770assert(LHS && RHS && isa<ConstantSDNode>(RHS));771return true;772}773774return false;775}776777StringRef AMDGPUDAGToDAGISelLegacy::getPassName() const {778return "AMDGPU DAG->DAG Pattern Instruction Selection";779}780781AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass(TargetMachine &TM)782: SelectionDAGISelPass(783std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}784785PreservedAnalyses786AMDGPUISelDAGToDAGPass::run(MachineFunction &MF,787MachineFunctionAnalysisManager &MFAM) {788#ifdef EXPENSIVE_CHECKS789auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)790.getManager();791auto &F = MF.getFunction();792DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);793LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);794for (auto &L : LI.getLoopsInPreorder())795assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");796#endif797return SelectionDAGISelPass::run(MF, MFAM);798}799800//===----------------------------------------------------------------------===//801// Complex Patterns802//===----------------------------------------------------------------------===//803804bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,805SDValue &Offset) {806return false;807}808809bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,810SDValue &Offset) {811ConstantSDNode *C;812SDLoc DL(Addr);813814if ((C = dyn_cast<ConstantSDNode>(Addr))) {815Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);816Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);817} else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&818(C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {819Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);820Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);821} else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&822(C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {823Base = Addr.getOperand(0);824Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);825} else {826Base = Addr;827Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);828}829830return true;831}832833SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,834const SDLoc &DL) const {835SDNode *Mov = CurDAG->getMachineNode(836AMDGPU::S_MOV_B32, DL, MVT::i32,837CurDAG->getTargetConstant(Val, DL, MVT::i32));838return SDValue(Mov, 0);839}840841// FIXME: Should only handle uaddo_carry/usubo_carry842void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {843SDLoc DL(N);844SDValue LHS = N->getOperand(0);845SDValue RHS = N->getOperand(1);846847unsigned Opcode = N->getOpcode();848bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);849bool ProduceCarry =850ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;851bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;852853SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);854SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);855856SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,857DL, MVT::i32, LHS, Sub0);858SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,859DL, MVT::i32, LHS, Sub1);860861SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,862DL, MVT::i32, RHS, Sub0);863SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,864DL, MVT::i32, RHS, Sub1);865866SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);867868static const unsigned OpcMap[2][2][2] = {869{{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},870{AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},871{{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},872{AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};873874unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];875unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];876877SDNode *AddLo;878if (!ConsumeCarry) {879SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };880AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);881} else {882SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };883AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);884}885SDValue AddHiArgs[] = {886SDValue(Hi0, 0),887SDValue(Hi1, 0),888SDValue(AddLo, 1)889};890SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);891892SDValue RegSequenceArgs[] = {893CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),894SDValue(AddLo,0),895Sub0,896SDValue(AddHi,0),897Sub1,898};899SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,900MVT::i64, RegSequenceArgs);901902if (ProduceCarry) {903// Replace the carry-use904ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));905}906907// Replace the remaining uses.908ReplaceNode(N, RegSequence);909}910911void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {912SDLoc DL(N);913SDValue LHS = N->getOperand(0);914SDValue RHS = N->getOperand(1);915SDValue CI = N->getOperand(2);916917if (N->isDivergent()) {918unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64919: AMDGPU::V_SUBB_U32_e64;920CurDAG->SelectNodeTo(921N, Opc, N->getVTList(),922{LHS, RHS, CI,923CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});924} else {925unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO926: AMDGPU::S_SUB_CO_PSEUDO;927CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});928}929}930931void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {932// The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned933// carry out despite the _i32 name. These were renamed in VI to _U32.934// FIXME: We should probably rename the opcodes here.935bool IsAdd = N->getOpcode() == ISD::UADDO;936bool IsVALU = N->isDivergent();937938for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;939++UI)940if (UI.getUse().getResNo() == 1) {941if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||942(!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {943IsVALU = true;944break;945}946}947948if (IsVALU) {949unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;950951CurDAG->SelectNodeTo(952N, Opc, N->getVTList(),953{N->getOperand(0), N->getOperand(1),954CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});955} else {956unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO957: AMDGPU::S_USUBO_PSEUDO;958959CurDAG->SelectNodeTo(N, Opc, N->getVTList(),960{N->getOperand(0), N->getOperand(1)});961}962}963964void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {965SDLoc SL(N);966// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod967SDValue Ops[10];968969SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);970SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);971SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);972Ops[8] = N->getOperand(0);973Ops[9] = N->getOperand(4);974975// If there are no source modifiers, prefer fmac over fma because it can use976// the smaller VOP2 encoding.977bool UseFMAC = Subtarget->hasDLInsts() &&978cast<ConstantSDNode>(Ops[0])->isZero() &&979cast<ConstantSDNode>(Ops[2])->isZero() &&980cast<ConstantSDNode>(Ops[4])->isZero();981unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;982CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);983}984985void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {986SDLoc SL(N);987// src0_modifiers, src0, src1_modifiers, src1, clamp, omod988SDValue Ops[8];989990SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);991SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);992Ops[6] = N->getOperand(0);993Ops[7] = N->getOperand(3);994995CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);996}997998// We need to handle this here because tablegen doesn't support matching999// instructions with multiple outputs.1000void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {1001SDLoc SL(N);1002EVT VT = N->getValueType(0);10031004assert(VT == MVT::f32 || VT == MVT::f64);10051006unsigned Opc1007= (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;10081009// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,1010// omod1011SDValue Ops[8];1012SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);1013SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);1014SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);1015CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);1016}10171018// We need to handle this here because tablegen doesn't support matching1019// instructions with multiple outputs.1020void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {1021SDLoc SL(N);1022bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;1023unsigned Opc;1024if (Subtarget->hasMADIntraFwdBug())1025Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e641026: AMDGPU::V_MAD_U64_U32_gfx11_e64;1027else1028Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;10291030SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);1031SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),1032Clamp };1033CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);1034}10351036// We need to handle this here because tablegen doesn't support matching1037// instructions with multiple outputs.1038void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {1039SDLoc SL(N);1040bool Signed = N->getOpcode() == ISD::SMUL_LOHI;1041unsigned Opc;1042if (Subtarget->hasMADIntraFwdBug())1043Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e641044: AMDGPU::V_MAD_U64_U32_gfx11_e64;1045else1046Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;10471048SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);1049SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);1050SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};1051SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);1052if (!SDValue(N, 0).use_empty()) {1053SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);1054SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,1055MVT::i32, SDValue(Mad, 0), Sub0);1056ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));1057}1058if (!SDValue(N, 1).use_empty()) {1059SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);1060SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,1061MVT::i32, SDValue(Mad, 0), Sub1);1062ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));1063}1064CurDAG->RemoveDeadNode(N);1065}10661067bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {1068if (!isUInt<16>(Offset))1069return false;10701071if (!Base || Subtarget->hasUsableDSOffset() ||1072Subtarget->unsafeDSOffsetFoldingEnabled())1073return true;10741075// On Southern Islands instruction with a negative base value and an offset1076// don't seem to work.1077return CurDAG->SignBitIsZero(Base);1078}10791080bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,1081SDValue &Offset) const {1082SDLoc DL(Addr);1083if (CurDAG->isBaseWithConstantOffset(Addr)) {1084SDValue N0 = Addr.getOperand(0);1085SDValue N1 = Addr.getOperand(1);1086ConstantSDNode *C1 = cast<ConstantSDNode>(N1);1087if (isDSOffsetLegal(N0, C1->getSExtValue())) {1088// (add n0, c0)1089Base = N0;1090Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);1091return true;1092}1093} else if (Addr.getOpcode() == ISD::SUB) {1094// sub C, x -> add (sub 0, x), C1095if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {1096int64_t ByteOffset = C->getSExtValue();1097if (isDSOffsetLegal(SDValue(), ByteOffset)) {1098SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);10991100// XXX - This is kind of hacky. Create a dummy sub node so we can check1101// the known bits in isDSOffsetLegal. We need to emit the selected node1102// here, so this is thrown away.1103SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,1104Zero, Addr.getOperand(1));11051106if (isDSOffsetLegal(Sub, ByteOffset)) {1107SmallVector<SDValue, 3> Opnds;1108Opnds.push_back(Zero);1109Opnds.push_back(Addr.getOperand(1));11101111// FIXME: Select to VOP3 version for with-carry.1112unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;1113if (Subtarget->hasAddNoCarry()) {1114SubOp = AMDGPU::V_SUB_U32_e64;1115Opnds.push_back(1116CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit1117}11181119MachineSDNode *MachineSub =1120CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);11211122Base = SDValue(MachineSub, 0);1123Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);1124return true;1125}1126}1127}1128} else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {1129// If we have a constant address, prefer to put the constant into the1130// offset. This can save moves to load the constant address since multiple1131// operations can share the zero base address register, and enables merging1132// into read2 / write2 instructions.11331134SDLoc DL(Addr);11351136if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {1137SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);1138MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,1139DL, MVT::i32, Zero);1140Base = SDValue(MovZero, 0);1141Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);1142return true;1143}1144}11451146// default case1147Base = Addr;1148Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);1149return true;1150}11511152bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,1153unsigned Offset1,1154unsigned Size) const {1155if (Offset0 % Size != 0 || Offset1 % Size != 0)1156return false;1157if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))1158return false;11591160if (!Base || Subtarget->hasUsableDSOffset() ||1161Subtarget->unsafeDSOffsetFoldingEnabled())1162return true;11631164// On Southern Islands instruction with a negative base value and an offset1165// don't seem to work.1166return CurDAG->SignBitIsZero(Base);1167}11681169// Return whether the operation has NoUnsignedWrap property.1170static bool isNoUnsignedWrap(SDValue Addr) {1171return (Addr.getOpcode() == ISD::ADD &&1172Addr->getFlags().hasNoUnsignedWrap()) ||1173Addr->getOpcode() == ISD::OR;1174}11751176// Check that the base address of flat scratch load/store in the form of `base +1177// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware1178// requirement). We always treat the first operand as the base address here.1179bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {1180if (isNoUnsignedWrap(Addr))1181return true;11821183// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative1184// values.1185if (Subtarget->hasSignedScratchOffsets())1186return true;11871188auto LHS = Addr.getOperand(0);1189auto RHS = Addr.getOperand(1);11901191// If the immediate offset is negative and within certain range, the base1192// address cannot also be negative. If the base is also negative, the sum1193// would be either negative or much larger than the valid range of scratch1194// memory a thread can access.1195ConstantSDNode *ImmOp = nullptr;1196if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {1197if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)1198return true;1199}12001201return CurDAG->SignBitIsZero(LHS);1202}12031204// Check address value in SGPR/VGPR are legal for flat scratch in the form1205// of: SGPR + VGPR.1206bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {1207if (isNoUnsignedWrap(Addr))1208return true;12091210// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative1211// values.1212if (Subtarget->hasSignedScratchOffsets())1213return true;12141215auto LHS = Addr.getOperand(0);1216auto RHS = Addr.getOperand(1);1217return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);1218}12191220// Check address value in SGPR/VGPR are legal for flat scratch in the form1221// of: SGPR + VGPR + Imm.1222bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {1223// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative1224// values.1225if (AMDGPU::isGFX12Plus(*Subtarget))1226return true;12271228auto Base = Addr.getOperand(0);1229auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));1230// If the immediate offset is negative and within certain range, the base1231// address cannot also be negative. If the base is also negative, the sum1232// would be either negative or much larger than the valid range of scratch1233// memory a thread can access.1234if (isNoUnsignedWrap(Base) &&1235(isNoUnsignedWrap(Addr) ||1236(RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))1237return true;12381239auto LHS = Base.getOperand(0);1240auto RHS = Base.getOperand(1);1241return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);1242}12431244// TODO: If offset is too big, put low 16-bit into offset.1245bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,1246SDValue &Offset0,1247SDValue &Offset1) const {1248return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);1249}12501251bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,1252SDValue &Offset0,1253SDValue &Offset1) const {1254return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);1255}12561257bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,1258SDValue &Offset0, SDValue &Offset1,1259unsigned Size) const {1260SDLoc DL(Addr);12611262if (CurDAG->isBaseWithConstantOffset(Addr)) {1263SDValue N0 = Addr.getOperand(0);1264SDValue N1 = Addr.getOperand(1);1265ConstantSDNode *C1 = cast<ConstantSDNode>(N1);1266unsigned OffsetValue0 = C1->getZExtValue();1267unsigned OffsetValue1 = OffsetValue0 + Size;12681269// (add n0, c0)1270if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {1271Base = N0;1272Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);1273Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);1274return true;1275}1276} else if (Addr.getOpcode() == ISD::SUB) {1277// sub C, x -> add (sub 0, x), C1278if (const ConstantSDNode *C =1279dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {1280unsigned OffsetValue0 = C->getZExtValue();1281unsigned OffsetValue1 = OffsetValue0 + Size;12821283if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {1284SDLoc DL(Addr);1285SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);12861287// XXX - This is kind of hacky. Create a dummy sub node so we can check1288// the known bits in isDSOffsetLegal. We need to emit the selected node1289// here, so this is thrown away.1290SDValue Sub =1291CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));12921293if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {1294SmallVector<SDValue, 3> Opnds;1295Opnds.push_back(Zero);1296Opnds.push_back(Addr.getOperand(1));1297unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;1298if (Subtarget->hasAddNoCarry()) {1299SubOp = AMDGPU::V_SUB_U32_e64;1300Opnds.push_back(1301CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit1302}13031304MachineSDNode *MachineSub = CurDAG->getMachineNode(1305SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);13061307Base = SDValue(MachineSub, 0);1308Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);1309Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);1310return true;1311}1312}1313}1314} else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {1315unsigned OffsetValue0 = CAddr->getZExtValue();1316unsigned OffsetValue1 = OffsetValue0 + Size;13171318if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {1319SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);1320MachineSDNode *MovZero =1321CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);1322Base = SDValue(MovZero, 0);1323Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);1324Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);1325return true;1326}1327}13281329// default case13301331Base = Addr;1332Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);1333Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);1334return true;1335}13361337bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,1338SDValue &SOffset, SDValue &Offset,1339SDValue &Offen, SDValue &Idxen,1340SDValue &Addr64) const {1341// Subtarget prefers to use flat instruction1342// FIXME: This should be a pattern predicate and not reach here1343if (Subtarget->useFlatForGlobal())1344return false;13451346SDLoc DL(Addr);13471348Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);1349Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);1350Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);1351SOffset = Subtarget->hasRestrictedSOffset()1352? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)1353: CurDAG->getTargetConstant(0, DL, MVT::i32);13541355ConstantSDNode *C1 = nullptr;1356SDValue N0 = Addr;1357if (CurDAG->isBaseWithConstantOffset(Addr)) {1358C1 = cast<ConstantSDNode>(Addr.getOperand(1));1359if (isUInt<32>(C1->getZExtValue()))1360N0 = Addr.getOperand(0);1361else1362C1 = nullptr;1363}13641365if (N0.getOpcode() == ISD::ADD) {1366// (add N2, N3) -> addr64, or1367// (add (add N2, N3), C1) -> addr641368SDValue N2 = N0.getOperand(0);1369SDValue N3 = N0.getOperand(1);1370Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);13711372if (N2->isDivergent()) {1373if (N3->isDivergent()) {1374// Both N2 and N3 are divergent. Use N0 (the result of the add) as the1375// addr64, and construct the resource from a 0 address.1376Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);1377VAddr = N0;1378} else {1379// N2 is divergent, N3 is not.1380Ptr = N3;1381VAddr = N2;1382}1383} else {1384// N2 is not divergent.1385Ptr = N2;1386VAddr = N3;1387}1388Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);1389} else if (N0->isDivergent()) {1390// N0 is divergent. Use it as the addr64, and construct the resource from a1391// 0 address.1392Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);1393VAddr = N0;1394Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);1395} else {1396// N0 -> offset, or1397// (N0 + C1) -> offset1398VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);1399Ptr = N0;1400}14011402if (!C1) {1403// No offset.1404Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);1405return true;1406}14071408const SIInstrInfo *TII = Subtarget->getInstrInfo();1409if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {1410// Legal offset for instruction.1411Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);1412return true;1413}14141415// Illegal offset, store it in soffset.1416Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);1417SOffset =1418SDValue(CurDAG->getMachineNode(1419AMDGPU::S_MOV_B32, DL, MVT::i32,1420CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),14210);1422return true;1423}14241425bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,1426SDValue &VAddr, SDValue &SOffset,1427SDValue &Offset) const {1428SDValue Ptr, Offen, Idxen, Addr64;14291430// addr64 bit was removed for volcanic islands.1431// FIXME: This should be a pattern predicate and not reach here1432if (!Subtarget->hasAddr64())1433return false;14341435if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))1436return false;14371438ConstantSDNode *C = cast<ConstantSDNode>(Addr64);1439if (C->getSExtValue()) {1440SDLoc DL(Addr);14411442const SITargetLowering& Lowering =1443*static_cast<const SITargetLowering*>(getTargetLowering());14441445SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);1446return true;1447}14481449return false;1450}14511452std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {1453SDLoc DL(N);14541455auto *FI = dyn_cast<FrameIndexSDNode>(N);1456SDValue TFI =1457FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;14581459// We rebase the base address into an absolute stack address and hence1460// use constant 0 for soffset. This value must be retained until1461// frame elimination and eliminateFrameIndex will choose the appropriate1462// frame register if need be.1463return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));1464}14651466bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,1467SDValue Addr, SDValue &Rsrc,1468SDValue &VAddr, SDValue &SOffset,1469SDValue &ImmOffset) const {14701471SDLoc DL(Addr);1472MachineFunction &MF = CurDAG->getMachineFunction();1473const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();14741475Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);14761477if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {1478int64_t Imm = CAddr->getSExtValue();1479const int64_t NullPtr =1480AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);1481// Don't fold null pointer.1482if (Imm != NullPtr) {1483const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);1484SDValue HighBits =1485CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);1486MachineSDNode *MovHighBits = CurDAG->getMachineNode(1487AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);1488VAddr = SDValue(MovHighBits, 0);14891490SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);1491ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);1492return true;1493}1494}14951496if (CurDAG->isBaseWithConstantOffset(Addr)) {1497// (add n0, c1)14981499SDValue N0 = Addr.getOperand(0);1500uint64_t C1 = Addr.getConstantOperandVal(1);15011502// Offsets in vaddr must be positive if range checking is enabled.1503//1504// The total computation of vaddr + soffset + offset must not overflow. If1505// vaddr is negative, even if offset is 0 the sgpr offset add will end up1506// overflowing.1507//1508// Prior to gfx9, MUBUF instructions with the vaddr offset enabled would1509// always perform a range check. If a negative vaddr base index was used,1510// this would fail the range check. The overall address computation would1511// compute a valid address, but this doesn't happen due to the range1512// check. For out-of-bounds MUBUF loads, a 0 is returned.1513//1514// Therefore it should be safe to fold any VGPR offset on gfx9 into the1515// MUBUF vaddr, but not on older subtargets which can only do this if the1516// sign bit is known 0.1517const SIInstrInfo *TII = Subtarget->getInstrInfo();1518if (TII->isLegalMUBUFImmOffset(C1) &&1519(!Subtarget->privateMemoryResourceIsRangeChecked() ||1520CurDAG->SignBitIsZero(N0))) {1521std::tie(VAddr, SOffset) = foldFrameIndex(N0);1522ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);1523return true;1524}1525}15261527// (node)1528std::tie(VAddr, SOffset) = foldFrameIndex(Addr);1529ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);1530return true;1531}15321533static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {1534if (Val.getOpcode() != ISD::CopyFromReg)1535return false;1536auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();1537if (!Reg.isPhysical())1538return false;1539auto RC = TRI.getPhysRegBaseClass(Reg);1540return RC && TRI.isSGPRClass(RC);1541}15421543bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,1544SDValue Addr,1545SDValue &SRsrc,1546SDValue &SOffset,1547SDValue &Offset) const {1548const SIRegisterInfo *TRI =1549static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());1550const SIInstrInfo *TII = Subtarget->getInstrInfo();1551MachineFunction &MF = CurDAG->getMachineFunction();1552const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();1553SDLoc DL(Addr);15541555// CopyFromReg <sgpr>1556if (IsCopyFromSGPR(*TRI, Addr)) {1557SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);1558SOffset = Addr;1559Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);1560return true;1561}15621563ConstantSDNode *CAddr;1564if (Addr.getOpcode() == ISD::ADD) {1565// Add (CopyFromReg <sgpr>) <constant>1566CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));1567if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))1568return false;1569if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))1570return false;15711572SOffset = Addr.getOperand(0);1573} else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&1574TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {1575// <constant>1576SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);1577} else {1578return false;1579}15801581SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);15821583Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);1584return true;1585}15861587bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,1588SDValue &SOffset, SDValue &Offset1589) const {1590SDValue Ptr, VAddr, Offen, Idxen, Addr64;1591const SIInstrInfo *TII = Subtarget->getInstrInfo();15921593if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))1594return false;15951596if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&1597!cast<ConstantSDNode>(Idxen)->getSExtValue() &&1598!cast<ConstantSDNode>(Addr64)->getSExtValue()) {1599uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |1600APInt::getAllOnes(32).getZExtValue(); // Size1601SDLoc DL(Addr);16021603const SITargetLowering& Lowering =1604*static_cast<const SITargetLowering*>(getTargetLowering());16051606SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);1607return true;1608}1609return false;1610}16111612bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,1613SDValue &SOffset) const {1614if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {1615SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);1616return true;1617}16181619SOffset = ByteOffsetNode;1620return true;1621}16221623// Find a load or store from corresponding pattern root.1624// Roots may be build_vector, bitconvert or their combinations.1625static MemSDNode* findMemSDNode(SDNode *N) {1626N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();1627if (MemSDNode *MN = dyn_cast<MemSDNode>(N))1628return MN;1629assert(isa<BuildVectorSDNode>(N));1630for (SDValue V : N->op_values())1631if (MemSDNode *MN =1632dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))1633return MN;1634llvm_unreachable("cannot find MemSDNode in the pattern!");1635}16361637bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,1638SDValue &VAddr, SDValue &Offset,1639uint64_t FlatVariant) const {1640int64_t OffsetVal = 0;16411642unsigned AS = findMemSDNode(N)->getAddressSpace();16431644bool CanHaveFlatSegmentOffsetBug =1645Subtarget->hasFlatSegmentOffsetBug() &&1646FlatVariant == SIInstrFlags::FLAT &&1647(AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);16481649if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {1650SDValue N0, N1;1651if (isBaseWithConstantOffset64(Addr, N0, N1) &&1652(FlatVariant != SIInstrFlags::FlatScratch ||1653isFlatScratchBaseLegal(Addr))) {1654int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();16551656const SIInstrInfo *TII = Subtarget->getInstrInfo();1657if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {1658Addr = N0;1659OffsetVal = COffsetVal;1660} else {1661// If the offset doesn't fit, put the low bits into the offset field and1662// add the rest.1663//1664// For a FLAT instruction the hardware decides whether to access1665// global/scratch/shared memory based on the high bits of vaddr,1666// ignoring the offset field, so we have to ensure that when we add1667// remainder to vaddr it still points into the same underlying object.1668// The easiest way to do that is to make sure that we split the offset1669// into two pieces that are both >= 0 or both <= 0.16701671SDLoc DL(N);1672uint64_t RemainderOffset;16731674std::tie(OffsetVal, RemainderOffset) =1675TII->splitFlatOffset(COffsetVal, AS, FlatVariant);16761677SDValue AddOffsetLo =1678getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);1679SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);16801681if (Addr.getValueType().getSizeInBits() == 32) {1682SmallVector<SDValue, 3> Opnds;1683Opnds.push_back(N0);1684Opnds.push_back(AddOffsetLo);1685unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;1686if (Subtarget->hasAddNoCarry()) {1687AddOp = AMDGPU::V_ADD_U32_e64;1688Opnds.push_back(Clamp);1689}1690Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);1691} else {1692// TODO: Should this try to use a scalar add pseudo if the base address1693// is uniform and saddr is usable?1694SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);1695SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);16961697SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,1698DL, MVT::i32, N0, Sub0);1699SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,1700DL, MVT::i32, N0, Sub1);17011702SDValue AddOffsetHi =1703getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);17041705SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);17061707SDNode *Add =1708CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,1709{AddOffsetLo, SDValue(N0Lo, 0), Clamp});17101711SDNode *Addc = CurDAG->getMachineNode(1712AMDGPU::V_ADDC_U32_e64, DL, VTs,1713{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});17141715SDValue RegSequenceArgs[] = {1716CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),1717SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};17181719Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,1720MVT::i64, RegSequenceArgs),17210);1722}1723}1724}1725}17261727VAddr = Addr;1728Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);1729return true;1730}17311732bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,1733SDValue &VAddr,1734SDValue &Offset) const {1735return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);1736}17371738bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,1739SDValue &VAddr,1740SDValue &Offset) const {1741return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);1742}17431744bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,1745SDValue &VAddr,1746SDValue &Offset) const {1747return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,1748SIInstrFlags::FlatScratch);1749}17501751// If this matches zero_extend i32:x, return x1752static SDValue matchZExtFromI32(SDValue Op) {1753if (Op.getOpcode() != ISD::ZERO_EXTEND)1754return SDValue();17551756SDValue ExtSrc = Op.getOperand(0);1757return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();1758}17591760// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)1761bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,1762SDValue Addr,1763SDValue &SAddr,1764SDValue &VOffset,1765SDValue &Offset) const {1766int64_t ImmOffset = 0;17671768// Match the immediate offset first, which canonically is moved as low as1769// possible.17701771SDValue LHS, RHS;1772if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {1773int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();1774const SIInstrInfo *TII = Subtarget->getInstrInfo();17751776if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,1777SIInstrFlags::FlatGlobal)) {1778Addr = LHS;1779ImmOffset = COffsetVal;1780} else if (!LHS->isDivergent()) {1781if (COffsetVal > 0) {1782SDLoc SL(N);1783// saddr + large_offset -> saddr +1784// (voffset = large_offset & ~MaxOffset) +1785// (large_offset & MaxOffset);1786int64_t SplitImmOffset, RemainderOffset;1787std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(1788COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);17891790if (isUInt<32>(RemainderOffset)) {1791SDNode *VMov = CurDAG->getMachineNode(1792AMDGPU::V_MOV_B32_e32, SL, MVT::i32,1793CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));1794VOffset = SDValue(VMov, 0);1795SAddr = LHS;1796Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);1797return true;1798}1799}18001801// We are adding a 64 bit SGPR and a constant. If constant bus limit1802// is 1 we would need to perform 1 or 2 extra moves for each half of1803// the constant and it is better to do a scalar add and then issue a1804// single VALU instruction to materialize zero. Otherwise it is less1805// instructions to perform VALU adds with immediates or inline literals.1806unsigned NumLiterals =1807!TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +1808!TII->isInlineConstant(APInt(32, COffsetVal >> 32));1809if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)1810return false;1811}1812}18131814// Match the variable offset.1815if (Addr.getOpcode() == ISD::ADD) {1816LHS = Addr.getOperand(0);1817RHS = Addr.getOperand(1);18181819if (!LHS->isDivergent()) {1820// add (i64 sgpr), (zero_extend (i32 vgpr))1821if (SDValue ZextRHS = matchZExtFromI32(RHS)) {1822SAddr = LHS;1823VOffset = ZextRHS;1824}1825}18261827if (!SAddr && !RHS->isDivergent()) {1828// add (zero_extend (i32 vgpr)), (i64 sgpr)1829if (SDValue ZextLHS = matchZExtFromI32(LHS)) {1830SAddr = RHS;1831VOffset = ZextLHS;1832}1833}18341835if (SAddr) {1836Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);1837return true;1838}1839}18401841if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||1842isa<ConstantSDNode>(Addr))1843return false;18441845// It's cheaper to materialize a single 32-bit zero for vaddr than the two1846// moves required to copy a 64-bit SGPR to VGPR.1847SAddr = Addr;1848SDNode *VMov =1849CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,1850CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));1851VOffset = SDValue(VMov, 0);1852Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);1853return true;1854}18551856static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {1857if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {1858SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));1859} else if (SAddr.getOpcode() == ISD::ADD &&1860isa<FrameIndexSDNode>(SAddr.getOperand(0))) {1861// Materialize this into a scalar move for scalar address to avoid1862// readfirstlane.1863auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));1864SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),1865FI->getValueType(0));1866SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),1867MVT::i32, TFI, SAddr.getOperand(1)),18680);1869}18701871return SAddr;1872}18731874// Match (32-bit SGPR base) + sext(imm offset)1875bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,1876SDValue &SAddr,1877SDValue &Offset) const {1878if (Addr->isDivergent())1879return false;18801881SDLoc DL(Addr);18821883int64_t COffsetVal = 0;18841885if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {1886COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();1887SAddr = Addr.getOperand(0);1888} else {1889SAddr = Addr;1890}18911892SAddr = SelectSAddrFI(CurDAG, SAddr);18931894const SIInstrInfo *TII = Subtarget->getInstrInfo();18951896if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,1897SIInstrFlags::FlatScratch)) {1898int64_t SplitImmOffset, RemainderOffset;1899std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(1900COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);19011902COffsetVal = SplitImmOffset;19031904SDValue AddOffset =1905SAddr.getOpcode() == ISD::TargetFrameIndex1906? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)1907: CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);1908SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,1909SAddr, AddOffset),19100);1911}19121913Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i32);19141915return true;1916}19171918// Check whether the flat scratch SVS swizzle bug affects this access.1919bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(1920SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {1921if (!Subtarget->hasFlatScratchSVSSwizzleBug())1922return false;19231924// The bug affects the swizzling of SVS accesses if there is any carry out1925// from the two low order bits (i.e. from bit 1 into bit 2) when adding1926// voffset to (soffset + inst_offset).1927KnownBits VKnown = CurDAG->computeKnownBits(VAddr);1928KnownBits SKnown = KnownBits::computeForAddSub(1929/*Add=*/true, /*NSW=*/false, /*NUW=*/false,1930CurDAG->computeKnownBits(SAddr),1931KnownBits::makeConstant(APInt(32, ImmOffset)));1932uint64_t VMax = VKnown.getMaxValue().getZExtValue();1933uint64_t SMax = SKnown.getMaxValue().getZExtValue();1934return (VMax & 3) + (SMax & 3) >= 4;1935}19361937bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,1938SDValue &VAddr, SDValue &SAddr,1939SDValue &Offset) const {1940int64_t ImmOffset = 0;19411942SDValue LHS, RHS;1943SDValue OrigAddr = Addr;1944if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {1945int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();1946const SIInstrInfo *TII = Subtarget->getInstrInfo();19471948if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {1949Addr = LHS;1950ImmOffset = COffsetVal;1951} else if (!LHS->isDivergent() && COffsetVal > 0) {1952SDLoc SL(N);1953// saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +1954// (large_offset & MaxOffset);1955int64_t SplitImmOffset, RemainderOffset;1956std::tie(SplitImmOffset, RemainderOffset)1957= TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);19581959if (isUInt<32>(RemainderOffset)) {1960SDNode *VMov = CurDAG->getMachineNode(1961AMDGPU::V_MOV_B32_e32, SL, MVT::i32,1962CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));1963VAddr = SDValue(VMov, 0);1964SAddr = LHS;1965if (!isFlatScratchBaseLegal(Addr))1966return false;1967if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))1968return false;1969Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);1970return true;1971}1972}1973}19741975if (Addr.getOpcode() != ISD::ADD)1976return false;19771978LHS = Addr.getOperand(0);1979RHS = Addr.getOperand(1);19801981if (!LHS->isDivergent() && RHS->isDivergent()) {1982SAddr = LHS;1983VAddr = RHS;1984} else if (!RHS->isDivergent() && LHS->isDivergent()) {1985SAddr = RHS;1986VAddr = LHS;1987} else {1988return false;1989}19901991if (OrigAddr != Addr) {1992if (!isFlatScratchBaseLegalSVImm(OrigAddr))1993return false;1994} else {1995if (!isFlatScratchBaseLegalSV(OrigAddr))1996return false;1997}19981999if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))2000return false;2001SAddr = SelectSAddrFI(CurDAG, SAddr);2002Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);2003return true;2004}20052006// For unbuffered smem loads, it is illegal for the Immediate Offset to be2007// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.2008// Handle the case where the Immediate Offset + SOffset is negative.2009bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,2010bool Imm32Only,2011bool IsBuffer,2012int64_t ImmOffset) const {2013if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&2014AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {2015KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);2016if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)2017return false;2018}20192020return true;2021}20222023// Match an immediate (if Offset is not null) or an SGPR (if SOffset is2024// not null) offset. If Imm32Only is true, match only 32-bit immediate2025// offsets available on CI.2026bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,2027SDValue *SOffset, SDValue *Offset,2028bool Imm32Only, bool IsBuffer,2029bool HasSOffset,2030int64_t ImmOffset) const {2031assert((!SOffset || !Offset) &&2032"Cannot match both soffset and offset at the same time!");20332034ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);2035if (!C) {2036if (!SOffset)2037return false;20382039if (ByteOffsetNode.getValueType().isScalarInteger() &&2040ByteOffsetNode.getValueType().getSizeInBits() == 32) {2041*SOffset = ByteOffsetNode;2042return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,2043ImmOffset);2044}2045if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {2046if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {2047*SOffset = ByteOffsetNode.getOperand(0);2048return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,2049ImmOffset);2050}2051}2052return false;2053}20542055SDLoc SL(ByteOffsetNode);20562057// GFX9 and GFX10 have signed byte immediate offsets. The immediate2058// offset for S_BUFFER instructions is unsigned.2059int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();2060std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(2061*Subtarget, ByteOffset, IsBuffer, HasSOffset);2062if (EncodedOffset && Offset && !Imm32Only) {2063*Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);2064return true;2065}20662067// SGPR and literal offsets are unsigned.2068if (ByteOffset < 0)2069return false;20702071EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);2072if (EncodedOffset && Offset && Imm32Only) {2073*Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);2074return true;2075}20762077if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))2078return false;20792080if (SOffset) {2081SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);2082*SOffset = SDValue(2083CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);2084return true;2085}20862087return false;2088}20892090SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {2091if (Addr.getValueType() != MVT::i32)2092return Addr;20932094// Zero-extend a 32-bit address.2095SDLoc SL(Addr);20962097const MachineFunction &MF = CurDAG->getMachineFunction();2098const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();2099unsigned AddrHiVal = Info->get32BitAddressHighBits();2100SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);21012102const SDValue Ops[] = {2103CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),2104Addr,2105CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),2106SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),21070),2108CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),2109};21102111return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,2112Ops), 0);2113}21142115// Match a base and an immediate (if Offset is not null) or an SGPR (if2116// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is2117// true, match only 32-bit immediate offsets available on CI.2118bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,2119SDValue *SOffset, SDValue *Offset,2120bool Imm32Only, bool IsBuffer,2121bool HasSOffset,2122int64_t ImmOffset) const {2123if (SOffset && Offset) {2124assert(!Imm32Only && !IsBuffer);2125SDValue B;21262127if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))2128return false;21292130int64_t ImmOff = 0;2131if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))2132ImmOff = C->getSExtValue();21332134return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,2135ImmOff);2136}21372138// A 32-bit (address + offset) should not cause unsigned 32-bit integer2139// wraparound, because s_load instructions perform the addition in 64 bits.2140if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&2141!Addr->getFlags().hasNoUnsignedWrap())2142return false;21432144SDValue N0, N1;2145// Extract the base and offset if possible.2146if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {2147N0 = Addr.getOperand(0);2148N1 = Addr.getOperand(1);2149} else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {2150assert(N0 && N1 && isa<ConstantSDNode>(N1));2151}2152if (!N0 || !N1)2153return false;21542155if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,2156ImmOffset)) {2157SBase = N0;2158return true;2159}2160if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,2161ImmOffset)) {2162SBase = N1;2163return true;2164}2165return false;2166}21672168bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,2169SDValue *SOffset, SDValue *Offset,2170bool Imm32Only) const {2171if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {2172SBase = Expand32BitAddress(SBase);2173return true;2174}21752176if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {2177SBase = Expand32BitAddress(Addr);2178*Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);2179return true;2180}21812182return false;2183}21842185bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,2186SDValue &Offset) const {2187return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);2188}21892190bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,2191SDValue &Offset) const {2192assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);2193return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,2194/* Imm32Only */ true);2195}21962197bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,2198SDValue &SOffset) const {2199return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);2200}22012202bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,2203SDValue &SOffset,2204SDValue &Offset) const {2205return SelectSMRD(Addr, SBase, &SOffset, &Offset);2206}22072208bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {2209return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,2210/* Imm32Only */ false, /* IsBuffer */ true);2211}22122213bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,2214SDValue &Offset) const {2215assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);2216return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,2217/* Imm32Only */ true, /* IsBuffer */ true);2218}22192220bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,2221SDValue &Offset) const {2222// Match the (soffset + offset) pair as a 32-bit register base and2223// an immediate offset.2224return N.getValueType() == MVT::i32 &&2225SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,2226&Offset, /* Imm32Only */ false,2227/* IsBuffer */ true);2228}22292230bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,2231SDValue &Base,2232SDValue &Offset) const {2233SDLoc DL(Index);22342235if (CurDAG->isBaseWithConstantOffset(Index)) {2236SDValue N0 = Index.getOperand(0);2237SDValue N1 = Index.getOperand(1);2238ConstantSDNode *C1 = cast<ConstantSDNode>(N1);22392240// (add n0, c0)2241// Don't peel off the offset (c0) if doing so could possibly lead2242// the base (n0) to be negative.2243// (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.2244if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||2245(Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {2246Base = N0;2247Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);2248return true;2249}2250}22512252if (isa<ConstantSDNode>(Index))2253return false;22542255Base = Index;2256Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);2257return true;2258}22592260SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,2261SDValue Val, uint32_t Offset,2262uint32_t Width) {2263if (Val->isDivergent()) {2264unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;2265SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);2266SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);22672268return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);2269}2270unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;2271// Transformation function, pack the offset and width of a BFE into2272// the format expected by the S_BFE_I32 / S_BFE_U32. In the second2273// source, bits [5:0] contain the offset and bits [22:16] the width.2274uint32_t PackedVal = Offset | (Width << 16);2275SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);22762277return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);2278}22792280void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {2281// "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)2282// "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)2283// Predicate: 0 < b <= c < 3222842285const SDValue &Shl = N->getOperand(0);2286ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));2287ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));22882289if (B && C) {2290uint32_t BVal = B->getZExtValue();2291uint32_t CVal = C->getZExtValue();22922293if (0 < BVal && BVal <= CVal && CVal < 32) {2294bool Signed = N->getOpcode() == ISD::SRA;2295ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,229632 - CVal));2297return;2298}2299}2300SelectCode(N);2301}23022303void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {2304switch (N->getOpcode()) {2305case ISD::AND:2306if (N->getOperand(0).getOpcode() == ISD::SRL) {2307// "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"2308// Predicate: isMask(mask)2309const SDValue &Srl = N->getOperand(0);2310ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));2311ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));23122313if (Shift && Mask) {2314uint32_t ShiftVal = Shift->getZExtValue();2315uint32_t MaskVal = Mask->getZExtValue();23162317if (isMask_32(MaskVal)) {2318uint32_t WidthVal = llvm::popcount(MaskVal);2319ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,2320WidthVal));2321return;2322}2323}2324}2325break;2326case ISD::SRL:2327if (N->getOperand(0).getOpcode() == ISD::AND) {2328// "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"2329// Predicate: isMask(mask >> b)2330const SDValue &And = N->getOperand(0);2331ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));2332ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));23332334if (Shift && Mask) {2335uint32_t ShiftVal = Shift->getZExtValue();2336uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;23372338if (isMask_32(MaskVal)) {2339uint32_t WidthVal = llvm::popcount(MaskVal);2340ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,2341WidthVal));2342return;2343}2344}2345} else if (N->getOperand(0).getOpcode() == ISD::SHL) {2346SelectS_BFEFromShifts(N);2347return;2348}2349break;2350case ISD::SRA:2351if (N->getOperand(0).getOpcode() == ISD::SHL) {2352SelectS_BFEFromShifts(N);2353return;2354}2355break;23562357case ISD::SIGN_EXTEND_INREG: {2358// sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 82359SDValue Src = N->getOperand(0);2360if (Src.getOpcode() != ISD::SRL)2361break;23622363const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));2364if (!Amt)2365break;23662367unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();2368ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),2369Amt->getZExtValue(), Width));2370return;2371}2372}23732374SelectCode(N);2375}23762377bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {2378assert(N->getOpcode() == ISD::BRCOND);2379if (!N->hasOneUse())2380return false;23812382SDValue Cond = N->getOperand(1);2383if (Cond.getOpcode() == ISD::CopyToReg)2384Cond = Cond.getOperand(2);23852386if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())2387return false;23882389MVT VT = Cond.getOperand(0).getSimpleValueType();2390if (VT == MVT::i32)2391return true;23922393if (VT == MVT::i64) {2394auto ST = static_cast<const GCNSubtarget *>(Subtarget);23952396ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();2397return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();2398}23992400return false;2401}24022403static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {2404assert(VCMP->getOpcode() == AMDGPUISD::SETCC);2405// Special case for amdgcn.ballot:2406// %Cond = i1 (and/or combination of i1 ISD::SETCCs)2407// %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq2408// =>2409// Use i1 %Cond value instead of i(WaveSize) %VCMP.2410// This is possible because divergent ISD::SETCC is selected as V_CMP and2411// Cond becomes a i(WaveSize) full mask value.2412// Note that ballot doesn't use SETEQ condition but its easy to support it2413// here for completeness, so in this case Negate is set true on return.2414auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();2415if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&2416isNullConstant(VCMP.getOperand(1))) {24172418auto Cond = VCMP.getOperand(0);2419if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.2420Cond = Cond.getOperand(0);24212422if (isBoolSGPR(Cond)) {2423Negate = VCMP_CC == ISD::SETEQ;2424return Cond;2425}2426}2427return SDValue();2428}24292430void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {2431SDValue Cond = N->getOperand(1);24322433if (Cond.isUndef()) {2434CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,2435N->getOperand(2), N->getOperand(0));2436return;2437}24382439const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);2440const SIRegisterInfo *TRI = ST->getRegisterInfo();24412442bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);2443bool AndExec = !UseSCCBr;2444bool Negate = false;24452446if (Cond.getOpcode() == ISD::SETCC &&2447Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {2448SDValue VCMP = Cond->getOperand(0);2449auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();2450if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&2451isNullConstant(Cond->getOperand(1)) &&2452// We may encounter ballot.i64 in wave32 mode on -O0.2453VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {2454// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...2455// %C = i1 ISD::SETCC %VCMP, 0, setne/seteq2456// BRCOND i1 %C, %BB2457// =>2458// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...2459// VCC = COPY i(WaveSize) %VCMP2460// S_CBRANCH_VCCNZ/VCCZ %BB2461Negate = CC == ISD::SETEQ;2462bool NegatedBallot = false;2463if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {2464Cond = BallotCond;2465UseSCCBr = !BallotCond->isDivergent();2466Negate = Negate ^ NegatedBallot;2467} else {2468// TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always2469// selected as V_CMP, but this may change for uniform condition.2470Cond = VCMP;2471UseSCCBr = false;2472}2473}2474// Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of2475// V_CMPs resulted from ballot or ballot has uniform condition and SCC is2476// used.2477AndExec = false;2478}24792480unsigned BrOp =2481UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)2482: (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);2483Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();2484SDLoc SL(N);24852486if (AndExec) {2487// This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not2488// analyzed what generates the vcc value, so we do not know whether vcc2489// bits for disabled lanes are 0. Thus we need to mask out bits for2490// disabled lanes.2491//2492// For the case that we select S_CBRANCH_SCC1 and it gets2493// changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls2494// SIInstrInfo::moveToVALU which inserts the S_AND).2495//2496// We could add an analysis of what generates the vcc value here and omit2497// the S_AND when is unnecessary. But it would be better to add a separate2498// pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it2499// catches both cases.2500Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B322501: AMDGPU::S_AND_B64,2502SL, MVT::i1,2503CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO2504: AMDGPU::EXEC,2505MVT::i1),2506Cond),25070);2508}25092510SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);2511CurDAG->SelectNodeTo(N, BrOp, MVT::Other,2512N->getOperand(2), // Basic Block2513VCC.getValue(0));2514}25152516void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {2517if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&2518!N->isDivergent()) {2519SDValue Src = N->getOperand(0);2520if (Src.getValueType() == MVT::f16) {2521if (isExtractHiElt(Src, Src)) {2522CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),2523{Src});2524return;2525}2526}2527}25282529SelectCode(N);2530}25312532void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {2533// The address is assumed to be uniform, so if it ends up in a VGPR, it will2534// be copied to an SGPR with readfirstlane.2535unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?2536AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;25372538SDValue Chain = N->getOperand(0);2539SDValue Ptr = N->getOperand(2);2540MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);2541MachineMemOperand *MMO = M->getMemOperand();2542bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;25432544SDValue Offset;2545if (CurDAG->isBaseWithConstantOffset(Ptr)) {2546SDValue PtrBase = Ptr.getOperand(0);2547SDValue PtrOffset = Ptr.getOperand(1);25482549const APInt &OffsetVal = PtrOffset->getAsAPIntVal();2550if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {2551N = glueCopyToM0(N, PtrBase);2552Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);2553}2554}25552556if (!Offset) {2557N = glueCopyToM0(N, Ptr);2558Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);2559}25602561SDValue Ops[] = {2562Offset,2563CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),2564Chain,2565N->getOperand(N->getNumOperands() - 1) // New glue2566};25672568SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);2569CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});2570}25712572// We need to handle this here because tablegen doesn't support matching2573// instructions with multiple outputs.2574void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {2575unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;2576SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),2577N->getOperand(5), N->getOperand(0)};25782579MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);2580MachineMemOperand *MMO = M->getMemOperand();2581SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);2582CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});2583}25842585static unsigned gwsIntrinToOpcode(unsigned IntrID) {2586switch (IntrID) {2587case Intrinsic::amdgcn_ds_gws_init:2588return AMDGPU::DS_GWS_INIT;2589case Intrinsic::amdgcn_ds_gws_barrier:2590return AMDGPU::DS_GWS_BARRIER;2591case Intrinsic::amdgcn_ds_gws_sema_v:2592return AMDGPU::DS_GWS_SEMA_V;2593case Intrinsic::amdgcn_ds_gws_sema_br:2594return AMDGPU::DS_GWS_SEMA_BR;2595case Intrinsic::amdgcn_ds_gws_sema_p:2596return AMDGPU::DS_GWS_SEMA_P;2597case Intrinsic::amdgcn_ds_gws_sema_release_all:2598return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;2599default:2600llvm_unreachable("not a gws intrinsic");2601}2602}26032604void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {2605if (!Subtarget->hasGWS() ||2606(IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&2607!Subtarget->hasGWSSemaReleaseAll())) {2608// Let this error.2609SelectCode(N);2610return;2611}26122613// Chain, intrinsic ID, vsrc, offset2614const bool HasVSrc = N->getNumOperands() == 4;2615assert(HasVSrc || N->getNumOperands() == 3);26162617SDLoc SL(N);2618SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);2619int ImmOffset = 0;2620MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);2621MachineMemOperand *MMO = M->getMemOperand();26222623// Don't worry if the offset ends up in a VGPR. Only one lane will have2624// effect, so SIFixSGPRCopies will validly insert readfirstlane.26252626// The resource id offset is computed as (<isa opaque base> + M0[21:16] +2627// offset field) % 64. Some versions of the programming guide omit the m02628// part, or claim it's from offset 0.2629if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {2630// If we have a constant offset, try to use the 0 in m0 as the base.2631// TODO: Look into changing the default m0 initialization value. If the2632// default -1 only set the low 16-bits, we could leave it as-is and add 1 to2633// the immediate offset.2634glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));2635ImmOffset = ConstOffset->getZExtValue();2636} else {2637if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {2638ImmOffset = BaseOffset.getConstantOperandVal(1);2639BaseOffset = BaseOffset.getOperand(0);2640}26412642// Prefer to do the shift in an SGPR since it should be possible to use m02643// as the result directly. If it's already an SGPR, it will be eliminated2644// later.2645SDNode *SGPROffset2646= CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,2647BaseOffset);2648// Shift to offset in m02649SDNode *M0Base2650= CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,2651SDValue(SGPROffset, 0),2652CurDAG->getTargetConstant(16, SL, MVT::i32));2653glueCopyToM0(N, SDValue(M0Base, 0));2654}26552656SDValue Chain = N->getOperand(0);2657SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);26582659const unsigned Opc = gwsIntrinToOpcode(IntrID);2660SmallVector<SDValue, 5> Ops;2661if (HasVSrc)2662Ops.push_back(N->getOperand(2));2663Ops.push_back(OffsetField);2664Ops.push_back(Chain);26652666SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);2667CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});2668}26692670void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {2671if (Subtarget->getLDSBankCount() != 16) {2672// This is a single instruction with a pattern.2673SelectCode(N);2674return;2675}26762677SDLoc DL(N);26782679// This requires 2 instructions. It is possible to write a pattern to support2680// this, but the generated isel emitter doesn't correctly deal with multiple2681// output instructions using the same physical register input. The copy to m02682// is incorrectly placed before the second instruction.2683//2684// TODO: Match source modifiers.2685//2686// def : Pat <2687// (int_amdgcn_interp_p1_f162688// (VOP3Mods f32:$src0, i32:$src0_modifiers),2689// (i32 timm:$attrchan), (i32 timm:$attr),2690// (i1 timm:$high), M0),2691// (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,2692// timm:$attrchan, 0,2693// (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {2694// let Predicates = [has16BankLDS];2695// }26962697// 16 bank LDS2698SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,2699N->getOperand(5), SDValue());27002701SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);27022703SDNode *InterpMov =2704CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {2705CurDAG->getTargetConstant(2, DL, MVT::i32), // P02706N->getOperand(3), // Attr2707N->getOperand(2), // Attrchan2708ToM0.getValue(1) // In glue2709});27102711SDNode *InterpP1LV =2712CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {2713CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers2714N->getOperand(1), // Src02715N->getOperand(3), // Attr2716N->getOperand(2), // Attrchan2717CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers2718SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high2719N->getOperand(4), // high2720CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp2721CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod2722SDValue(InterpMov, 1)2723});27242725CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));2726}27272728void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {2729unsigned IntrID = N->getConstantOperandVal(1);2730switch (IntrID) {2731case Intrinsic::amdgcn_ds_append:2732case Intrinsic::amdgcn_ds_consume: {2733if (N->getValueType(0) != MVT::i32)2734break;2735SelectDSAppendConsume(N, IntrID);2736return;2737}2738case Intrinsic::amdgcn_ds_bvh_stack_rtn:2739SelectDSBvhStackIntrinsic(N);2740return;2741}27422743SelectCode(N);2744}27452746void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {2747unsigned IntrID = N->getConstantOperandVal(0);2748unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;2749SDNode *ConvGlueNode = N->getGluedNode();2750if (ConvGlueNode) {2751// FIXME: Possibly iterate over multiple glue nodes?2752assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);2753ConvGlueNode = ConvGlueNode->getOperand(0).getNode();2754ConvGlueNode =2755CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},2756MVT::Glue, SDValue(ConvGlueNode, 0));2757} else {2758ConvGlueNode = nullptr;2759}2760switch (IntrID) {2761case Intrinsic::amdgcn_wqm:2762Opcode = AMDGPU::WQM;2763break;2764case Intrinsic::amdgcn_softwqm:2765Opcode = AMDGPU::SOFT_WQM;2766break;2767case Intrinsic::amdgcn_wwm:2768case Intrinsic::amdgcn_strict_wwm:2769Opcode = AMDGPU::STRICT_WWM;2770break;2771case Intrinsic::amdgcn_strict_wqm:2772Opcode = AMDGPU::STRICT_WQM;2773break;2774case Intrinsic::amdgcn_interp_p1_f16:2775SelectInterpP1F16(N);2776return;2777default:2778SelectCode(N);2779break;2780}27812782if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {2783SDValue Src = N->getOperand(1);2784CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});2785}27862787if (ConvGlueNode) {2788SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());2789NewOps.push_back(SDValue(ConvGlueNode, 0));2790CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);2791}2792}27932794void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {2795unsigned IntrID = N->getConstantOperandVal(1);2796switch (IntrID) {2797case Intrinsic::amdgcn_ds_gws_init:2798case Intrinsic::amdgcn_ds_gws_barrier:2799case Intrinsic::amdgcn_ds_gws_sema_v:2800case Intrinsic::amdgcn_ds_gws_sema_br:2801case Intrinsic::amdgcn_ds_gws_sema_p:2802case Intrinsic::amdgcn_ds_gws_sema_release_all:2803SelectDS_GWS(N, IntrID);2804return;2805default:2806break;2807}28082809SelectCode(N);2810}28112812void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {2813SDValue Log2WaveSize =2814CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);2815CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),2816{N->getOperand(0), Log2WaveSize});2817}28182819void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {2820SDValue SrcVal = N->getOperand(1);2821if (SrcVal.getValueType() != MVT::i32) {2822SelectCode(N); // Emit default error2823return;2824}28252826SDValue CopyVal;2827Register SP = TLI->getStackPointerRegisterToSaveRestore();2828SDLoc SL(N);28292830if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {2831CopyVal = SrcVal.getOperand(0);2832} else {2833SDValue Log2WaveSize = CurDAG->getTargetConstant(2834Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);28352836if (N->isDivergent()) {2837SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,2838MVT::i32, SrcVal),28390);2840}28412842CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,2843{SrcVal, Log2WaveSize}),28440);2845}28462847SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);2848CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);2849}28502851bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,2852unsigned &Mods,2853bool IsCanonicalizing,2854bool AllowAbs) const {2855Mods = SISrcMods::NONE;2856Src = In;28572858if (Src.getOpcode() == ISD::FNEG) {2859Mods |= SISrcMods::NEG;2860Src = Src.getOperand(0);2861} else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {2862// Fold fsub [+-]0 into fneg. This may not have folded depending on the2863// denormal mode, but we're implicitly canonicalizing in a source operand.2864auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));2865if (LHS && LHS->isZero()) {2866Mods |= SISrcMods::NEG;2867Src = Src.getOperand(1);2868}2869}28702871if (AllowAbs && Src.getOpcode() == ISD::FABS) {2872Mods |= SISrcMods::ABS;2873Src = Src.getOperand(0);2874}28752876return true;2877}28782879bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,2880SDValue &SrcMods) const {2881unsigned Mods;2882if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,2883/*AllowAbs=*/true)) {2884SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);2885return true;2886}28872888return false;2889}28902891bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(2892SDValue In, SDValue &Src, SDValue &SrcMods) const {2893unsigned Mods;2894if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,2895/*AllowAbs=*/true)) {2896SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);2897return true;2898}28992900return false;2901}29022903bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,2904SDValue &SrcMods) const {2905unsigned Mods;2906if (SelectVOP3ModsImpl(In, Src, Mods,2907/*IsCanonicalizing=*/true,2908/*AllowAbs=*/false)) {2909SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);2910return true;2911}29122913return false;2914}29152916bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {2917if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)2918return false;29192920Src = In;2921return true;2922}29232924bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,2925SDValue &SrcMods,2926bool OpSel) const {2927unsigned Mods;2928if (SelectVOP3ModsImpl(In, Src, Mods,2929/*IsCanonicalizing=*/true,2930/*AllowAbs=*/false)) {2931if (OpSel)2932Mods |= SISrcMods::OP_SEL_0;2933SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);2934return true;2935}29362937return false;2938}29392940bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,2941SDValue &SrcMods) const {2942return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);2943}29442945bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,2946SDValue &SrcMods) const {2947return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);2948}29492950bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,2951SDValue &SrcMods, SDValue &Clamp,2952SDValue &Omod) const {2953SDLoc DL(In);2954Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);2955Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);29562957return SelectVOP3Mods(In, Src, SrcMods);2958}29592960bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,2961SDValue &SrcMods, SDValue &Clamp,2962SDValue &Omod) const {2963SDLoc DL(In);2964Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);2965Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);29662967return SelectVOP3BMods(In, Src, SrcMods);2968}29692970bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,2971SDValue &Clamp, SDValue &Omod) const {2972Src = In;29732974SDLoc DL(In);2975Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);2976Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);29772978return true;2979}29802981bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,2982SDValue &SrcMods, bool IsDOT) const {2983unsigned Mods = SISrcMods::NONE;2984Src = In;29852986// TODO: Handle G_FSUB 0 as fneg2987if (Src.getOpcode() == ISD::FNEG) {2988Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);2989Src = Src.getOperand(0);2990}29912992if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&2993(!IsDOT || !Subtarget->hasDOTOpSelHazard())) {2994unsigned VecMods = Mods;29952996SDValue Lo = stripBitcast(Src.getOperand(0));2997SDValue Hi = stripBitcast(Src.getOperand(1));29982999if (Lo.getOpcode() == ISD::FNEG) {3000Lo = stripBitcast(Lo.getOperand(0));3001Mods ^= SISrcMods::NEG;3002}30033004if (Hi.getOpcode() == ISD::FNEG) {3005Hi = stripBitcast(Hi.getOperand(0));3006Mods ^= SISrcMods::NEG_HI;3007}30083009if (isExtractHiElt(Lo, Lo))3010Mods |= SISrcMods::OP_SEL_0;30113012if (isExtractHiElt(Hi, Hi))3013Mods |= SISrcMods::OP_SEL_1;30143015unsigned VecSize = Src.getValueSizeInBits();3016Lo = stripExtractLoElt(Lo);3017Hi = stripExtractLoElt(Hi);30183019if (Lo.getValueSizeInBits() > VecSize) {3020Lo = CurDAG->getTargetExtractSubreg(3021(VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),3022MVT::getIntegerVT(VecSize), Lo);3023}30243025if (Hi.getValueSizeInBits() > VecSize) {3026Hi = CurDAG->getTargetExtractSubreg(3027(VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),3028MVT::getIntegerVT(VecSize), Hi);3029}30303031assert(Lo.getValueSizeInBits() <= VecSize &&3032Hi.getValueSizeInBits() <= VecSize);30333034if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {3035// Really a scalar input. Just select from the low half of the register to3036// avoid packing.30373038if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {3039Src = Lo;3040} else {3041assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);30423043SDLoc SL(In);3044SDValue Undef = SDValue(3045CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,3046Lo.getValueType()), 0);3047auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID3048: AMDGPU::SReg_64RegClassID;3049const SDValue Ops[] = {3050CurDAG->getTargetConstant(RC, SL, MVT::i32),3051Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),3052Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };30533054Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,3055Src.getValueType(), Ops), 0);3056}3057SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);3058return true;3059}30603061if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {3062uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()3063.bitcastToAPInt().getZExtValue();3064if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {3065Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);3066SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);3067return true;3068}3069}30703071Mods = VecMods;3072}30733074// Packed instructions do not have abs modifiers.3075Mods |= SISrcMods::OP_SEL_1;30763077SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);3078return true;3079}30803081bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,3082SDValue &SrcMods) const {3083return SelectVOP3PMods(In, Src, SrcMods, true);3084}30853086bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {3087const ConstantSDNode *C = cast<ConstantSDNode>(In);3088// Literal i1 value set in intrinsic, represents SrcMods for the next operand.3089// 1 promotes packed values to signed, 0 treats them as unsigned.3090assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");30913092unsigned Mods = SISrcMods::OP_SEL_1;3093unsigned SrcSign = C->getZExtValue();3094if (SrcSign == 1)3095Mods ^= SISrcMods::NEG;30963097Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);3098return true;3099}31003101bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,3102SDValue &Src) const {3103const ConstantSDNode *C = cast<ConstantSDNode>(In);3104assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");31053106unsigned Mods = SISrcMods::OP_SEL_1;3107unsigned SrcVal = C->getZExtValue();3108if (SrcVal == 1)3109Mods |= SISrcMods::OP_SEL_0;31103111Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);3112return true;3113}31143115static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,3116llvm::SelectionDAG *CurDAG,3117const SDLoc &DL) {3118unsigned DstRegClass;3119EVT DstTy;3120switch (Elts.size()) {3121case 8:3122DstRegClass = AMDGPU::VReg_256RegClassID;3123DstTy = MVT::v8i32;3124break;3125case 4:3126DstRegClass = AMDGPU::VReg_128RegClassID;3127DstTy = MVT::v4i32;3128break;3129case 2:3130DstRegClass = AMDGPU::VReg_64RegClassID;3131DstTy = MVT::v2i32;3132break;3133default:3134llvm_unreachable("unhandled Reg sequence size");3135}31363137SmallVector<SDValue, 17> Ops;3138Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));3139for (unsigned i = 0; i < Elts.size(); ++i) {3140Ops.push_back(Elts[i]);3141Ops.push_back(CurDAG->getTargetConstant(3142SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));3143}3144return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);3145}31463147static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,3148llvm::SelectionDAG *CurDAG,3149const SDLoc &DL) {3150SmallVector<SDValue, 8> PackedElts;3151assert("unhandled Reg sequence size" &&3152(Elts.size() == 8 || Elts.size() == 16));31533154// Pack 16-bit elements in pairs into 32-bit register. If both elements are3155// unpacked from 32-bit source use it, otherwise pack them using v_perm.3156for (unsigned i = 0; i < Elts.size(); i += 2) {3157SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));3158SDValue HiSrc;3159if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {3160PackedElts.push_back(HiSrc);3161} else {3162SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);3163MachineSDNode *Packed =3164CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,3165{Elts[i + 1], Elts[i], PackLoLo});3166PackedElts.push_back(SDValue(Packed, 0));3167}3168}31693170return buildRegSequence32(PackedElts, CurDAG, DL);3171}31723173static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,3174llvm::SelectionDAG *CurDAG,3175const SDLoc &DL, unsigned ElementSize) {3176if (ElementSize == 16)3177return buildRegSequence16(Elts, CurDAG, DL);3178if (ElementSize == 32)3179return buildRegSequence32(Elts, CurDAG, DL);3180llvm_unreachable("Unhandled element size");3181}31823183static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,3184SmallVectorImpl<SDValue> &Elts, SDValue &Src,3185llvm::SelectionDAG *CurDAG, const SDLoc &DL,3186unsigned ElementSize) {3187if (ModOpcode == ISD::FNEG) {3188Mods |= SISrcMods::NEG;3189// Check if all elements also have abs modifier3190SmallVector<SDValue, 8> NegAbsElts;3191for (auto El : Elts) {3192if (El.getOpcode() != ISD::FABS)3193break;3194NegAbsElts.push_back(El->getOperand(0));3195}3196if (Elts.size() != NegAbsElts.size()) {3197// Neg3198Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);3199} else {3200// Neg and Abs3201Mods |= SISrcMods::NEG_HI;3202Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);3203}3204} else {3205assert(ModOpcode == ISD::FABS);3206// Abs3207Mods |= SISrcMods::NEG_HI;3208Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);3209}3210}32113212// Check all f16 elements for modifiers while looking through b32 and v2b163213// build vector, stop if element does not satisfy ModifierCheck.3214static void3215checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,3216std::function<bool(SDValue)> ModifierCheck) {3217for (unsigned i = 0; i < BV->getNumOperands(); ++i) {3218if (auto *F16Pair =3219dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {3220for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {3221SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));3222if (!ModifierCheck(ElF16))3223break;3224}3225}3226}3227}32283229bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,3230SDValue &SrcMods) const {3231Src = In;3232unsigned Mods = SISrcMods::OP_SEL_1;32333234// mods are on f16 elements3235if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {3236SmallVector<SDValue, 8> EltsF16;32373238checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {3239if (Element.getOpcode() != ISD::FNEG)3240return false;3241EltsF16.push_back(Element.getOperand(0));3242return true;3243});32443245// All elements have neg modifier3246if (BV->getNumOperands() * 2 == EltsF16.size()) {3247Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);3248Mods |= SISrcMods::NEG;3249Mods |= SISrcMods::NEG_HI;3250}3251}32523253// mods are on v2f16 elements3254if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {3255SmallVector<SDValue, 8> EltsV2F16;3256for (unsigned i = 0; i < BV->getNumOperands(); ++i) {3257SDValue ElV2f16 = stripBitcast(BV->getOperand(i));3258// Based on first element decide which mod we match, neg or abs3259if (ElV2f16.getOpcode() != ISD::FNEG)3260break;3261EltsV2F16.push_back(ElV2f16.getOperand(0));3262}32633264// All pairs of elements have neg modifier3265if (BV->getNumOperands() == EltsV2F16.size()) {3266Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);3267Mods |= SISrcMods::NEG;3268Mods |= SISrcMods::NEG_HI;3269}3270}32713272SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);3273return true;3274}32753276bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,3277SDValue &SrcMods) const {3278Src = In;3279unsigned Mods = SISrcMods::OP_SEL_1;3280unsigned ModOpcode;32813282// mods are on f16 elements3283if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {3284SmallVector<SDValue, 8> EltsF16;3285checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {3286// Based on first element decide which mod we match, neg or abs3287if (EltsF16.empty())3288ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;3289if (ElF16.getOpcode() != ModOpcode)3290return false;3291EltsF16.push_back(ElF16.getOperand(0));3292return true;3293});32943295// All elements have ModOpcode modifier3296if (BV->getNumOperands() * 2 == EltsF16.size())3297selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),329816);3299}33003301// mods are on v2f16 elements3302if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {3303SmallVector<SDValue, 8> EltsV2F16;33043305for (unsigned i = 0; i < BV->getNumOperands(); ++i) {3306SDValue ElV2f16 = stripBitcast(BV->getOperand(i));3307// Based on first element decide which mod we match, neg or abs3308if (EltsV2F16.empty())3309ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;3310if (ElV2f16->getOpcode() != ModOpcode)3311break;3312EltsV2F16.push_back(ElV2f16->getOperand(0));3313}33143315// All elements have ModOpcode modifier3316if (BV->getNumOperands() == EltsV2F16.size())3317selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),331832);3319}33203321SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);3322return true;3323}33243325bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,3326SDValue &SrcMods) const {3327Src = In;3328unsigned Mods = SISrcMods::OP_SEL_1;3329SmallVector<SDValue, 8> EltsF32;33303331if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {3332assert(BV->getNumOperands() > 0);3333// Based on first element decide which mod we match, neg or abs3334SDValue ElF32 = stripBitcast(BV->getOperand(0));3335unsigned ModOpcode =3336(ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;3337for (unsigned i = 0; i < BV->getNumOperands(); ++i) {3338SDValue ElF32 = stripBitcast(BV->getOperand(i));3339if (ElF32.getOpcode() != ModOpcode)3340break;3341EltsF32.push_back(ElF32.getOperand(0));3342}33433344// All elements had ModOpcode modifier3345if (BV->getNumOperands() == EltsF32.size())3346selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),334732);3348}33493350SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);3351return true;3352}33533354bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {3355if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {3356BitVector UndefElements;3357if (SDValue Splat = BV->getSplatValue(&UndefElements))3358if (isInlineImmediate(Splat.getNode())) {3359if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {3360unsigned Imm = C->getAPIntValue().getSExtValue();3361Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);3362return true;3363}3364if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {3365unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();3366Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);3367return true;3368}3369llvm_unreachable("unhandled Constant node");3370}3371}33723373// 16 bit splat3374SDValue SplatSrc32 = stripBitcast(In);3375if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))3376if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {3377SDValue SplatSrc16 = stripBitcast(Splat32);3378if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))3379if (SDValue Splat = SplatSrc16BV->getSplatValue()) {3380const SIInstrInfo *TII = Subtarget->getInstrInfo();3381std::optional<APInt> RawValue;3382if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))3383RawValue = C->getValueAPF().bitcastToAPInt();3384else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))3385RawValue = C->getAPIntValue();33863387if (RawValue.has_value()) {3388EVT VT = In.getValueType().getScalarType();3389if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {3390APFloat FloatVal(VT.getSimpleVT() == MVT::f163391? APFloatBase::IEEEhalf()3392: APFloatBase::BFloat(),3393RawValue.value());3394if (TII->isInlineConstant(FloatVal)) {3395Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),3396MVT::i16);3397return true;3398}3399} else if (VT.getSimpleVT() == MVT::i16) {3400if (TII->isInlineConstant(RawValue.value())) {3401Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),3402MVT::i16);3403return true;3404}3405} else3406llvm_unreachable("unknown 16-bit type");3407}3408}3409}34103411return false;3412}34133414bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,3415SDValue &IndexKey) const {3416unsigned Key = 0;3417Src = In;34183419if (In.getOpcode() == ISD::SRL) {3420const llvm::SDValue &ShiftSrc = In.getOperand(0);3421ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));3422if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&3423ShiftAmt->getZExtValue() % 8 == 0) {3424Key = ShiftAmt->getZExtValue() / 8;3425Src = ShiftSrc;3426}3427}34283429IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);3430return true;3431}34323433bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,3434SDValue &IndexKey) const {3435unsigned Key = 0;3436Src = In;34373438if (In.getOpcode() == ISD::SRL) {3439const llvm::SDValue &ShiftSrc = In.getOperand(0);3440ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));3441if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&3442ShiftAmt->getZExtValue() == 16) {3443Key = 1;3444Src = ShiftSrc;3445}3446}34473448IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);3449return true;3450}34513452bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,3453SDValue &SrcMods) const {3454Src = In;3455// FIXME: Handle op_sel3456SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);3457return true;3458}34593460bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,3461SDValue &SrcMods) const {3462// FIXME: Handle op_sel3463return SelectVOP3Mods(In, Src, SrcMods);3464}34653466// The return value is not whether the match is possible (which it always is),3467// but whether or not it a conversion is really used.3468bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,3469unsigned &Mods) const {3470Mods = 0;3471SelectVOP3ModsImpl(In, Src, Mods);34723473if (Src.getOpcode() == ISD::FP_EXTEND) {3474Src = Src.getOperand(0);3475assert(Src.getValueType() == MVT::f16);3476Src = stripBitcast(Src);34773478// Be careful about folding modifiers if we already have an abs. fneg is3479// applied last, so we don't want to apply an earlier fneg.3480if ((Mods & SISrcMods::ABS) == 0) {3481unsigned ModsTmp;3482SelectVOP3ModsImpl(Src, Src, ModsTmp);34833484if ((ModsTmp & SISrcMods::NEG) != 0)3485Mods ^= SISrcMods::NEG;34863487if ((ModsTmp & SISrcMods::ABS) != 0)3488Mods |= SISrcMods::ABS;3489}34903491// op_sel/op_sel_hi decide the source type and source.3492// If the source's op_sel_hi is set, it indicates to do a conversion from fp16.3493// If the sources's op_sel is set, it picks the high half of the source3494// register.34953496Mods |= SISrcMods::OP_SEL_1;3497if (isExtractHiElt(Src, Src)) {3498Mods |= SISrcMods::OP_SEL_0;34993500// TODO: Should we try to look for neg/abs here?3501}35023503return true;3504}35053506return false;3507}35083509bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,3510SDValue &SrcMods) const {3511unsigned Mods = 0;3512if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))3513return false;3514SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);3515return true;3516}35173518bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,3519SDValue &SrcMods) const {3520unsigned Mods = 0;3521SelectVOP3PMadMixModsImpl(In, Src, Mods);3522SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);3523return true;3524}35253526SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {3527if (In.isUndef())3528return CurDAG->getUNDEF(MVT::i32);35293530if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {3531SDLoc SL(In);3532return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);3533}35343535if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {3536SDLoc SL(In);3537return CurDAG->getConstant(3538C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);3539}35403541SDValue Src;3542if (isExtractHiElt(In, Src))3543return Src;35443545return SDValue();3546}35473548bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {3549assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);35503551const SIRegisterInfo *SIRI =3552static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());3553const SIInstrInfo * SII =3554static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());35553556unsigned Limit = 0;3557bool AllUsesAcceptSReg = true;3558for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();3559Limit < 10 && U != E; ++U, ++Limit) {3560const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());35613562// If the register class is unknown, it could be an unknown3563// register class that needs to be an SGPR, e.g. an inline asm3564// constraint3565if (!RC || SIRI->isSGPRClass(RC))3566return false;35673568if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {3569AllUsesAcceptSReg = false;3570SDNode * User = *U;3571if (User->isMachineOpcode()) {3572unsigned Opc = User->getMachineOpcode();3573const MCInstrDesc &Desc = SII->get(Opc);3574if (Desc.isCommutable()) {3575unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();3576unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;3577if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {3578unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();3579const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);3580if (CommutedRC == &AMDGPU::VS_32RegClass ||3581CommutedRC == &AMDGPU::VS_64RegClass)3582AllUsesAcceptSReg = true;3583}3584}3585}3586// If "AllUsesAcceptSReg == false" so far we haven't succeeded3587// commuting current user. This means have at least one use3588// that strictly require VGPR. Thus, we will not attempt to commute3589// other user instructions.3590if (!AllUsesAcceptSReg)3591break;3592}3593}3594return !AllUsesAcceptSReg && (Limit < 10);3595}35963597bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {3598auto Ld = cast<LoadSDNode>(N);35993600const MachineMemOperand *MMO = Ld->getMemOperand();3601if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))3602return false;36033604return MMO->getSize().hasValue() &&3605Ld->getAlign() >=3606Align(std::min(MMO->getSize().getValue().getKnownMinValue(),3607uint64_t(4))) &&3608((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||3609Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||3610(Subtarget->getScalarizeGlobalBehavior() &&3611Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&3612Ld->isSimple() &&3613static_cast<const SITargetLowering *>(getTargetLowering())3614->isMemOpHasNoClobberedMemOperand(N)));3615}36163617void AMDGPUDAGToDAGISel::PostprocessISelDAG() {3618const AMDGPUTargetLowering& Lowering =3619*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());3620bool IsModified = false;3621do {3622IsModified = false;36233624// Go over all selected nodes and try to fold them a bit more3625SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();3626while (Position != CurDAG->allnodes_end()) {3627SDNode *Node = &*Position++;3628MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);3629if (!MachineNode)3630continue;36313632SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);3633if (ResNode != Node) {3634if (ResNode)3635ReplaceUses(Node, ResNode);3636IsModified = true;3637}3638}3639CurDAG->RemoveDeadNodes();3640} while (IsModified);3641}36423643AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,3644CodeGenOptLevel OptLevel)3645: SelectionDAGISelLegacy(3646ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}36473648char AMDGPUDAGToDAGISelLegacy::ID = 0;364936503651