Path: blob/main/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
35271 views
//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file defines an instruction selector for the NVPTX target.9//10//===----------------------------------------------------------------------===//1112#include "NVPTXISelDAGToDAG.h"13#include "MCTargetDesc/NVPTXBaseInfo.h"14#include "NVPTXUtilities.h"15#include "llvm/Analysis/ValueTracking.h"16#include "llvm/CodeGen/ISDOpcodes.h"17#include "llvm/IR/GlobalValue.h"18#include "llvm/IR/Instructions.h"19#include "llvm/IR/IntrinsicsNVPTX.h"20#include "llvm/Support/AtomicOrdering.h"21#include "llvm/Support/CommandLine.h"22#include "llvm/Support/Debug.h"23#include "llvm/Support/ErrorHandling.h"24#include "llvm/Support/raw_ostream.h"25#include "llvm/Target/TargetIntrinsicInfo.h"2627using namespace llvm;2829#define DEBUG_TYPE "nvptx-isel"30#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"3132static cl::opt<bool>33EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,34cl::desc("Enable reciprocal sqrt optimization"));3536/// createNVPTXISelDag - This pass converts a legalized DAG into a37/// NVPTX-specific DAG, ready for instruction scheduling.38FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,39llvm::CodeGenOptLevel OptLevel) {40return new NVPTXDAGToDAGISelLegacy(TM, OptLevel);41}4243NVPTXDAGToDAGISelLegacy::NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm,44CodeGenOptLevel OptLevel)45: SelectionDAGISelLegacy(46ID, std::make_unique<NVPTXDAGToDAGISel>(tm, OptLevel)) {}4748char NVPTXDAGToDAGISelLegacy::ID = 0;4950INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)5152NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,53CodeGenOptLevel OptLevel)54: SelectionDAGISel(tm, OptLevel), TM(tm) {55doMulWide = (OptLevel > CodeGenOptLevel::None);56}5758bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {59Subtarget = &MF.getSubtarget<NVPTXSubtarget>();60return SelectionDAGISel::runOnMachineFunction(MF);61}6263int NVPTXDAGToDAGISel::getDivF32Level() const {64return Subtarget->getTargetLowering()->getDivF32Level();65}6667bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {68return Subtarget->getTargetLowering()->usePrecSqrtF32();69}7071bool NVPTXDAGToDAGISel::useF32FTZ() const {72return Subtarget->getTargetLowering()->useF32FTZ(*MF);73}7475bool NVPTXDAGToDAGISel::allowFMA() const {76const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();77return TL->allowFMA(*MF, OptLevel);78}7980bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {81const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();82return TL->allowUnsafeFPMath(*MF);83}8485bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }8687/// Select - Select instructions not customized! Used for88/// expanded, promoted and normal instructions.89void NVPTXDAGToDAGISel::Select(SDNode *N) {9091if (N->isMachineOpcode()) {92N->setNodeId(-1);93return; // Already selected.94}9596switch (N->getOpcode()) {97case ISD::LOAD:98case ISD::ATOMIC_LOAD:99if (tryLoad(N))100return;101break;102case ISD::STORE:103case ISD::ATOMIC_STORE:104if (tryStore(N))105return;106break;107case ISD::EXTRACT_VECTOR_ELT:108if (tryEXTRACT_VECTOR_ELEMENT(N))109return;110break;111case NVPTXISD::SETP_F16X2:112SelectSETP_F16X2(N);113return;114case NVPTXISD::SETP_BF16X2:115SelectSETP_BF16X2(N);116return;117case NVPTXISD::LoadV2:118case NVPTXISD::LoadV4:119if (tryLoadVector(N))120return;121break;122case NVPTXISD::LDGV2:123case NVPTXISD::LDGV4:124case NVPTXISD::LDUV2:125case NVPTXISD::LDUV4:126if (tryLDGLDU(N))127return;128break;129case NVPTXISD::StoreV2:130case NVPTXISD::StoreV4:131if (tryStoreVector(N))132return;133break;134case NVPTXISD::LoadParam:135case NVPTXISD::LoadParamV2:136case NVPTXISD::LoadParamV4:137if (tryLoadParam(N))138return;139break;140case NVPTXISD::StoreRetval:141case NVPTXISD::StoreRetvalV2:142case NVPTXISD::StoreRetvalV4:143if (tryStoreRetval(N))144return;145break;146case NVPTXISD::StoreParam:147case NVPTXISD::StoreParamV2:148case NVPTXISD::StoreParamV4:149case NVPTXISD::StoreParamS32:150case NVPTXISD::StoreParamU32:151if (tryStoreParam(N))152return;153break;154case ISD::INTRINSIC_WO_CHAIN:155if (tryIntrinsicNoChain(N))156return;157break;158case ISD::INTRINSIC_W_CHAIN:159if (tryIntrinsicChain(N))160return;161break;162case NVPTXISD::Tex1DFloatS32:163case NVPTXISD::Tex1DFloatFloat:164case NVPTXISD::Tex1DFloatFloatLevel:165case NVPTXISD::Tex1DFloatFloatGrad:166case NVPTXISD::Tex1DS32S32:167case NVPTXISD::Tex1DS32Float:168case NVPTXISD::Tex1DS32FloatLevel:169case NVPTXISD::Tex1DS32FloatGrad:170case NVPTXISD::Tex1DU32S32:171case NVPTXISD::Tex1DU32Float:172case NVPTXISD::Tex1DU32FloatLevel:173case NVPTXISD::Tex1DU32FloatGrad:174case NVPTXISD::Tex1DArrayFloatS32:175case NVPTXISD::Tex1DArrayFloatFloat:176case NVPTXISD::Tex1DArrayFloatFloatLevel:177case NVPTXISD::Tex1DArrayFloatFloatGrad:178case NVPTXISD::Tex1DArrayS32S32:179case NVPTXISD::Tex1DArrayS32Float:180case NVPTXISD::Tex1DArrayS32FloatLevel:181case NVPTXISD::Tex1DArrayS32FloatGrad:182case NVPTXISD::Tex1DArrayU32S32:183case NVPTXISD::Tex1DArrayU32Float:184case NVPTXISD::Tex1DArrayU32FloatLevel:185case NVPTXISD::Tex1DArrayU32FloatGrad:186case NVPTXISD::Tex2DFloatS32:187case NVPTXISD::Tex2DFloatFloat:188case NVPTXISD::Tex2DFloatFloatLevel:189case NVPTXISD::Tex2DFloatFloatGrad:190case NVPTXISD::Tex2DS32S32:191case NVPTXISD::Tex2DS32Float:192case NVPTXISD::Tex2DS32FloatLevel:193case NVPTXISD::Tex2DS32FloatGrad:194case NVPTXISD::Tex2DU32S32:195case NVPTXISD::Tex2DU32Float:196case NVPTXISD::Tex2DU32FloatLevel:197case NVPTXISD::Tex2DU32FloatGrad:198case NVPTXISD::Tex2DArrayFloatS32:199case NVPTXISD::Tex2DArrayFloatFloat:200case NVPTXISD::Tex2DArrayFloatFloatLevel:201case NVPTXISD::Tex2DArrayFloatFloatGrad:202case NVPTXISD::Tex2DArrayS32S32:203case NVPTXISD::Tex2DArrayS32Float:204case NVPTXISD::Tex2DArrayS32FloatLevel:205case NVPTXISD::Tex2DArrayS32FloatGrad:206case NVPTXISD::Tex2DArrayU32S32:207case NVPTXISD::Tex2DArrayU32Float:208case NVPTXISD::Tex2DArrayU32FloatLevel:209case NVPTXISD::Tex2DArrayU32FloatGrad:210case NVPTXISD::Tex3DFloatS32:211case NVPTXISD::Tex3DFloatFloat:212case NVPTXISD::Tex3DFloatFloatLevel:213case NVPTXISD::Tex3DFloatFloatGrad:214case NVPTXISD::Tex3DS32S32:215case NVPTXISD::Tex3DS32Float:216case NVPTXISD::Tex3DS32FloatLevel:217case NVPTXISD::Tex3DS32FloatGrad:218case NVPTXISD::Tex3DU32S32:219case NVPTXISD::Tex3DU32Float:220case NVPTXISD::Tex3DU32FloatLevel:221case NVPTXISD::Tex3DU32FloatGrad:222case NVPTXISD::TexCubeFloatFloat:223case NVPTXISD::TexCubeFloatFloatLevel:224case NVPTXISD::TexCubeS32Float:225case NVPTXISD::TexCubeS32FloatLevel:226case NVPTXISD::TexCubeU32Float:227case NVPTXISD::TexCubeU32FloatLevel:228case NVPTXISD::TexCubeArrayFloatFloat:229case NVPTXISD::TexCubeArrayFloatFloatLevel:230case NVPTXISD::TexCubeArrayS32Float:231case NVPTXISD::TexCubeArrayS32FloatLevel:232case NVPTXISD::TexCubeArrayU32Float:233case NVPTXISD::TexCubeArrayU32FloatLevel:234case NVPTXISD::Tld4R2DFloatFloat:235case NVPTXISD::Tld4G2DFloatFloat:236case NVPTXISD::Tld4B2DFloatFloat:237case NVPTXISD::Tld4A2DFloatFloat:238case NVPTXISD::Tld4R2DS64Float:239case NVPTXISD::Tld4G2DS64Float:240case NVPTXISD::Tld4B2DS64Float:241case NVPTXISD::Tld4A2DS64Float:242case NVPTXISD::Tld4R2DU64Float:243case NVPTXISD::Tld4G2DU64Float:244case NVPTXISD::Tld4B2DU64Float:245case NVPTXISD::Tld4A2DU64Float:246case NVPTXISD::TexUnified1DFloatS32:247case NVPTXISD::TexUnified1DFloatFloat:248case NVPTXISD::TexUnified1DFloatFloatLevel:249case NVPTXISD::TexUnified1DFloatFloatGrad:250case NVPTXISD::TexUnified1DS32S32:251case NVPTXISD::TexUnified1DS32Float:252case NVPTXISD::TexUnified1DS32FloatLevel:253case NVPTXISD::TexUnified1DS32FloatGrad:254case NVPTXISD::TexUnified1DU32S32:255case NVPTXISD::TexUnified1DU32Float:256case NVPTXISD::TexUnified1DU32FloatLevel:257case NVPTXISD::TexUnified1DU32FloatGrad:258case NVPTXISD::TexUnified1DArrayFloatS32:259case NVPTXISD::TexUnified1DArrayFloatFloat:260case NVPTXISD::TexUnified1DArrayFloatFloatLevel:261case NVPTXISD::TexUnified1DArrayFloatFloatGrad:262case NVPTXISD::TexUnified1DArrayS32S32:263case NVPTXISD::TexUnified1DArrayS32Float:264case NVPTXISD::TexUnified1DArrayS32FloatLevel:265case NVPTXISD::TexUnified1DArrayS32FloatGrad:266case NVPTXISD::TexUnified1DArrayU32S32:267case NVPTXISD::TexUnified1DArrayU32Float:268case NVPTXISD::TexUnified1DArrayU32FloatLevel:269case NVPTXISD::TexUnified1DArrayU32FloatGrad:270case NVPTXISD::TexUnified2DFloatS32:271case NVPTXISD::TexUnified2DFloatFloat:272case NVPTXISD::TexUnified2DFloatFloatLevel:273case NVPTXISD::TexUnified2DFloatFloatGrad:274case NVPTXISD::TexUnified2DS32S32:275case NVPTXISD::TexUnified2DS32Float:276case NVPTXISD::TexUnified2DS32FloatLevel:277case NVPTXISD::TexUnified2DS32FloatGrad:278case NVPTXISD::TexUnified2DU32S32:279case NVPTXISD::TexUnified2DU32Float:280case NVPTXISD::TexUnified2DU32FloatLevel:281case NVPTXISD::TexUnified2DU32FloatGrad:282case NVPTXISD::TexUnified2DArrayFloatS32:283case NVPTXISD::TexUnified2DArrayFloatFloat:284case NVPTXISD::TexUnified2DArrayFloatFloatLevel:285case NVPTXISD::TexUnified2DArrayFloatFloatGrad:286case NVPTXISD::TexUnified2DArrayS32S32:287case NVPTXISD::TexUnified2DArrayS32Float:288case NVPTXISD::TexUnified2DArrayS32FloatLevel:289case NVPTXISD::TexUnified2DArrayS32FloatGrad:290case NVPTXISD::TexUnified2DArrayU32S32:291case NVPTXISD::TexUnified2DArrayU32Float:292case NVPTXISD::TexUnified2DArrayU32FloatLevel:293case NVPTXISD::TexUnified2DArrayU32FloatGrad:294case NVPTXISD::TexUnified3DFloatS32:295case NVPTXISD::TexUnified3DFloatFloat:296case NVPTXISD::TexUnified3DFloatFloatLevel:297case NVPTXISD::TexUnified3DFloatFloatGrad:298case NVPTXISD::TexUnified3DS32S32:299case NVPTXISD::TexUnified3DS32Float:300case NVPTXISD::TexUnified3DS32FloatLevel:301case NVPTXISD::TexUnified3DS32FloatGrad:302case NVPTXISD::TexUnified3DU32S32:303case NVPTXISD::TexUnified3DU32Float:304case NVPTXISD::TexUnified3DU32FloatLevel:305case NVPTXISD::TexUnified3DU32FloatGrad:306case NVPTXISD::TexUnifiedCubeFloatFloat:307case NVPTXISD::TexUnifiedCubeFloatFloatLevel:308case NVPTXISD::TexUnifiedCubeS32Float:309case NVPTXISD::TexUnifiedCubeS32FloatLevel:310case NVPTXISD::TexUnifiedCubeU32Float:311case NVPTXISD::TexUnifiedCubeU32FloatLevel:312case NVPTXISD::TexUnifiedCubeArrayFloatFloat:313case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:314case NVPTXISD::TexUnifiedCubeArrayS32Float:315case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:316case NVPTXISD::TexUnifiedCubeArrayU32Float:317case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:318case NVPTXISD::TexUnifiedCubeFloatFloatGrad:319case NVPTXISD::TexUnifiedCubeS32FloatGrad:320case NVPTXISD::TexUnifiedCubeU32FloatGrad:321case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad:322case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad:323case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad:324case NVPTXISD::Tld4UnifiedR2DFloatFloat:325case NVPTXISD::Tld4UnifiedG2DFloatFloat:326case NVPTXISD::Tld4UnifiedB2DFloatFloat:327case NVPTXISD::Tld4UnifiedA2DFloatFloat:328case NVPTXISD::Tld4UnifiedR2DS64Float:329case NVPTXISD::Tld4UnifiedG2DS64Float:330case NVPTXISD::Tld4UnifiedB2DS64Float:331case NVPTXISD::Tld4UnifiedA2DS64Float:332case NVPTXISD::Tld4UnifiedR2DU64Float:333case NVPTXISD::Tld4UnifiedG2DU64Float:334case NVPTXISD::Tld4UnifiedB2DU64Float:335case NVPTXISD::Tld4UnifiedA2DU64Float:336if (tryTextureIntrinsic(N))337return;338break;339case NVPTXISD::Suld1DI8Clamp:340case NVPTXISD::Suld1DI16Clamp:341case NVPTXISD::Suld1DI32Clamp:342case NVPTXISD::Suld1DI64Clamp:343case NVPTXISD::Suld1DV2I8Clamp:344case NVPTXISD::Suld1DV2I16Clamp:345case NVPTXISD::Suld1DV2I32Clamp:346case NVPTXISD::Suld1DV2I64Clamp:347case NVPTXISD::Suld1DV4I8Clamp:348case NVPTXISD::Suld1DV4I16Clamp:349case NVPTXISD::Suld1DV4I32Clamp:350case NVPTXISD::Suld1DArrayI8Clamp:351case NVPTXISD::Suld1DArrayI16Clamp:352case NVPTXISD::Suld1DArrayI32Clamp:353case NVPTXISD::Suld1DArrayI64Clamp:354case NVPTXISD::Suld1DArrayV2I8Clamp:355case NVPTXISD::Suld1DArrayV2I16Clamp:356case NVPTXISD::Suld1DArrayV2I32Clamp:357case NVPTXISD::Suld1DArrayV2I64Clamp:358case NVPTXISD::Suld1DArrayV4I8Clamp:359case NVPTXISD::Suld1DArrayV4I16Clamp:360case NVPTXISD::Suld1DArrayV4I32Clamp:361case NVPTXISD::Suld2DI8Clamp:362case NVPTXISD::Suld2DI16Clamp:363case NVPTXISD::Suld2DI32Clamp:364case NVPTXISD::Suld2DI64Clamp:365case NVPTXISD::Suld2DV2I8Clamp:366case NVPTXISD::Suld2DV2I16Clamp:367case NVPTXISD::Suld2DV2I32Clamp:368case NVPTXISD::Suld2DV2I64Clamp:369case NVPTXISD::Suld2DV4I8Clamp:370case NVPTXISD::Suld2DV4I16Clamp:371case NVPTXISD::Suld2DV4I32Clamp:372case NVPTXISD::Suld2DArrayI8Clamp:373case NVPTXISD::Suld2DArrayI16Clamp:374case NVPTXISD::Suld2DArrayI32Clamp:375case NVPTXISD::Suld2DArrayI64Clamp:376case NVPTXISD::Suld2DArrayV2I8Clamp:377case NVPTXISD::Suld2DArrayV2I16Clamp:378case NVPTXISD::Suld2DArrayV2I32Clamp:379case NVPTXISD::Suld2DArrayV2I64Clamp:380case NVPTXISD::Suld2DArrayV4I8Clamp:381case NVPTXISD::Suld2DArrayV4I16Clamp:382case NVPTXISD::Suld2DArrayV4I32Clamp:383case NVPTXISD::Suld3DI8Clamp:384case NVPTXISD::Suld3DI16Clamp:385case NVPTXISD::Suld3DI32Clamp:386case NVPTXISD::Suld3DI64Clamp:387case NVPTXISD::Suld3DV2I8Clamp:388case NVPTXISD::Suld3DV2I16Clamp:389case NVPTXISD::Suld3DV2I32Clamp:390case NVPTXISD::Suld3DV2I64Clamp:391case NVPTXISD::Suld3DV4I8Clamp:392case NVPTXISD::Suld3DV4I16Clamp:393case NVPTXISD::Suld3DV4I32Clamp:394case NVPTXISD::Suld1DI8Trap:395case NVPTXISD::Suld1DI16Trap:396case NVPTXISD::Suld1DI32Trap:397case NVPTXISD::Suld1DI64Trap:398case NVPTXISD::Suld1DV2I8Trap:399case NVPTXISD::Suld1DV2I16Trap:400case NVPTXISD::Suld1DV2I32Trap:401case NVPTXISD::Suld1DV2I64Trap:402case NVPTXISD::Suld1DV4I8Trap:403case NVPTXISD::Suld1DV4I16Trap:404case NVPTXISD::Suld1DV4I32Trap:405case NVPTXISD::Suld1DArrayI8Trap:406case NVPTXISD::Suld1DArrayI16Trap:407case NVPTXISD::Suld1DArrayI32Trap:408case NVPTXISD::Suld1DArrayI64Trap:409case NVPTXISD::Suld1DArrayV2I8Trap:410case NVPTXISD::Suld1DArrayV2I16Trap:411case NVPTXISD::Suld1DArrayV2I32Trap:412case NVPTXISD::Suld1DArrayV2I64Trap:413case NVPTXISD::Suld1DArrayV4I8Trap:414case NVPTXISD::Suld1DArrayV4I16Trap:415case NVPTXISD::Suld1DArrayV4I32Trap:416case NVPTXISD::Suld2DI8Trap:417case NVPTXISD::Suld2DI16Trap:418case NVPTXISD::Suld2DI32Trap:419case NVPTXISD::Suld2DI64Trap:420case NVPTXISD::Suld2DV2I8Trap:421case NVPTXISD::Suld2DV2I16Trap:422case NVPTXISD::Suld2DV2I32Trap:423case NVPTXISD::Suld2DV2I64Trap:424case NVPTXISD::Suld2DV4I8Trap:425case NVPTXISD::Suld2DV4I16Trap:426case NVPTXISD::Suld2DV4I32Trap:427case NVPTXISD::Suld2DArrayI8Trap:428case NVPTXISD::Suld2DArrayI16Trap:429case NVPTXISD::Suld2DArrayI32Trap:430case NVPTXISD::Suld2DArrayI64Trap:431case NVPTXISD::Suld2DArrayV2I8Trap:432case NVPTXISD::Suld2DArrayV2I16Trap:433case NVPTXISD::Suld2DArrayV2I32Trap:434case NVPTXISD::Suld2DArrayV2I64Trap:435case NVPTXISD::Suld2DArrayV4I8Trap:436case NVPTXISD::Suld2DArrayV4I16Trap:437case NVPTXISD::Suld2DArrayV4I32Trap:438case NVPTXISD::Suld3DI8Trap:439case NVPTXISD::Suld3DI16Trap:440case NVPTXISD::Suld3DI32Trap:441case NVPTXISD::Suld3DI64Trap:442case NVPTXISD::Suld3DV2I8Trap:443case NVPTXISD::Suld3DV2I16Trap:444case NVPTXISD::Suld3DV2I32Trap:445case NVPTXISD::Suld3DV2I64Trap:446case NVPTXISD::Suld3DV4I8Trap:447case NVPTXISD::Suld3DV4I16Trap:448case NVPTXISD::Suld3DV4I32Trap:449case NVPTXISD::Suld1DI8Zero:450case NVPTXISD::Suld1DI16Zero:451case NVPTXISD::Suld1DI32Zero:452case NVPTXISD::Suld1DI64Zero:453case NVPTXISD::Suld1DV2I8Zero:454case NVPTXISD::Suld1DV2I16Zero:455case NVPTXISD::Suld1DV2I32Zero:456case NVPTXISD::Suld1DV2I64Zero:457case NVPTXISD::Suld1DV4I8Zero:458case NVPTXISD::Suld1DV4I16Zero:459case NVPTXISD::Suld1DV4I32Zero:460case NVPTXISD::Suld1DArrayI8Zero:461case NVPTXISD::Suld1DArrayI16Zero:462case NVPTXISD::Suld1DArrayI32Zero:463case NVPTXISD::Suld1DArrayI64Zero:464case NVPTXISD::Suld1DArrayV2I8Zero:465case NVPTXISD::Suld1DArrayV2I16Zero:466case NVPTXISD::Suld1DArrayV2I32Zero:467case NVPTXISD::Suld1DArrayV2I64Zero:468case NVPTXISD::Suld1DArrayV4I8Zero:469case NVPTXISD::Suld1DArrayV4I16Zero:470case NVPTXISD::Suld1DArrayV4I32Zero:471case NVPTXISD::Suld2DI8Zero:472case NVPTXISD::Suld2DI16Zero:473case NVPTXISD::Suld2DI32Zero:474case NVPTXISD::Suld2DI64Zero:475case NVPTXISD::Suld2DV2I8Zero:476case NVPTXISD::Suld2DV2I16Zero:477case NVPTXISD::Suld2DV2I32Zero:478case NVPTXISD::Suld2DV2I64Zero:479case NVPTXISD::Suld2DV4I8Zero:480case NVPTXISD::Suld2DV4I16Zero:481case NVPTXISD::Suld2DV4I32Zero:482case NVPTXISD::Suld2DArrayI8Zero:483case NVPTXISD::Suld2DArrayI16Zero:484case NVPTXISD::Suld2DArrayI32Zero:485case NVPTXISD::Suld2DArrayI64Zero:486case NVPTXISD::Suld2DArrayV2I8Zero:487case NVPTXISD::Suld2DArrayV2I16Zero:488case NVPTXISD::Suld2DArrayV2I32Zero:489case NVPTXISD::Suld2DArrayV2I64Zero:490case NVPTXISD::Suld2DArrayV4I8Zero:491case NVPTXISD::Suld2DArrayV4I16Zero:492case NVPTXISD::Suld2DArrayV4I32Zero:493case NVPTXISD::Suld3DI8Zero:494case NVPTXISD::Suld3DI16Zero:495case NVPTXISD::Suld3DI32Zero:496case NVPTXISD::Suld3DI64Zero:497case NVPTXISD::Suld3DV2I8Zero:498case NVPTXISD::Suld3DV2I16Zero:499case NVPTXISD::Suld3DV2I32Zero:500case NVPTXISD::Suld3DV2I64Zero:501case NVPTXISD::Suld3DV4I8Zero:502case NVPTXISD::Suld3DV4I16Zero:503case NVPTXISD::Suld3DV4I32Zero:504if (trySurfaceIntrinsic(N))505return;506break;507case ISD::AND:508case ISD::SRA:509case ISD::SRL:510// Try to select BFE511if (tryBFE(N))512return;513break;514case ISD::ADDRSPACECAST:515SelectAddrSpaceCast(N);516return;517case ISD::ConstantFP:518if (tryConstantFP(N))519return;520break;521case ISD::CopyToReg: {522if (N->getOperand(1).getValueType() == MVT::i128) {523SelectV2I64toI128(N);524return;525}526break;527}528case ISD::CopyFromReg: {529if (N->getOperand(1).getValueType() == MVT::i128) {530SelectI128toV2I64(N);531return;532}533break;534}535default:536break;537}538SelectCode(N);539}540541bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {542unsigned IID = N->getConstantOperandVal(1);543switch (IID) {544default:545return false;546case Intrinsic::nvvm_ldg_global_f:547case Intrinsic::nvvm_ldg_global_i:548case Intrinsic::nvvm_ldg_global_p:549case Intrinsic::nvvm_ldu_global_f:550case Intrinsic::nvvm_ldu_global_i:551case Intrinsic::nvvm_ldu_global_p:552return tryLDGLDU(N);553}554}555556// There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we557// have to load them into an .(b)f16 register first.558bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) {559if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16)560return false;561SDValue Val = CurDAG->getTargetConstantFP(562cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0));563SDNode *LoadConstF16 = CurDAG->getMachineNode(564(N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16565: NVPTX::LOAD_CONST_BF16),566SDLoc(N), N->getValueType(0), Val);567ReplaceNode(N, LoadConstF16);568return true;569}570571// Map ISD:CONDCODE value to appropriate CmpMode expected by572// NVPTXInstPrinter::printCmpMode()573static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {574using NVPTX::PTXCmpMode::CmpMode;575unsigned PTXCmpMode = [](ISD::CondCode CC) {576switch (CC) {577default:578llvm_unreachable("Unexpected condition code.");579case ISD::SETOEQ:580return CmpMode::EQ;581case ISD::SETOGT:582return CmpMode::GT;583case ISD::SETOGE:584return CmpMode::GE;585case ISD::SETOLT:586return CmpMode::LT;587case ISD::SETOLE:588return CmpMode::LE;589case ISD::SETONE:590return CmpMode::NE;591case ISD::SETO:592return CmpMode::NUM;593case ISD::SETUO:594return CmpMode::NotANumber;595case ISD::SETUEQ:596return CmpMode::EQU;597case ISD::SETUGT:598return CmpMode::GTU;599case ISD::SETUGE:600return CmpMode::GEU;601case ISD::SETULT:602return CmpMode::LTU;603case ISD::SETULE:604return CmpMode::LEU;605case ISD::SETUNE:606return CmpMode::NEU;607case ISD::SETEQ:608return CmpMode::EQ;609case ISD::SETGT:610return CmpMode::GT;611case ISD::SETGE:612return CmpMode::GE;613case ISD::SETLT:614return CmpMode::LT;615case ISD::SETLE:616return CmpMode::LE;617case ISD::SETNE:618return CmpMode::NE;619}620}(CondCode.get());621622if (FTZ)623PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;624625return PTXCmpMode;626}627628bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {629unsigned PTXCmpMode =630getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());631SDLoc DL(N);632SDNode *SetP = CurDAG->getMachineNode(633NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),634N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));635ReplaceNode(N, SetP);636return true;637}638639bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {640unsigned PTXCmpMode =641getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());642SDLoc DL(N);643SDNode *SetP = CurDAG->getMachineNode(644NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),645N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));646ReplaceNode(N, SetP);647return true;648}649650// Find all instances of extract_vector_elt that use this v2f16 vector651// and coalesce them into a scattering move instruction.652bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {653SDValue Vector = N->getOperand(0);654655// We only care about 16x2 as it's the only real vector type we656// need to deal with.657MVT VT = Vector.getSimpleValueType();658if (!Isv2x16VT(VT))659return false;660// Find and record all uses of this vector that extract element 0 or 1.661SmallVector<SDNode *, 4> E0, E1;662for (auto *U : Vector.getNode()->uses()) {663if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)664continue;665if (U->getOperand(0) != Vector)666continue;667if (const ConstantSDNode *IdxConst =668dyn_cast<ConstantSDNode>(U->getOperand(1))) {669if (IdxConst->getZExtValue() == 0)670E0.push_back(U);671else if (IdxConst->getZExtValue() == 1)672E1.push_back(U);673else674llvm_unreachable("Invalid vector index.");675}676}677678// There's no point scattering f16x2 if we only ever access one679// element of it.680if (E0.empty() || E1.empty())681return false;682683// Merge (f16 extractelt(V, 0), f16 extractelt(V,1))684// into f16,f16 SplitF16x2(V)685MVT EltVT = VT.getVectorElementType();686SDNode *ScatterOp =687CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);688for (auto *Node : E0)689ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));690for (auto *Node : E1)691ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));692693return true;694}695696static unsigned int getCodeAddrSpace(MemSDNode *N) {697const Value *Src = N->getMemOperand()->getValue();698699if (!Src)700return NVPTX::PTXLdStInstCode::GENERIC;701702if (auto *PT = dyn_cast<PointerType>(Src->getType())) {703switch (PT->getAddressSpace()) {704case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;705case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;706case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;707case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;708case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;709case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;710default: break;711}712}713return NVPTX::PTXLdStInstCode::GENERIC;714}715716static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,717unsigned CodeAddrSpace, MachineFunction *F) {718// We use ldg (i.e. ld.global.nc) for invariant loads from the global address719// space.720//721// We have two ways of identifying invariant loads: Loads may be explicitly722// marked as invariant, or we may infer them to be invariant.723//724// We currently infer invariance for loads from725// - constant global variables, and726// - kernel function pointer params that are noalias (i.e. __restrict) and727// never written to.728//729// TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally730// not during the SelectionDAG phase).731//732// TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for733// explicitly invariant loads because these are how clang tells us to use ldg734// when the user uses a builtin.735if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)736return false;737738if (N->isInvariant())739return true;740741bool IsKernelFn = isKernelFunction(F->getFunction());742743// We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly744// because the former looks through phi nodes while the latter does not. We745// need to look through phi nodes to handle pointer induction variables.746SmallVector<const Value *, 8> Objs;747getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);748749return all_of(Objs, [&](const Value *V) {750if (auto *A = dyn_cast<const Argument>(V))751return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();752if (auto *GV = dyn_cast<const GlobalVariable>(V))753return GV->isConstant();754return false;755});756}757758bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {759unsigned IID = N->getConstantOperandVal(0);760switch (IID) {761default:762return false;763case Intrinsic::nvvm_texsurf_handle_internal:764SelectTexSurfHandle(N);765return true;766}767}768769void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {770// Op 0 is the intrinsic ID771SDValue Wrapper = N->getOperand(1);772SDValue GlobalVal = Wrapper.getOperand(0);773ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),774MVT::i64, GlobalVal));775}776777void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {778SDValue Src = N->getOperand(0);779AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);780unsigned SrcAddrSpace = CastN->getSrcAddressSpace();781unsigned DstAddrSpace = CastN->getDestAddressSpace();782assert(SrcAddrSpace != DstAddrSpace &&783"addrspacecast must be between different address spaces");784785if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {786// Specific to generic787unsigned Opc;788switch (SrcAddrSpace) {789default: report_fatal_error("Bad address space in addrspacecast");790case ADDRESS_SPACE_GLOBAL:791Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;792break;793case ADDRESS_SPACE_SHARED:794Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32795? NVPTX::cvta_shared_6432796: NVPTX::cvta_shared_64)797: NVPTX::cvta_shared;798break;799case ADDRESS_SPACE_CONST:800Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32801? NVPTX::cvta_const_6432802: NVPTX::cvta_const_64)803: NVPTX::cvta_const;804break;805case ADDRESS_SPACE_LOCAL:806Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32807? NVPTX::cvta_local_6432808: NVPTX::cvta_local_64)809: NVPTX::cvta_local;810break;811}812ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),813Src));814return;815} else {816// Generic to specific817if (SrcAddrSpace != 0)818report_fatal_error("Cannot cast between two non-generic address spaces");819unsigned Opc;820switch (DstAddrSpace) {821default: report_fatal_error("Bad address space in addrspacecast");822case ADDRESS_SPACE_GLOBAL:823Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;824break;825case ADDRESS_SPACE_SHARED:826Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32827? NVPTX::cvta_to_shared_3264828: NVPTX::cvta_to_shared_64)829: NVPTX::cvta_to_shared;830break;831case ADDRESS_SPACE_CONST:832Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32833? NVPTX::cvta_to_const_3264834: NVPTX::cvta_to_const_64)835: NVPTX::cvta_to_const;836break;837case ADDRESS_SPACE_LOCAL:838Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32839? NVPTX::cvta_to_local_3264840: NVPTX::cvta_to_local_64)841: NVPTX::cvta_to_local;842break;843case ADDRESS_SPACE_PARAM:844Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64845: NVPTX::nvvm_ptr_gen_to_param;846break;847}848ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),849Src));850return;851}852}853854// Helper function template to reduce amount of boilerplate code for855// opcode selection.856static std::optional<unsigned>857pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,858unsigned Opcode_i16, unsigned Opcode_i32,859std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,860std::optional<unsigned> Opcode_f64) {861switch (VT) {862case MVT::i1:863case MVT::i8:864return Opcode_i8;865case MVT::i16:866return Opcode_i16;867case MVT::i32:868return Opcode_i32;869case MVT::i64:870return Opcode_i64;871case MVT::f16:872case MVT::bf16:873return Opcode_i16;874case MVT::v2f16:875case MVT::v2bf16:876case MVT::v2i16:877case MVT::v4i8:878return Opcode_i32;879case MVT::f32:880return Opcode_f32;881case MVT::f64:882return Opcode_f64;883default:884return std::nullopt;885}886}887888static int getLdStRegType(EVT VT) {889if (VT.isFloatingPoint())890switch (VT.getSimpleVT().SimpleTy) {891case MVT::f16:892case MVT::bf16:893case MVT::v2f16:894case MVT::v2bf16:895return NVPTX::PTXLdStInstCode::Untyped;896default:897return NVPTX::PTXLdStInstCode::Float;898}899else900return NVPTX::PTXLdStInstCode::Unsigned;901}902903bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {904SDLoc dl(N);905MemSDNode *LD = cast<MemSDNode>(N);906assert(LD->readMem() && "Expected load");907LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);908EVT LoadedVT = LD->getMemoryVT();909SDNode *NVPTXLD = nullptr;910911// do not support pre/post inc/dec912if (PlainLoad && PlainLoad->isIndexed())913return false;914915if (!LoadedVT.isSimple())916return false;917918AtomicOrdering Ordering = LD->getSuccessOrdering();919// In order to lower atomic loads with stronger guarantees we would need to920// use load.acquire or insert fences. However these features were only added921// with PTX ISA 6.0 / sm_70.922// TODO: Check if we can actually use the new instructions and implement them.923if (isStrongerThanMonotonic(Ordering))924return false;925926// Address Space Setting927unsigned int CodeAddrSpace = getCodeAddrSpace(LD);928if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {929return tryLDGLDU(N);930}931932unsigned int PointerSize =933CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());934935// Volatile Setting936// - .volatile is only available for .global and .shared937// - .volatile has the same memory synchronization semantics as .relaxed.sys938bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;939if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&940CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&941CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)942isVolatile = false;943944// Type Setting: fromType + fromTypeWidth945//946// Sign : ISD::SEXTLOAD947// Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the948// type is integer949// Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float950MVT SimpleVT = LoadedVT.getSimpleVT();951MVT ScalarVT = SimpleVT.getScalarType();952// Read at least 8 bits (predicates are stored as 8-bit values)953unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());954unsigned int fromType;955956// Vector Setting957unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;958if (SimpleVT.isVector()) {959assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&960"Unexpected vector type");961// v2f16/v2bf16/v2i16 is loaded using ld.b32962fromTypeWidth = 32;963}964965if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))966fromType = NVPTX::PTXLdStInstCode::Signed;967else968fromType = getLdStRegType(ScalarVT);969970// Create the machine instruction DAG971SDValue Chain = N->getOperand(0);972SDValue N1 = N->getOperand(1);973SDValue Addr;974SDValue Offset, Base;975std::optional<unsigned> Opcode;976MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;977978if (SelectDirectAddr(N1, Addr)) {979Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,980NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,981NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);982if (!Opcode)983return false;984SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),985getI32Imm(vecType, dl), getI32Imm(fromType, dl),986getI32Imm(fromTypeWidth, dl), Addr, Chain };987NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);988} else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)989: SelectADDRsi(N1.getNode(), N1, Base, Offset)) {990Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,991NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,992NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);993if (!Opcode)994return false;995SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),996getI32Imm(vecType, dl), getI32Imm(fromType, dl),997getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };998NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);999} else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)1000: SelectADDRri(N1.getNode(), N1, Base, Offset)) {1001if (PointerSize == 64)1002Opcode =1003pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,1004NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,1005NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);1006else1007Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,1008NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,1009NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);1010if (!Opcode)1011return false;1012SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),1013getI32Imm(vecType, dl), getI32Imm(fromType, dl),1014getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };1015NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);1016} else {1017if (PointerSize == 64)1018Opcode =1019pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,1020NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,1021NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);1022else1023Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,1024NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,1025NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);1026if (!Opcode)1027return false;1028SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),1029getI32Imm(vecType, dl), getI32Imm(fromType, dl),1030getI32Imm(fromTypeWidth, dl), N1, Chain };1031NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);1032}10331034if (!NVPTXLD)1035return false;10361037MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();1038CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});10391040ReplaceNode(N, NVPTXLD);1041return true;1042}10431044bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {10451046SDValue Chain = N->getOperand(0);1047SDValue Op1 = N->getOperand(1);1048SDValue Addr, Offset, Base;1049std::optional<unsigned> Opcode;1050SDLoc DL(N);1051SDNode *LD;1052MemSDNode *MemSD = cast<MemSDNode>(N);1053EVT LoadedVT = MemSD->getMemoryVT();10541055if (!LoadedVT.isSimple())1056return false;10571058// Address Space Setting1059unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);1060if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {1061return tryLDGLDU(N);1062}10631064unsigned int PointerSize =1065CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());10661067// Volatile Setting1068// - .volatile is only availalble for .global and .shared1069bool IsVolatile = MemSD->isVolatile();1070if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&1071CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&1072CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)1073IsVolatile = false;10741075// Vector Setting1076MVT SimpleVT = LoadedVT.getSimpleVT();10771078// Type Setting: fromType + fromTypeWidth1079//1080// Sign : ISD::SEXTLOAD1081// Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the1082// type is integer1083// Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float1084MVT ScalarVT = SimpleVT.getScalarType();1085// Read at least 8 bits (predicates are stored as 8-bit values)1086unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());1087unsigned int FromType;1088// The last operand holds the original LoadSDNode::getExtensionType() value1089unsigned ExtensionType = cast<ConstantSDNode>(1090N->getOperand(N->getNumOperands() - 1))->getZExtValue();1091if (ExtensionType == ISD::SEXTLOAD)1092FromType = NVPTX::PTXLdStInstCode::Signed;1093else1094FromType = getLdStRegType(ScalarVT);10951096unsigned VecType;10971098switch (N->getOpcode()) {1099case NVPTXISD::LoadV2:1100VecType = NVPTX::PTXLdStInstCode::V2;1101break;1102case NVPTXISD::LoadV4:1103VecType = NVPTX::PTXLdStInstCode::V4;1104break;1105default:1106return false;1107}11081109EVT EltVT = N->getValueType(0);11101111// v8x16 is a special case. PTX doesn't have ld.v8.161112// instruction. Instead, we split the vector into v2x16 chunks and1113// load them with ld.v4.b32.1114if (Isv2x16VT(EltVT)) {1115assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");1116EltVT = MVT::i32;1117FromType = NVPTX::PTXLdStInstCode::Untyped;1118FromTypeWidth = 32;1119}11201121if (SelectDirectAddr(Op1, Addr)) {1122switch (N->getOpcode()) {1123default:1124return false;1125case NVPTXISD::LoadV2:1126Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1127NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,1128NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,1129NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);1130break;1131case NVPTXISD::LoadV4:1132Opcode =1133pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,1134NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,1135std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);1136break;1137}1138if (!Opcode)1139return false;1140SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),1141getI32Imm(VecType, DL), getI32Imm(FromType, DL),1142getI32Imm(FromTypeWidth, DL), Addr, Chain };1143LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);1144} else if (PointerSize == 641145? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)1146: SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {1147switch (N->getOpcode()) {1148default:1149return false;1150case NVPTXISD::LoadV2:1151Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1152NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,1153NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,1154NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);1155break;1156case NVPTXISD::LoadV4:1157Opcode =1158pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,1159NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,1160std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);1161break;1162}1163if (!Opcode)1164return false;1165SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),1166getI32Imm(VecType, DL), getI32Imm(FromType, DL),1167getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };1168LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);1169} else if (PointerSize == 641170? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)1171: SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {1172if (PointerSize == 64) {1173switch (N->getOpcode()) {1174default:1175return false;1176case NVPTXISD::LoadV2:1177Opcode =1178pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1179NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,1180NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,1181NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);1182break;1183case NVPTXISD::LoadV4:1184Opcode = pickOpcodeForVT(1185EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,1186NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,1187NVPTX::LDV_f32_v4_ari_64, std::nullopt);1188break;1189}1190} else {1191switch (N->getOpcode()) {1192default:1193return false;1194case NVPTXISD::LoadV2:1195Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1196NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,1197NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,1198NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);1199break;1200case NVPTXISD::LoadV4:1201Opcode =1202pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,1203NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,1204std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);1205break;1206}1207}1208if (!Opcode)1209return false;1210SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),1211getI32Imm(VecType, DL), getI32Imm(FromType, DL),1212getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };12131214LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);1215} else {1216if (PointerSize == 64) {1217switch (N->getOpcode()) {1218default:1219return false;1220case NVPTXISD::LoadV2:1221Opcode = pickOpcodeForVT(1222EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,1223NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,1224NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,1225NVPTX::LDV_f64_v2_areg_64);1226break;1227case NVPTXISD::LoadV4:1228Opcode = pickOpcodeForVT(1229EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,1230NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,1231NVPTX::LDV_f32_v4_areg_64, std::nullopt);1232break;1233}1234} else {1235switch (N->getOpcode()) {1236default:1237return false;1238case NVPTXISD::LoadV2:1239Opcode =1240pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,1241NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,1242NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,1243NVPTX::LDV_f64_v2_areg);1244break;1245case NVPTXISD::LoadV4:1246Opcode =1247pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,1248NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,1249std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);1250break;1251}1252}1253if (!Opcode)1254return false;1255SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),1256getI32Imm(VecType, DL), getI32Imm(FromType, DL),1257getI32Imm(FromTypeWidth, DL), Op1, Chain };1258LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);1259}12601261MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();1262CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});12631264ReplaceNode(N, LD);1265return true;1266}12671268bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {12691270SDValue Chain = N->getOperand(0);1271SDValue Op1;1272MemSDNode *Mem;1273bool IsLDG = true;12741275// If this is an LDG intrinsic, the address is the third operand. If its an1276// LDG/LDU SD node (from custom vector handling), then its the second operand1277if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {1278Op1 = N->getOperand(2);1279Mem = cast<MemIntrinsicSDNode>(N);1280unsigned IID = N->getConstantOperandVal(1);1281switch (IID) {1282default:1283return false;1284case Intrinsic::nvvm_ldg_global_f:1285case Intrinsic::nvvm_ldg_global_i:1286case Intrinsic::nvvm_ldg_global_p:1287IsLDG = true;1288break;1289case Intrinsic::nvvm_ldu_global_f:1290case Intrinsic::nvvm_ldu_global_i:1291case Intrinsic::nvvm_ldu_global_p:1292IsLDG = false;1293break;1294}1295} else {1296Op1 = N->getOperand(1);1297Mem = cast<MemSDNode>(N);1298}12991300std::optional<unsigned> Opcode;1301SDLoc DL(N);1302SDNode *LD;1303SDValue Base, Offset, Addr;1304EVT OrigType = N->getValueType(0);13051306EVT EltVT = Mem->getMemoryVT();1307unsigned NumElts = 1;1308if (EltVT.isVector()) {1309NumElts = EltVT.getVectorNumElements();1310EltVT = EltVT.getVectorElementType();1311// vectors of 16bits type are loaded/stored as multiples of v2x16 elements.1312if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||1313(EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||1314(EltVT == MVT::i16 && OrigType == MVT::v2i16)) {1315assert(NumElts % 2 == 0 && "Vector must have even number of elements");1316EltVT = OrigType;1317NumElts /= 2;1318} else if (OrigType == MVT::v4i8) {1319EltVT = OrigType;1320NumElts = 1;1321}1322}13231324// Build the "promoted" result VTList for the load. If we are really loading1325// i8s, then the return type will be promoted to i16 since we do not expose1326// 8-bit registers in NVPTX.1327EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;1328SmallVector<EVT, 5> InstVTs;1329for (unsigned i = 0; i != NumElts; ++i) {1330InstVTs.push_back(NodeVT);1331}1332InstVTs.push_back(MVT::Other);1333SDVTList InstVTList = CurDAG->getVTList(InstVTs);13341335if (SelectDirectAddr(Op1, Addr)) {1336switch (N->getOpcode()) {1337default:1338return false;1339case ISD::LOAD:1340case ISD::INTRINSIC_W_CHAIN:1341if (IsLDG)1342Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1343NVPTX::INT_PTX_LDG_GLOBAL_i8avar,1344NVPTX::INT_PTX_LDG_GLOBAL_i16avar,1345NVPTX::INT_PTX_LDG_GLOBAL_i32avar,1346NVPTX::INT_PTX_LDG_GLOBAL_i64avar,1347NVPTX::INT_PTX_LDG_GLOBAL_f32avar,1348NVPTX::INT_PTX_LDG_GLOBAL_f64avar);1349else1350Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1351NVPTX::INT_PTX_LDU_GLOBAL_i8avar,1352NVPTX::INT_PTX_LDU_GLOBAL_i16avar,1353NVPTX::INT_PTX_LDU_GLOBAL_i32avar,1354NVPTX::INT_PTX_LDU_GLOBAL_i64avar,1355NVPTX::INT_PTX_LDU_GLOBAL_f32avar,1356NVPTX::INT_PTX_LDU_GLOBAL_f64avar);1357break;1358case NVPTXISD::LoadV2:1359case NVPTXISD::LDGV2:1360Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1361NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,1362NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,1363NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,1364NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,1365NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,1366NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);1367break;1368case NVPTXISD::LDUV2:1369Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1370NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,1371NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,1372NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,1373NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,1374NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,1375NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);1376break;1377case NVPTXISD::LoadV4:1378case NVPTXISD::LDGV4:1379Opcode = pickOpcodeForVT(1380EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,1381NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,1382NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,1383NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);1384break;1385case NVPTXISD::LDUV4:1386Opcode = pickOpcodeForVT(1387EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,1388NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,1389NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,1390NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);1391break;1392}1393if (!Opcode)1394return false;1395SDValue Ops[] = { Addr, Chain };1396LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);1397} else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)1398: SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {1399if (TM.is64Bit()) {1400switch (N->getOpcode()) {1401default:1402return false;1403case ISD::LOAD:1404case ISD::INTRINSIC_W_CHAIN:1405if (IsLDG)1406Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1407NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,1408NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,1409NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,1410NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,1411NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,1412NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);1413else1414Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1415NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,1416NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,1417NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,1418NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,1419NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,1420NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);1421break;1422case NVPTXISD::LoadV2:1423case NVPTXISD::LDGV2:1424Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1425NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,1426NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,1427NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,1428NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,1429NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,1430NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);1431break;1432case NVPTXISD::LDUV2:1433Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1434NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,1435NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,1436NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,1437NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,1438NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,1439NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);1440break;1441case NVPTXISD::LoadV4:1442case NVPTXISD::LDGV4:1443Opcode = pickOpcodeForVT(1444EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,1445NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,1446NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,1447NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);1448break;1449case NVPTXISD::LDUV4:1450Opcode = pickOpcodeForVT(1451EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,1452NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,1453NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,1454NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);1455break;1456}1457} else {1458switch (N->getOpcode()) {1459default:1460return false;1461case ISD::LOAD:1462case ISD::INTRINSIC_W_CHAIN:1463if (IsLDG)1464Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1465NVPTX::INT_PTX_LDG_GLOBAL_i8ari,1466NVPTX::INT_PTX_LDG_GLOBAL_i16ari,1467NVPTX::INT_PTX_LDG_GLOBAL_i32ari,1468NVPTX::INT_PTX_LDG_GLOBAL_i64ari,1469NVPTX::INT_PTX_LDG_GLOBAL_f32ari,1470NVPTX::INT_PTX_LDG_GLOBAL_f64ari);1471else1472Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1473NVPTX::INT_PTX_LDU_GLOBAL_i8ari,1474NVPTX::INT_PTX_LDU_GLOBAL_i16ari,1475NVPTX::INT_PTX_LDU_GLOBAL_i32ari,1476NVPTX::INT_PTX_LDU_GLOBAL_i64ari,1477NVPTX::INT_PTX_LDU_GLOBAL_f32ari,1478NVPTX::INT_PTX_LDU_GLOBAL_f64ari);1479break;1480case NVPTXISD::LoadV2:1481case NVPTXISD::LDGV2:1482Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1483NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,1484NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,1485NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,1486NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,1487NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,1488NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);1489break;1490case NVPTXISD::LDUV2:1491Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1492NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,1493NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,1494NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,1495NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,1496NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,1497NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);1498break;1499case NVPTXISD::LoadV4:1500case NVPTXISD::LDGV4:1501Opcode = pickOpcodeForVT(1502EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,1503NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,1504NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,1505NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);1506break;1507case NVPTXISD::LDUV4:1508Opcode = pickOpcodeForVT(1509EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,1510NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,1511NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,1512NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);1513break;1514}1515}1516if (!Opcode)1517return false;1518SDValue Ops[] = {Base, Offset, Chain};1519LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);1520} else {1521if (TM.is64Bit()) {1522switch (N->getOpcode()) {1523default:1524return false;1525case ISD::LOAD:1526case ISD::INTRINSIC_W_CHAIN:1527if (IsLDG)1528Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1529NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,1530NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,1531NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,1532NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,1533NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,1534NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);1535else1536Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1537NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,1538NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,1539NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,1540NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,1541NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,1542NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);1543break;1544case NVPTXISD::LoadV2:1545case NVPTXISD::LDGV2:1546Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1547NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,1548NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,1549NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,1550NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,1551NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,1552NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);1553break;1554case NVPTXISD::LDUV2:1555Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1556NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,1557NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,1558NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,1559NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,1560NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,1561NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);1562break;1563case NVPTXISD::LoadV4:1564case NVPTXISD::LDGV4:1565Opcode = pickOpcodeForVT(1566EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,1567NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,1568NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,1569NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);1570break;1571case NVPTXISD::LDUV4:1572Opcode = pickOpcodeForVT(1573EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,1574NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,1575NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,1576NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);1577break;1578}1579} else {1580switch (N->getOpcode()) {1581default:1582return false;1583case ISD::LOAD:1584case ISD::INTRINSIC_W_CHAIN:1585if (IsLDG)1586Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1587NVPTX::INT_PTX_LDG_GLOBAL_i8areg,1588NVPTX::INT_PTX_LDG_GLOBAL_i16areg,1589NVPTX::INT_PTX_LDG_GLOBAL_i32areg,1590NVPTX::INT_PTX_LDG_GLOBAL_i64areg,1591NVPTX::INT_PTX_LDG_GLOBAL_f32areg,1592NVPTX::INT_PTX_LDG_GLOBAL_f64areg);1593else1594Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1595NVPTX::INT_PTX_LDU_GLOBAL_i8areg,1596NVPTX::INT_PTX_LDU_GLOBAL_i16areg,1597NVPTX::INT_PTX_LDU_GLOBAL_i32areg,1598NVPTX::INT_PTX_LDU_GLOBAL_i64areg,1599NVPTX::INT_PTX_LDU_GLOBAL_f32areg,1600NVPTX::INT_PTX_LDU_GLOBAL_f64areg);1601break;1602case NVPTXISD::LoadV2:1603case NVPTXISD::LDGV2:1604Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1605NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,1606NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,1607NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,1608NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,1609NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,1610NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);1611break;1612case NVPTXISD::LDUV2:1613Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1614NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,1615NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,1616NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,1617NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,1618NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,1619NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);1620break;1621case NVPTXISD::LoadV4:1622case NVPTXISD::LDGV4:1623Opcode = pickOpcodeForVT(1624EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,1625NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,1626NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,1627NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);1628break;1629case NVPTXISD::LDUV4:1630Opcode = pickOpcodeForVT(1631EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,1632NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,1633NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,1634NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);1635break;1636}1637}1638if (!Opcode)1639return false;1640SDValue Ops[] = { Op1, Chain };1641LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);1642}16431644// For automatic generation of LDG (through SelectLoad[Vector], not the1645// intrinsics), we may have an extending load like:1646//1647// i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i641648//1649// In this case, the matching logic above will select a load for the original1650// memory type (in this case, i8) and our types will not match (the node needs1651// to return an i32 in this case). Our LDG/LDU nodes do not support the1652// concept of sign-/zero-extension, so emulate it here by adding an explicit1653// CVT instruction. Ptxas should clean up any redundancies here.16541655LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);16561657if (OrigType != EltVT &&1658(LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {1659// We have an extending-load. The instruction we selected operates on the1660// smaller type, but the SDNode we are replacing has the larger type. We1661// need to emit a CVT to make the types match.1662unsigned CvtOpc =1663GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);16641665// For each output value, apply the manual sign/zero-extension and make sure1666// all users of the load go through that CVT.1667for (unsigned i = 0; i != NumElts; ++i) {1668SDValue Res(LD, i);1669SDValue OrigVal(N, i);16701671SDNode *CvtNode =1672CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,1673CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,1674DL, MVT::i32));1675ReplaceUses(OrigVal, SDValue(CvtNode, 0));1676}1677}16781679ReplaceNode(N, LD);1680return true;1681}16821683bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {1684SDLoc dl(N);1685MemSDNode *ST = cast<MemSDNode>(N);1686assert(ST->writeMem() && "Expected store");1687StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);1688AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);1689assert((PlainStore || AtomicStore) && "Expected store");1690EVT StoreVT = ST->getMemoryVT();1691SDNode *NVPTXST = nullptr;16921693// do not support pre/post inc/dec1694if (PlainStore && PlainStore->isIndexed())1695return false;16961697if (!StoreVT.isSimple())1698return false;16991700AtomicOrdering Ordering = ST->getSuccessOrdering();1701// In order to lower atomic loads with stronger guarantees we would need to1702// use store.release or insert fences. However these features were only added1703// with PTX ISA 6.0 / sm_70.1704// TODO: Check if we can actually use the new instructions and implement them.1705if (isStrongerThanMonotonic(Ordering))1706return false;17071708// Address Space Setting1709unsigned int CodeAddrSpace = getCodeAddrSpace(ST);1710unsigned int PointerSize =1711CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());17121713// Volatile Setting1714// - .volatile is only available for .global and .shared1715// - .volatile has the same memory synchronization semantics as .relaxed.sys1716bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;1717if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&1718CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&1719CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)1720isVolatile = false;17211722// Vector Setting1723MVT SimpleVT = StoreVT.getSimpleVT();1724unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;17251726// Type Setting: toType + toTypeWidth1727// - for integer type, always use 'u'1728//1729MVT ScalarVT = SimpleVT.getScalarType();1730unsigned toTypeWidth = ScalarVT.getSizeInBits();1731if (SimpleVT.isVector()) {1732assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&1733"Unexpected vector type");1734// v2x16 is stored using st.b321735toTypeWidth = 32;1736}17371738unsigned int toType = getLdStRegType(ScalarVT);17391740// Create the machine instruction DAG1741SDValue Chain = ST->getChain();1742SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();1743SDValue BasePtr = ST->getBasePtr();1744SDValue Addr;1745SDValue Offset, Base;1746std::optional<unsigned> Opcode;1747MVT::SimpleValueType SourceVT =1748Value.getNode()->getSimpleValueType(0).SimpleTy;17491750if (SelectDirectAddr(BasePtr, Addr)) {1751Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,1752NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,1753NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);1754if (!Opcode)1755return false;1756SDValue Ops[] = {Value,1757getI32Imm(isVolatile, dl),1758getI32Imm(CodeAddrSpace, dl),1759getI32Imm(vecType, dl),1760getI32Imm(toType, dl),1761getI32Imm(toTypeWidth, dl),1762Addr,1763Chain};1764NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);1765} else if (PointerSize == 641766? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)1767: SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {1768Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,1769NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,1770NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);1771if (!Opcode)1772return false;1773SDValue Ops[] = {Value,1774getI32Imm(isVolatile, dl),1775getI32Imm(CodeAddrSpace, dl),1776getI32Imm(vecType, dl),1777getI32Imm(toType, dl),1778getI32Imm(toTypeWidth, dl),1779Base,1780Offset,1781Chain};1782NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);1783} else if (PointerSize == 641784? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)1785: SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {1786if (PointerSize == 64)1787Opcode =1788pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,1789NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,1790NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);1791else1792Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,1793NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,1794NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);1795if (!Opcode)1796return false;17971798SDValue Ops[] = {Value,1799getI32Imm(isVolatile, dl),1800getI32Imm(CodeAddrSpace, dl),1801getI32Imm(vecType, dl),1802getI32Imm(toType, dl),1803getI32Imm(toTypeWidth, dl),1804Base,1805Offset,1806Chain};1807NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);1808} else {1809if (PointerSize == 64)1810Opcode =1811pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,1812NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,1813NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);1814else1815Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,1816NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,1817NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);1818if (!Opcode)1819return false;1820SDValue Ops[] = {Value,1821getI32Imm(isVolatile, dl),1822getI32Imm(CodeAddrSpace, dl),1823getI32Imm(vecType, dl),1824getI32Imm(toType, dl),1825getI32Imm(toTypeWidth, dl),1826BasePtr,1827Chain};1828NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);1829}18301831if (!NVPTXST)1832return false;18331834MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();1835CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});1836ReplaceNode(N, NVPTXST);1837return true;1838}18391840bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {1841SDValue Chain = N->getOperand(0);1842SDValue Op1 = N->getOperand(1);1843SDValue Addr, Offset, Base;1844std::optional<unsigned> Opcode;1845SDLoc DL(N);1846SDNode *ST;1847EVT EltVT = Op1.getValueType();1848MemSDNode *MemSD = cast<MemSDNode>(N);1849EVT StoreVT = MemSD->getMemoryVT();18501851// Address Space Setting1852unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);1853if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {1854report_fatal_error("Cannot store to pointer that points to constant "1855"memory space");1856}1857unsigned int PointerSize =1858CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());18591860// Volatile Setting1861// - .volatile is only availalble for .global and .shared1862bool IsVolatile = MemSD->isVolatile();1863if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&1864CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&1865CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)1866IsVolatile = false;18671868// Type Setting: toType + toTypeWidth1869// - for integer type, always use 'u'1870assert(StoreVT.isSimple() && "Store value is not simple");1871MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();1872unsigned ToTypeWidth = ScalarVT.getSizeInBits();1873unsigned ToType = getLdStRegType(ScalarVT);18741875SmallVector<SDValue, 12> StOps;1876SDValue N2;1877unsigned VecType;18781879switch (N->getOpcode()) {1880case NVPTXISD::StoreV2:1881VecType = NVPTX::PTXLdStInstCode::V2;1882StOps.push_back(N->getOperand(1));1883StOps.push_back(N->getOperand(2));1884N2 = N->getOperand(3);1885break;1886case NVPTXISD::StoreV4:1887VecType = NVPTX::PTXLdStInstCode::V4;1888StOps.push_back(N->getOperand(1));1889StOps.push_back(N->getOperand(2));1890StOps.push_back(N->getOperand(3));1891StOps.push_back(N->getOperand(4));1892N2 = N->getOperand(5);1893break;1894default:1895return false;1896}18971898// v8x16 is a special case. PTX doesn't have st.v8.x161899// instruction. Instead, we split the vector into v2x16 chunks and1900// store them with st.v4.b32.1901if (Isv2x16VT(EltVT)) {1902assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");1903EltVT = MVT::i32;1904ToType = NVPTX::PTXLdStInstCode::Untyped;1905ToTypeWidth = 32;1906}19071908StOps.push_back(getI32Imm(IsVolatile, DL));1909StOps.push_back(getI32Imm(CodeAddrSpace, DL));1910StOps.push_back(getI32Imm(VecType, DL));1911StOps.push_back(getI32Imm(ToType, DL));1912StOps.push_back(getI32Imm(ToTypeWidth, DL));19131914if (SelectDirectAddr(N2, Addr)) {1915switch (N->getOpcode()) {1916default:1917return false;1918case NVPTXISD::StoreV2:1919Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1920NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,1921NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,1922NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);1923break;1924case NVPTXISD::StoreV4:1925Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1926NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,1927NVPTX::STV_i32_v4_avar, std::nullopt,1928NVPTX::STV_f32_v4_avar, std::nullopt);1929break;1930}1931StOps.push_back(Addr);1932} else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)1933: SelectADDRsi(N2.getNode(), N2, Base, Offset)) {1934switch (N->getOpcode()) {1935default:1936return false;1937case NVPTXISD::StoreV2:1938Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1939NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,1940NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,1941NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);1942break;1943case NVPTXISD::StoreV4:1944Opcode =1945pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,1946NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,1947std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);1948break;1949}1950StOps.push_back(Base);1951StOps.push_back(Offset);1952} else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)1953: SelectADDRri(N2.getNode(), N2, Base, Offset)) {1954if (PointerSize == 64) {1955switch (N->getOpcode()) {1956default:1957return false;1958case NVPTXISD::StoreV2:1959Opcode =1960pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1961NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,1962NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,1963NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);1964break;1965case NVPTXISD::StoreV4:1966Opcode = pickOpcodeForVT(1967EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,1968NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,1969NVPTX::STV_f32_v4_ari_64, std::nullopt);1970break;1971}1972} else {1973switch (N->getOpcode()) {1974default:1975return false;1976case NVPTXISD::StoreV2:1977Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1978NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,1979NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,1980NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);1981break;1982case NVPTXISD::StoreV4:1983Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,1984NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,1985NVPTX::STV_i32_v4_ari, std::nullopt,1986NVPTX::STV_f32_v4_ari, std::nullopt);1987break;1988}1989}1990StOps.push_back(Base);1991StOps.push_back(Offset);1992} else {1993if (PointerSize == 64) {1994switch (N->getOpcode()) {1995default:1996return false;1997case NVPTXISD::StoreV2:1998Opcode = pickOpcodeForVT(1999EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,2000NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,2001NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,2002NVPTX::STV_f64_v2_areg_64);2003break;2004case NVPTXISD::StoreV4:2005Opcode = pickOpcodeForVT(2006EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,2007NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,2008NVPTX::STV_f32_v4_areg_64, std::nullopt);2009break;2010}2011} else {2012switch (N->getOpcode()) {2013default:2014return false;2015case NVPTXISD::StoreV2:2016Opcode =2017pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,2018NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,2019NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,2020NVPTX::STV_f64_v2_areg);2021break;2022case NVPTXISD::StoreV4:2023Opcode =2024pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,2025NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,2026std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);2027break;2028}2029}2030StOps.push_back(N2);2031}20322033if (!Opcode)2034return false;20352036StOps.push_back(Chain);20372038ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);20392040MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();2041CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});20422043ReplaceNode(N, ST);2044return true;2045}20462047bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {2048SDValue Chain = Node->getOperand(0);2049SDValue Offset = Node->getOperand(2);2050SDValue Glue = Node->getOperand(3);2051SDLoc DL(Node);2052MemSDNode *Mem = cast<MemSDNode>(Node);20532054unsigned VecSize;2055switch (Node->getOpcode()) {2056default:2057return false;2058case NVPTXISD::LoadParam:2059VecSize = 1;2060break;2061case NVPTXISD::LoadParamV2:2062VecSize = 2;2063break;2064case NVPTXISD::LoadParamV4:2065VecSize = 4;2066break;2067}20682069EVT EltVT = Node->getValueType(0);2070EVT MemVT = Mem->getMemoryVT();20712072std::optional<unsigned> Opcode;20732074switch (VecSize) {2075default:2076return false;2077case 1:2078Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,2079NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,2080NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,2081NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);2082break;2083case 2:2084Opcode =2085pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,2086NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,2087NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,2088NVPTX::LoadParamMemV2F64);2089break;2090case 4:2091Opcode =2092pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,2093NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,2094std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);2095break;2096}2097if (!Opcode)2098return false;20992100SDVTList VTs;2101if (VecSize == 1) {2102VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);2103} else if (VecSize == 2) {2104VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);2105} else {2106EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };2107VTs = CurDAG->getVTList(EVTs);2108}21092110unsigned OffsetVal = Offset->getAsZExtVal();21112112SmallVector<SDValue, 2> Ops;2113Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));2114Ops.push_back(Chain);2115Ops.push_back(Glue);21162117ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));2118return true;2119}21202121bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {2122SDLoc DL(N);2123SDValue Chain = N->getOperand(0);2124SDValue Offset = N->getOperand(1);2125unsigned OffsetVal = Offset->getAsZExtVal();2126MemSDNode *Mem = cast<MemSDNode>(N);21272128// How many elements do we have?2129unsigned NumElts = 1;2130switch (N->getOpcode()) {2131default:2132return false;2133case NVPTXISD::StoreRetval:2134NumElts = 1;2135break;2136case NVPTXISD::StoreRetvalV2:2137NumElts = 2;2138break;2139case NVPTXISD::StoreRetvalV4:2140NumElts = 4;2141break;2142}21432144// Build vector of operands2145SmallVector<SDValue, 6> Ops;2146for (unsigned i = 0; i < NumElts; ++i)2147Ops.push_back(N->getOperand(i + 2));2148Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));2149Ops.push_back(Chain);21502151// Determine target opcode2152// If we have an i1, use an 8-bit store. The lowering code in2153// NVPTXISelLowering will have already emitted an upcast.2154std::optional<unsigned> Opcode = 0;2155switch (NumElts) {2156default:2157return false;2158case 1:2159Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,2160NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,2161NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,2162NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);2163if (Opcode == NVPTX::StoreRetvalI8) {2164// Fine tune the opcode depending on the size of the operand.2165// This helps to avoid creating redundant COPY instructions in2166// InstrEmitter::AddRegisterOperand().2167switch (Ops[0].getSimpleValueType().SimpleTy) {2168default:2169break;2170case MVT::i32:2171Opcode = NVPTX::StoreRetvalI8TruncI32;2172break;2173case MVT::i64:2174Opcode = NVPTX::StoreRetvalI8TruncI64;2175break;2176}2177}2178break;2179case 2:2180Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,2181NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,2182NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,2183NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);2184break;2185case 4:2186Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,2187NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,2188NVPTX::StoreRetvalV4I32, std::nullopt,2189NVPTX::StoreRetvalV4F32, std::nullopt);2190break;2191}2192if (!Opcode)2193return false;21942195SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);2196MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();2197CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});21982199ReplaceNode(N, Ret);2200return true;2201}22022203// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)2204#define getOpcV2H(ty, opKind0, opKind1) \2205NVPTX::StoreParamV2##ty##_##opKind0##opKind122062207#define getOpcV2H1(ty, opKind0, isImm1) \2208(isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)22092210#define getOpcodeForVectorStParamV2(ty, isimm) \2211(isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])22122213#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \2214NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind322152216#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \2217(isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \2218: getOpcV4H(ty, opKind0, opKind1, opKind2, r)22192220#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \2221(isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \2222: getOpcV4H3(ty, opKind0, opKind1, r, isImm3)22232224#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \2225(isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \2226: getOpcV4H2(ty, opKind0, r, isImm2, isImm3)22272228#define getOpcodeForVectorStParamV4(ty, isimm) \2229(isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \2230: getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])22312232#define getOpcodeForVectorStParam(n, ty, isimm) \2233(n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \2234: getOpcodeForVectorStParamV4(ty, isimm)22352236static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops,2237unsigned NumElts,2238MVT::SimpleValueType MemTy,2239SelectionDAG *CurDAG, SDLoc DL) {2240// Determine which inputs are registers and immediates make new operators2241// with constant values2242SmallVector<bool, 4> IsImm(NumElts, false);2243for (unsigned i = 0; i < NumElts; i++) {2244IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));2245if (IsImm[i]) {2246SDValue Imm = Ops[i];2247if (MemTy == MVT::f32 || MemTy == MVT::f64) {2248const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);2249const ConstantFP *CF = ConstImm->getConstantFPValue();2250Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));2251} else {2252const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);2253const ConstantInt *CI = ConstImm->getConstantIntValue();2254Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));2255}2256Ops[i] = Imm;2257}2258}22592260// Get opcode for MemTy, size, and register/immediate operand ordering2261switch (MemTy) {2262case MVT::i8:2263return getOpcodeForVectorStParam(NumElts, I8, IsImm);2264case MVT::i16:2265return getOpcodeForVectorStParam(NumElts, I16, IsImm);2266case MVT::i32:2267return getOpcodeForVectorStParam(NumElts, I32, IsImm);2268case MVT::i64:2269assert(NumElts == 2 && "MVT too large for NumElts > 2");2270return getOpcodeForVectorStParamV2(I64, IsImm);2271case MVT::f32:2272return getOpcodeForVectorStParam(NumElts, F32, IsImm);2273case MVT::f64:2274assert(NumElts == 2 && "MVT too large for NumElts > 2");2275return getOpcodeForVectorStParamV2(F64, IsImm);22762277// These cases don't support immediates, just use the all register version2278// and generate moves.2279case MVT::i1:2280return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr2281: NVPTX::StoreParamV4I8_rrrr;2282case MVT::f16:2283case MVT::bf16:2284return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr2285: NVPTX::StoreParamV4I16_rrrr;2286case MVT::v2f16:2287case MVT::v2bf16:2288case MVT::v2i16:2289case MVT::v4i8:2290return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr2291: NVPTX::StoreParamV4I32_rrrr;2292default:2293llvm_unreachable("Cannot select st.param for unknown MemTy");2294}2295}22962297bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {2298SDLoc DL(N);2299SDValue Chain = N->getOperand(0);2300SDValue Param = N->getOperand(1);2301unsigned ParamVal = Param->getAsZExtVal();2302SDValue Offset = N->getOperand(2);2303unsigned OffsetVal = Offset->getAsZExtVal();2304MemSDNode *Mem = cast<MemSDNode>(N);2305SDValue Glue = N->getOperand(N->getNumOperands() - 1);23062307// How many elements do we have?2308unsigned NumElts;2309switch (N->getOpcode()) {2310default:2311llvm_unreachable("Unexpected opcode");2312case NVPTXISD::StoreParamU32:2313case NVPTXISD::StoreParamS32:2314case NVPTXISD::StoreParam:2315NumElts = 1;2316break;2317case NVPTXISD::StoreParamV2:2318NumElts = 2;2319break;2320case NVPTXISD::StoreParamV4:2321NumElts = 4;2322break;2323}23242325// Build vector of operands2326SmallVector<SDValue, 8> Ops;2327for (unsigned i = 0; i < NumElts; ++i)2328Ops.push_back(N->getOperand(i + 3));2329Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));2330Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));2331Ops.push_back(Chain);2332Ops.push_back(Glue);23332334// Determine target opcode2335// If we have an i1, use an 8-bit store. The lowering code in2336// NVPTXISelLowering will have already emitted an upcast.2337std::optional<unsigned> Opcode;2338switch (N->getOpcode()) {2339default:2340switch (NumElts) {2341default:2342llvm_unreachable("Unexpected NumElts");2343case 1: {2344MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;2345SDValue Imm = Ops[0];2346if (MemTy != MVT::f16 && MemTy != MVT::v2f16 &&2347(isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {2348// Convert immediate to target constant2349if (MemTy == MVT::f32 || MemTy == MVT::f64) {2350const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);2351const ConstantFP *CF = ConstImm->getConstantFPValue();2352Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));2353} else {2354const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);2355const ConstantInt *CI = ConstImm->getConstantIntValue();2356Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));2357}2358Ops[0] = Imm;2359// Use immediate version of store param2360Opcode = pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i,2361NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i,2362NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i,2363NVPTX::StoreParamF64_i);2364} else2365Opcode =2366pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,2367NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,2368NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r,2369NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r);2370if (Opcode == NVPTX::StoreParamI8_r) {2371// Fine tune the opcode depending on the size of the operand.2372// This helps to avoid creating redundant COPY instructions in2373// InstrEmitter::AddRegisterOperand().2374switch (Ops[0].getSimpleValueType().SimpleTy) {2375default:2376break;2377case MVT::i32:2378Opcode = NVPTX::StoreParamI8TruncI32_r;2379break;2380case MVT::i64:2381Opcode = NVPTX::StoreParamI8TruncI64_r;2382break;2383}2384}2385break;2386}2387case 2:2388case 4: {2389MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;2390Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);2391break;2392}2393}2394break;2395// Special case: if we have a sign-extend/zero-extend node, insert the2396// conversion instruction first, and use that as the value operand to2397// the selected StoreParam node.2398case NVPTXISD::StoreParamU32: {2399Opcode = NVPTX::StoreParamI32_r;2400SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,2401MVT::i32);2402SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,2403MVT::i32, Ops[0], CvtNone);2404Ops[0] = SDValue(Cvt, 0);2405break;2406}2407case NVPTXISD::StoreParamS32: {2408Opcode = NVPTX::StoreParamI32_r;2409SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,2410MVT::i32);2411SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,2412MVT::i32, Ops[0], CvtNone);2413Ops[0] = SDValue(Cvt, 0);2414break;2415}2416}24172418SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);2419SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);2420MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();2421CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});24222423ReplaceNode(N, Ret);2424return true;2425}24262427bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {2428unsigned Opc = 0;24292430switch (N->getOpcode()) {2431default: return false;2432case NVPTXISD::Tex1DFloatS32:2433Opc = NVPTX::TEX_1D_F32_S32_RR;2434break;2435case NVPTXISD::Tex1DFloatFloat:2436Opc = NVPTX::TEX_1D_F32_F32_RR;2437break;2438case NVPTXISD::Tex1DFloatFloatLevel:2439Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;2440break;2441case NVPTXISD::Tex1DFloatFloatGrad:2442Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;2443break;2444case NVPTXISD::Tex1DS32S32:2445Opc = NVPTX::TEX_1D_S32_S32_RR;2446break;2447case NVPTXISD::Tex1DS32Float:2448Opc = NVPTX::TEX_1D_S32_F32_RR;2449break;2450case NVPTXISD::Tex1DS32FloatLevel:2451Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;2452break;2453case NVPTXISD::Tex1DS32FloatGrad:2454Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;2455break;2456case NVPTXISD::Tex1DU32S32:2457Opc = NVPTX::TEX_1D_U32_S32_RR;2458break;2459case NVPTXISD::Tex1DU32Float:2460Opc = NVPTX::TEX_1D_U32_F32_RR;2461break;2462case NVPTXISD::Tex1DU32FloatLevel:2463Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;2464break;2465case NVPTXISD::Tex1DU32FloatGrad:2466Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;2467break;2468case NVPTXISD::Tex1DArrayFloatS32:2469Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;2470break;2471case NVPTXISD::Tex1DArrayFloatFloat:2472Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;2473break;2474case NVPTXISD::Tex1DArrayFloatFloatLevel:2475Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;2476break;2477case NVPTXISD::Tex1DArrayFloatFloatGrad:2478Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;2479break;2480case NVPTXISD::Tex1DArrayS32S32:2481Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;2482break;2483case NVPTXISD::Tex1DArrayS32Float:2484Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;2485break;2486case NVPTXISD::Tex1DArrayS32FloatLevel:2487Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;2488break;2489case NVPTXISD::Tex1DArrayS32FloatGrad:2490Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;2491break;2492case NVPTXISD::Tex1DArrayU32S32:2493Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;2494break;2495case NVPTXISD::Tex1DArrayU32Float:2496Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;2497break;2498case NVPTXISD::Tex1DArrayU32FloatLevel:2499Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;2500break;2501case NVPTXISD::Tex1DArrayU32FloatGrad:2502Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;2503break;2504case NVPTXISD::Tex2DFloatS32:2505Opc = NVPTX::TEX_2D_F32_S32_RR;2506break;2507case NVPTXISD::Tex2DFloatFloat:2508Opc = NVPTX::TEX_2D_F32_F32_RR;2509break;2510case NVPTXISD::Tex2DFloatFloatLevel:2511Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;2512break;2513case NVPTXISD::Tex2DFloatFloatGrad:2514Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;2515break;2516case NVPTXISD::Tex2DS32S32:2517Opc = NVPTX::TEX_2D_S32_S32_RR;2518break;2519case NVPTXISD::Tex2DS32Float:2520Opc = NVPTX::TEX_2D_S32_F32_RR;2521break;2522case NVPTXISD::Tex2DS32FloatLevel:2523Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;2524break;2525case NVPTXISD::Tex2DS32FloatGrad:2526Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;2527break;2528case NVPTXISD::Tex2DU32S32:2529Opc = NVPTX::TEX_2D_U32_S32_RR;2530break;2531case NVPTXISD::Tex2DU32Float:2532Opc = NVPTX::TEX_2D_U32_F32_RR;2533break;2534case NVPTXISD::Tex2DU32FloatLevel:2535Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;2536break;2537case NVPTXISD::Tex2DU32FloatGrad:2538Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;2539break;2540case NVPTXISD::Tex2DArrayFloatS32:2541Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;2542break;2543case NVPTXISD::Tex2DArrayFloatFloat:2544Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;2545break;2546case NVPTXISD::Tex2DArrayFloatFloatLevel:2547Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;2548break;2549case NVPTXISD::Tex2DArrayFloatFloatGrad:2550Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;2551break;2552case NVPTXISD::Tex2DArrayS32S32:2553Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;2554break;2555case NVPTXISD::Tex2DArrayS32Float:2556Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;2557break;2558case NVPTXISD::Tex2DArrayS32FloatLevel:2559Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;2560break;2561case NVPTXISD::Tex2DArrayS32FloatGrad:2562Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;2563break;2564case NVPTXISD::Tex2DArrayU32S32:2565Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;2566break;2567case NVPTXISD::Tex2DArrayU32Float:2568Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;2569break;2570case NVPTXISD::Tex2DArrayU32FloatLevel:2571Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;2572break;2573case NVPTXISD::Tex2DArrayU32FloatGrad:2574Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;2575break;2576case NVPTXISD::Tex3DFloatS32:2577Opc = NVPTX::TEX_3D_F32_S32_RR;2578break;2579case NVPTXISD::Tex3DFloatFloat:2580Opc = NVPTX::TEX_3D_F32_F32_RR;2581break;2582case NVPTXISD::Tex3DFloatFloatLevel:2583Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;2584break;2585case NVPTXISD::Tex3DFloatFloatGrad:2586Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;2587break;2588case NVPTXISD::Tex3DS32S32:2589Opc = NVPTX::TEX_3D_S32_S32_RR;2590break;2591case NVPTXISD::Tex3DS32Float:2592Opc = NVPTX::TEX_3D_S32_F32_RR;2593break;2594case NVPTXISD::Tex3DS32FloatLevel:2595Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;2596break;2597case NVPTXISD::Tex3DS32FloatGrad:2598Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;2599break;2600case NVPTXISD::Tex3DU32S32:2601Opc = NVPTX::TEX_3D_U32_S32_RR;2602break;2603case NVPTXISD::Tex3DU32Float:2604Opc = NVPTX::TEX_3D_U32_F32_RR;2605break;2606case NVPTXISD::Tex3DU32FloatLevel:2607Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;2608break;2609case NVPTXISD::Tex3DU32FloatGrad:2610Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;2611break;2612case NVPTXISD::TexCubeFloatFloat:2613Opc = NVPTX::TEX_CUBE_F32_F32_RR;2614break;2615case NVPTXISD::TexCubeFloatFloatLevel:2616Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;2617break;2618case NVPTXISD::TexCubeS32Float:2619Opc = NVPTX::TEX_CUBE_S32_F32_RR;2620break;2621case NVPTXISD::TexCubeS32FloatLevel:2622Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;2623break;2624case NVPTXISD::TexCubeU32Float:2625Opc = NVPTX::TEX_CUBE_U32_F32_RR;2626break;2627case NVPTXISD::TexCubeU32FloatLevel:2628Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;2629break;2630case NVPTXISD::TexCubeArrayFloatFloat:2631Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;2632break;2633case NVPTXISD::TexCubeArrayFloatFloatLevel:2634Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;2635break;2636case NVPTXISD::TexCubeArrayS32Float:2637Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;2638break;2639case NVPTXISD::TexCubeArrayS32FloatLevel:2640Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;2641break;2642case NVPTXISD::TexCubeArrayU32Float:2643Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;2644break;2645case NVPTXISD::TexCubeArrayU32FloatLevel:2646Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;2647break;2648case NVPTXISD::Tld4R2DFloatFloat:2649Opc = NVPTX::TLD4_R_2D_F32_F32_RR;2650break;2651case NVPTXISD::Tld4G2DFloatFloat:2652Opc = NVPTX::TLD4_G_2D_F32_F32_RR;2653break;2654case NVPTXISD::Tld4B2DFloatFloat:2655Opc = NVPTX::TLD4_B_2D_F32_F32_RR;2656break;2657case NVPTXISD::Tld4A2DFloatFloat:2658Opc = NVPTX::TLD4_A_2D_F32_F32_RR;2659break;2660case NVPTXISD::Tld4R2DS64Float:2661Opc = NVPTX::TLD4_R_2D_S32_F32_RR;2662break;2663case NVPTXISD::Tld4G2DS64Float:2664Opc = NVPTX::TLD4_G_2D_S32_F32_RR;2665break;2666case NVPTXISD::Tld4B2DS64Float:2667Opc = NVPTX::TLD4_B_2D_S32_F32_RR;2668break;2669case NVPTXISD::Tld4A2DS64Float:2670Opc = NVPTX::TLD4_A_2D_S32_F32_RR;2671break;2672case NVPTXISD::Tld4R2DU64Float:2673Opc = NVPTX::TLD4_R_2D_U32_F32_RR;2674break;2675case NVPTXISD::Tld4G2DU64Float:2676Opc = NVPTX::TLD4_G_2D_U32_F32_RR;2677break;2678case NVPTXISD::Tld4B2DU64Float:2679Opc = NVPTX::TLD4_B_2D_U32_F32_RR;2680break;2681case NVPTXISD::Tld4A2DU64Float:2682Opc = NVPTX::TLD4_A_2D_U32_F32_RR;2683break;2684case NVPTXISD::TexUnified1DFloatS32:2685Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;2686break;2687case NVPTXISD::TexUnified1DFloatFloat:2688Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;2689break;2690case NVPTXISD::TexUnified1DFloatFloatLevel:2691Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;2692break;2693case NVPTXISD::TexUnified1DFloatFloatGrad:2694Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;2695break;2696case NVPTXISD::TexUnified1DS32S32:2697Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;2698break;2699case NVPTXISD::TexUnified1DS32Float:2700Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;2701break;2702case NVPTXISD::TexUnified1DS32FloatLevel:2703Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;2704break;2705case NVPTXISD::TexUnified1DS32FloatGrad:2706Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;2707break;2708case NVPTXISD::TexUnified1DU32S32:2709Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;2710break;2711case NVPTXISD::TexUnified1DU32Float:2712Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;2713break;2714case NVPTXISD::TexUnified1DU32FloatLevel:2715Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;2716break;2717case NVPTXISD::TexUnified1DU32FloatGrad:2718Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;2719break;2720case NVPTXISD::TexUnified1DArrayFloatS32:2721Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;2722break;2723case NVPTXISD::TexUnified1DArrayFloatFloat:2724Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;2725break;2726case NVPTXISD::TexUnified1DArrayFloatFloatLevel:2727Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;2728break;2729case NVPTXISD::TexUnified1DArrayFloatFloatGrad:2730Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;2731break;2732case NVPTXISD::TexUnified1DArrayS32S32:2733Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;2734break;2735case NVPTXISD::TexUnified1DArrayS32Float:2736Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;2737break;2738case NVPTXISD::TexUnified1DArrayS32FloatLevel:2739Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;2740break;2741case NVPTXISD::TexUnified1DArrayS32FloatGrad:2742Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;2743break;2744case NVPTXISD::TexUnified1DArrayU32S32:2745Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;2746break;2747case NVPTXISD::TexUnified1DArrayU32Float:2748Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;2749break;2750case NVPTXISD::TexUnified1DArrayU32FloatLevel:2751Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;2752break;2753case NVPTXISD::TexUnified1DArrayU32FloatGrad:2754Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;2755break;2756case NVPTXISD::TexUnified2DFloatS32:2757Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;2758break;2759case NVPTXISD::TexUnified2DFloatFloat:2760Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;2761break;2762case NVPTXISD::TexUnified2DFloatFloatLevel:2763Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;2764break;2765case NVPTXISD::TexUnified2DFloatFloatGrad:2766Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;2767break;2768case NVPTXISD::TexUnified2DS32S32:2769Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;2770break;2771case NVPTXISD::TexUnified2DS32Float:2772Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;2773break;2774case NVPTXISD::TexUnified2DS32FloatLevel:2775Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;2776break;2777case NVPTXISD::TexUnified2DS32FloatGrad:2778Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;2779break;2780case NVPTXISD::TexUnified2DU32S32:2781Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;2782break;2783case NVPTXISD::TexUnified2DU32Float:2784Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;2785break;2786case NVPTXISD::TexUnified2DU32FloatLevel:2787Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;2788break;2789case NVPTXISD::TexUnified2DU32FloatGrad:2790Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;2791break;2792case NVPTXISD::TexUnified2DArrayFloatS32:2793Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;2794break;2795case NVPTXISD::TexUnified2DArrayFloatFloat:2796Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;2797break;2798case NVPTXISD::TexUnified2DArrayFloatFloatLevel:2799Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;2800break;2801case NVPTXISD::TexUnified2DArrayFloatFloatGrad:2802Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;2803break;2804case NVPTXISD::TexUnified2DArrayS32S32:2805Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;2806break;2807case NVPTXISD::TexUnified2DArrayS32Float:2808Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;2809break;2810case NVPTXISD::TexUnified2DArrayS32FloatLevel:2811Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;2812break;2813case NVPTXISD::TexUnified2DArrayS32FloatGrad:2814Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;2815break;2816case NVPTXISD::TexUnified2DArrayU32S32:2817Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;2818break;2819case NVPTXISD::TexUnified2DArrayU32Float:2820Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;2821break;2822case NVPTXISD::TexUnified2DArrayU32FloatLevel:2823Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;2824break;2825case NVPTXISD::TexUnified2DArrayU32FloatGrad:2826Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;2827break;2828case NVPTXISD::TexUnified3DFloatS32:2829Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;2830break;2831case NVPTXISD::TexUnified3DFloatFloat:2832Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;2833break;2834case NVPTXISD::TexUnified3DFloatFloatLevel:2835Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;2836break;2837case NVPTXISD::TexUnified3DFloatFloatGrad:2838Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;2839break;2840case NVPTXISD::TexUnified3DS32S32:2841Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;2842break;2843case NVPTXISD::TexUnified3DS32Float:2844Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;2845break;2846case NVPTXISD::TexUnified3DS32FloatLevel:2847Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;2848break;2849case NVPTXISD::TexUnified3DS32FloatGrad:2850Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;2851break;2852case NVPTXISD::TexUnified3DU32S32:2853Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;2854break;2855case NVPTXISD::TexUnified3DU32Float:2856Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;2857break;2858case NVPTXISD::TexUnified3DU32FloatLevel:2859Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;2860break;2861case NVPTXISD::TexUnified3DU32FloatGrad:2862Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;2863break;2864case NVPTXISD::TexUnifiedCubeFloatFloat:2865Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;2866break;2867case NVPTXISD::TexUnifiedCubeFloatFloatLevel:2868Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;2869break;2870case NVPTXISD::TexUnifiedCubeS32Float:2871Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;2872break;2873case NVPTXISD::TexUnifiedCubeS32FloatLevel:2874Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;2875break;2876case NVPTXISD::TexUnifiedCubeU32Float:2877Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;2878break;2879case NVPTXISD::TexUnifiedCubeU32FloatLevel:2880Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;2881break;2882case NVPTXISD::TexUnifiedCubeArrayFloatFloat:2883Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;2884break;2885case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:2886Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;2887break;2888case NVPTXISD::TexUnifiedCubeArrayS32Float:2889Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;2890break;2891case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:2892Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;2893break;2894case NVPTXISD::TexUnifiedCubeArrayU32Float:2895Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;2896break;2897case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:2898Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;2899break;2900case NVPTXISD::Tld4UnifiedR2DFloatFloat:2901Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;2902break;2903case NVPTXISD::Tld4UnifiedG2DFloatFloat:2904Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;2905break;2906case NVPTXISD::Tld4UnifiedB2DFloatFloat:2907Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;2908break;2909case NVPTXISD::Tld4UnifiedA2DFloatFloat:2910Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;2911break;2912case NVPTXISD::Tld4UnifiedR2DS64Float:2913Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;2914break;2915case NVPTXISD::Tld4UnifiedG2DS64Float:2916Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;2917break;2918case NVPTXISD::Tld4UnifiedB2DS64Float:2919Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;2920break;2921case NVPTXISD::Tld4UnifiedA2DS64Float:2922Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;2923break;2924case NVPTXISD::Tld4UnifiedR2DU64Float:2925Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;2926break;2927case NVPTXISD::Tld4UnifiedG2DU64Float:2928Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;2929break;2930case NVPTXISD::Tld4UnifiedB2DU64Float:2931Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;2932break;2933case NVPTXISD::Tld4UnifiedA2DU64Float:2934Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;2935break;2936case NVPTXISD::TexUnifiedCubeFloatFloatGrad:2937Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_GRAD_R;2938break;2939case NVPTXISD::TexUnifiedCubeS32FloatGrad:2940Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_GRAD_R;2941break;2942case NVPTXISD::TexUnifiedCubeU32FloatGrad:2943Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_GRAD_R;2944break;2945case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad:2946Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD_R;2947break;2948case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad:2949Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD_R;2950break;2951case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad:2952Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD_R;2953break;2954}29552956// Copy over operands2957SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));2958Ops.push_back(N->getOperand(0)); // Move chain to the back.29592960ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));2961return true;2962}29632964bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {2965unsigned Opc = 0;2966switch (N->getOpcode()) {2967default: return false;2968case NVPTXISD::Suld1DI8Clamp:2969Opc = NVPTX::SULD_1D_I8_CLAMP_R;2970break;2971case NVPTXISD::Suld1DI16Clamp:2972Opc = NVPTX::SULD_1D_I16_CLAMP_R;2973break;2974case NVPTXISD::Suld1DI32Clamp:2975Opc = NVPTX::SULD_1D_I32_CLAMP_R;2976break;2977case NVPTXISD::Suld1DI64Clamp:2978Opc = NVPTX::SULD_1D_I64_CLAMP_R;2979break;2980case NVPTXISD::Suld1DV2I8Clamp:2981Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;2982break;2983case NVPTXISD::Suld1DV2I16Clamp:2984Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;2985break;2986case NVPTXISD::Suld1DV2I32Clamp:2987Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;2988break;2989case NVPTXISD::Suld1DV2I64Clamp:2990Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;2991break;2992case NVPTXISD::Suld1DV4I8Clamp:2993Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;2994break;2995case NVPTXISD::Suld1DV4I16Clamp:2996Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;2997break;2998case NVPTXISD::Suld1DV4I32Clamp:2999Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;3000break;3001case NVPTXISD::Suld1DArrayI8Clamp:3002Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;3003break;3004case NVPTXISD::Suld1DArrayI16Clamp:3005Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;3006break;3007case NVPTXISD::Suld1DArrayI32Clamp:3008Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;3009break;3010case NVPTXISD::Suld1DArrayI64Clamp:3011Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;3012break;3013case NVPTXISD::Suld1DArrayV2I8Clamp:3014Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;3015break;3016case NVPTXISD::Suld1DArrayV2I16Clamp:3017Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;3018break;3019case NVPTXISD::Suld1DArrayV2I32Clamp:3020Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;3021break;3022case NVPTXISD::Suld1DArrayV2I64Clamp:3023Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;3024break;3025case NVPTXISD::Suld1DArrayV4I8Clamp:3026Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;3027break;3028case NVPTXISD::Suld1DArrayV4I16Clamp:3029Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;3030break;3031case NVPTXISD::Suld1DArrayV4I32Clamp:3032Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;3033break;3034case NVPTXISD::Suld2DI8Clamp:3035Opc = NVPTX::SULD_2D_I8_CLAMP_R;3036break;3037case NVPTXISD::Suld2DI16Clamp:3038Opc = NVPTX::SULD_2D_I16_CLAMP_R;3039break;3040case NVPTXISD::Suld2DI32Clamp:3041Opc = NVPTX::SULD_2D_I32_CLAMP_R;3042break;3043case NVPTXISD::Suld2DI64Clamp:3044Opc = NVPTX::SULD_2D_I64_CLAMP_R;3045break;3046case NVPTXISD::Suld2DV2I8Clamp:3047Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;3048break;3049case NVPTXISD::Suld2DV2I16Clamp:3050Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;3051break;3052case NVPTXISD::Suld2DV2I32Clamp:3053Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;3054break;3055case NVPTXISD::Suld2DV2I64Clamp:3056Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;3057break;3058case NVPTXISD::Suld2DV4I8Clamp:3059Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;3060break;3061case NVPTXISD::Suld2DV4I16Clamp:3062Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;3063break;3064case NVPTXISD::Suld2DV4I32Clamp:3065Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;3066break;3067case NVPTXISD::Suld2DArrayI8Clamp:3068Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;3069break;3070case NVPTXISD::Suld2DArrayI16Clamp:3071Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;3072break;3073case NVPTXISD::Suld2DArrayI32Clamp:3074Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;3075break;3076case NVPTXISD::Suld2DArrayI64Clamp:3077Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;3078break;3079case NVPTXISD::Suld2DArrayV2I8Clamp:3080Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;3081break;3082case NVPTXISD::Suld2DArrayV2I16Clamp:3083Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;3084break;3085case NVPTXISD::Suld2DArrayV2I32Clamp:3086Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;3087break;3088case NVPTXISD::Suld2DArrayV2I64Clamp:3089Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;3090break;3091case NVPTXISD::Suld2DArrayV4I8Clamp:3092Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;3093break;3094case NVPTXISD::Suld2DArrayV4I16Clamp:3095Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;3096break;3097case NVPTXISD::Suld2DArrayV4I32Clamp:3098Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;3099break;3100case NVPTXISD::Suld3DI8Clamp:3101Opc = NVPTX::SULD_3D_I8_CLAMP_R;3102break;3103case NVPTXISD::Suld3DI16Clamp:3104Opc = NVPTX::SULD_3D_I16_CLAMP_R;3105break;3106case NVPTXISD::Suld3DI32Clamp:3107Opc = NVPTX::SULD_3D_I32_CLAMP_R;3108break;3109case NVPTXISD::Suld3DI64Clamp:3110Opc = NVPTX::SULD_3D_I64_CLAMP_R;3111break;3112case NVPTXISD::Suld3DV2I8Clamp:3113Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;3114break;3115case NVPTXISD::Suld3DV2I16Clamp:3116Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;3117break;3118case NVPTXISD::Suld3DV2I32Clamp:3119Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;3120break;3121case NVPTXISD::Suld3DV2I64Clamp:3122Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;3123break;3124case NVPTXISD::Suld3DV4I8Clamp:3125Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;3126break;3127case NVPTXISD::Suld3DV4I16Clamp:3128Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;3129break;3130case NVPTXISD::Suld3DV4I32Clamp:3131Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;3132break;3133case NVPTXISD::Suld1DI8Trap:3134Opc = NVPTX::SULD_1D_I8_TRAP_R;3135break;3136case NVPTXISD::Suld1DI16Trap:3137Opc = NVPTX::SULD_1D_I16_TRAP_R;3138break;3139case NVPTXISD::Suld1DI32Trap:3140Opc = NVPTX::SULD_1D_I32_TRAP_R;3141break;3142case NVPTXISD::Suld1DI64Trap:3143Opc = NVPTX::SULD_1D_I64_TRAP_R;3144break;3145case NVPTXISD::Suld1DV2I8Trap:3146Opc = NVPTX::SULD_1D_V2I8_TRAP_R;3147break;3148case NVPTXISD::Suld1DV2I16Trap:3149Opc = NVPTX::SULD_1D_V2I16_TRAP_R;3150break;3151case NVPTXISD::Suld1DV2I32Trap:3152Opc = NVPTX::SULD_1D_V2I32_TRAP_R;3153break;3154case NVPTXISD::Suld1DV2I64Trap:3155Opc = NVPTX::SULD_1D_V2I64_TRAP_R;3156break;3157case NVPTXISD::Suld1DV4I8Trap:3158Opc = NVPTX::SULD_1D_V4I8_TRAP_R;3159break;3160case NVPTXISD::Suld1DV4I16Trap:3161Opc = NVPTX::SULD_1D_V4I16_TRAP_R;3162break;3163case NVPTXISD::Suld1DV4I32Trap:3164Opc = NVPTX::SULD_1D_V4I32_TRAP_R;3165break;3166case NVPTXISD::Suld1DArrayI8Trap:3167Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;3168break;3169case NVPTXISD::Suld1DArrayI16Trap:3170Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;3171break;3172case NVPTXISD::Suld1DArrayI32Trap:3173Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;3174break;3175case NVPTXISD::Suld1DArrayI64Trap:3176Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;3177break;3178case NVPTXISD::Suld1DArrayV2I8Trap:3179Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;3180break;3181case NVPTXISD::Suld1DArrayV2I16Trap:3182Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;3183break;3184case NVPTXISD::Suld1DArrayV2I32Trap:3185Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;3186break;3187case NVPTXISD::Suld1DArrayV2I64Trap:3188Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;3189break;3190case NVPTXISD::Suld1DArrayV4I8Trap:3191Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;3192break;3193case NVPTXISD::Suld1DArrayV4I16Trap:3194Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;3195break;3196case NVPTXISD::Suld1DArrayV4I32Trap:3197Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;3198break;3199case NVPTXISD::Suld2DI8Trap:3200Opc = NVPTX::SULD_2D_I8_TRAP_R;3201break;3202case NVPTXISD::Suld2DI16Trap:3203Opc = NVPTX::SULD_2D_I16_TRAP_R;3204break;3205case NVPTXISD::Suld2DI32Trap:3206Opc = NVPTX::SULD_2D_I32_TRAP_R;3207break;3208case NVPTXISD::Suld2DI64Trap:3209Opc = NVPTX::SULD_2D_I64_TRAP_R;3210break;3211case NVPTXISD::Suld2DV2I8Trap:3212Opc = NVPTX::SULD_2D_V2I8_TRAP_R;3213break;3214case NVPTXISD::Suld2DV2I16Trap:3215Opc = NVPTX::SULD_2D_V2I16_TRAP_R;3216break;3217case NVPTXISD::Suld2DV2I32Trap:3218Opc = NVPTX::SULD_2D_V2I32_TRAP_R;3219break;3220case NVPTXISD::Suld2DV2I64Trap:3221Opc = NVPTX::SULD_2D_V2I64_TRAP_R;3222break;3223case NVPTXISD::Suld2DV4I8Trap:3224Opc = NVPTX::SULD_2D_V4I8_TRAP_R;3225break;3226case NVPTXISD::Suld2DV4I16Trap:3227Opc = NVPTX::SULD_2D_V4I16_TRAP_R;3228break;3229case NVPTXISD::Suld2DV4I32Trap:3230Opc = NVPTX::SULD_2D_V4I32_TRAP_R;3231break;3232case NVPTXISD::Suld2DArrayI8Trap:3233Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;3234break;3235case NVPTXISD::Suld2DArrayI16Trap:3236Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;3237break;3238case NVPTXISD::Suld2DArrayI32Trap:3239Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;3240break;3241case NVPTXISD::Suld2DArrayI64Trap:3242Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;3243break;3244case NVPTXISD::Suld2DArrayV2I8Trap:3245Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;3246break;3247case NVPTXISD::Suld2DArrayV2I16Trap:3248Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;3249break;3250case NVPTXISD::Suld2DArrayV2I32Trap:3251Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;3252break;3253case NVPTXISD::Suld2DArrayV2I64Trap:3254Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;3255break;3256case NVPTXISD::Suld2DArrayV4I8Trap:3257Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;3258break;3259case NVPTXISD::Suld2DArrayV4I16Trap:3260Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;3261break;3262case NVPTXISD::Suld2DArrayV4I32Trap:3263Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;3264break;3265case NVPTXISD::Suld3DI8Trap:3266Opc = NVPTX::SULD_3D_I8_TRAP_R;3267break;3268case NVPTXISD::Suld3DI16Trap:3269Opc = NVPTX::SULD_3D_I16_TRAP_R;3270break;3271case NVPTXISD::Suld3DI32Trap:3272Opc = NVPTX::SULD_3D_I32_TRAP_R;3273break;3274case NVPTXISD::Suld3DI64Trap:3275Opc = NVPTX::SULD_3D_I64_TRAP_R;3276break;3277case NVPTXISD::Suld3DV2I8Trap:3278Opc = NVPTX::SULD_3D_V2I8_TRAP_R;3279break;3280case NVPTXISD::Suld3DV2I16Trap:3281Opc = NVPTX::SULD_3D_V2I16_TRAP_R;3282break;3283case NVPTXISD::Suld3DV2I32Trap:3284Opc = NVPTX::SULD_3D_V2I32_TRAP_R;3285break;3286case NVPTXISD::Suld3DV2I64Trap:3287Opc = NVPTX::SULD_3D_V2I64_TRAP_R;3288break;3289case NVPTXISD::Suld3DV4I8Trap:3290Opc = NVPTX::SULD_3D_V4I8_TRAP_R;3291break;3292case NVPTXISD::Suld3DV4I16Trap:3293Opc = NVPTX::SULD_3D_V4I16_TRAP_R;3294break;3295case NVPTXISD::Suld3DV4I32Trap:3296Opc = NVPTX::SULD_3D_V4I32_TRAP_R;3297break;3298case NVPTXISD::Suld1DI8Zero:3299Opc = NVPTX::SULD_1D_I8_ZERO_R;3300break;3301case NVPTXISD::Suld1DI16Zero:3302Opc = NVPTX::SULD_1D_I16_ZERO_R;3303break;3304case NVPTXISD::Suld1DI32Zero:3305Opc = NVPTX::SULD_1D_I32_ZERO_R;3306break;3307case NVPTXISD::Suld1DI64Zero:3308Opc = NVPTX::SULD_1D_I64_ZERO_R;3309break;3310case NVPTXISD::Suld1DV2I8Zero:3311Opc = NVPTX::SULD_1D_V2I8_ZERO_R;3312break;3313case NVPTXISD::Suld1DV2I16Zero:3314Opc = NVPTX::SULD_1D_V2I16_ZERO_R;3315break;3316case NVPTXISD::Suld1DV2I32Zero:3317Opc = NVPTX::SULD_1D_V2I32_ZERO_R;3318break;3319case NVPTXISD::Suld1DV2I64Zero:3320Opc = NVPTX::SULD_1D_V2I64_ZERO_R;3321break;3322case NVPTXISD::Suld1DV4I8Zero:3323Opc = NVPTX::SULD_1D_V4I8_ZERO_R;3324break;3325case NVPTXISD::Suld1DV4I16Zero:3326Opc = NVPTX::SULD_1D_V4I16_ZERO_R;3327break;3328case NVPTXISD::Suld1DV4I32Zero:3329Opc = NVPTX::SULD_1D_V4I32_ZERO_R;3330break;3331case NVPTXISD::Suld1DArrayI8Zero:3332Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;3333break;3334case NVPTXISD::Suld1DArrayI16Zero:3335Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;3336break;3337case NVPTXISD::Suld1DArrayI32Zero:3338Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;3339break;3340case NVPTXISD::Suld1DArrayI64Zero:3341Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;3342break;3343case NVPTXISD::Suld1DArrayV2I8Zero:3344Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;3345break;3346case NVPTXISD::Suld1DArrayV2I16Zero:3347Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;3348break;3349case NVPTXISD::Suld1DArrayV2I32Zero:3350Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;3351break;3352case NVPTXISD::Suld1DArrayV2I64Zero:3353Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;3354break;3355case NVPTXISD::Suld1DArrayV4I8Zero:3356Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;3357break;3358case NVPTXISD::Suld1DArrayV4I16Zero:3359Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;3360break;3361case NVPTXISD::Suld1DArrayV4I32Zero:3362Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;3363break;3364case NVPTXISD::Suld2DI8Zero:3365Opc = NVPTX::SULD_2D_I8_ZERO_R;3366break;3367case NVPTXISD::Suld2DI16Zero:3368Opc = NVPTX::SULD_2D_I16_ZERO_R;3369break;3370case NVPTXISD::Suld2DI32Zero:3371Opc = NVPTX::SULD_2D_I32_ZERO_R;3372break;3373case NVPTXISD::Suld2DI64Zero:3374Opc = NVPTX::SULD_2D_I64_ZERO_R;3375break;3376case NVPTXISD::Suld2DV2I8Zero:3377Opc = NVPTX::SULD_2D_V2I8_ZERO_R;3378break;3379case NVPTXISD::Suld2DV2I16Zero:3380Opc = NVPTX::SULD_2D_V2I16_ZERO_R;3381break;3382case NVPTXISD::Suld2DV2I32Zero:3383Opc = NVPTX::SULD_2D_V2I32_ZERO_R;3384break;3385case NVPTXISD::Suld2DV2I64Zero:3386Opc = NVPTX::SULD_2D_V2I64_ZERO_R;3387break;3388case NVPTXISD::Suld2DV4I8Zero:3389Opc = NVPTX::SULD_2D_V4I8_ZERO_R;3390break;3391case NVPTXISD::Suld2DV4I16Zero:3392Opc = NVPTX::SULD_2D_V4I16_ZERO_R;3393break;3394case NVPTXISD::Suld2DV4I32Zero:3395Opc = NVPTX::SULD_2D_V4I32_ZERO_R;3396break;3397case NVPTXISD::Suld2DArrayI8Zero:3398Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;3399break;3400case NVPTXISD::Suld2DArrayI16Zero:3401Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;3402break;3403case NVPTXISD::Suld2DArrayI32Zero:3404Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;3405break;3406case NVPTXISD::Suld2DArrayI64Zero:3407Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;3408break;3409case NVPTXISD::Suld2DArrayV2I8Zero:3410Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;3411break;3412case NVPTXISD::Suld2DArrayV2I16Zero:3413Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;3414break;3415case NVPTXISD::Suld2DArrayV2I32Zero:3416Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;3417break;3418case NVPTXISD::Suld2DArrayV2I64Zero:3419Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;3420break;3421case NVPTXISD::Suld2DArrayV4I8Zero:3422Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;3423break;3424case NVPTXISD::Suld2DArrayV4I16Zero:3425Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;3426break;3427case NVPTXISD::Suld2DArrayV4I32Zero:3428Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;3429break;3430case NVPTXISD::Suld3DI8Zero:3431Opc = NVPTX::SULD_3D_I8_ZERO_R;3432break;3433case NVPTXISD::Suld3DI16Zero:3434Opc = NVPTX::SULD_3D_I16_ZERO_R;3435break;3436case NVPTXISD::Suld3DI32Zero:3437Opc = NVPTX::SULD_3D_I32_ZERO_R;3438break;3439case NVPTXISD::Suld3DI64Zero:3440Opc = NVPTX::SULD_3D_I64_ZERO_R;3441break;3442case NVPTXISD::Suld3DV2I8Zero:3443Opc = NVPTX::SULD_3D_V2I8_ZERO_R;3444break;3445case NVPTXISD::Suld3DV2I16Zero:3446Opc = NVPTX::SULD_3D_V2I16_ZERO_R;3447break;3448case NVPTXISD::Suld3DV2I32Zero:3449Opc = NVPTX::SULD_3D_V2I32_ZERO_R;3450break;3451case NVPTXISD::Suld3DV2I64Zero:3452Opc = NVPTX::SULD_3D_V2I64_ZERO_R;3453break;3454case NVPTXISD::Suld3DV4I8Zero:3455Opc = NVPTX::SULD_3D_V4I8_ZERO_R;3456break;3457case NVPTXISD::Suld3DV4I16Zero:3458Opc = NVPTX::SULD_3D_V4I16_ZERO_R;3459break;3460case NVPTXISD::Suld3DV4I32Zero:3461Opc = NVPTX::SULD_3D_V4I32_ZERO_R;3462break;3463}34643465// Copy over operands3466SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));3467Ops.push_back(N->getOperand(0)); // Move chain to the back.34683469ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));3470return true;3471}347234733474/// SelectBFE - Look for instruction sequences that can be made more efficient3475/// by using the 'bfe' (bit-field extract) PTX instruction3476bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {3477SDLoc DL(N);3478SDValue LHS = N->getOperand(0);3479SDValue RHS = N->getOperand(1);3480SDValue Len;3481SDValue Start;3482SDValue Val;3483bool IsSigned = false;34843485if (N->getOpcode() == ISD::AND) {3486// Canonicalize the operands3487// We want 'and %val, %mask'3488if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {3489std::swap(LHS, RHS);3490}34913492ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);3493if (!Mask) {3494// We need a constant mask on the RHS of the AND3495return false;3496}34973498// Extract the mask bits3499uint64_t MaskVal = Mask->getZExtValue();3500if (!isMask_64(MaskVal)) {3501// We *could* handle shifted masks here, but doing so would require an3502// 'and' operation to fix up the low-order bits so we would trade3503// shr+and for bfe+and, which has the same throughput3504return false;3505}35063507// How many bits are in our mask?3508int64_t NumBits = countr_one(MaskVal);3509Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);35103511if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {3512// We have a 'srl/and' pair, extract the effective start bit and length3513Val = LHS.getNode()->getOperand(0);3514Start = LHS.getNode()->getOperand(1);3515ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);3516if (StartConst) {3517uint64_t StartVal = StartConst->getZExtValue();3518// How many "good" bits do we have left? "good" is defined here as bits3519// that exist in the original value, not shifted in.3520int64_t GoodBits = Start.getValueSizeInBits() - StartVal;3521if (NumBits > GoodBits) {3522// Do not handle the case where bits have been shifted in. In theory3523// we could handle this, but the cost is likely higher than just3524// emitting the srl/and pair.3525return false;3526}3527Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);3528} else {3529// Do not handle the case where the shift amount (can be zero if no srl3530// was found) is not constant. We could handle this case, but it would3531// require run-time logic that would be more expensive than just3532// emitting the srl/and pair.3533return false;3534}3535} else {3536// Do not handle the case where the LHS of the and is not a shift. While3537// it would be trivial to handle this case, it would just transform3538// 'and' -> 'bfe', but 'and' has higher-throughput.3539return false;3540}3541} else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {3542if (LHS->getOpcode() == ISD::AND) {3543ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);3544if (!ShiftCnst) {3545// Shift amount must be constant3546return false;3547}35483549uint64_t ShiftAmt = ShiftCnst->getZExtValue();35503551SDValue AndLHS = LHS->getOperand(0);3552SDValue AndRHS = LHS->getOperand(1);35533554// Canonicalize the AND to have the mask on the RHS3555if (isa<ConstantSDNode>(AndLHS)) {3556std::swap(AndLHS, AndRHS);3557}35583559ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);3560if (!MaskCnst) {3561// Mask must be constant3562return false;3563}35643565uint64_t MaskVal = MaskCnst->getZExtValue();3566uint64_t NumZeros;3567uint64_t NumBits;3568if (isMask_64(MaskVal)) {3569NumZeros = 0;3570// The number of bits in the result bitfield will be the number of3571// trailing ones (the AND) minus the number of bits we shift off3572NumBits = llvm::countr_one(MaskVal) - ShiftAmt;3573} else if (isShiftedMask_64(MaskVal)) {3574NumZeros = llvm::countr_zero(MaskVal);3575unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);3576// The number of bits in the result bitfield will be the number of3577// trailing zeros plus the number of set bits in the mask minus the3578// number of bits we shift off3579NumBits = NumZeros + NumOnes - ShiftAmt;3580} else {3581// This is not a mask we can handle3582return false;3583}35843585if (ShiftAmt < NumZeros) {3586// Handling this case would require extra logic that would make this3587// transformation non-profitable3588return false;3589}35903591Val = AndLHS;3592Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);3593Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);3594} else if (LHS->getOpcode() == ISD::SHL) {3595// Here, we have a pattern like:3596//3597// (sra (shl val, NN), MM)3598// or3599// (srl (shl val, NN), MM)3600//3601// If MM >= NN, we can efficiently optimize this with bfe3602Val = LHS->getOperand(0);36033604SDValue ShlRHS = LHS->getOperand(1);3605ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);3606if (!ShlCnst) {3607// Shift amount must be constant3608return false;3609}3610uint64_t InnerShiftAmt = ShlCnst->getZExtValue();36113612SDValue ShrRHS = RHS;3613ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);3614if (!ShrCnst) {3615// Shift amount must be constant3616return false;3617}3618uint64_t OuterShiftAmt = ShrCnst->getZExtValue();36193620// To avoid extra codegen and be profitable, we need Outer >= Inner3621if (OuterShiftAmt < InnerShiftAmt) {3622return false;3623}36243625// If the outer shift is more than the type size, we have no bitfield to3626// extract (since we also check that the inner shift is <= the outer shift3627// then this also implies that the inner shift is < the type size)3628if (OuterShiftAmt >= Val.getValueSizeInBits()) {3629return false;3630}36313632Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,3633MVT::i32);3634Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,3635DL, MVT::i32);36363637if (N->getOpcode() == ISD::SRA) {3638// If we have a arithmetic right shift, we need to use the signed bfe3639// variant3640IsSigned = true;3641}3642} else {3643// No can do...3644return false;3645}3646} else {3647// No can do...3648return false;3649}365036513652unsigned Opc;3653// For the BFE operations we form here from "and" and "srl", always use the3654// unsigned variants.3655if (Val.getValueType() == MVT::i32) {3656if (IsSigned) {3657Opc = NVPTX::BFE_S32rii;3658} else {3659Opc = NVPTX::BFE_U32rii;3660}3661} else if (Val.getValueType() == MVT::i64) {3662if (IsSigned) {3663Opc = NVPTX::BFE_S64rii;3664} else {3665Opc = NVPTX::BFE_U64rii;3666}3667} else {3668// We cannot handle this type3669return false;3670}36713672SDValue Ops[] = {3673Val, Start, Len3674};36753676ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));3677return true;3678}36793680// SelectDirectAddr - Match a direct address for DAG.3681// A direct address could be a globaladdress or externalsymbol.3682bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {3683// Return true if TGA or ES.3684if (N.getOpcode() == ISD::TargetGlobalAddress ||3685N.getOpcode() == ISD::TargetExternalSymbol) {3686Address = N;3687return true;3688}3689if (N.getOpcode() == NVPTXISD::Wrapper) {3690Address = N.getOperand(0);3691return true;3692}3693// addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol3694if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {3695if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&3696CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&3697CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)3698return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);3699}3700return false;3701}37023703// symbol+offset3704bool NVPTXDAGToDAGISel::SelectADDRsi_imp(3705SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {3706if (Addr.getOpcode() == ISD::ADD) {3707if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {3708SDValue base = Addr.getOperand(0);3709if (SelectDirectAddr(base, Base)) {3710Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),3711mvt);3712return true;3713}3714}3715}3716return false;3717}37183719// symbol+offset3720bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,3721SDValue &Base, SDValue &Offset) {3722return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);3723}37243725// symbol+offset3726bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,3727SDValue &Base, SDValue &Offset) {3728return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);3729}37303731// register+offset3732bool NVPTXDAGToDAGISel::SelectADDRri_imp(3733SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {3734if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {3735Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);3736Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);3737return true;3738}3739if (Addr.getOpcode() == ISD::TargetExternalSymbol ||3740Addr.getOpcode() == ISD::TargetGlobalAddress)3741return false; // direct calls.37423743if (Addr.getOpcode() == ISD::ADD) {3744if (SelectDirectAddr(Addr.getOperand(0), Addr)) {3745return false;3746}3747if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {3748if (FrameIndexSDNode *FIN =3749dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))3750// Constant offset from frame ref.3751Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);3752else3753Base = Addr.getOperand(0);37543755// Offset must fit in a 32-bit signed int in PTX [register+offset] address3756// mode3757if (!CN->getAPIntValue().isSignedIntN(32))3758return false;37593760Offset = CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(OpNode),3761MVT::i32);3762return true;3763}3764}3765return false;3766}37673768// register+offset3769bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,3770SDValue &Base, SDValue &Offset) {3771return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);3772}37733774// register+offset3775bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,3776SDValue &Base, SDValue &Offset) {3777return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);3778}37793780bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,3781unsigned int spN) const {3782const Value *Src = nullptr;3783if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {3784if (spN == 0 && mN->getMemOperand()->getPseudoValue())3785return true;3786Src = mN->getMemOperand()->getValue();3787}3788if (!Src)3789return false;3790if (auto *PT = dyn_cast<PointerType>(Src->getType()))3791return (PT->getAddressSpace() == spN);3792return false;3793}37943795/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for3796/// inline asm expressions.3797bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(3798const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,3799std::vector<SDValue> &OutOps) {3800SDValue Op0, Op1;3801switch (ConstraintID) {3802default:3803return true;3804case InlineAsm::ConstraintCode::m: // memory3805if (SelectDirectAddr(Op, Op0)) {3806OutOps.push_back(Op0);3807OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));3808return false;3809}3810if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {3811OutOps.push_back(Op0);3812OutOps.push_back(Op1);3813return false;3814}3815break;3816}3817return true;3818}38193820void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {3821// Lower a CopyToReg with two 64-bit inputs3822// Dst:i128, lo:i64, hi:i643823//3824// CopyToReg Dst, lo, hi;3825//3826// ==>3827//3828// tmp = V2I64toI128 {lo, hi};3829// CopyToReg Dst, tmp;3830SDValue Dst = N->getOperand(1);3831SDValue Lo = N->getOperand(2);3832SDValue Hi = N->getOperand(3);38333834SDLoc DL(N);3835SDNode *Mov =3836CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});38373838SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);3839NewOps[0] = N->getOperand(0);3840NewOps[1] = Dst;3841NewOps[2] = SDValue(Mov, 0);3842if (N->getNumOperands() == 5)3843NewOps[3] = N->getOperand(4);3844SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);38453846ReplaceNode(N, NewValue.getNode());3847}38483849void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {3850// Lower CopyFromReg from a 128-bit regs to two 64-bit regs3851// Dst:i128, Src:i1283852//3853// {lo, hi} = CopyFromReg Src3854//3855// ==>3856//3857// {lo, hi} = I128toV2I64 Src3858//3859SDValue Ch = N->getOperand(0);3860SDValue Src = N->getOperand(1);3861SDValue Glue = N->getOperand(2);3862SDLoc DL(N);38633864// Add Glue and Ch to the operands and results to avoid break the execution3865// order3866SDNode *Mov = CurDAG->getMachineNode(3867NVPTX::I128toV2I64, DL,3868{MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},3869{Src, Ch, Glue});38703871ReplaceNode(N, Mov);3872}38733874/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a3875/// conversion from \p SrcTy to \p DestTy.3876unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,3877LoadSDNode *LdNode) {3878bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;3879switch (SrcTy.SimpleTy) {3880default:3881llvm_unreachable("Unhandled source type");3882case MVT::i8:3883switch (DestTy.SimpleTy) {3884default:3885llvm_unreachable("Unhandled dest type");3886case MVT::i16:3887return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;3888case MVT::i32:3889return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;3890case MVT::i64:3891return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;3892}3893case MVT::i16:3894switch (DestTy.SimpleTy) {3895default:3896llvm_unreachable("Unhandled dest type");3897case MVT::i8:3898return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;3899case MVT::i32:3900return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;3901case MVT::i64:3902return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;3903}3904case MVT::i32:3905switch (DestTy.SimpleTy) {3906default:3907llvm_unreachable("Unhandled dest type");3908case MVT::i8:3909return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;3910case MVT::i16:3911return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;3912case MVT::i64:3913return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;3914}3915case MVT::i64:3916switch (DestTy.SimpleTy) {3917default:3918llvm_unreachable("Unhandled dest type");3919case MVT::i8:3920return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;3921case MVT::i16:3922return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;3923case MVT::i32:3924return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;3925}3926case MVT::f16:3927switch (DestTy.SimpleTy) {3928default:3929llvm_unreachable("Unhandled dest type");3930case MVT::f32:3931return NVPTX::CVT_f32_f16;3932case MVT::f64:3933return NVPTX::CVT_f64_f16;3934}3935}3936}393739383939