Path: blob/main/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
35271 views
//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file defines the interfaces that NVPTX uses to lower LLVM code into a9// selection DAG.10//11//===----------------------------------------------------------------------===//1213#include "NVPTXISelLowering.h"14#include "MCTargetDesc/NVPTXBaseInfo.h"15#include "NVPTX.h"16#include "NVPTXSubtarget.h"17#include "NVPTXTargetMachine.h"18#include "NVPTXTargetObjectFile.h"19#include "NVPTXUtilities.h"20#include "llvm/ADT/APInt.h"21#include "llvm/ADT/STLExtras.h"22#include "llvm/ADT/SmallVector.h"23#include "llvm/ADT/StringRef.h"24#include "llvm/CodeGen/Analysis.h"25#include "llvm/CodeGen/ISDOpcodes.h"26#include "llvm/CodeGen/MachineFunction.h"27#include "llvm/CodeGen/MachineMemOperand.h"28#include "llvm/CodeGen/SelectionDAG.h"29#include "llvm/CodeGen/SelectionDAGNodes.h"30#include "llvm/CodeGen/TargetCallingConv.h"31#include "llvm/CodeGen/TargetLowering.h"32#include "llvm/CodeGen/ValueTypes.h"33#include "llvm/CodeGenTypes/MachineValueType.h"34#include "llvm/IR/Argument.h"35#include "llvm/IR/Attributes.h"36#include "llvm/IR/Constants.h"37#include "llvm/IR/DataLayout.h"38#include "llvm/IR/DerivedTypes.h"39#include "llvm/IR/DiagnosticInfo.h"40#include "llvm/IR/FPEnv.h"41#include "llvm/IR/Function.h"42#include "llvm/IR/GlobalValue.h"43#include "llvm/IR/Instruction.h"44#include "llvm/IR/Instructions.h"45#include "llvm/IR/IntrinsicsNVPTX.h"46#include "llvm/IR/Module.h"47#include "llvm/IR/Type.h"48#include "llvm/IR/Value.h"49#include "llvm/Support/Alignment.h"50#include "llvm/Support/Casting.h"51#include "llvm/Support/CodeGen.h"52#include "llvm/Support/CommandLine.h"53#include "llvm/Support/ErrorHandling.h"54#include "llvm/Support/raw_ostream.h"55#include "llvm/Target/TargetMachine.h"56#include "llvm/Target/TargetOptions.h"57#include <algorithm>58#include <cassert>59#include <cmath>60#include <cstdint>61#include <iterator>62#include <optional>63#include <sstream>64#include <string>65#include <utility>66#include <vector>6768#define DEBUG_TYPE "nvptx-lower"6970using namespace llvm;7172static std::atomic<unsigned> GlobalUniqueCallSite;7374static cl::opt<bool> sched4reg(75"nvptx-sched4reg",76cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));7778static cl::opt<unsigned> FMAContractLevelOpt(79"nvptx-fma-level", cl::Hidden,80cl::desc("NVPTX Specific: FMA contraction (0: don't do it"81" 1: do it 2: do it aggressively"),82cl::init(2));8384static cl::opt<int> UsePrecDivF32(85"nvptx-prec-divf32", cl::Hidden,86cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"87" IEEE Compliant F32 div.rnd if available."),88cl::init(2));8990static cl::opt<bool> UsePrecSqrtF32(91"nvptx-prec-sqrtf32", cl::Hidden,92cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),93cl::init(true));9495static cl::opt<bool> ForceMinByValParamAlign(96"nvptx-force-min-byval-param-align", cl::Hidden,97cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"98" params of device functions."),99cl::init(false));100101int NVPTXTargetLowering::getDivF32Level() const {102if (UsePrecDivF32.getNumOccurrences() > 0) {103// If nvptx-prec-div32=N is used on the command-line, always honor it104return UsePrecDivF32;105} else {106// Otherwise, use div.approx if fast math is enabled107if (getTargetMachine().Options.UnsafeFPMath)108return 0;109else110return 2;111}112}113114bool NVPTXTargetLowering::usePrecSqrtF32() const {115if (UsePrecSqrtF32.getNumOccurrences() > 0) {116// If nvptx-prec-sqrtf32 is used on the command-line, always honor it117return UsePrecSqrtF32;118} else {119// Otherwise, use sqrt.approx if fast math is enabled120return !getTargetMachine().Options.UnsafeFPMath;121}122}123124bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {125return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==126DenormalMode::PreserveSign;127}128129static bool IsPTXVectorType(MVT VT) {130switch (VT.SimpleTy) {131default:132return false;133case MVT::v2i1:134case MVT::v4i1:135case MVT::v2i8:136case MVT::v4i8:137case MVT::v2i16:138case MVT::v4i16:139case MVT::v8i16: // <4 x i16x2>140case MVT::v2i32:141case MVT::v4i32:142case MVT::v2i64:143case MVT::v2f16:144case MVT::v4f16:145case MVT::v8f16: // <4 x f16x2>146case MVT::v2bf16:147case MVT::v4bf16:148case MVT::v8bf16: // <4 x bf16x2>149case MVT::v2f32:150case MVT::v4f32:151case MVT::v2f64:152return true;153}154}155156static bool Is16bitsType(MVT VT) {157return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||158VT.SimpleTy == MVT::i16);159}160161/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive162/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors163/// into their primitive components.164/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the165/// same number of types as the Ins/Outs arrays in LowerFormalArguments,166/// LowerCall, and LowerReturn.167static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,168Type *Ty, SmallVectorImpl<EVT> &ValueVTs,169SmallVectorImpl<uint64_t> *Offsets = nullptr,170uint64_t StartingOffset = 0) {171SmallVector<EVT, 16> TempVTs;172SmallVector<uint64_t, 16> TempOffsets;173174// Special case for i128 - decompose to (i64, i64)175if (Ty->isIntegerTy(128)) {176ValueVTs.push_back(EVT(MVT::i64));177ValueVTs.push_back(EVT(MVT::i64));178179if (Offsets) {180Offsets->push_back(StartingOffset + 0);181Offsets->push_back(StartingOffset + 8);182}183184return;185}186187// Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.188if (StructType *STy = dyn_cast<StructType>(Ty)) {189auto const *SL = DL.getStructLayout(STy);190auto ElementNum = 0;191for(auto *EI : STy->elements()) {192ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,193StartingOffset + SL->getElementOffset(ElementNum));194++ElementNum;195}196return;197}198199ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);200for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {201EVT VT = TempVTs[i];202uint64_t Off = TempOffsets[i];203// Split vectors into individual elements, except for v2f16, which204// we will pass as a single scalar.205if (VT.isVector()) {206unsigned NumElts = VT.getVectorNumElements();207EVT EltVT = VT.getVectorElementType();208// Vectors with an even number of f16 elements will be passed to209// us as an array of v2f16/v2bf16 elements. We must match this so we210// stay in sync with Ins/Outs.211if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) {212switch (EltVT.getSimpleVT().SimpleTy) {213case MVT::f16:214EltVT = MVT::v2f16;215break;216case MVT::bf16:217EltVT = MVT::v2bf16;218break;219case MVT::i16:220EltVT = MVT::v2i16;221break;222default:223llvm_unreachable("Unexpected type");224}225NumElts /= 2;226} else if (EltVT.getSimpleVT() == MVT::i8 &&227(NumElts % 4 == 0 || NumElts == 3)) {228// v*i8 are formally lowered as v4i8229EltVT = MVT::v4i8;230NumElts = (NumElts + 3) / 4;231} else if (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) {232// v2i8 is promoted to v2i16233NumElts = 1;234EltVT = MVT::v2i16;235}236for (unsigned j = 0; j != NumElts; ++j) {237ValueVTs.push_back(EltVT);238if (Offsets)239Offsets->push_back(Off + j * EltVT.getStoreSize());240}241} else {242ValueVTs.push_back(VT);243if (Offsets)244Offsets->push_back(Off);245}246}247}248249/// PromoteScalarIntegerPTX250/// Used to make sure the arguments/returns are suitable for passing251/// and promote them to a larger size if they're not.252///253/// The promoted type is placed in \p PromoteVT if the function returns true.254static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {255if (VT.isScalarInteger()) {256switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {257default:258llvm_unreachable(259"Promotion is not suitable for scalars of size larger than 64-bits");260case 1:261*PromotedVT = MVT::i1;262break;263case 2:264case 4:265case 8:266*PromotedVT = MVT::i8;267break;268case 16:269*PromotedVT = MVT::i16;270break;271case 32:272*PromotedVT = MVT::i32;273break;274case 64:275*PromotedVT = MVT::i64;276break;277}278return EVT(*PromotedVT) != VT;279}280return false;281}282283// Check whether we can merge loads/stores of some of the pieces of a284// flattened function parameter or return value into a single vector285// load/store.286//287// The flattened parameter is represented as a list of EVTs and288// offsets, and the whole structure is aligned to ParamAlignment. This289// function determines whether we can load/store pieces of the290// parameter starting at index Idx using a single vectorized op of291// size AccessSize. If so, it returns the number of param pieces292// covered by the vector op. Otherwise, it returns 1.293static unsigned CanMergeParamLoadStoresStartingAt(294unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,295const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {296297// Can't vectorize if param alignment is not sufficient.298if (ParamAlignment < AccessSize)299return 1;300// Can't vectorize if offset is not aligned.301if (Offsets[Idx] & (AccessSize - 1))302return 1;303304EVT EltVT = ValueVTs[Idx];305unsigned EltSize = EltVT.getStoreSize();306307// Element is too large to vectorize.308if (EltSize >= AccessSize)309return 1;310311unsigned NumElts = AccessSize / EltSize;312// Can't vectorize if AccessBytes if not a multiple of EltSize.313if (AccessSize != EltSize * NumElts)314return 1;315316// We don't have enough elements to vectorize.317if (Idx + NumElts > ValueVTs.size())318return 1;319320// PTX ISA can only deal with 2- and 4-element vector ops.321if (NumElts != 4 && NumElts != 2)322return 1;323324for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {325// Types do not match.326if (ValueVTs[j] != EltVT)327return 1;328329// Elements are not contiguous.330if (Offsets[j] - Offsets[j - 1] != EltSize)331return 1;332}333// OK. We can vectorize ValueVTs[i..i+NumElts)334return NumElts;335}336337// Flags for tracking per-element vectorization state of loads/stores338// of a flattened function parameter or return value.339enum ParamVectorizationFlags {340PVF_INNER = 0x0, // Middle elements of a vector.341PVF_FIRST = 0x1, // First element of the vector.342PVF_LAST = 0x2, // Last element of the vector.343// Scalar is effectively a 1-element vector.344PVF_SCALAR = PVF_FIRST | PVF_LAST345};346347// Computes whether and how we can vectorize the loads/stores of a348// flattened function parameter or return value.349//350// The flattened parameter is represented as the list of ValueVTs and351// Offsets, and is aligned to ParamAlignment bytes. We return a vector352// of the same size as ValueVTs indicating how each piece should be353// loaded/stored (i.e. as a scalar, or as part of a vector354// load/store).355static SmallVector<ParamVectorizationFlags, 16>356VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,357const SmallVectorImpl<uint64_t> &Offsets,358Align ParamAlignment, bool IsVAArg = false) {359// Set vector size to match ValueVTs and mark all elements as360// scalars by default.361SmallVector<ParamVectorizationFlags, 16> VectorInfo;362VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);363364if (IsVAArg)365return VectorInfo;366367// Check what we can vectorize using 128/64/32-bit accesses.368for (int I = 0, E = ValueVTs.size(); I != E; ++I) {369// Skip elements we've already processed.370assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");371for (unsigned AccessSize : {16, 8, 4, 2}) {372unsigned NumElts = CanMergeParamLoadStoresStartingAt(373I, AccessSize, ValueVTs, Offsets, ParamAlignment);374// Mark vectorized elements.375switch (NumElts) {376default:377llvm_unreachable("Unexpected return value");378case 1:379// Can't vectorize using this size, try next smaller size.380continue;381case 2:382assert(I + 1 < E && "Not enough elements.");383VectorInfo[I] = PVF_FIRST;384VectorInfo[I + 1] = PVF_LAST;385I += 1;386break;387case 4:388assert(I + 3 < E && "Not enough elements.");389VectorInfo[I] = PVF_FIRST;390VectorInfo[I + 1] = PVF_INNER;391VectorInfo[I + 2] = PVF_INNER;392VectorInfo[I + 3] = PVF_LAST;393I += 3;394break;395}396// Break out of the inner loop because we've already succeeded397// using largest possible AccessSize.398break;399}400}401return VectorInfo;402}403404// NVPTXTargetLowering Constructor.405NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,406const NVPTXSubtarget &STI)407: TargetLowering(TM), nvTM(&TM), STI(STI) {408// always lower memset, memcpy, and memmove intrinsics to load/store409// instructions, rather410// then generating calls to memset, mempcy or memmove.411MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF;412MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF;413MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF;414415setBooleanContents(ZeroOrNegativeOneBooleanContent);416setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);417418// Jump is Expensive. Don't create extra control flow for 'and', 'or'419// condition branches.420setJumpIsExpensive(true);421422// Wide divides are _very_ slow. Try to reduce the width of the divide if423// possible.424addBypassSlowDiv(64, 32);425426// By default, use the Source scheduling427if (sched4reg)428setSchedulingPreference(Sched::RegPressure);429else430setSchedulingPreference(Sched::Source);431432auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,433LegalizeAction NoF16Action) {434setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);435};436437auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,438LegalizeAction NoBF16Action) {439bool IsOpSupported = STI.hasBF16Math();440// Few instructions are available on sm_90 only441switch(Op) {442case ISD::FADD:443case ISD::FMUL:444case ISD::FSUB:445case ISD::SELECT:446case ISD::SELECT_CC:447case ISD::SETCC:448case ISD::FEXP2:449case ISD::FCEIL:450case ISD::FFLOOR:451case ISD::FNEARBYINT:452case ISD::FRINT:453case ISD::FROUNDEVEN:454case ISD::FTRUNC:455IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;456break;457}458setOperationAction(459Op, VT, IsOpSupported ? Action : NoBF16Action);460};461462auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,463LegalizeAction NoI16x2Action) {464bool IsOpSupported = false;465// instructions are available on sm_90 only466switch (Op) {467case ISD::ADD:468case ISD::SMAX:469case ISD::SMIN:470case ISD::UMIN:471case ISD::UMAX:472IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;473break;474}475setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);476};477478addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);479addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);480addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);481addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);482addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);483addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);484addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);485addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);486addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);487addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);488addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);489addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);490491// Conversion to/from FP16/FP16x2 is always legal.492setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);493setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);494setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);495setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);496497setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);498if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)499setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);500501setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);502setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);503504// Conversion to/from BFP16/BFP16x2 is always legal.505setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom);506setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom);507setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand);508setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand);509510setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);511setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);512if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)513AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);514515// Conversion to/from i16/i16x2 is always legal.516setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);517setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);518setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);519setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);520521setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);522setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);523setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);524setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);525// Only logical ops can be done on v4i8 directly, others must be done526// elementwise.527setOperationAction(528{ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE,529ISD::BITREVERSE, ISD::CTLZ, ISD::CTPOP, ISD::CTTZ,530ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FSHL, ISD::FSHR,531ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::PARITY,532ISD::ROTL, ISD::ROTR, ISD::SADDO, ISD::SADDO_CARRY,533ISD::SADDSAT, ISD::SDIV, ISD::SDIVREM, ISD::SELECT_CC,534ISD::SETCC, ISD::SHL, ISD::SINT_TO_FP, ISD::SMAX,535ISD::SMIN, ISD::SMULO, ISD::SMUL_LOHI, ISD::SRA,536ISD::SREM, ISD::SRL, ISD::SSHLSAT, ISD::SSUBO,537ISD::SSUBO_CARRY, ISD::SSUBSAT, ISD::SUB, ISD::SUBC,538ISD::SUBE, ISD::UADDO, ISD::UADDO_CARRY, ISD::UADDSAT,539ISD::UDIV, ISD::UDIVREM, ISD::UINT_TO_FP, ISD::UMAX,540ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM,541ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT,542ISD::USUBSAT},543MVT::v4i8, Expand);544545// Operations not directly supported by NVPTX.546for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,547MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,548MVT::i32, MVT::i64}) {549setOperationAction(ISD::SELECT_CC, VT, Expand);550setOperationAction(ISD::BR_CC, VT, Expand);551}552553// Some SIGN_EXTEND_INREG can be done using cvt instruction.554// For others we will expand to a SHL/SRA pair.555setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);556setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);557setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);558setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);559setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);560setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);561562setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);563setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);564setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom);565setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom);566setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom);567setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom);568569setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);570setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);571572// TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs573// that don't have h/w rotation we lower them to multi-instruction assembly.574// See ROT*_sw in NVPTXIntrInfo.td575setOperationAction(ISD::ROTL, MVT::i64, Legal);576setOperationAction(ISD::ROTR, MVT::i64, Legal);577setOperationAction(ISD::ROTL, MVT::i32, Legal);578setOperationAction(ISD::ROTR, MVT::i32, Legal);579580setOperationAction(ISD::ROTL, MVT::i16, Expand);581setOperationAction(ISD::ROTL, MVT::v2i16, Expand);582setOperationAction(ISD::ROTR, MVT::i16, Expand);583setOperationAction(ISD::ROTR, MVT::v2i16, Expand);584setOperationAction(ISD::ROTL, MVT::i8, Expand);585setOperationAction(ISD::ROTR, MVT::i8, Expand);586setOperationAction(ISD::BSWAP, MVT::i16, Expand);587588// Indirect branch is not supported.589// This also disables Jump Table creation.590setOperationAction(ISD::BR_JT, MVT::Other, Expand);591setOperationAction(ISD::BRIND, MVT::Other, Expand);592593setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);594setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);595596// We want to legalize constant related memmove and memcopy597// intrinsics.598setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);599600// Turn FP extload into load/fpextend601setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);602setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);603setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);604setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);605setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);606setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);607setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);608setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);609setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);610setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);611setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);612setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);613setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);614setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);615setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);616setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);617setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);618setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);619setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);620// Turn FP truncstore into trunc + store.621// FIXME: vector types should also be expanded622setTruncStoreAction(MVT::f32, MVT::f16, Expand);623setTruncStoreAction(MVT::f64, MVT::f16, Expand);624setTruncStoreAction(MVT::f32, MVT::bf16, Expand);625setTruncStoreAction(MVT::f64, MVT::bf16, Expand);626setTruncStoreAction(MVT::f64, MVT::f32, Expand);627628// PTX does not support load / store predicate registers629setOperationAction(ISD::LOAD, MVT::i1, Custom);630setOperationAction(ISD::STORE, MVT::i1, Custom);631632for (MVT VT : MVT::integer_valuetypes()) {633setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);634setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);635setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);636setTruncStoreAction(VT, MVT::i1, Expand);637}638639// expand extload of vector of integers.640setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,641MVT::v2i8, Expand);642setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);643644// This is legal in NVPTX645setOperationAction(ISD::ConstantFP, MVT::f64, Legal);646setOperationAction(ISD::ConstantFP, MVT::f32, Legal);647setOperationAction(ISD::ConstantFP, MVT::f16, Legal);648setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);649650setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);651setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);652653// TRAP can be lowered to PTX trap654setOperationAction(ISD::TRAP, MVT::Other, Legal);655656// Register custom handling for vector loads/stores657for (MVT VT : MVT::fixedlen_vector_valuetypes()) {658if (IsPTXVectorType(VT)) {659setOperationAction(ISD::LOAD, VT, Custom);660setOperationAction(ISD::STORE, VT, Custom);661setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);662}663}664665// Support varargs.666setOperationAction(ISD::VASTART, MVT::Other, Custom);667setOperationAction(ISD::VAARG, MVT::Other, Custom);668setOperationAction(ISD::VACOPY, MVT::Other, Expand);669setOperationAction(ISD::VAEND, MVT::Other, Expand);670671// Custom handling for i8 intrinsics672setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);673674for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {675setOperationAction(ISD::ABS, Ty, Legal);676setOperationAction(ISD::SMIN, Ty, Legal);677setOperationAction(ISD::SMAX, Ty, Legal);678setOperationAction(ISD::UMIN, Ty, Legal);679setOperationAction(ISD::UMAX, Ty, Legal);680681setOperationAction(ISD::CTPOP, Ty, Legal);682setOperationAction(ISD::CTLZ, Ty, Legal);683}684685setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);686setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);687setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);688setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);689setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);690setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);691setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);692693setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);694setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);695setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);696setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);697setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);698setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);699700// Other arithmetic and logic ops are unsupported.701setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,702ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,703ISD::SINT_TO_FP, ISD::UINT_TO_FP},704MVT::v2i16, Expand);705706setOperationAction(ISD::ADDC, MVT::i32, Legal);707setOperationAction(ISD::ADDE, MVT::i32, Legal);708setOperationAction(ISD::SUBC, MVT::i32, Legal);709setOperationAction(ISD::SUBE, MVT::i32, Legal);710if (STI.getPTXVersion() >= 43) {711setOperationAction(ISD::ADDC, MVT::i64, Legal);712setOperationAction(ISD::ADDE, MVT::i64, Legal);713setOperationAction(ISD::SUBC, MVT::i64, Legal);714setOperationAction(ISD::SUBE, MVT::i64, Legal);715}716717setOperationAction(ISD::CTTZ, MVT::i16, Expand);718setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);719setOperationAction(ISD::CTTZ, MVT::i32, Expand);720setOperationAction(ISD::CTTZ, MVT::i64, Expand);721722// PTX does not directly support SELP of i1, so promote to i32 first723setOperationAction(ISD::SELECT, MVT::i1, Custom);724725// PTX cannot multiply two i64s in a single instruction.726setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);727setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);728729// We have some custom DAG combine patterns for these nodes730setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,731ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,732ISD::VSELECT});733734// setcc for f16x2 and bf16x2 needs special handling to prevent735// legalizer's attempt to scalarize it due to v2i1 not being legal.736if (STI.allowFP16Math() || STI.hasBF16Math())737setTargetDAGCombine(ISD::SETCC);738739// Promote fp16 arithmetic if fp16 hardware isn't available or the740// user passed --nvptx-no-fp16-math. The flag is useful because,741// although sm_53+ GPUs have some sort of FP16 support in742// hardware, only sm_53 and sm_60 have full implementation. Others743// only have token amount of hardware and are likely to run faster744// by using fp32 units instead.745for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {746setFP16OperationAction(Op, MVT::f16, Legal, Promote);747setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);748setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);749// bf16 must be promoted to f32.750setBF16OperationAction(Op, MVT::bf16, Legal, Promote);751if (getOperationAction(Op, MVT::bf16) == Promote)752AddPromotedToType(Op, MVT::bf16, MVT::f32);753}754755// f16/f16x2 neg was introduced in PTX 60, SM_53.756const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&757STI.getPTXVersion() >= 60 &&758STI.allowFP16Math();759for (const auto &VT : {MVT::f16, MVT::v2f16})760setOperationAction(ISD::FNEG, VT,761IsFP16FP16x2NegAvailable ? Legal : Expand);762763setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);764setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);765// (would be) Library functions.766767// These map to conversion instructions for scalar FP types.768for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,769ISD::FROUNDEVEN, ISD::FTRUNC}) {770setOperationAction(Op, MVT::f16, Legal);771setOperationAction(Op, MVT::f32, Legal);772setOperationAction(Op, MVT::f64, Legal);773setOperationAction(Op, MVT::v2f16, Expand);774setOperationAction(Op, MVT::v2bf16, Expand);775setBF16OperationAction(Op, MVT::bf16, Legal, Promote);776if (getOperationAction(Op, MVT::bf16) == Promote)777AddPromotedToType(Op, MVT::bf16, MVT::f32);778}779780if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {781setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);782}783if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {784for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {785setOperationAction(ISD::FP_EXTEND, VT, Custom);786setOperationAction(ISD::FP_ROUND, VT, Custom);787}788}789790// sm_80 only has conversions between f32 and bf16. Custom lower all other791// bf16 conversions.792if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {793for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {794setOperationAction(795{ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},796VT, Custom);797}798setOperationAction(799{ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},800MVT::bf16, Custom);801}802803setOperationAction(ISD::FROUND, MVT::f16, Promote);804setOperationAction(ISD::FROUND, MVT::v2f16, Expand);805setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);806setOperationAction(ISD::FROUND, MVT::f32, Custom);807setOperationAction(ISD::FROUND, MVT::f64, Custom);808setOperationAction(ISD::FROUND, MVT::bf16, Promote);809AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);810811// 'Expand' implements FCOPYSIGN without calling an external library.812setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);813setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);814setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);815setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand);816setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);817setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);818819// These map to corresponding instructions for f32/f64. f16 must be820// promoted to f32. v2f16 is expanded to f16, which is then promoted821// to f32.822for (const auto &Op :823{ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {824setOperationAction(Op, MVT::f16, Promote);825setOperationAction(Op, MVT::f32, Legal);826setOperationAction(Op, MVT::f64, Legal);827setOperationAction(Op, MVT::v2f16, Expand);828setOperationAction(Op, MVT::v2bf16, Expand);829setOperationAction(Op, MVT::bf16, Promote);830AddPromotedToType(Op, MVT::bf16, MVT::f32);831}832for (const auto &Op : {ISD::FABS}) {833setOperationAction(Op, MVT::f16, Promote);834setOperationAction(Op, MVT::f32, Legal);835setOperationAction(Op, MVT::f64, Legal);836setOperationAction(Op, MVT::v2f16, Expand);837setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);838setBF16OperationAction(Op, MVT::bf16, Legal, Promote);839if (getOperationAction(Op, MVT::bf16) == Promote)840AddPromotedToType(Op, MVT::bf16, MVT::f32);841}842843// max.f16, max.f16x2 and max.NaN are supported on sm_80+.844auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {845bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;846return IsAtLeastSm80 ? Legal : NotSm80Action;847};848for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {849setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);850setOperationAction(Op, MVT::f32, Legal);851setOperationAction(Op, MVT::f64, Legal);852setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);853setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);854setBF16OperationAction(Op, MVT::bf16, Legal, Promote);855if (getOperationAction(Op, MVT::bf16) == Promote)856AddPromotedToType(Op, MVT::bf16, MVT::f32);857}858for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {859setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);860setFP16OperationAction(Op, MVT::bf16, Legal, Expand);861setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));862setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);863setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);864}865866// Custom lowering for inline asm with 128-bit operands867setOperationAction(ISD::CopyToReg, MVT::i128, Custom);868setOperationAction(ISD::CopyFromReg, MVT::i128, Custom);869870// No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.871// No FPOW or FREM in PTX.872873// Now deduce the information based on the above mentioned874// actions875computeRegisterProperties(STI.getRegisterInfo());876877setMinCmpXchgSizeInBits(32);878setMaxAtomicSizeInBitsSupported(64);879setMaxDivRemBitWidthSupported(64);880}881882const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {883884#define MAKE_CASE(V) \885case V: \886return #V;887888switch ((NVPTXISD::NodeType)Opcode) {889case NVPTXISD::FIRST_NUMBER:890break;891892MAKE_CASE(NVPTXISD::CALL)893MAKE_CASE(NVPTXISD::RET_GLUE)894MAKE_CASE(NVPTXISD::LOAD_PARAM)895MAKE_CASE(NVPTXISD::Wrapper)896MAKE_CASE(NVPTXISD::DeclareParam)897MAKE_CASE(NVPTXISD::DeclareScalarParam)898MAKE_CASE(NVPTXISD::DeclareRet)899MAKE_CASE(NVPTXISD::DeclareScalarRet)900MAKE_CASE(NVPTXISD::DeclareRetParam)901MAKE_CASE(NVPTXISD::PrintCall)902MAKE_CASE(NVPTXISD::PrintConvergentCall)903MAKE_CASE(NVPTXISD::PrintCallUni)904MAKE_CASE(NVPTXISD::PrintConvergentCallUni)905MAKE_CASE(NVPTXISD::LoadParam)906MAKE_CASE(NVPTXISD::LoadParamV2)907MAKE_CASE(NVPTXISD::LoadParamV4)908MAKE_CASE(NVPTXISD::StoreParam)909MAKE_CASE(NVPTXISD::StoreParamV2)910MAKE_CASE(NVPTXISD::StoreParamV4)911MAKE_CASE(NVPTXISD::StoreParamS32)912MAKE_CASE(NVPTXISD::StoreParamU32)913MAKE_CASE(NVPTXISD::CallArgBegin)914MAKE_CASE(NVPTXISD::CallArg)915MAKE_CASE(NVPTXISD::LastCallArg)916MAKE_CASE(NVPTXISD::CallArgEnd)917MAKE_CASE(NVPTXISD::CallVoid)918MAKE_CASE(NVPTXISD::CallVal)919MAKE_CASE(NVPTXISD::CallSymbol)920MAKE_CASE(NVPTXISD::Prototype)921MAKE_CASE(NVPTXISD::MoveParam)922MAKE_CASE(NVPTXISD::StoreRetval)923MAKE_CASE(NVPTXISD::StoreRetvalV2)924MAKE_CASE(NVPTXISD::StoreRetvalV4)925MAKE_CASE(NVPTXISD::PseudoUseParam)926MAKE_CASE(NVPTXISD::RETURN)927MAKE_CASE(NVPTXISD::CallSeqBegin)928MAKE_CASE(NVPTXISD::CallSeqEnd)929MAKE_CASE(NVPTXISD::CallPrototype)930MAKE_CASE(NVPTXISD::ProxyReg)931MAKE_CASE(NVPTXISD::LoadV2)932MAKE_CASE(NVPTXISD::LoadV4)933MAKE_CASE(NVPTXISD::LDGV2)934MAKE_CASE(NVPTXISD::LDGV4)935MAKE_CASE(NVPTXISD::LDUV2)936MAKE_CASE(NVPTXISD::LDUV4)937MAKE_CASE(NVPTXISD::StoreV2)938MAKE_CASE(NVPTXISD::StoreV4)939MAKE_CASE(NVPTXISD::FUN_SHFL_CLAMP)940MAKE_CASE(NVPTXISD::FUN_SHFR_CLAMP)941MAKE_CASE(NVPTXISD::IMAD)942MAKE_CASE(NVPTXISD::BFE)943MAKE_CASE(NVPTXISD::BFI)944MAKE_CASE(NVPTXISD::PRMT)945MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)946MAKE_CASE(NVPTXISD::SETP_F16X2)947MAKE_CASE(NVPTXISD::SETP_BF16X2)948MAKE_CASE(NVPTXISD::Dummy)949MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED)950MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED)951MAKE_CASE(NVPTXISD::Tex1DFloatS32)952MAKE_CASE(NVPTXISD::Tex1DFloatFloat)953MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel)954MAKE_CASE(NVPTXISD::Tex1DFloatFloatGrad)955MAKE_CASE(NVPTXISD::Tex1DS32S32)956MAKE_CASE(NVPTXISD::Tex1DS32Float)957MAKE_CASE(NVPTXISD::Tex1DS32FloatLevel)958MAKE_CASE(NVPTXISD::Tex1DS32FloatGrad)959MAKE_CASE(NVPTXISD::Tex1DU32S32)960MAKE_CASE(NVPTXISD::Tex1DU32Float)961MAKE_CASE(NVPTXISD::Tex1DU32FloatLevel)962MAKE_CASE(NVPTXISD::Tex1DU32FloatGrad)963MAKE_CASE(NVPTXISD::Tex1DArrayFloatS32)964MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloat)965MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatLevel)966MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatGrad)967MAKE_CASE(NVPTXISD::Tex1DArrayS32S32)968MAKE_CASE(NVPTXISD::Tex1DArrayS32Float)969MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatLevel)970MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatGrad)971MAKE_CASE(NVPTXISD::Tex1DArrayU32S32)972MAKE_CASE(NVPTXISD::Tex1DArrayU32Float)973MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatLevel)974MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatGrad)975MAKE_CASE(NVPTXISD::Tex2DFloatS32)976MAKE_CASE(NVPTXISD::Tex2DFloatFloat)977MAKE_CASE(NVPTXISD::Tex2DFloatFloatLevel)978MAKE_CASE(NVPTXISD::Tex2DFloatFloatGrad)979MAKE_CASE(NVPTXISD::Tex2DS32S32)980MAKE_CASE(NVPTXISD::Tex2DS32Float)981MAKE_CASE(NVPTXISD::Tex2DS32FloatLevel)982MAKE_CASE(NVPTXISD::Tex2DS32FloatGrad)983MAKE_CASE(NVPTXISD::Tex2DU32S32)984MAKE_CASE(NVPTXISD::Tex2DU32Float)985MAKE_CASE(NVPTXISD::Tex2DU32FloatLevel)986MAKE_CASE(NVPTXISD::Tex2DU32FloatGrad)987MAKE_CASE(NVPTXISD::Tex2DArrayFloatS32)988MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloat)989MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatLevel)990MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatGrad)991MAKE_CASE(NVPTXISD::Tex2DArrayS32S32)992MAKE_CASE(NVPTXISD::Tex2DArrayS32Float)993MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatLevel)994MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatGrad)995MAKE_CASE(NVPTXISD::Tex2DArrayU32S32)996MAKE_CASE(NVPTXISD::Tex2DArrayU32Float)997MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatLevel)998MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatGrad)999MAKE_CASE(NVPTXISD::Tex3DFloatS32)1000MAKE_CASE(NVPTXISD::Tex3DFloatFloat)1001MAKE_CASE(NVPTXISD::Tex3DFloatFloatLevel)1002MAKE_CASE(NVPTXISD::Tex3DFloatFloatGrad)1003MAKE_CASE(NVPTXISD::Tex3DS32S32)1004MAKE_CASE(NVPTXISD::Tex3DS32Float)1005MAKE_CASE(NVPTXISD::Tex3DS32FloatLevel)1006MAKE_CASE(NVPTXISD::Tex3DS32FloatGrad)1007MAKE_CASE(NVPTXISD::Tex3DU32S32)1008MAKE_CASE(NVPTXISD::Tex3DU32Float)1009MAKE_CASE(NVPTXISD::Tex3DU32FloatLevel)1010MAKE_CASE(NVPTXISD::Tex3DU32FloatGrad)1011MAKE_CASE(NVPTXISD::TexCubeFloatFloat)1012MAKE_CASE(NVPTXISD::TexCubeFloatFloatLevel)1013MAKE_CASE(NVPTXISD::TexCubeS32Float)1014MAKE_CASE(NVPTXISD::TexCubeS32FloatLevel)1015MAKE_CASE(NVPTXISD::TexCubeU32Float)1016MAKE_CASE(NVPTXISD::TexCubeU32FloatLevel)1017MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloat)1018MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloatLevel)1019MAKE_CASE(NVPTXISD::TexCubeArrayS32Float)1020MAKE_CASE(NVPTXISD::TexCubeArrayS32FloatLevel)1021MAKE_CASE(NVPTXISD::TexCubeArrayU32Float)1022MAKE_CASE(NVPTXISD::TexCubeArrayU32FloatLevel)1023MAKE_CASE(NVPTXISD::Tld4R2DFloatFloat)1024MAKE_CASE(NVPTXISD::Tld4G2DFloatFloat)1025MAKE_CASE(NVPTXISD::Tld4B2DFloatFloat)1026MAKE_CASE(NVPTXISD::Tld4A2DFloatFloat)1027MAKE_CASE(NVPTXISD::Tld4R2DS64Float)1028MAKE_CASE(NVPTXISD::Tld4G2DS64Float)1029MAKE_CASE(NVPTXISD::Tld4B2DS64Float)1030MAKE_CASE(NVPTXISD::Tld4A2DS64Float)1031MAKE_CASE(NVPTXISD::Tld4R2DU64Float)1032MAKE_CASE(NVPTXISD::Tld4G2DU64Float)1033MAKE_CASE(NVPTXISD::Tld4B2DU64Float)1034MAKE_CASE(NVPTXISD::Tld4A2DU64Float)10351036MAKE_CASE(NVPTXISD::TexUnified1DFloatS32)1037MAKE_CASE(NVPTXISD::TexUnified1DFloatFloat)1038MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatLevel)1039MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatGrad)1040MAKE_CASE(NVPTXISD::TexUnified1DS32S32)1041MAKE_CASE(NVPTXISD::TexUnified1DS32Float)1042MAKE_CASE(NVPTXISD::TexUnified1DS32FloatLevel)1043MAKE_CASE(NVPTXISD::TexUnified1DS32FloatGrad)1044MAKE_CASE(NVPTXISD::TexUnified1DU32S32)1045MAKE_CASE(NVPTXISD::TexUnified1DU32Float)1046MAKE_CASE(NVPTXISD::TexUnified1DU32FloatLevel)1047MAKE_CASE(NVPTXISD::TexUnified1DU32FloatGrad)1048MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatS32)1049MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloat)1050MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatLevel)1051MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatGrad)1052MAKE_CASE(NVPTXISD::TexUnified1DArrayS32S32)1053MAKE_CASE(NVPTXISD::TexUnified1DArrayS32Float)1054MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatLevel)1055MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatGrad)1056MAKE_CASE(NVPTXISD::TexUnified1DArrayU32S32)1057MAKE_CASE(NVPTXISD::TexUnified1DArrayU32Float)1058MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatLevel)1059MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatGrad)1060MAKE_CASE(NVPTXISD::TexUnified2DFloatS32)1061MAKE_CASE(NVPTXISD::TexUnified2DFloatFloat)1062MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatLevel)1063MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatGrad)1064MAKE_CASE(NVPTXISD::TexUnified2DS32S32)1065MAKE_CASE(NVPTXISD::TexUnified2DS32Float)1066MAKE_CASE(NVPTXISD::TexUnified2DS32FloatLevel)1067MAKE_CASE(NVPTXISD::TexUnified2DS32FloatGrad)1068MAKE_CASE(NVPTXISD::TexUnified2DU32S32)1069MAKE_CASE(NVPTXISD::TexUnified2DU32Float)1070MAKE_CASE(NVPTXISD::TexUnified2DU32FloatLevel)1071MAKE_CASE(NVPTXISD::TexUnified2DU32FloatGrad)1072MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatS32)1073MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloat)1074MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatLevel)1075MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatGrad)1076MAKE_CASE(NVPTXISD::TexUnified2DArrayS32S32)1077MAKE_CASE(NVPTXISD::TexUnified2DArrayS32Float)1078MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatLevel)1079MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatGrad)1080MAKE_CASE(NVPTXISD::TexUnified2DArrayU32S32)1081MAKE_CASE(NVPTXISD::TexUnified2DArrayU32Float)1082MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatLevel)1083MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatGrad)1084MAKE_CASE(NVPTXISD::TexUnified3DFloatS32)1085MAKE_CASE(NVPTXISD::TexUnified3DFloatFloat)1086MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatLevel)1087MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatGrad)1088MAKE_CASE(NVPTXISD::TexUnified3DS32S32)1089MAKE_CASE(NVPTXISD::TexUnified3DS32Float)1090MAKE_CASE(NVPTXISD::TexUnified3DS32FloatLevel)1091MAKE_CASE(NVPTXISD::TexUnified3DS32FloatGrad)1092MAKE_CASE(NVPTXISD::TexUnified3DU32S32)1093MAKE_CASE(NVPTXISD::TexUnified3DU32Float)1094MAKE_CASE(NVPTXISD::TexUnified3DU32FloatLevel)1095MAKE_CASE(NVPTXISD::TexUnified3DU32FloatGrad)1096MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloat)1097MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatLevel)1098MAKE_CASE(NVPTXISD::TexUnifiedCubeS32Float)1099MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatLevel)1100MAKE_CASE(NVPTXISD::TexUnifiedCubeU32Float)1101MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatLevel)1102MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloat)1103MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel)1104MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32Float)1105MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatLevel)1106MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32Float)1107MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatLevel)1108MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatGrad)1109MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatGrad)1110MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatGrad)1111MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad)1112MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatGrad)1113MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatGrad)1114MAKE_CASE(NVPTXISD::Tld4UnifiedR2DFloatFloat)1115MAKE_CASE(NVPTXISD::Tld4UnifiedG2DFloatFloat)1116MAKE_CASE(NVPTXISD::Tld4UnifiedB2DFloatFloat)1117MAKE_CASE(NVPTXISD::Tld4UnifiedA2DFloatFloat)1118MAKE_CASE(NVPTXISD::Tld4UnifiedR2DS64Float)1119MAKE_CASE(NVPTXISD::Tld4UnifiedG2DS64Float)1120MAKE_CASE(NVPTXISD::Tld4UnifiedB2DS64Float)1121MAKE_CASE(NVPTXISD::Tld4UnifiedA2DS64Float)1122MAKE_CASE(NVPTXISD::Tld4UnifiedR2DU64Float)1123MAKE_CASE(NVPTXISD::Tld4UnifiedG2DU64Float)1124MAKE_CASE(NVPTXISD::Tld4UnifiedB2DU64Float)1125MAKE_CASE(NVPTXISD::Tld4UnifiedA2DU64Float)11261127MAKE_CASE(NVPTXISD::Suld1DI8Clamp)1128MAKE_CASE(NVPTXISD::Suld1DI16Clamp)1129MAKE_CASE(NVPTXISD::Suld1DI32Clamp)1130MAKE_CASE(NVPTXISD::Suld1DI64Clamp)1131MAKE_CASE(NVPTXISD::Suld1DV2I8Clamp)1132MAKE_CASE(NVPTXISD::Suld1DV2I16Clamp)1133MAKE_CASE(NVPTXISD::Suld1DV2I32Clamp)1134MAKE_CASE(NVPTXISD::Suld1DV2I64Clamp)1135MAKE_CASE(NVPTXISD::Suld1DV4I8Clamp)1136MAKE_CASE(NVPTXISD::Suld1DV4I16Clamp)1137MAKE_CASE(NVPTXISD::Suld1DV4I32Clamp)11381139MAKE_CASE(NVPTXISD::Suld1DArrayI8Clamp)1140MAKE_CASE(NVPTXISD::Suld1DArrayI16Clamp)1141MAKE_CASE(NVPTXISD::Suld1DArrayI32Clamp)1142MAKE_CASE(NVPTXISD::Suld1DArrayI64Clamp)1143MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Clamp)1144MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Clamp)1145MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Clamp)1146MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Clamp)1147MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Clamp)1148MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Clamp)1149MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Clamp)11501151MAKE_CASE(NVPTXISD::Suld2DI8Clamp)1152MAKE_CASE(NVPTXISD::Suld2DI16Clamp)1153MAKE_CASE(NVPTXISD::Suld2DI32Clamp)1154MAKE_CASE(NVPTXISD::Suld2DI64Clamp)1155MAKE_CASE(NVPTXISD::Suld2DV2I8Clamp)1156MAKE_CASE(NVPTXISD::Suld2DV2I16Clamp)1157MAKE_CASE(NVPTXISD::Suld2DV2I32Clamp)1158MAKE_CASE(NVPTXISD::Suld2DV2I64Clamp)1159MAKE_CASE(NVPTXISD::Suld2DV4I8Clamp)1160MAKE_CASE(NVPTXISD::Suld2DV4I16Clamp)1161MAKE_CASE(NVPTXISD::Suld2DV4I32Clamp)11621163MAKE_CASE(NVPTXISD::Suld2DArrayI8Clamp)1164MAKE_CASE(NVPTXISD::Suld2DArrayI16Clamp)1165MAKE_CASE(NVPTXISD::Suld2DArrayI32Clamp)1166MAKE_CASE(NVPTXISD::Suld2DArrayI64Clamp)1167MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Clamp)1168MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Clamp)1169MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Clamp)1170MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Clamp)1171MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Clamp)1172MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Clamp)1173MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Clamp)11741175MAKE_CASE(NVPTXISD::Suld3DI8Clamp)1176MAKE_CASE(NVPTXISD::Suld3DI16Clamp)1177MAKE_CASE(NVPTXISD::Suld3DI32Clamp)1178MAKE_CASE(NVPTXISD::Suld3DI64Clamp)1179MAKE_CASE(NVPTXISD::Suld3DV2I8Clamp)1180MAKE_CASE(NVPTXISD::Suld3DV2I16Clamp)1181MAKE_CASE(NVPTXISD::Suld3DV2I32Clamp)1182MAKE_CASE(NVPTXISD::Suld3DV2I64Clamp)1183MAKE_CASE(NVPTXISD::Suld3DV4I8Clamp)1184MAKE_CASE(NVPTXISD::Suld3DV4I16Clamp)1185MAKE_CASE(NVPTXISD::Suld3DV4I32Clamp)11861187MAKE_CASE(NVPTXISD::Suld1DI8Trap)1188MAKE_CASE(NVPTXISD::Suld1DI16Trap)1189MAKE_CASE(NVPTXISD::Suld1DI32Trap)1190MAKE_CASE(NVPTXISD::Suld1DI64Trap)1191MAKE_CASE(NVPTXISD::Suld1DV2I8Trap)1192MAKE_CASE(NVPTXISD::Suld1DV2I16Trap)1193MAKE_CASE(NVPTXISD::Suld1DV2I32Trap)1194MAKE_CASE(NVPTXISD::Suld1DV2I64Trap)1195MAKE_CASE(NVPTXISD::Suld1DV4I8Trap)1196MAKE_CASE(NVPTXISD::Suld1DV4I16Trap)1197MAKE_CASE(NVPTXISD::Suld1DV4I32Trap)11981199MAKE_CASE(NVPTXISD::Suld1DArrayI8Trap)1200MAKE_CASE(NVPTXISD::Suld1DArrayI16Trap)1201MAKE_CASE(NVPTXISD::Suld1DArrayI32Trap)1202MAKE_CASE(NVPTXISD::Suld1DArrayI64Trap)1203MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Trap)1204MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Trap)1205MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Trap)1206MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Trap)1207MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Trap)1208MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Trap)1209MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Trap)12101211MAKE_CASE(NVPTXISD::Suld2DI8Trap)1212MAKE_CASE(NVPTXISD::Suld2DI16Trap)1213MAKE_CASE(NVPTXISD::Suld2DI32Trap)1214MAKE_CASE(NVPTXISD::Suld2DI64Trap)1215MAKE_CASE(NVPTXISD::Suld2DV2I8Trap)1216MAKE_CASE(NVPTXISD::Suld2DV2I16Trap)1217MAKE_CASE(NVPTXISD::Suld2DV2I32Trap)1218MAKE_CASE(NVPTXISD::Suld2DV2I64Trap)1219MAKE_CASE(NVPTXISD::Suld2DV4I8Trap)1220MAKE_CASE(NVPTXISD::Suld2DV4I16Trap)1221MAKE_CASE(NVPTXISD::Suld2DV4I32Trap)12221223MAKE_CASE(NVPTXISD::Suld2DArrayI8Trap)1224MAKE_CASE(NVPTXISD::Suld2DArrayI16Trap)1225MAKE_CASE(NVPTXISD::Suld2DArrayI32Trap)1226MAKE_CASE(NVPTXISD::Suld2DArrayI64Trap)1227MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Trap)1228MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Trap)1229MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Trap)1230MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Trap)1231MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Trap)1232MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Trap)1233MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Trap)12341235MAKE_CASE(NVPTXISD::Suld3DI8Trap)1236MAKE_CASE(NVPTXISD::Suld3DI16Trap)1237MAKE_CASE(NVPTXISD::Suld3DI32Trap)1238MAKE_CASE(NVPTXISD::Suld3DI64Trap)1239MAKE_CASE(NVPTXISD::Suld3DV2I8Trap)1240MAKE_CASE(NVPTXISD::Suld3DV2I16Trap)1241MAKE_CASE(NVPTXISD::Suld3DV2I32Trap)1242MAKE_CASE(NVPTXISD::Suld3DV2I64Trap)1243MAKE_CASE(NVPTXISD::Suld3DV4I8Trap)1244MAKE_CASE(NVPTXISD::Suld3DV4I16Trap)1245MAKE_CASE(NVPTXISD::Suld3DV4I32Trap)12461247MAKE_CASE(NVPTXISD::Suld1DI8Zero)1248MAKE_CASE(NVPTXISD::Suld1DI16Zero)1249MAKE_CASE(NVPTXISD::Suld1DI32Zero)1250MAKE_CASE(NVPTXISD::Suld1DI64Zero)1251MAKE_CASE(NVPTXISD::Suld1DV2I8Zero)1252MAKE_CASE(NVPTXISD::Suld1DV2I16Zero)1253MAKE_CASE(NVPTXISD::Suld1DV2I32Zero)1254MAKE_CASE(NVPTXISD::Suld1DV2I64Zero)1255MAKE_CASE(NVPTXISD::Suld1DV4I8Zero)1256MAKE_CASE(NVPTXISD::Suld1DV4I16Zero)1257MAKE_CASE(NVPTXISD::Suld1DV4I32Zero)12581259MAKE_CASE(NVPTXISD::Suld1DArrayI8Zero)1260MAKE_CASE(NVPTXISD::Suld1DArrayI16Zero)1261MAKE_CASE(NVPTXISD::Suld1DArrayI32Zero)1262MAKE_CASE(NVPTXISD::Suld1DArrayI64Zero)1263MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Zero)1264MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Zero)1265MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Zero)1266MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Zero)1267MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Zero)1268MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Zero)1269MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Zero)12701271MAKE_CASE(NVPTXISD::Suld2DI8Zero)1272MAKE_CASE(NVPTXISD::Suld2DI16Zero)1273MAKE_CASE(NVPTXISD::Suld2DI32Zero)1274MAKE_CASE(NVPTXISD::Suld2DI64Zero)1275MAKE_CASE(NVPTXISD::Suld2DV2I8Zero)1276MAKE_CASE(NVPTXISD::Suld2DV2I16Zero)1277MAKE_CASE(NVPTXISD::Suld2DV2I32Zero)1278MAKE_CASE(NVPTXISD::Suld2DV2I64Zero)1279MAKE_CASE(NVPTXISD::Suld2DV4I8Zero)1280MAKE_CASE(NVPTXISD::Suld2DV4I16Zero)1281MAKE_CASE(NVPTXISD::Suld2DV4I32Zero)12821283MAKE_CASE(NVPTXISD::Suld2DArrayI8Zero)1284MAKE_CASE(NVPTXISD::Suld2DArrayI16Zero)1285MAKE_CASE(NVPTXISD::Suld2DArrayI32Zero)1286MAKE_CASE(NVPTXISD::Suld2DArrayI64Zero)1287MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Zero)1288MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Zero)1289MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Zero)1290MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Zero)1291MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Zero)1292MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Zero)1293MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Zero)12941295MAKE_CASE(NVPTXISD::Suld3DI8Zero)1296MAKE_CASE(NVPTXISD::Suld3DI16Zero)1297MAKE_CASE(NVPTXISD::Suld3DI32Zero)1298MAKE_CASE(NVPTXISD::Suld3DI64Zero)1299MAKE_CASE(NVPTXISD::Suld3DV2I8Zero)1300MAKE_CASE(NVPTXISD::Suld3DV2I16Zero)1301MAKE_CASE(NVPTXISD::Suld3DV2I32Zero)1302MAKE_CASE(NVPTXISD::Suld3DV2I64Zero)1303MAKE_CASE(NVPTXISD::Suld3DV4I8Zero)1304MAKE_CASE(NVPTXISD::Suld3DV4I16Zero)1305MAKE_CASE(NVPTXISD::Suld3DV4I32Zero)1306}1307return nullptr;13081309#undef MAKE_CASE1310}13111312TargetLoweringBase::LegalizeTypeAction1313NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {1314if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&1315VT.getScalarType() == MVT::i1)1316return TypeSplitVector;1317if (Isv2x16VT(VT))1318return TypeLegal;1319return TargetLoweringBase::getPreferredVectorAction(VT);1320}13211322SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,1323int Enabled, int &ExtraSteps,1324bool &UseOneConst,1325bool Reciprocal) const {1326if (!(Enabled == ReciprocalEstimate::Enabled ||1327(Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))1328return SDValue();13291330if (ExtraSteps == ReciprocalEstimate::Unspecified)1331ExtraSteps = 0;13321333SDLoc DL(Operand);1334EVT VT = Operand.getValueType();1335bool Ftz = useF32FTZ(DAG.getMachineFunction());13361337auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {1338return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,1339DAG.getConstant(IID, DL, MVT::i32), Operand);1340};13411342// The sqrt and rsqrt refinement processes assume we always start out with an1343// approximation of the rsqrt. Therefore, if we're going to do any refinement1344// (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing1345// any refinement, we must return a regular sqrt.1346if (Reciprocal || ExtraSteps > 0) {1347if (VT == MVT::f32)1348return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f1349: Intrinsic::nvvm_rsqrt_approx_f);1350else if (VT == MVT::f64)1351return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);1352else1353return SDValue();1354} else {1355if (VT == MVT::f32)1356return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f1357: Intrinsic::nvvm_sqrt_approx_f);1358else {1359// There's no sqrt.approx.f64 instruction, so we emit1360// reciprocal(rsqrt(x)). This is faster than1361// select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain1362// x * rsqrt(x).)1363return DAG.getNode(1364ISD::INTRINSIC_WO_CHAIN, DL, VT,1365DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),1366MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));1367}1368}1369}13701371SDValue1372NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {1373SDLoc dl(Op);1374const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);1375auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());1376Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);1377return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);1378}13791380static bool IsTypePassedAsArray(const Type *Ty) {1381return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||1382Ty->isHalfTy() || Ty->isBFloatTy();1383}13841385std::string NVPTXTargetLowering::getPrototype(1386const DataLayout &DL, Type *retTy, const ArgListTy &Args,1387const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,1388std::optional<std::pair<unsigned, const APInt &>> VAInfo,1389const CallBase &CB, unsigned UniqueCallSite) const {1390auto PtrVT = getPointerTy(DL);13911392bool isABI = (STI.getSmVersion() >= 20);1393assert(isABI && "Non-ABI compilation is not supported");1394if (!isABI)1395return "";13961397std::string Prototype;1398raw_string_ostream O(Prototype);1399O << "prototype_" << UniqueCallSite << " : .callprototype ";14001401if (retTy->getTypeID() == Type::VoidTyID) {1402O << "()";1403} else {1404O << "(";1405if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&1406!IsTypePassedAsArray(retTy)) {1407unsigned size = 0;1408if (auto *ITy = dyn_cast<IntegerType>(retTy)) {1409size = ITy->getBitWidth();1410} else {1411assert(retTy->isFloatingPointTy() &&1412"Floating point type expected here");1413size = retTy->getPrimitiveSizeInBits();1414}1415// PTX ABI requires all scalar return values to be at least 321416// bits in size. fp16 normally uses .b16 as its storage type in1417// PTX, so its size must be adjusted here, too.1418size = promoteScalarArgumentSize(size);14191420O << ".param .b" << size << " _";1421} else if (isa<PointerType>(retTy)) {1422O << ".param .b" << PtrVT.getSizeInBits() << " _";1423} else if (IsTypePassedAsArray(retTy)) {1424O << ".param .align " << (retAlignment ? retAlignment->value() : 0)1425<< " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";1426} else {1427llvm_unreachable("Unknown return type");1428}1429O << ") ";1430}1431O << "_ (";14321433bool first = true;14341435unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();1436for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {1437Type *Ty = Args[i].Ty;1438if (!first) {1439O << ", ";1440}1441first = false;14421443if (!Outs[OIdx].Flags.isByVal()) {1444if (IsTypePassedAsArray(Ty)) {1445Align ParamAlign =1446getArgumentAlignment(&CB, Ty, i + AttributeList::FirstArgIndex, DL);1447O << ".param .align " << ParamAlign.value() << " .b8 ";1448O << "_";1449O << "[" << DL.getTypeAllocSize(Ty) << "]";1450// update the index for Outs1451SmallVector<EVT, 16> vtparts;1452ComputeValueVTs(*this, DL, Ty, vtparts);1453if (unsigned len = vtparts.size())1454OIdx += len - 1;1455continue;1456}1457// i8 types in IR will be i16 types in SDAG1458assert((getValueType(DL, Ty) == Outs[OIdx].VT ||1459(getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&1460"type mismatch between callee prototype and arguments");1461// scalar type1462unsigned sz = 0;1463if (isa<IntegerType>(Ty)) {1464sz = cast<IntegerType>(Ty)->getBitWidth();1465sz = promoteScalarArgumentSize(sz);1466} else if (isa<PointerType>(Ty)) {1467sz = PtrVT.getSizeInBits();1468} else {1469sz = Ty->getPrimitiveSizeInBits();1470}1471O << ".param .b" << sz << " ";1472O << "_";1473continue;1474}14751476// Indirect calls need strict ABI alignment so we disable optimizations by1477// not providing a function to optimize.1478Type *ETy = Args[i].IndirectType;1479Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();1480Align ParamByValAlign =1481getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);14821483O << ".param .align " << ParamByValAlign.value() << " .b8 ";1484O << "_";1485O << "[" << Outs[OIdx].Flags.getByValSize() << "]";1486}14871488if (VAInfo)1489O << (first ? "" : ",") << " .param .align " << VAInfo->second1490<< " .b8 _[]\n";1491O << ")";1492if (shouldEmitPTXNoReturn(&CB, *nvTM))1493O << " .noreturn";1494O << ";";14951496return Prototype;1497}14981499Align NVPTXTargetLowering::getFunctionArgumentAlignment(1500const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {1501return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));1502}15031504Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,1505unsigned Idx,1506const DataLayout &DL) const {1507if (!CB) {1508// CallSite is zero, fallback to ABI type alignment1509return DL.getABITypeAlign(Ty);1510}15111512const Function *DirectCallee = CB->getCalledFunction();15131514if (!DirectCallee) {1515// We don't have a direct function symbol, but that may be because of1516// constant cast instructions in the call.15171518// With bitcast'd call targets, the instruction will be the call1519if (const auto *CI = dyn_cast<CallInst>(CB)) {1520// Check if we have call alignment metadata1521if (MaybeAlign StackAlign = getAlign(*CI, Idx))1522return StackAlign.value();1523}1524DirectCallee = getMaybeBitcastedCallee(CB);1525}15261527// Check for function alignment information if we found that the1528// ultimate target is a Function1529if (DirectCallee)1530return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);15311532// Call is indirect, fall back to the ABI type alignment1533return DL.getABITypeAlign(Ty);1534}15351536static bool adjustElementType(EVT &ElementType) {1537switch (ElementType.getSimpleVT().SimpleTy) {1538default:1539return false;1540case MVT::f16:1541case MVT::bf16:1542ElementType = MVT::i16;1543return true;1544case MVT::f32:1545case MVT::v2f16:1546case MVT::v2bf16:1547ElementType = MVT::i32;1548return true;1549case MVT::f64:1550ElementType = MVT::i64;1551return true;1552}1553}15541555// Use byte-store when the param address of the argument value is unaligned.1556// This may happen when the return value is a field of a packed structure.1557//1558// This is called in LowerCall() when passing the param values.1559static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,1560uint64_t Offset, EVT ElementType,1561SDValue StVal, SDValue &InGlue,1562unsigned ArgID, const SDLoc &dl) {1563// Bit logic only works on integer types1564if (adjustElementType(ElementType))1565StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);15661567// Store each byte1568SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);1569for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {1570// Shift the byte to the last byte position1571SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,1572DAG.getConstant(i * 8, dl, MVT::i32));1573SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),1574DAG.getConstant(Offset + i, dl, MVT::i32),1575ShiftVal, InGlue};1576// Trunc store only the last byte by using1577// st.param.b81578// The register type can be larger than b8.1579Chain = DAG.getMemIntrinsicNode(1580NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,1581MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);1582InGlue = Chain.getValue(1);1583}1584return Chain;1585}15861587// Use byte-load when the param adress of the returned value is unaligned.1588// This may happen when the returned value is a field of a packed structure.1589static SDValue1590LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,1591EVT ElementType, SDValue &InGlue,1592SmallVectorImpl<SDValue> &TempProxyRegOps,1593const SDLoc &dl) {1594// Bit logic only works on integer types1595EVT MergedType = ElementType;1596adjustElementType(MergedType);15971598// Load each byte and construct the whole value. Initial value to 01599SDValue RetVal = DAG.getConstant(0, dl, MergedType);1600// LoadParamMemI8 loads into i16 register only1601SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);1602for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {1603SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),1604DAG.getConstant(Offset + i, dl, MVT::i32),1605InGlue};1606// This will be selected to LoadParamMemI81607SDValue LdVal =1608DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,1609MVT::i8, MachinePointerInfo(), Align(1));1610SDValue TmpLdVal = LdVal.getValue(0);1611Chain = LdVal.getValue(1);1612InGlue = LdVal.getValue(2);16131614TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,1615TmpLdVal.getSimpleValueType(), TmpLdVal);1616TempProxyRegOps.push_back(TmpLdVal);16171618SDValue CMask = DAG.getConstant(255, dl, MergedType);1619SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);1620// Need to extend the i16 register to the whole width.1621TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);1622// Mask off the high bits. Leave only the lower 8bits.1623// Do this because we are using loadparam.b8.1624TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);1625// Shift and merge1626TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);1627RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);1628}1629if (ElementType != MergedType)1630RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);16311632return RetVal;1633}16341635SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,1636SmallVectorImpl<SDValue> &InVals) const {16371638if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))1639report_fatal_error(1640"Support for variadic functions (unsized array parameter) introduced "1641"in PTX ISA version 6.0 and requires target sm_30.");16421643SelectionDAG &DAG = CLI.DAG;1644SDLoc dl = CLI.DL;1645SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;1646SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;1647SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;1648SDValue Chain = CLI.Chain;1649SDValue Callee = CLI.Callee;1650bool &isTailCall = CLI.IsTailCall;1651ArgListTy &Args = CLI.getArgs();1652Type *RetTy = CLI.RetTy;1653const CallBase *CB = CLI.CB;1654const DataLayout &DL = DAG.getDataLayout();16551656bool isABI = (STI.getSmVersion() >= 20);1657assert(isABI && "Non-ABI compilation is not supported");1658if (!isABI)1659return Chain;16601661// Variadic arguments.1662//1663// Normally, for each argument, we declare a param scalar or a param1664// byte array in the .param space, and store the argument value to that1665// param scalar or array starting at offset 0.1666//1667// In the case of the first variadic argument, we declare a vararg byte array1668// with size 0. The exact size of this array isn't known at this point, so1669// it'll be patched later. All the variadic arguments will be stored to this1670// array at a certain offset (which gets tracked by 'VAOffset'). The offset is1671// initially set to 0, so it can be used for non-variadic arguments (which use1672// 0 offset) to simplify the code.1673//1674// After all vararg is processed, 'VAOffset' holds the size of the1675// vararg byte array.16761677SDValue VADeclareParam; // vararg byte array1678unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic1679unsigned VAOffset = 0; // current offset in the param array16801681unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);1682SDValue TempChain = Chain;1683Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);1684SDValue InGlue = Chain.getValue(1);16851686unsigned ParamCount = 0;1687// Args.size() and Outs.size() need not match.1688// Outs.size() will be larger1689// * if there is an aggregate argument with multiple fields (each field1690// showing up separately in Outs)1691// * if there is a vector argument with more than typical vector-length1692// elements (generally if more than 4) where each vector element is1693// individually present in Outs.1694// So a different index should be used for indexing into Outs/OutVals.1695// See similar issue in LowerFormalArguments.1696unsigned OIdx = 0;1697// Declare the .params or .reg need to pass values1698// to the function1699for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {1700EVT VT = Outs[OIdx].VT;1701Type *Ty = Args[i].Ty;1702bool IsVAArg = (i >= CLI.NumFixedArgs);1703bool IsByVal = Outs[OIdx].Flags.isByVal();17041705SmallVector<EVT, 16> VTs;1706SmallVector<uint64_t, 16> Offsets;17071708assert((!IsByVal || Args[i].IndirectType) &&1709"byval arg must have indirect type");1710Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);1711ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);17121713Align ArgAlign;1714if (IsByVal) {1715// The ByValAlign in the Outs[OIdx].Flags is always set at this point,1716// so we don't need to worry whether it's naturally aligned or not.1717// See TargetLowering::LowerCallTo().1718Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();1719ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,1720InitialAlign, DL);1721if (IsVAArg)1722VAOffset = alignTo(VAOffset, ArgAlign);1723} else {1724ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL);1725}17261727unsigned TypeSize =1728(IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));1729SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);17301731bool NeedAlign; // Does argument declaration specify alignment?1732bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);1733if (IsVAArg) {1734if (ParamCount == FirstVAArg) {1735SDValue DeclareParamOps[] = {1736Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),1737DAG.getConstant(ParamCount, dl, MVT::i32),1738DAG.getConstant(1, dl, MVT::i32), InGlue};1739VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,1740DeclareParamVTs, DeclareParamOps);1741}1742NeedAlign = PassAsArray;1743} else if (PassAsArray) {1744// declare .param .align <align> .b8 .param<n>[<size>];1745SDValue DeclareParamOps[] = {1746Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),1747DAG.getConstant(ParamCount, dl, MVT::i32),1748DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};1749Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,1750DeclareParamOps);1751NeedAlign = true;1752} else {1753// declare .param .b<size> .param<n>;1754if (VT.isInteger() || VT.isFloatingPoint()) {1755// PTX ABI requires integral types to be at least 32 bits in1756// size. FP16 is loaded/stored using i16, so it's handled1757// here as well.1758TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8;1759}1760SDValue DeclareScalarParamOps[] = {1761Chain, DAG.getConstant(ParamCount, dl, MVT::i32),1762DAG.getConstant(TypeSize * 8, dl, MVT::i32),1763DAG.getConstant(0, dl, MVT::i32), InGlue};1764Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,1765DeclareScalarParamOps);1766NeedAlign = false;1767}1768InGlue = Chain.getValue(1);17691770// PTX Interoperability Guide 3.3(A): [Integer] Values shorter1771// than 32-bits are sign extended or zero extended, depending on1772// whether they are signed or unsigned types. This case applies1773// only to scalar parameters and not to aggregate values.1774bool ExtendIntegerParam =1775Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;17761777auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);1778SmallVector<SDValue, 6> StoreOperands;1779for (unsigned j = 0, je = VTs.size(); j != je; ++j) {1780EVT EltVT = VTs[j];1781int CurOffset = Offsets[j];1782MaybeAlign PartAlign;1783if (NeedAlign)1784PartAlign = commonAlignment(ArgAlign, CurOffset);17851786SDValue StVal = OutVals[OIdx];17871788MVT PromotedVT;1789if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {1790EltVT = EVT(PromotedVT);1791}1792if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {1793llvm::ISD::NodeType Ext =1794Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;1795StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);1796}17971798if (IsByVal) {1799auto PtrVT = getPointerTy(DL);1800SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,1801DAG.getConstant(CurOffset, dl, PtrVT));1802StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),1803PartAlign);1804} else if (ExtendIntegerParam) {1805assert(VTs.size() == 1 && "Scalar can't have multiple parts.");1806// zext/sext to i321807StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND1808: ISD::ZERO_EXTEND,1809dl, MVT::i32, StVal);1810}18111812if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {1813// Use 16-bit registers for small stores as it's the1814// smallest general purpose register size supported by NVPTX.1815StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);1816}18171818// If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a1819// scalar store. In such cases, fall back to byte stores.1820if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&1821PartAlign.value() <1822DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {1823assert(StoreOperands.empty() && "Unfinished preceeding store.");1824Chain = LowerUnalignedStoreParam(1825DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,1826StVal, InGlue, ParamCount, dl);18271828// LowerUnalignedStoreParam took care of inserting the necessary nodes1829// into the SDAG, so just move on to the next element.1830if (!IsByVal)1831++OIdx;1832continue;1833}18341835// New store.1836if (VectorInfo[j] & PVF_FIRST) {1837assert(StoreOperands.empty() && "Unfinished preceding store.");1838StoreOperands.push_back(Chain);1839StoreOperands.push_back(1840DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));18411842StoreOperands.push_back(DAG.getConstant(1843IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),1844dl, MVT::i32));1845}18461847// Record the value to store.1848StoreOperands.push_back(StVal);18491850if (VectorInfo[j] & PVF_LAST) {1851unsigned NumElts = StoreOperands.size() - 3;1852NVPTXISD::NodeType Op;1853switch (NumElts) {1854case 1:1855Op = NVPTXISD::StoreParam;1856break;1857case 2:1858Op = NVPTXISD::StoreParamV2;1859break;1860case 4:1861Op = NVPTXISD::StoreParamV4;1862break;1863default:1864llvm_unreachable("Invalid vector info.");1865}18661867StoreOperands.push_back(InGlue);18681869// Adjust type of the store op if we've extended the scalar1870// return value.1871EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;18721873Chain = DAG.getMemIntrinsicNode(1874Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,1875TheStoreType, MachinePointerInfo(), PartAlign,1876MachineMemOperand::MOStore);1877InGlue = Chain.getValue(1);18781879// Cleanup.1880StoreOperands.clear();18811882// TODO: We may need to support vector types that can be passed1883// as scalars in variadic arguments.1884if (!IsByVal && IsVAArg) {1885assert(NumElts == 1 &&1886"Vectorization is expected to be disabled for variadics.");1887VAOffset += DL.getTypeAllocSize(1888TheStoreType.getTypeForEVT(*DAG.getContext()));1889}1890}1891if (!IsByVal)1892++OIdx;1893}1894assert(StoreOperands.empty() && "Unfinished parameter store.");1895if (!IsByVal && VTs.size() > 0)1896--OIdx;1897++ParamCount;1898if (IsByVal && IsVAArg)1899VAOffset += TypeSize;1900}19011902GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());1903MaybeAlign retAlignment = std::nullopt;19041905// Handle Result1906if (Ins.size() > 0) {1907SmallVector<EVT, 16> resvtparts;1908ComputeValueVTs(*this, DL, RetTy, resvtparts);19091910// Declare1911// .param .align N .b8 retval0[<size-in-bytes>], or1912// .param .b<size-in-bits> retval01913unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);1914if (!IsTypePassedAsArray(RetTy)) {1915resultsz = promoteScalarArgumentSize(resultsz);1916SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);1917SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),1918DAG.getConstant(resultsz, dl, MVT::i32),1919DAG.getConstant(0, dl, MVT::i32), InGlue };1920Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,1921DeclareRetOps);1922InGlue = Chain.getValue(1);1923} else {1924retAlignment = getArgumentAlignment(CB, RetTy, 0, DL);1925assert(retAlignment && "retAlignment is guaranteed to be set");1926SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);1927SDValue DeclareRetOps[] = {1928Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),1929DAG.getConstant(resultsz / 8, dl, MVT::i32),1930DAG.getConstant(0, dl, MVT::i32), InGlue};1931Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,1932DeclareRetOps);1933InGlue = Chain.getValue(1);1934}1935}19361937bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);1938// Set the size of the vararg param byte array if the callee is a variadic1939// function and the variadic part is not empty.1940if (HasVAArgs) {1941SDValue DeclareParamOps[] = {1942VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),1943VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),1944VADeclareParam.getOperand(4)};1945DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),1946VADeclareParam->getVTList(), DeclareParamOps);1947}19481949// Both indirect calls and libcalls have nullptr Func. In order to distinguish1950// between them we must rely on the call site value which is valid for1951// indirect calls but is always null for libcalls.1952bool isIndirectCall = !Func && CB;19531954if (isa<ExternalSymbolSDNode>(Callee)) {1955Function* CalleeFunc = nullptr;19561957// Try to find the callee in the current module.1958Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);1959assert(CalleeFunc != nullptr && "Libcall callee must be set.");19601961// Set the "libcall callee" attribute to indicate that the function1962// must always have a declaration.1963CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");1964}19651966if (isIndirectCall) {1967// This is indirect function call case : PTX requires a prototype of the1968// form1969// proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);1970// to be emitted, and the label has to used as the last arg of call1971// instruction.1972// The prototype is embedded in a string and put as the operand for a1973// CallPrototype SDNode which will print out to the value of the string.1974SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);1975std::string Proto = getPrototype(1976DL, RetTy, Args, Outs, retAlignment,1977HasVAArgs1978? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(1979CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))1980: std::nullopt,1981*CB, UniqueCallSite);1982const char *ProtoStr = nvTM->getStrPool().save(Proto).data();1983SDValue ProtoOps[] = {1984Chain,1985DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),1986InGlue,1987};1988Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);1989InGlue = Chain.getValue(1);1990}1991// Op to just print "call"1992SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);1993SDValue PrintCallOps[] = {1994Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue1995};1996// We model convergent calls as separate opcodes.1997unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;1998if (CLI.IsConvergent)1999Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni2000: NVPTXISD::PrintConvergentCall;2001Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);2002InGlue = Chain.getValue(1);20032004// Ops to print out the function name2005SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);2006SDValue CallVoidOps[] = { Chain, Callee, InGlue };2007Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);2008InGlue = Chain.getValue(1);20092010// Ops to print out the param list2011SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);2012SDValue CallArgBeginOps[] = { Chain, InGlue };2013Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,2014CallArgBeginOps);2015InGlue = Chain.getValue(1);20162017for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;2018++i) {2019unsigned opcode;2020if (i == (e - 1))2021opcode = NVPTXISD::LastCallArg;2022else2023opcode = NVPTXISD::CallArg;2024SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);2025SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),2026DAG.getConstant(i, dl, MVT::i32), InGlue };2027Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);2028InGlue = Chain.getValue(1);2029}2030SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);2031SDValue CallArgEndOps[] = { Chain,2032DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),2033InGlue };2034Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);2035InGlue = Chain.getValue(1);20362037if (isIndirectCall) {2038SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);2039SDValue PrototypeOps[] = {2040Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};2041Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);2042InGlue = Chain.getValue(1);2043}20442045SmallVector<SDValue, 16> ProxyRegOps;2046SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;2047// An item of the vector is filled if the element does not need a ProxyReg2048// operation on it and should be added to InVals as is. ProxyRegOps and2049// ProxyRegTruncates contain empty/none items at the same index.2050SmallVector<SDValue, 16> RetElts;2051// A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`2052// to use the values of `LoadParam`s and to be replaced later then2053// `CALLSEQ_END` is added.2054SmallVector<SDValue, 16> TempProxyRegOps;20552056// Generate loads from param memory/moves from registers for result2057if (Ins.size() > 0) {2058SmallVector<EVT, 16> VTs;2059SmallVector<uint64_t, 16> Offsets;2060ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);2061assert(VTs.size() == Ins.size() && "Bad value decomposition");20622063Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);2064auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);20652066SmallVector<EVT, 6> LoadVTs;2067int VecIdx = -1; // Index of the first element of the vector.20682069// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than2070// 32-bits are sign extended or zero extended, depending on whether2071// they are signed or unsigned types.2072bool ExtendIntegerRetVal =2073RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;20742075for (unsigned i = 0, e = VTs.size(); i != e; ++i) {2076bool needTruncate = false;2077EVT TheLoadType = VTs[i];2078EVT EltType = Ins[i].VT;2079Align EltAlign = commonAlignment(RetAlign, Offsets[i]);2080MVT PromotedVT;20812082if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {2083TheLoadType = EVT(PromotedVT);2084EltType = EVT(PromotedVT);2085needTruncate = true;2086}20872088if (ExtendIntegerRetVal) {2089TheLoadType = MVT::i32;2090EltType = MVT::i32;2091needTruncate = true;2092} else if (TheLoadType.getSizeInBits() < 16) {2093if (VTs[i].isInteger())2094needTruncate = true;2095EltType = MVT::i16;2096}20972098// If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a2099// scalar load. In such cases, fall back to byte loads.2100if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&2101EltAlign < DL.getABITypeAlign(2102TheLoadType.getTypeForEVT(*DAG.getContext()))) {2103assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");2104SDValue Ret = LowerUnalignedLoadRetParam(2105DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);2106ProxyRegOps.push_back(SDValue());2107ProxyRegTruncates.push_back(std::optional<MVT>());2108RetElts.resize(i);2109RetElts.push_back(Ret);21102111continue;2112}21132114// Record index of the very first element of the vector.2115if (VectorInfo[i] & PVF_FIRST) {2116assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");2117VecIdx = i;2118}21192120LoadVTs.push_back(EltType);21212122if (VectorInfo[i] & PVF_LAST) {2123unsigned NumElts = LoadVTs.size();2124LoadVTs.push_back(MVT::Other);2125LoadVTs.push_back(MVT::Glue);2126NVPTXISD::NodeType Op;2127switch (NumElts) {2128case 1:2129Op = NVPTXISD::LoadParam;2130break;2131case 2:2132Op = NVPTXISD::LoadParamV2;2133break;2134case 4:2135Op = NVPTXISD::LoadParamV4;2136break;2137default:2138llvm_unreachable("Invalid vector info.");2139}21402141SDValue LoadOperands[] = {2142Chain, DAG.getConstant(1, dl, MVT::i32),2143DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};2144SDValue RetVal = DAG.getMemIntrinsicNode(2145Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,2146MachinePointerInfo(), EltAlign,2147MachineMemOperand::MOLoad);21482149for (unsigned j = 0; j < NumElts; ++j) {2150ProxyRegOps.push_back(RetVal.getValue(j));21512152if (needTruncate)2153ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));2154else2155ProxyRegTruncates.push_back(std::optional<MVT>());2156}21572158Chain = RetVal.getValue(NumElts);2159InGlue = RetVal.getValue(NumElts + 1);21602161// Cleanup2162VecIdx = -1;2163LoadVTs.clear();2164}2165}2166}21672168Chain =2169DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);2170InGlue = Chain.getValue(1);21712172// Append ProxyReg instructions to the chain to make sure that `callseq_end`2173// will not get lost. Otherwise, during libcalls expansion, the nodes can become2174// dangling.2175for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {2176if (i < RetElts.size() && RetElts[i]) {2177InVals.push_back(RetElts[i]);2178continue;2179}21802181SDValue Ret = DAG.getNode(2182NVPTXISD::ProxyReg, dl,2183DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),2184{ Chain, ProxyRegOps[i], InGlue }2185);21862187Chain = Ret.getValue(1);2188InGlue = Ret.getValue(2);21892190if (ProxyRegTruncates[i]) {2191Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);2192}21932194InVals.push_back(Ret);2195}21962197for (SDValue &T : TempProxyRegOps) {2198SDValue Repl = DAG.getNode(2199NVPTXISD::ProxyReg, dl,2200DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),2201{Chain, T.getOperand(0), InGlue});2202DAG.ReplaceAllUsesWith(T, Repl);2203DAG.RemoveDeadNode(T.getNode());22042205Chain = Repl.getValue(1);2206InGlue = Repl.getValue(2);2207}22082209// set isTailCall to false for now, until we figure out how to express2210// tail call optimization in PTX2211isTailCall = false;2212return Chain;2213}22142215SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,2216SelectionDAG &DAG) const {22172218if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {2219const Function &Fn = DAG.getMachineFunction().getFunction();22202221DiagnosticInfoUnsupported NoDynamicAlloca(2222Fn,2223"Support for dynamic alloca introduced in PTX ISA version 7.3 and "2224"requires target sm_52.",2225SDLoc(Op).getDebugLoc());2226DAG.getContext()->diagnose(NoDynamicAlloca);2227auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),2228Op.getOperand(0)};2229return DAG.getMergeValues(Ops, SDLoc());2230}22312232SDValue Chain = Op.getOperand(0);2233SDValue Size = Op.getOperand(1);2234uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();2235SDLoc DL(Op.getNode());22362237// The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.2238if (nvTM->is64Bit())2239Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64);2240else2241Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32);22422243SDValue AllocOps[] = {Chain, Size,2244DAG.getTargetConstant(Align, DL, MVT::i32)};2245SDValue Alloca = DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL,2246nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps);22472248SDValue MergeOps[] = {Alloca, Chain};2249return DAG.getMergeValues(MergeOps, DL);2250}22512252// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()2253// (see LegalizeDAG.cpp). This is slow and uses local memory.2254// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.52255SDValue2256NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {2257SDNode *Node = Op.getNode();2258SDLoc dl(Node);2259SmallVector<SDValue, 8> Ops;2260unsigned NumOperands = Node->getNumOperands();2261for (unsigned i = 0; i < NumOperands; ++i) {2262SDValue SubOp = Node->getOperand(i);2263EVT VVT = SubOp.getNode()->getValueType(0);2264EVT EltVT = VVT.getVectorElementType();2265unsigned NumSubElem = VVT.getVectorNumElements();2266for (unsigned j = 0; j < NumSubElem; ++j) {2267Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,2268DAG.getIntPtrConstant(j, dl)));2269}2270}2271return DAG.getBuildVector(Node->getValueType(0), dl, Ops);2272}22732274// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it2275// would get lowered as two constant loads and vector-packing move.2276// Instead we want just a constant move:2277// mov.b32 %r2, 0x40003C002278SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,2279SelectionDAG &DAG) const {2280EVT VT = Op->getValueType(0);2281if (!(Isv2x16VT(VT) || VT == MVT::v4i8))2282return Op;22832284SDLoc DL(Op);22852286if (!llvm::all_of(Op->ops(), [](SDValue Operand) {2287return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||2288isa<ConstantFPSDNode>(Operand);2289})) {2290// Lower non-const v4i8 vector as byte-wise constructed i32, which allows us2291// to optimize calculation of constant parts.2292if (VT == MVT::v4i8) {2293SDValue C8 = DAG.getConstant(8, DL, MVT::i32);2294SDValue E01 = DAG.getNode(2295NVPTXISD::BFI, DL, MVT::i32,2296DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),2297DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);2298SDValue E012 =2299DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,2300DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),2301E01, DAG.getConstant(16, DL, MVT::i32), C8);2302SDValue E0123 =2303DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,2304DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),2305E012, DAG.getConstant(24, DL, MVT::i32), C8);2306return DAG.getNode(ISD::BITCAST, DL, VT, E0123);2307}2308return Op;2309}23102311// Get value or the Nth operand as an APInt(32). Undef values treated as 0.2312auto GetOperand = [](SDValue Op, int N) -> APInt {2313const SDValue &Operand = Op->getOperand(N);2314EVT VT = Op->getValueType(0);2315if (Operand->isUndef())2316return APInt(32, 0);2317APInt Value;2318if (VT == MVT::v2f16 || VT == MVT::v2bf16)2319Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();2320else if (VT == MVT::v2i16 || VT == MVT::v4i8)2321Value = Operand->getAsAPIntVal();2322else2323llvm_unreachable("Unsupported type");2324// i8 values are carried around as i16, so we need to zero out upper bits,2325// so they do not get in the way of combining individual byte values2326if (VT == MVT::v4i8)2327Value = Value.trunc(8);2328return Value.zext(32);2329};2330APInt Value;2331if (Isv2x16VT(VT)) {2332Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);2333} else if (VT == MVT::v4i8) {2334Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |2335GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);2336} else {2337llvm_unreachable("Unsupported type");2338}2339SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);2340return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);2341}23422343SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,2344SelectionDAG &DAG) const {2345SDValue Index = Op->getOperand(1);2346SDValue Vector = Op->getOperand(0);2347SDLoc DL(Op);2348EVT VectorVT = Vector.getValueType();23492350if (VectorVT == MVT::v4i8) {2351SDValue BFE =2352DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,2353{Vector,2354DAG.getNode(ISD::MUL, DL, MVT::i32,2355DAG.getZExtOrTrunc(Index, DL, MVT::i32),2356DAG.getConstant(8, DL, MVT::i32)),2357DAG.getConstant(8, DL, MVT::i32)});2358return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));2359}23602361// Constant index will be matched by tablegen.2362if (isa<ConstantSDNode>(Index.getNode()))2363return Op;23642365// Extract individual elements and select one of them.2366assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");2367EVT EltVT = VectorVT.getVectorElementType();23682369SDLoc dl(Op.getNode());2370SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,2371DAG.getIntPtrConstant(0, dl));2372SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,2373DAG.getIntPtrConstant(1, dl));2374return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,2375ISD::CondCode::SETEQ);2376}23772378SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,2379SelectionDAG &DAG) const {2380SDValue Vector = Op->getOperand(0);2381EVT VectorVT = Vector.getValueType();23822383if (VectorVT != MVT::v4i8)2384return Op;2385SDLoc DL(Op);2386SDValue Value = Op->getOperand(1);2387if (Value->isUndef())2388return Vector;23892390SDValue Index = Op->getOperand(2);23912392SDValue BFI =2393DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,2394{DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,2395DAG.getNode(ISD::MUL, DL, MVT::i32,2396DAG.getZExtOrTrunc(Index, DL, MVT::i32),2397DAG.getConstant(8, DL, MVT::i32)),2398DAG.getConstant(8, DL, MVT::i32)});2399return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);2400}24012402SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,2403SelectionDAG &DAG) const {2404SDValue V1 = Op.getOperand(0);2405EVT VectorVT = V1.getValueType();2406if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)2407return Op;24082409// Lower shuffle to PRMT instruction.2410const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());2411SDValue V2 = Op.getOperand(1);2412uint32_t Selector = 0;2413for (auto I : llvm::enumerate(SVN->getMask())) {2414if (I.value() != -1) // -1 is a placeholder for undef.2415Selector |= (I.value() << (I.index() * 4));2416}24172418SDLoc DL(Op);2419return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,2420DAG.getConstant(Selector, DL, MVT::i32),2421DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));2422}2423/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which2424/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift2425/// amount, or2426/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift2427/// amount.2428SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,2429SelectionDAG &DAG) const {2430assert(Op.getNumOperands() == 3 && "Not a double-shift!");2431assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);24322433EVT VT = Op.getValueType();2434unsigned VTBits = VT.getSizeInBits();2435SDLoc dl(Op);2436SDValue ShOpLo = Op.getOperand(0);2437SDValue ShOpHi = Op.getOperand(1);2438SDValue ShAmt = Op.getOperand(2);2439unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;24402441if (VTBits == 32 && STI.getSmVersion() >= 35) {2442// For 32bit and sm35, we can use the funnel shift 'shf' instruction.2443// {dHi, dLo} = {aHi, aLo} >> Amt2444// dHi = aHi >> Amt2445// dLo = shf.r.clamp aLo, aHi, Amt24462447SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);2448SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,2449ShAmt);24502451SDValue Ops[2] = { Lo, Hi };2452return DAG.getMergeValues(Ops, dl);2453}2454else {2455// {dHi, dLo} = {aHi, aLo} >> Amt2456// - if (Amt>=size) then2457// dLo = aHi >> (Amt-size)2458// dHi = aHi >> Amt (this is either all 0 or all 1)2459// else2460// dLo = (aLo >>logic Amt) | (aHi << (size-Amt))2461// dHi = aHi >> Amt24622463SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,2464DAG.getConstant(VTBits, dl, MVT::i32),2465ShAmt);2466SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);2467SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,2468DAG.getConstant(VTBits, dl, MVT::i32));2469SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);2470SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);2471SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);24722473SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,2474DAG.getConstant(VTBits, dl, MVT::i32),2475ISD::SETGE);2476SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);2477SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);24782479SDValue Ops[2] = { Lo, Hi };2480return DAG.getMergeValues(Ops, dl);2481}2482}24832484/// LowerShiftLeftParts - Lower SHL_PARTS, which2485/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift2486/// amount, or2487/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift2488/// amount.2489SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,2490SelectionDAG &DAG) const {2491assert(Op.getNumOperands() == 3 && "Not a double-shift!");2492assert(Op.getOpcode() == ISD::SHL_PARTS);24932494EVT VT = Op.getValueType();2495unsigned VTBits = VT.getSizeInBits();2496SDLoc dl(Op);2497SDValue ShOpLo = Op.getOperand(0);2498SDValue ShOpHi = Op.getOperand(1);2499SDValue ShAmt = Op.getOperand(2);25002501if (VTBits == 32 && STI.getSmVersion() >= 35) {2502// For 32bit and sm35, we can use the funnel shift 'shf' instruction.2503// {dHi, dLo} = {aHi, aLo} << Amt2504// dHi = shf.l.clamp aLo, aHi, Amt2505// dLo = aLo << Amt25062507SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,2508ShAmt);2509SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);25102511SDValue Ops[2] = { Lo, Hi };2512return DAG.getMergeValues(Ops, dl);2513}2514else {2515// {dHi, dLo} = {aHi, aLo} << Amt2516// - if (Amt>=size) then2517// dLo = aLo << Amt (all 0)2518// dLo = aLo << (Amt-size)2519// else2520// dLo = aLo << Amt2521// dHi = (aHi << Amt) | (aLo >> (size-Amt))25222523SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,2524DAG.getConstant(VTBits, dl, MVT::i32),2525ShAmt);2526SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);2527SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,2528DAG.getConstant(VTBits, dl, MVT::i32));2529SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);2530SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);2531SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);25322533SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,2534DAG.getConstant(VTBits, dl, MVT::i32),2535ISD::SETGE);2536SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);2537SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);25382539SDValue Ops[2] = { Lo, Hi };2540return DAG.getMergeValues(Ops, dl);2541}2542}25432544SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {2545EVT VT = Op.getValueType();25462547if (VT == MVT::f32)2548return LowerFROUND32(Op, DAG);25492550if (VT == MVT::f64)2551return LowerFROUND64(Op, DAG);25522553llvm_unreachable("unhandled type");2554}25552556// This is the the rounding method used in CUDA libdevice in C like code:2557// float roundf(float A)2558// {2559// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));2560// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;2561// return abs(A) < 0.5 ? (float)(int)A : RoundedA;2562// }2563SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,2564SelectionDAG &DAG) const {2565SDLoc SL(Op);2566SDValue A = Op.getOperand(0);2567EVT VT = Op.getValueType();25682569SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);25702571// RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))2572SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);2573const int SignBitMask = 0x80000000;2574SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,2575DAG.getConstant(SignBitMask, SL, MVT::i32));2576const int PointFiveInBits = 0x3F000000;2577SDValue PointFiveWithSignRaw =2578DAG.getNode(ISD::OR, SL, MVT::i32, Sign,2579DAG.getConstant(PointFiveInBits, SL, MVT::i32));2580SDValue PointFiveWithSign =2581DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);2582SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);2583SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);25842585// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;2586EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);2587SDValue IsLarge =2588DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),2589ISD::SETOGT);2590RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);25912592// return abs(A) < 0.5 ? (float)(int)A : RoundedA;2593SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,2594DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);2595SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);2596return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);2597}25982599// The implementation of round(double) is similar to that of round(float) in2600// that they both separate the value range into three regions and use a method2601// specific to the region to round the values. However, round(double) first2602// calculates the round of the absolute value and then adds the sign back while2603// round(float) directly rounds the value with sign.2604SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,2605SelectionDAG &DAG) const {2606SDLoc SL(Op);2607SDValue A = Op.getOperand(0);2608EVT VT = Op.getValueType();26092610SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);26112612// double RoundedA = (double) (int) (abs(A) + 0.5f);2613SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,2614DAG.getConstantFP(0.5, SL, VT));2615SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);26162617// RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;2618EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);2619SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,2620DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);2621RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,2622DAG.getConstantFP(0, SL, VT),2623RoundedA);26242625// Add sign to rounded_A2626RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);2627DAG.getNode(ISD::FTRUNC, SL, VT, A);26282629// RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;2630SDValue IsLarge =2631DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),2632ISD::SETOGT);2633return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);2634}26352636SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,2637SelectionDAG &DAG) const {2638assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);26392640if (Op.getValueType() == MVT::bf16) {2641SDLoc Loc(Op);2642return DAG.getNode(2643ISD::FP_ROUND, Loc, MVT::bf16,2644DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),2645DAG.getIntPtrConstant(0, Loc));2646}26472648// Everything else is considered legal.2649return Op;2650}26512652SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,2653SelectionDAG &DAG) const {2654assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);26552656if (Op.getOperand(0).getValueType() == MVT::bf16) {2657SDLoc Loc(Op);2658return DAG.getNode(2659Op.getOpcode(), Loc, Op.getValueType(),2660DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));2661}26622663// Everything else is considered legal.2664return Op;2665}26662667SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,2668SelectionDAG &DAG) const {2669EVT NarrowVT = Op.getValueType();2670SDValue Wide = Op.getOperand(0);2671EVT WideVT = Wide.getValueType();2672if (NarrowVT.getScalarType() == MVT::bf16) {2673const TargetLowering *TLI = STI.getTargetLowering();2674if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {2675return TLI->expandFP_ROUND(Op.getNode(), DAG);2676}2677if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {2678// This combination was the first to support f32 -> bf16.2679if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {2680if (WideVT.getScalarType() == MVT::f32) {2681return Op;2682}2683if (WideVT.getScalarType() == MVT::f64) {2684SDLoc Loc(Op);2685// Round-inexact-to-odd f64 to f32, then do the final rounding using2686// the hardware f32 -> bf16 instruction.2687SDValue rod = TLI->expandRoundInexactToOdd(2688WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)2689: MVT::f32,2690Wide, Loc, DAG);2691return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);2692}2693}2694return TLI->expandFP_ROUND(Op.getNode(), DAG);2695}2696}26972698// Everything else is considered legal.2699return Op;2700}27012702SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,2703SelectionDAG &DAG) const {2704SDValue Narrow = Op.getOperand(0);2705EVT NarrowVT = Narrow.getValueType();2706EVT WideVT = Op.getValueType();2707if (NarrowVT.getScalarType() == MVT::bf16) {2708if (WideVT.getScalarType() == MVT::f32 &&2709(STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {2710SDLoc Loc(Op);2711return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);2712}2713if (WideVT.getScalarType() == MVT::f64 &&2714(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {2715EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)2716: MVT::f32;2717SDLoc Loc(Op);2718if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {2719Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);2720} else {2721Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);2722}2723return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);2724}2725}27262727// Everything else is considered legal.2728return Op;2729}27302731static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) {2732SDLoc DL(Op);2733if (Op.getValueType() != MVT::v2i16)2734return Op;2735EVT EltVT = Op.getValueType().getVectorElementType();2736SmallVector<SDValue> VecElements;2737for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {2738SmallVector<SDValue> ScalarArgs;2739llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),2740[&](const SDUse &O) {2741return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,2742O.get(), DAG.getIntPtrConstant(I, DL));2743});2744VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));2745}2746SDValue V =2747DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);2748return V;2749}27502751SDValue2752NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {2753switch (Op.getOpcode()) {2754case ISD::RETURNADDR:2755return SDValue();2756case ISD::FRAMEADDR:2757return SDValue();2758case ISD::GlobalAddress:2759return LowerGlobalAddress(Op, DAG);2760case ISD::INTRINSIC_W_CHAIN:2761return Op;2762case ISD::BUILD_VECTOR:2763return LowerBUILD_VECTOR(Op, DAG);2764case ISD::EXTRACT_SUBVECTOR:2765return Op;2766case ISD::EXTRACT_VECTOR_ELT:2767return LowerEXTRACT_VECTOR_ELT(Op, DAG);2768case ISD::INSERT_VECTOR_ELT:2769return LowerINSERT_VECTOR_ELT(Op, DAG);2770case ISD::VECTOR_SHUFFLE:2771return LowerVECTOR_SHUFFLE(Op, DAG);2772case ISD::CONCAT_VECTORS:2773return LowerCONCAT_VECTORS(Op, DAG);2774case ISD::STORE:2775return LowerSTORE(Op, DAG);2776case ISD::LOAD:2777return LowerLOAD(Op, DAG);2778case ISD::SHL_PARTS:2779return LowerShiftLeftParts(Op, DAG);2780case ISD::SRA_PARTS:2781case ISD::SRL_PARTS:2782return LowerShiftRightParts(Op, DAG);2783case ISD::SELECT:2784return LowerSelect(Op, DAG);2785case ISD::FROUND:2786return LowerFROUND(Op, DAG);2787case ISD::SINT_TO_FP:2788case ISD::UINT_TO_FP:2789return LowerINT_TO_FP(Op, DAG);2790case ISD::FP_TO_SINT:2791case ISD::FP_TO_UINT:2792return LowerFP_TO_INT(Op, DAG);2793case ISD::FP_ROUND:2794return LowerFP_ROUND(Op, DAG);2795case ISD::FP_EXTEND:2796return LowerFP_EXTEND(Op, DAG);2797case ISD::VAARG:2798return LowerVAARG(Op, DAG);2799case ISD::VASTART:2800return LowerVASTART(Op, DAG);2801case ISD::ABS:2802case ISD::SMIN:2803case ISD::SMAX:2804case ISD::UMIN:2805case ISD::UMAX:2806case ISD::ADD:2807case ISD::SUB:2808case ISD::MUL:2809case ISD::SHL:2810case ISD::SREM:2811case ISD::UREM:2812return LowerVectorArith(Op, DAG);2813case ISD::DYNAMIC_STACKALLOC:2814return LowerDYNAMIC_STACKALLOC(Op, DAG);2815case ISD::CopyToReg:2816return LowerCopyToReg_128(Op, DAG);2817default:2818llvm_unreachable("Custom lowering not defined for operation");2819}2820}28212822// This function is almost a copy of SelectionDAG::expandVAArg().2823// The only diff is that this one produces loads from local address space.2824SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {2825const TargetLowering *TLI = STI.getTargetLowering();2826SDLoc DL(Op);28272828SDNode *Node = Op.getNode();2829const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();2830EVT VT = Node->getValueType(0);2831auto *Ty = VT.getTypeForEVT(*DAG.getContext());2832SDValue Tmp1 = Node->getOperand(0);2833SDValue Tmp2 = Node->getOperand(1);2834const MaybeAlign MA(Node->getConstantOperandVal(3));28352836SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,2837Tmp1, Tmp2, MachinePointerInfo(V));2838SDValue VAList = VAListLoad;28392840if (MA && *MA > TLI->getMinStackArgumentAlignment()) {2841VAList = DAG.getNode(2842ISD::ADD, DL, VAList.getValueType(), VAList,2843DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));28442845VAList = DAG.getNode(2846ISD::AND, DL, VAList.getValueType(), VAList,2847DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));2848}28492850// Increment the pointer, VAList, to the next vaarg2851Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,2852DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty),2853DL, VAList.getValueType()));28542855// Store the incremented VAList to the legalized pointer2856Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,2857MachinePointerInfo(V));28582859const Value *SrcV =2860Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL));28612862// Load the actual argument out of the pointer VAList2863return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));2864}28652866SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {2867const TargetLowering *TLI = STI.getTargetLowering();2868SDLoc DL(Op);2869EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());28702871// Store the address of unsized array <function>_vararg[] in the ap object.2872SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);2873SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);28742875const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();2876return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),2877MachinePointerInfo(SV));2878}28792880SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {2881SDValue Op0 = Op->getOperand(0);2882SDValue Op1 = Op->getOperand(1);2883SDValue Op2 = Op->getOperand(2);2884SDLoc DL(Op.getNode());28852886assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");28872888Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);2889Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);2890SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);2891SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);28922893return Trunc;2894}28952896SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {2897if (Op.getValueType() == MVT::i1)2898return LowerLOADi1(Op, DAG);28992900// v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle2901// unaligned loads and have to handle it here.2902EVT VT = Op.getValueType();2903if (Isv2x16VT(VT) || VT == MVT::v4i8) {2904LoadSDNode *Load = cast<LoadSDNode>(Op);2905EVT MemVT = Load->getMemoryVT();2906if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),2907MemVT, *Load->getMemOperand())) {2908SDValue Ops[2];2909std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);2910return DAG.getMergeValues(Ops, SDLoc(Op));2911}2912}29132914return SDValue();2915}29162917// v = ld i1* addr2918// =>2919// v1 = ld i8* addr (-> i16)2920// v = trunc i16 to i12921SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {2922SDNode *Node = Op.getNode();2923LoadSDNode *LD = cast<LoadSDNode>(Node);2924SDLoc dl(Node);2925assert(LD->getExtensionType() == ISD::NON_EXTLOAD);2926assert(Node->getValueType(0) == MVT::i1 &&2927"Custom lowering for i1 load only");2928SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),2929LD->getBasePtr(), LD->getPointerInfo(),2930MVT::i8, LD->getAlign(),2931LD->getMemOperand()->getFlags());2932SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);2933// The legalizer (the caller) is expecting two values from the legalized2934// load, so we build a MergeValues node for it. See ExpandUnalignedLoad()2935// in LegalizeDAG.cpp which also uses MergeValues.2936SDValue Ops[] = { result, LD->getChain() };2937return DAG.getMergeValues(Ops, dl);2938}29392940SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {2941StoreSDNode *Store = cast<StoreSDNode>(Op);2942EVT VT = Store->getMemoryVT();29432944if (VT == MVT::i1)2945return LowerSTOREi1(Op, DAG);29462947// v2f16 is legal, so we can't rely on legalizer to handle unaligned2948// stores and have to handle it here.2949if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&2950!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),2951VT, *Store->getMemOperand()))2952return expandUnalignedStore(Store, DAG);29532954// v2f16, v2bf16 and v2i16 don't need special handling.2955if (Isv2x16VT(VT) || VT == MVT::v4i8)2956return SDValue();29572958if (VT.isVector())2959return LowerSTOREVector(Op, DAG);29602961return SDValue();2962}29632964SDValue2965NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {2966SDNode *N = Op.getNode();2967SDValue Val = N->getOperand(1);2968SDLoc DL(N);2969EVT ValVT = Val.getValueType();29702971if (ValVT.isVector()) {2972// We only handle "native" vector sizes for now, e.g. <4 x double> is not2973// legal. We can (and should) split that into 2 stores of <2 x double> here2974// but I'm leaving that as a TODO for now.2975if (!ValVT.isSimple())2976return SDValue();2977switch (ValVT.getSimpleVT().SimpleTy) {2978default:2979return SDValue();2980case MVT::v2i8:2981case MVT::v2i16:2982case MVT::v2i32:2983case MVT::v2i64:2984case MVT::v2f16:2985case MVT::v2bf16:2986case MVT::v2f32:2987case MVT::v2f64:2988case MVT::v4i8:2989case MVT::v4i16:2990case MVT::v4i32:2991case MVT::v4f16:2992case MVT::v4bf16:2993case MVT::v4f32:2994case MVT::v8f16: // <4 x f16x2>2995case MVT::v8bf16: // <4 x bf16x2>2996case MVT::v8i16: // <4 x i16x2>2997// This is a "native" vector type2998break;2999}30003001MemSDNode *MemSD = cast<MemSDNode>(N);3002const DataLayout &TD = DAG.getDataLayout();30033004Align Alignment = MemSD->getAlign();3005Align PrefAlign =3006TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));3007if (Alignment < PrefAlign) {3008// This store is not sufficiently aligned, so bail out and let this vector3009// store be scalarized. Note that we may still be able to emit smaller3010// vector stores. For example, if we are storing a <4 x float> with an3011// alignment of 8, this check will fail but the legalizer will try again3012// with 2 x <2 x float>, which will succeed with an alignment of 8.3013return SDValue();3014}30153016unsigned Opcode = 0;3017EVT EltVT = ValVT.getVectorElementType();3018unsigned NumElts = ValVT.getVectorNumElements();30193020// Since StoreV2 is a target node, we cannot rely on DAG type legalization.3021// Therefore, we must ensure the type is legal. For i1 and i8, we set the3022// stored type to i16 and propagate the "real" type as the memory type.3023bool NeedExt = false;3024if (EltVT.getSizeInBits() < 16)3025NeedExt = true;30263027bool StoreF16x2 = false;3028switch (NumElts) {3029default:3030return SDValue();3031case 2:3032Opcode = NVPTXISD::StoreV2;3033break;3034case 4:3035Opcode = NVPTXISD::StoreV4;3036break;3037case 8:3038// v8f16 is a special case. PTX doesn't have st.v8.f163039// instruction. Instead, we split the vector into v2f16 chunks and3040// store them with st.v4.b32.3041assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");3042Opcode = NVPTXISD::StoreV4;3043StoreF16x2 = true;3044break;3045}30463047SmallVector<SDValue, 8> Ops;30483049// First is the chain3050Ops.push_back(N->getOperand(0));30513052if (StoreF16x2) {3053// Combine f16,f16 -> v2f163054NumElts /= 2;3055for (unsigned i = 0; i < NumElts; ++i) {3056SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,3057DAG.getIntPtrConstant(i * 2, DL));3058SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,3059DAG.getIntPtrConstant(i * 2 + 1, DL));3060EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);3061SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);3062Ops.push_back(V2);3063}3064} else {3065// Then the split values3066for (unsigned i = 0; i < NumElts; ++i) {3067SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,3068DAG.getIntPtrConstant(i, DL));3069if (NeedExt)3070ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);3071Ops.push_back(ExtVal);3072}3073}30743075// Then any remaining arguments3076Ops.append(N->op_begin() + 2, N->op_end());30773078SDValue NewSt =3079DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,3080MemSD->getMemoryVT(), MemSD->getMemOperand());30813082// return DCI.CombineTo(N, NewSt, true);3083return NewSt;3084}30853086return SDValue();3087}30883089// st i1 v, addr3090// =>3091// v1 = zxt v to i163092// st.u8 i16, addr3093SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {3094SDNode *Node = Op.getNode();3095SDLoc dl(Node);3096StoreSDNode *ST = cast<StoreSDNode>(Node);3097SDValue Tmp1 = ST->getChain();3098SDValue Tmp2 = ST->getBasePtr();3099SDValue Tmp3 = ST->getValue();3100assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");3101Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);3102SDValue Result =3103DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,3104ST->getAlign(), ST->getMemOperand()->getFlags());3105return Result;3106}31073108SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,3109SelectionDAG &DAG) const {3110// Change the CopyToReg to take in two 64-bit operands instead of a 128-bit3111// operand so that it can pass the legalization.31123113assert(Op.getOperand(1).getValueType() == MVT::i128 &&3114"Custom lowering for 128-bit CopyToReg only");31153116SDNode *Node = Op.getNode();3117SDLoc DL(Node);31183119SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));3120SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,3121DAG.getIntPtrConstant(0, DL));3122SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,3123DAG.getIntPtrConstant(1, DL));31243125SmallVector<SDValue, 5> NewOps(Op->getNumOperands() + 1);3126SmallVector<EVT, 3> ResultsType(Node->values());31273128NewOps[0] = Op->getOperand(0); // Chain3129NewOps[1] = Op->getOperand(1); // Dst Reg3130NewOps[2] = Lo; // Lower 64-bit3131NewOps[3] = Hi; // Higher 64-bit3132if (Op.getNumOperands() == 4)3133NewOps[4] = Op->getOperand(3); // Glue if exists31343135return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);3136}31373138unsigned NVPTXTargetLowering::getNumRegisters(3139LLVMContext &Context, EVT VT,3140std::optional<MVT> RegisterVT = std::nullopt) const {3141if (VT == MVT::i128 && RegisterVT == MVT::i128)3142return 1;3143return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);3144}31453146bool NVPTXTargetLowering::splitValueIntoRegisterParts(3147SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,3148unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {3149if (Val.getValueType() == MVT::i128 && NumParts == 1) {3150Parts[0] = Val;3151return true;3152}3153return false;3154}31553156// This creates target external symbol for a function parameter.3157// Name of the symbol is composed from its index and the function name.3158// Negative index corresponds to special parameter (unsized array) used for3159// passing variable arguments.3160SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,3161EVT v) const {3162StringRef SavedStr = nvTM->getStrPool().save(3163getParamName(&DAG.getMachineFunction().getFunction(), idx));3164return DAG.getTargetExternalSymbol(SavedStr.data(), v);3165}31663167SDValue NVPTXTargetLowering::LowerFormalArguments(3168SDValue Chain, CallingConv::ID CallConv, bool isVarArg,3169const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,3170SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {3171MachineFunction &MF = DAG.getMachineFunction();3172const DataLayout &DL = DAG.getDataLayout();3173auto PtrVT = getPointerTy(DAG.getDataLayout());31743175const Function *F = &MF.getFunction();3176const AttributeList &PAL = F->getAttributes();3177const TargetLowering *TLI = STI.getTargetLowering();31783179SDValue Root = DAG.getRoot();3180std::vector<SDValue> OutChains;31813182bool isABI = (STI.getSmVersion() >= 20);3183assert(isABI && "Non-ABI compilation is not supported");3184if (!isABI)3185return Chain;31863187std::vector<Type *> argTypes;3188std::vector<const Argument *> theArgs;3189for (const Argument &I : F->args()) {3190theArgs.push_back(&I);3191argTypes.push_back(I.getType());3192}3193// argTypes.size() (or theArgs.size()) and Ins.size() need not match.3194// Ins.size() will be larger3195// * if there is an aggregate argument with multiple fields (each field3196// showing up separately in Ins)3197// * if there is a vector argument with more than typical vector-length3198// elements (generally if more than 4) where each vector element is3199// individually present in Ins.3200// So a different index should be used for indexing into Ins.3201// See similar issue in LowerCall.3202unsigned InsIdx = 0;32033204for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {3205Type *Ty = argTypes[i];32063207if (theArgs[i]->use_empty()) {3208// argument is dead3209if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {3210SmallVector<EVT, 16> vtparts;32113212ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);3213if (vtparts.empty())3214report_fatal_error("Empty parameter types are not supported");32153216for (unsigned parti = 0, parte = vtparts.size(); parti != parte;3217++parti) {3218InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));3219++InsIdx;3220}3221if (vtparts.size() > 0)3222--InsIdx;3223continue;3224}3225if (Ty->isVectorTy()) {3226EVT ObjectVT = getValueType(DL, Ty);3227unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);3228for (unsigned parti = 0; parti < NumRegs; ++parti) {3229InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));3230++InsIdx;3231}3232if (NumRegs > 0)3233--InsIdx;3234continue;3235}3236InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));3237continue;3238}32393240// In the following cases, assign a node order of "i+1"3241// to newly created nodes. The SDNodes for params have to3242// appear in the same order as their order of appearance3243// in the original function. "i+1" holds that order.3244if (!PAL.hasParamAttr(i, Attribute::ByVal)) {3245bool aggregateIsPacked = false;3246if (StructType *STy = dyn_cast<StructType>(Ty))3247aggregateIsPacked = STy->isPacked();32483249SmallVector<EVT, 16> VTs;3250SmallVector<uint64_t, 16> Offsets;3251ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);3252if (VTs.empty())3253report_fatal_error("Empty parameter types are not supported");32543255Align ArgAlign = getFunctionArgumentAlignment(3256F, Ty, i + AttributeList::FirstArgIndex, DL);3257auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);32583259SDValue Arg = getParamSymbol(DAG, i, PtrVT);3260int VecIdx = -1; // Index of the first element of the current vector.3261for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {3262if (VectorInfo[parti] & PVF_FIRST) {3263assert(VecIdx == -1 && "Orphaned vector.");3264VecIdx = parti;3265}32663267// That's the last element of this store op.3268if (VectorInfo[parti] & PVF_LAST) {3269unsigned NumElts = parti - VecIdx + 1;3270EVT EltVT = VTs[parti];3271// i1 is loaded/stored as i8.3272EVT LoadVT = EltVT;3273if (EltVT == MVT::i1)3274LoadVT = MVT::i8;3275else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)3276// getLoad needs a vector type, but it can't handle3277// vectors which contain v2f16 or v2bf16 elements. So we must load3278// using i32 here and then bitcast back.3279LoadVT = MVT::i32;32803281EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);3282SDValue VecAddr =3283DAG.getNode(ISD::ADD, dl, PtrVT, Arg,3284DAG.getConstant(Offsets[VecIdx], dl, PtrVT));3285Value *srcValue = Constant::getNullValue(PointerType::get(3286EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));32873288const MaybeAlign PartAlign = [&]() -> MaybeAlign {3289if (aggregateIsPacked)3290return Align(1);3291if (NumElts != 1)3292return std::nullopt;3293Align PartAlign =3294DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));3295return commonAlignment(PartAlign, Offsets[parti]);3296}();3297SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,3298MachinePointerInfo(srcValue), PartAlign,3299MachineMemOperand::MODereferenceable |3300MachineMemOperand::MOInvariant);3301if (P.getNode())3302P.getNode()->setIROrder(i + 1);3303for (unsigned j = 0; j < NumElts; ++j) {3304SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,3305DAG.getIntPtrConstant(j, dl));3306// We've loaded i1 as an i8 and now must truncate it back to i13307if (EltVT == MVT::i1)3308Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);3309// v2f16 was loaded as an i32. Now we must bitcast it back.3310else if (EltVT != LoadVT)3311Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);33123313// If a promoted integer type is used, truncate down to the original3314MVT PromotedVT;3315if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {3316Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);3317}33183319// Extend the element if necessary (e.g. an i8 is loaded3320// into an i16 register)3321if (Ins[InsIdx].VT.isInteger() &&3322Ins[InsIdx].VT.getFixedSizeInBits() >3323LoadVT.getFixedSizeInBits()) {3324unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND3325: ISD::ZERO_EXTEND;3326Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);3327}3328InVals.push_back(Elt);3329}33303331// Reset vector tracking state.3332VecIdx = -1;3333}3334++InsIdx;3335}3336if (VTs.size() > 0)3337--InsIdx;3338continue;3339}33403341// Param has ByVal attribute3342// Return MoveParam(param symbol).3343// Ideally, the param symbol can be returned directly,3344// but when SDNode builder decides to use it in a CopyToReg(),3345// machine instruction fails because TargetExternalSymbol3346// (not lowered) is target dependent, and CopyToReg assumes3347// the source is lowered.3348EVT ObjectVT = getValueType(DL, Ty);3349assert(ObjectVT == Ins[InsIdx].VT &&3350"Ins type did not match function type");3351SDValue Arg = getParamSymbol(DAG, i, PtrVT);3352SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);3353if (p.getNode())3354p.getNode()->setIROrder(i + 1);3355InVals.push_back(p);3356}33573358if (!OutChains.empty())3359DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));33603361return Chain;3362}33633364// Use byte-store when the param adress of the return value is unaligned.3365// This may happen when the return value is a field of a packed structure.3366static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain,3367uint64_t Offset, EVT ElementType,3368SDValue RetVal, const SDLoc &dl) {3369// Bit logic only works on integer types3370if (adjustElementType(ElementType))3371RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);33723373// Store each byte3374for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {3375// Shift the byte to the last byte position3376SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,3377DAG.getConstant(i * 8, dl, MVT::i32));3378SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),3379ShiftVal};3380// Trunc store only the last byte by using3381// st.param.b83382// The register type can be larger than b8.3383Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,3384DAG.getVTList(MVT::Other), StoreOperands,3385MVT::i8, MachinePointerInfo(), std::nullopt,3386MachineMemOperand::MOStore);3387}3388return Chain;3389}33903391SDValue3392NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,3393bool isVarArg,3394const SmallVectorImpl<ISD::OutputArg> &Outs,3395const SmallVectorImpl<SDValue> &OutVals,3396const SDLoc &dl, SelectionDAG &DAG) const {3397const MachineFunction &MF = DAG.getMachineFunction();3398const Function &F = MF.getFunction();3399Type *RetTy = MF.getFunction().getReturnType();34003401bool isABI = (STI.getSmVersion() >= 20);3402assert(isABI && "Non-ABI compilation is not supported");3403if (!isABI)3404return Chain;34053406const DataLayout &DL = DAG.getDataLayout();3407SmallVector<SDValue, 16> PromotedOutVals;3408SmallVector<EVT, 16> VTs;3409SmallVector<uint64_t, 16> Offsets;3410ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);3411assert(VTs.size() == OutVals.size() && "Bad return value decomposition");34123413for (unsigned i = 0, e = VTs.size(); i != e; ++i) {3414SDValue PromotedOutVal = OutVals[i];3415MVT PromotedVT;3416if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {3417VTs[i] = EVT(PromotedVT);3418}3419if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {3420llvm::ISD::NodeType Ext =3421Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;3422PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);3423}3424PromotedOutVals.push_back(PromotedOutVal);3425}34263427auto VectorInfo = VectorizePTXValueVTs(3428VTs, Offsets,3429RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)3430: Align(1));34313432// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than3433// 32-bits are sign extended or zero extended, depending on whether3434// they are signed or unsigned types.3435bool ExtendIntegerRetVal =3436RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;34373438SmallVector<SDValue, 6> StoreOperands;3439for (unsigned i = 0, e = VTs.size(); i != e; ++i) {3440SDValue OutVal = OutVals[i];3441SDValue RetVal = PromotedOutVals[i];34423443if (ExtendIntegerRetVal) {3444RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND3445: ISD::ZERO_EXTEND,3446dl, MVT::i32, RetVal);3447} else if (OutVal.getValueSizeInBits() < 16) {3448// Use 16-bit registers for small load-stores as it's the3449// smallest general purpose register size supported by NVPTX.3450RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);3451}34523453// If we have a PVF_SCALAR entry, it may not even be sufficiently aligned3454// for a scalar store. In such cases, fall back to byte stores.3455if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {3456EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];3457Align ElementTypeAlign =3458DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));3459Align ElementAlign =3460commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);3461if (ElementAlign < ElementTypeAlign) {3462assert(StoreOperands.empty() && "Orphaned operand list.");3463Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,3464RetVal, dl);34653466// The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes3467// into the graph, so just move on to the next element.3468continue;3469}3470}34713472// New load/store. Record chain and offset operands.3473if (VectorInfo[i] & PVF_FIRST) {3474assert(StoreOperands.empty() && "Orphaned operand list.");3475StoreOperands.push_back(Chain);3476StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));3477}34783479// Record the value to return.3480StoreOperands.push_back(RetVal);34813482// That's the last element of this store op.3483if (VectorInfo[i] & PVF_LAST) {3484NVPTXISD::NodeType Op;3485unsigned NumElts = StoreOperands.size() - 2;3486switch (NumElts) {3487case 1:3488Op = NVPTXISD::StoreRetval;3489break;3490case 2:3491Op = NVPTXISD::StoreRetvalV2;3492break;3493case 4:3494Op = NVPTXISD::StoreRetvalV4;3495break;3496default:3497llvm_unreachable("Invalid vector info.");3498}34993500// Adjust type of load/store op if we've extended the scalar3501// return value.3502EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];3503Chain = DAG.getMemIntrinsicNode(3504Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,3505MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);3506// Cleanup vector state.3507StoreOperands.clear();3508}3509}35103511return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);3512}35133514void NVPTXTargetLowering::LowerAsmOperandForConstraint(3515SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,3516SelectionDAG &DAG) const {3517if (Constraint.size() > 1)3518return;3519TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);3520}35213522static unsigned getOpcForTextureInstr(unsigned Intrinsic) {3523switch (Intrinsic) {3524default:3525return 0;35263527case Intrinsic::nvvm_tex_1d_v4f32_s32:3528return NVPTXISD::Tex1DFloatS32;3529case Intrinsic::nvvm_tex_1d_v4f32_f32:3530return NVPTXISD::Tex1DFloatFloat;3531case Intrinsic::nvvm_tex_1d_level_v4f32_f32:3532return NVPTXISD::Tex1DFloatFloatLevel;3533case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:3534return NVPTXISD::Tex1DFloatFloatGrad;3535case Intrinsic::nvvm_tex_1d_v4s32_s32:3536return NVPTXISD::Tex1DS32S32;3537case Intrinsic::nvvm_tex_1d_v4s32_f32:3538return NVPTXISD::Tex1DS32Float;3539case Intrinsic::nvvm_tex_1d_level_v4s32_f32:3540return NVPTXISD::Tex1DS32FloatLevel;3541case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:3542return NVPTXISD::Tex1DS32FloatGrad;3543case Intrinsic::nvvm_tex_1d_v4u32_s32:3544return NVPTXISD::Tex1DU32S32;3545case Intrinsic::nvvm_tex_1d_v4u32_f32:3546return NVPTXISD::Tex1DU32Float;3547case Intrinsic::nvvm_tex_1d_level_v4u32_f32:3548return NVPTXISD::Tex1DU32FloatLevel;3549case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:3550return NVPTXISD::Tex1DU32FloatGrad;35513552case Intrinsic::nvvm_tex_1d_array_v4f32_s32:3553return NVPTXISD::Tex1DArrayFloatS32;3554case Intrinsic::nvvm_tex_1d_array_v4f32_f32:3555return NVPTXISD::Tex1DArrayFloatFloat;3556case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:3557return NVPTXISD::Tex1DArrayFloatFloatLevel;3558case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:3559return NVPTXISD::Tex1DArrayFloatFloatGrad;3560case Intrinsic::nvvm_tex_1d_array_v4s32_s32:3561return NVPTXISD::Tex1DArrayS32S32;3562case Intrinsic::nvvm_tex_1d_array_v4s32_f32:3563return NVPTXISD::Tex1DArrayS32Float;3564case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:3565return NVPTXISD::Tex1DArrayS32FloatLevel;3566case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:3567return NVPTXISD::Tex1DArrayS32FloatGrad;3568case Intrinsic::nvvm_tex_1d_array_v4u32_s32:3569return NVPTXISD::Tex1DArrayU32S32;3570case Intrinsic::nvvm_tex_1d_array_v4u32_f32:3571return NVPTXISD::Tex1DArrayU32Float;3572case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:3573return NVPTXISD::Tex1DArrayU32FloatLevel;3574case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:3575return NVPTXISD::Tex1DArrayU32FloatGrad;35763577case Intrinsic::nvvm_tex_2d_v4f32_s32:3578return NVPTXISD::Tex2DFloatS32;3579case Intrinsic::nvvm_tex_2d_v4f32_f32:3580return NVPTXISD::Tex2DFloatFloat;3581case Intrinsic::nvvm_tex_2d_level_v4f32_f32:3582return NVPTXISD::Tex2DFloatFloatLevel;3583case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:3584return NVPTXISD::Tex2DFloatFloatGrad;3585case Intrinsic::nvvm_tex_2d_v4s32_s32:3586return NVPTXISD::Tex2DS32S32;3587case Intrinsic::nvvm_tex_2d_v4s32_f32:3588return NVPTXISD::Tex2DS32Float;3589case Intrinsic::nvvm_tex_2d_level_v4s32_f32:3590return NVPTXISD::Tex2DS32FloatLevel;3591case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:3592return NVPTXISD::Tex2DS32FloatGrad;3593case Intrinsic::nvvm_tex_2d_v4u32_s32:3594return NVPTXISD::Tex2DU32S32;3595case Intrinsic::nvvm_tex_2d_v4u32_f32:3596return NVPTXISD::Tex2DU32Float;3597case Intrinsic::nvvm_tex_2d_level_v4u32_f32:3598return NVPTXISD::Tex2DU32FloatLevel;3599case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:3600return NVPTXISD::Tex2DU32FloatGrad;36013602case Intrinsic::nvvm_tex_2d_array_v4f32_s32:3603return NVPTXISD::Tex2DArrayFloatS32;3604case Intrinsic::nvvm_tex_2d_array_v4f32_f32:3605return NVPTXISD::Tex2DArrayFloatFloat;3606case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:3607return NVPTXISD::Tex2DArrayFloatFloatLevel;3608case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:3609return NVPTXISD::Tex2DArrayFloatFloatGrad;3610case Intrinsic::nvvm_tex_2d_array_v4s32_s32:3611return NVPTXISD::Tex2DArrayS32S32;3612case Intrinsic::nvvm_tex_2d_array_v4s32_f32:3613return NVPTXISD::Tex2DArrayS32Float;3614case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:3615return NVPTXISD::Tex2DArrayS32FloatLevel;3616case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:3617return NVPTXISD::Tex2DArrayS32FloatGrad;3618case Intrinsic::nvvm_tex_2d_array_v4u32_s32:3619return NVPTXISD::Tex2DArrayU32S32;3620case Intrinsic::nvvm_tex_2d_array_v4u32_f32:3621return NVPTXISD::Tex2DArrayU32Float;3622case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:3623return NVPTXISD::Tex2DArrayU32FloatLevel;3624case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:3625return NVPTXISD::Tex2DArrayU32FloatGrad;36263627case Intrinsic::nvvm_tex_3d_v4f32_s32:3628return NVPTXISD::Tex3DFloatS32;3629case Intrinsic::nvvm_tex_3d_v4f32_f32:3630return NVPTXISD::Tex3DFloatFloat;3631case Intrinsic::nvvm_tex_3d_level_v4f32_f32:3632return NVPTXISD::Tex3DFloatFloatLevel;3633case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:3634return NVPTXISD::Tex3DFloatFloatGrad;3635case Intrinsic::nvvm_tex_3d_v4s32_s32:3636return NVPTXISD::Tex3DS32S32;3637case Intrinsic::nvvm_tex_3d_v4s32_f32:3638return NVPTXISD::Tex3DS32Float;3639case Intrinsic::nvvm_tex_3d_level_v4s32_f32:3640return NVPTXISD::Tex3DS32FloatLevel;3641case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:3642return NVPTXISD::Tex3DS32FloatGrad;3643case Intrinsic::nvvm_tex_3d_v4u32_s32:3644return NVPTXISD::Tex3DU32S32;3645case Intrinsic::nvvm_tex_3d_v4u32_f32:3646return NVPTXISD::Tex3DU32Float;3647case Intrinsic::nvvm_tex_3d_level_v4u32_f32:3648return NVPTXISD::Tex3DU32FloatLevel;3649case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:3650return NVPTXISD::Tex3DU32FloatGrad;36513652case Intrinsic::nvvm_tex_cube_v4f32_f32:3653return NVPTXISD::TexCubeFloatFloat;3654case Intrinsic::nvvm_tex_cube_level_v4f32_f32:3655return NVPTXISD::TexCubeFloatFloatLevel;3656case Intrinsic::nvvm_tex_cube_v4s32_f32:3657return NVPTXISD::TexCubeS32Float;3658case Intrinsic::nvvm_tex_cube_level_v4s32_f32:3659return NVPTXISD::TexCubeS32FloatLevel;3660case Intrinsic::nvvm_tex_cube_v4u32_f32:3661return NVPTXISD::TexCubeU32Float;3662case Intrinsic::nvvm_tex_cube_level_v4u32_f32:3663return NVPTXISD::TexCubeU32FloatLevel;36643665case Intrinsic::nvvm_tex_cube_array_v4f32_f32:3666return NVPTXISD::TexCubeArrayFloatFloat;3667case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:3668return NVPTXISD::TexCubeArrayFloatFloatLevel;3669case Intrinsic::nvvm_tex_cube_array_v4s32_f32:3670return NVPTXISD::TexCubeArrayS32Float;3671case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:3672return NVPTXISD::TexCubeArrayS32FloatLevel;3673case Intrinsic::nvvm_tex_cube_array_v4u32_f32:3674return NVPTXISD::TexCubeArrayU32Float;3675case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:3676return NVPTXISD::TexCubeArrayU32FloatLevel;36773678case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:3679return NVPTXISD::Tld4R2DFloatFloat;3680case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:3681return NVPTXISD::Tld4G2DFloatFloat;3682case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:3683return NVPTXISD::Tld4B2DFloatFloat;3684case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:3685return NVPTXISD::Tld4A2DFloatFloat;3686case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:3687return NVPTXISD::Tld4R2DS64Float;3688case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:3689return NVPTXISD::Tld4G2DS64Float;3690case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:3691return NVPTXISD::Tld4B2DS64Float;3692case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:3693return NVPTXISD::Tld4A2DS64Float;3694case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:3695return NVPTXISD::Tld4R2DU64Float;3696case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:3697return NVPTXISD::Tld4G2DU64Float;3698case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:3699return NVPTXISD::Tld4B2DU64Float;3700case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:3701return NVPTXISD::Tld4A2DU64Float;37023703case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:3704return NVPTXISD::TexUnified1DFloatS32;3705case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:3706return NVPTXISD::TexUnified1DFloatFloat;3707case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:3708return NVPTXISD::TexUnified1DFloatFloatLevel;3709case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:3710return NVPTXISD::TexUnified1DFloatFloatGrad;3711case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:3712return NVPTXISD::TexUnified1DS32S32;3713case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:3714return NVPTXISD::TexUnified1DS32Float;3715case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:3716return NVPTXISD::TexUnified1DS32FloatLevel;3717case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:3718return NVPTXISD::TexUnified1DS32FloatGrad;3719case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:3720return NVPTXISD::TexUnified1DU32S32;3721case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:3722return NVPTXISD::TexUnified1DU32Float;3723case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:3724return NVPTXISD::TexUnified1DU32FloatLevel;3725case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:3726return NVPTXISD::TexUnified1DU32FloatGrad;37273728case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:3729return NVPTXISD::TexUnified1DArrayFloatS32;3730case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:3731return NVPTXISD::TexUnified1DArrayFloatFloat;3732case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:3733return NVPTXISD::TexUnified1DArrayFloatFloatLevel;3734case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:3735return NVPTXISD::TexUnified1DArrayFloatFloatGrad;3736case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:3737return NVPTXISD::TexUnified1DArrayS32S32;3738case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:3739return NVPTXISD::TexUnified1DArrayS32Float;3740case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:3741return NVPTXISD::TexUnified1DArrayS32FloatLevel;3742case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:3743return NVPTXISD::TexUnified1DArrayS32FloatGrad;3744case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:3745return NVPTXISD::TexUnified1DArrayU32S32;3746case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:3747return NVPTXISD::TexUnified1DArrayU32Float;3748case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:3749return NVPTXISD::TexUnified1DArrayU32FloatLevel;3750case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:3751return NVPTXISD::TexUnified1DArrayU32FloatGrad;37523753case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:3754return NVPTXISD::TexUnified2DFloatS32;3755case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:3756return NVPTXISD::TexUnified2DFloatFloat;3757case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:3758return NVPTXISD::TexUnified2DFloatFloatLevel;3759case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:3760return NVPTXISD::TexUnified2DFloatFloatGrad;3761case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:3762return NVPTXISD::TexUnified2DS32S32;3763case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:3764return NVPTXISD::TexUnified2DS32Float;3765case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:3766return NVPTXISD::TexUnified2DS32FloatLevel;3767case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:3768return NVPTXISD::TexUnified2DS32FloatGrad;3769case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:3770return NVPTXISD::TexUnified2DU32S32;3771case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:3772return NVPTXISD::TexUnified2DU32Float;3773case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:3774return NVPTXISD::TexUnified2DU32FloatLevel;3775case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:3776return NVPTXISD::TexUnified2DU32FloatGrad;37773778case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:3779return NVPTXISD::TexUnified2DArrayFloatS32;3780case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:3781return NVPTXISD::TexUnified2DArrayFloatFloat;3782case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:3783return NVPTXISD::TexUnified2DArrayFloatFloatLevel;3784case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:3785return NVPTXISD::TexUnified2DArrayFloatFloatGrad;3786case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:3787return NVPTXISD::TexUnified2DArrayS32S32;3788case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:3789return NVPTXISD::TexUnified2DArrayS32Float;3790case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:3791return NVPTXISD::TexUnified2DArrayS32FloatLevel;3792case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:3793return NVPTXISD::TexUnified2DArrayS32FloatGrad;3794case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:3795return NVPTXISD::TexUnified2DArrayU32S32;3796case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:3797return NVPTXISD::TexUnified2DArrayU32Float;3798case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:3799return NVPTXISD::TexUnified2DArrayU32FloatLevel;3800case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:3801return NVPTXISD::TexUnified2DArrayU32FloatGrad;38023803case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:3804return NVPTXISD::TexUnified3DFloatS32;3805case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:3806return NVPTXISD::TexUnified3DFloatFloat;3807case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:3808return NVPTXISD::TexUnified3DFloatFloatLevel;3809case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:3810return NVPTXISD::TexUnified3DFloatFloatGrad;3811case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:3812return NVPTXISD::TexUnified3DS32S32;3813case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:3814return NVPTXISD::TexUnified3DS32Float;3815case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:3816return NVPTXISD::TexUnified3DS32FloatLevel;3817case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:3818return NVPTXISD::TexUnified3DS32FloatGrad;3819case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:3820return NVPTXISD::TexUnified3DU32S32;3821case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:3822return NVPTXISD::TexUnified3DU32Float;3823case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:3824return NVPTXISD::TexUnified3DU32FloatLevel;3825case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:3826return NVPTXISD::TexUnified3DU32FloatGrad;38273828case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:3829return NVPTXISD::TexUnifiedCubeFloatFloat;3830case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:3831return NVPTXISD::TexUnifiedCubeFloatFloatLevel;3832case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:3833return NVPTXISD::TexUnifiedCubeS32Float;3834case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:3835return NVPTXISD::TexUnifiedCubeS32FloatLevel;3836case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:3837return NVPTXISD::TexUnifiedCubeU32Float;3838case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:3839return NVPTXISD::TexUnifiedCubeU32FloatLevel;38403841case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:3842return NVPTXISD::TexUnifiedCubeArrayFloatFloat;3843case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:3844return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;3845case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:3846return NVPTXISD::TexUnifiedCubeArrayS32Float;3847case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:3848return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;3849case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:3850return NVPTXISD::TexUnifiedCubeArrayU32Float;3851case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:3852return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;38533854case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:3855return NVPTXISD::TexUnifiedCubeFloatFloatGrad;3856case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:3857return NVPTXISD::TexUnifiedCubeS32FloatGrad;3858case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:3859return NVPTXISD::TexUnifiedCubeU32FloatGrad;3860case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:3861return NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad;3862case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:3863return NVPTXISD::TexUnifiedCubeArrayS32FloatGrad;3864case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:3865return NVPTXISD::TexUnifiedCubeArrayU32FloatGrad;38663867case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:3868return NVPTXISD::Tld4UnifiedR2DFloatFloat;3869case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:3870return NVPTXISD::Tld4UnifiedG2DFloatFloat;3871case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:3872return NVPTXISD::Tld4UnifiedB2DFloatFloat;3873case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:3874return NVPTXISD::Tld4UnifiedA2DFloatFloat;3875case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:3876return NVPTXISD::Tld4UnifiedR2DS64Float;3877case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:3878return NVPTXISD::Tld4UnifiedG2DS64Float;3879case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:3880return NVPTXISD::Tld4UnifiedB2DS64Float;3881case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:3882return NVPTXISD::Tld4UnifiedA2DS64Float;3883case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:3884return NVPTXISD::Tld4UnifiedR2DU64Float;3885case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:3886return NVPTXISD::Tld4UnifiedG2DU64Float;3887case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:3888return NVPTXISD::Tld4UnifiedB2DU64Float;3889case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:3890return NVPTXISD::Tld4UnifiedA2DU64Float;3891}3892}38933894static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {3895switch (Intrinsic) {3896default:3897return 0;3898case Intrinsic::nvvm_suld_1d_i8_clamp:3899return NVPTXISD::Suld1DI8Clamp;3900case Intrinsic::nvvm_suld_1d_i16_clamp:3901return NVPTXISD::Suld1DI16Clamp;3902case Intrinsic::nvvm_suld_1d_i32_clamp:3903return NVPTXISD::Suld1DI32Clamp;3904case Intrinsic::nvvm_suld_1d_i64_clamp:3905return NVPTXISD::Suld1DI64Clamp;3906case Intrinsic::nvvm_suld_1d_v2i8_clamp:3907return NVPTXISD::Suld1DV2I8Clamp;3908case Intrinsic::nvvm_suld_1d_v2i16_clamp:3909return NVPTXISD::Suld1DV2I16Clamp;3910case Intrinsic::nvvm_suld_1d_v2i32_clamp:3911return NVPTXISD::Suld1DV2I32Clamp;3912case Intrinsic::nvvm_suld_1d_v2i64_clamp:3913return NVPTXISD::Suld1DV2I64Clamp;3914case Intrinsic::nvvm_suld_1d_v4i8_clamp:3915return NVPTXISD::Suld1DV4I8Clamp;3916case Intrinsic::nvvm_suld_1d_v4i16_clamp:3917return NVPTXISD::Suld1DV4I16Clamp;3918case Intrinsic::nvvm_suld_1d_v4i32_clamp:3919return NVPTXISD::Suld1DV4I32Clamp;3920case Intrinsic::nvvm_suld_1d_array_i8_clamp:3921return NVPTXISD::Suld1DArrayI8Clamp;3922case Intrinsic::nvvm_suld_1d_array_i16_clamp:3923return NVPTXISD::Suld1DArrayI16Clamp;3924case Intrinsic::nvvm_suld_1d_array_i32_clamp:3925return NVPTXISD::Suld1DArrayI32Clamp;3926case Intrinsic::nvvm_suld_1d_array_i64_clamp:3927return NVPTXISD::Suld1DArrayI64Clamp;3928case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:3929return NVPTXISD::Suld1DArrayV2I8Clamp;3930case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:3931return NVPTXISD::Suld1DArrayV2I16Clamp;3932case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:3933return NVPTXISD::Suld1DArrayV2I32Clamp;3934case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:3935return NVPTXISD::Suld1DArrayV2I64Clamp;3936case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:3937return NVPTXISD::Suld1DArrayV4I8Clamp;3938case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:3939return NVPTXISD::Suld1DArrayV4I16Clamp;3940case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:3941return NVPTXISD::Suld1DArrayV4I32Clamp;3942case Intrinsic::nvvm_suld_2d_i8_clamp:3943return NVPTXISD::Suld2DI8Clamp;3944case Intrinsic::nvvm_suld_2d_i16_clamp:3945return NVPTXISD::Suld2DI16Clamp;3946case Intrinsic::nvvm_suld_2d_i32_clamp:3947return NVPTXISD::Suld2DI32Clamp;3948case Intrinsic::nvvm_suld_2d_i64_clamp:3949return NVPTXISD::Suld2DI64Clamp;3950case Intrinsic::nvvm_suld_2d_v2i8_clamp:3951return NVPTXISD::Suld2DV2I8Clamp;3952case Intrinsic::nvvm_suld_2d_v2i16_clamp:3953return NVPTXISD::Suld2DV2I16Clamp;3954case Intrinsic::nvvm_suld_2d_v2i32_clamp:3955return NVPTXISD::Suld2DV2I32Clamp;3956case Intrinsic::nvvm_suld_2d_v2i64_clamp:3957return NVPTXISD::Suld2DV2I64Clamp;3958case Intrinsic::nvvm_suld_2d_v4i8_clamp:3959return NVPTXISD::Suld2DV4I8Clamp;3960case Intrinsic::nvvm_suld_2d_v4i16_clamp:3961return NVPTXISD::Suld2DV4I16Clamp;3962case Intrinsic::nvvm_suld_2d_v4i32_clamp:3963return NVPTXISD::Suld2DV4I32Clamp;3964case Intrinsic::nvvm_suld_2d_array_i8_clamp:3965return NVPTXISD::Suld2DArrayI8Clamp;3966case Intrinsic::nvvm_suld_2d_array_i16_clamp:3967return NVPTXISD::Suld2DArrayI16Clamp;3968case Intrinsic::nvvm_suld_2d_array_i32_clamp:3969return NVPTXISD::Suld2DArrayI32Clamp;3970case Intrinsic::nvvm_suld_2d_array_i64_clamp:3971return NVPTXISD::Suld2DArrayI64Clamp;3972case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:3973return NVPTXISD::Suld2DArrayV2I8Clamp;3974case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:3975return NVPTXISD::Suld2DArrayV2I16Clamp;3976case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:3977return NVPTXISD::Suld2DArrayV2I32Clamp;3978case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:3979return NVPTXISD::Suld2DArrayV2I64Clamp;3980case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:3981return NVPTXISD::Suld2DArrayV4I8Clamp;3982case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:3983return NVPTXISD::Suld2DArrayV4I16Clamp;3984case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:3985return NVPTXISD::Suld2DArrayV4I32Clamp;3986case Intrinsic::nvvm_suld_3d_i8_clamp:3987return NVPTXISD::Suld3DI8Clamp;3988case Intrinsic::nvvm_suld_3d_i16_clamp:3989return NVPTXISD::Suld3DI16Clamp;3990case Intrinsic::nvvm_suld_3d_i32_clamp:3991return NVPTXISD::Suld3DI32Clamp;3992case Intrinsic::nvvm_suld_3d_i64_clamp:3993return NVPTXISD::Suld3DI64Clamp;3994case Intrinsic::nvvm_suld_3d_v2i8_clamp:3995return NVPTXISD::Suld3DV2I8Clamp;3996case Intrinsic::nvvm_suld_3d_v2i16_clamp:3997return NVPTXISD::Suld3DV2I16Clamp;3998case Intrinsic::nvvm_suld_3d_v2i32_clamp:3999return NVPTXISD::Suld3DV2I32Clamp;4000case Intrinsic::nvvm_suld_3d_v2i64_clamp:4001return NVPTXISD::Suld3DV2I64Clamp;4002case Intrinsic::nvvm_suld_3d_v4i8_clamp:4003return NVPTXISD::Suld3DV4I8Clamp;4004case Intrinsic::nvvm_suld_3d_v4i16_clamp:4005return NVPTXISD::Suld3DV4I16Clamp;4006case Intrinsic::nvvm_suld_3d_v4i32_clamp:4007return NVPTXISD::Suld3DV4I32Clamp;4008case Intrinsic::nvvm_suld_1d_i8_trap:4009return NVPTXISD::Suld1DI8Trap;4010case Intrinsic::nvvm_suld_1d_i16_trap:4011return NVPTXISD::Suld1DI16Trap;4012case Intrinsic::nvvm_suld_1d_i32_trap:4013return NVPTXISD::Suld1DI32Trap;4014case Intrinsic::nvvm_suld_1d_i64_trap:4015return NVPTXISD::Suld1DI64Trap;4016case Intrinsic::nvvm_suld_1d_v2i8_trap:4017return NVPTXISD::Suld1DV2I8Trap;4018case Intrinsic::nvvm_suld_1d_v2i16_trap:4019return NVPTXISD::Suld1DV2I16Trap;4020case Intrinsic::nvvm_suld_1d_v2i32_trap:4021return NVPTXISD::Suld1DV2I32Trap;4022case Intrinsic::nvvm_suld_1d_v2i64_trap:4023return NVPTXISD::Suld1DV2I64Trap;4024case Intrinsic::nvvm_suld_1d_v4i8_trap:4025return NVPTXISD::Suld1DV4I8Trap;4026case Intrinsic::nvvm_suld_1d_v4i16_trap:4027return NVPTXISD::Suld1DV4I16Trap;4028case Intrinsic::nvvm_suld_1d_v4i32_trap:4029return NVPTXISD::Suld1DV4I32Trap;4030case Intrinsic::nvvm_suld_1d_array_i8_trap:4031return NVPTXISD::Suld1DArrayI8Trap;4032case Intrinsic::nvvm_suld_1d_array_i16_trap:4033return NVPTXISD::Suld1DArrayI16Trap;4034case Intrinsic::nvvm_suld_1d_array_i32_trap:4035return NVPTXISD::Suld1DArrayI32Trap;4036case Intrinsic::nvvm_suld_1d_array_i64_trap:4037return NVPTXISD::Suld1DArrayI64Trap;4038case Intrinsic::nvvm_suld_1d_array_v2i8_trap:4039return NVPTXISD::Suld1DArrayV2I8Trap;4040case Intrinsic::nvvm_suld_1d_array_v2i16_trap:4041return NVPTXISD::Suld1DArrayV2I16Trap;4042case Intrinsic::nvvm_suld_1d_array_v2i32_trap:4043return NVPTXISD::Suld1DArrayV2I32Trap;4044case Intrinsic::nvvm_suld_1d_array_v2i64_trap:4045return NVPTXISD::Suld1DArrayV2I64Trap;4046case Intrinsic::nvvm_suld_1d_array_v4i8_trap:4047return NVPTXISD::Suld1DArrayV4I8Trap;4048case Intrinsic::nvvm_suld_1d_array_v4i16_trap:4049return NVPTXISD::Suld1DArrayV4I16Trap;4050case Intrinsic::nvvm_suld_1d_array_v4i32_trap:4051return NVPTXISD::Suld1DArrayV4I32Trap;4052case Intrinsic::nvvm_suld_2d_i8_trap:4053return NVPTXISD::Suld2DI8Trap;4054case Intrinsic::nvvm_suld_2d_i16_trap:4055return NVPTXISD::Suld2DI16Trap;4056case Intrinsic::nvvm_suld_2d_i32_trap:4057return NVPTXISD::Suld2DI32Trap;4058case Intrinsic::nvvm_suld_2d_i64_trap:4059return NVPTXISD::Suld2DI64Trap;4060case Intrinsic::nvvm_suld_2d_v2i8_trap:4061return NVPTXISD::Suld2DV2I8Trap;4062case Intrinsic::nvvm_suld_2d_v2i16_trap:4063return NVPTXISD::Suld2DV2I16Trap;4064case Intrinsic::nvvm_suld_2d_v2i32_trap:4065return NVPTXISD::Suld2DV2I32Trap;4066case Intrinsic::nvvm_suld_2d_v2i64_trap:4067return NVPTXISD::Suld2DV2I64Trap;4068case Intrinsic::nvvm_suld_2d_v4i8_trap:4069return NVPTXISD::Suld2DV4I8Trap;4070case Intrinsic::nvvm_suld_2d_v4i16_trap:4071return NVPTXISD::Suld2DV4I16Trap;4072case Intrinsic::nvvm_suld_2d_v4i32_trap:4073return NVPTXISD::Suld2DV4I32Trap;4074case Intrinsic::nvvm_suld_2d_array_i8_trap:4075return NVPTXISD::Suld2DArrayI8Trap;4076case Intrinsic::nvvm_suld_2d_array_i16_trap:4077return NVPTXISD::Suld2DArrayI16Trap;4078case Intrinsic::nvvm_suld_2d_array_i32_trap:4079return NVPTXISD::Suld2DArrayI32Trap;4080case Intrinsic::nvvm_suld_2d_array_i64_trap:4081return NVPTXISD::Suld2DArrayI64Trap;4082case Intrinsic::nvvm_suld_2d_array_v2i8_trap:4083return NVPTXISD::Suld2DArrayV2I8Trap;4084case Intrinsic::nvvm_suld_2d_array_v2i16_trap:4085return NVPTXISD::Suld2DArrayV2I16Trap;4086case Intrinsic::nvvm_suld_2d_array_v2i32_trap:4087return NVPTXISD::Suld2DArrayV2I32Trap;4088case Intrinsic::nvvm_suld_2d_array_v2i64_trap:4089return NVPTXISD::Suld2DArrayV2I64Trap;4090case Intrinsic::nvvm_suld_2d_array_v4i8_trap:4091return NVPTXISD::Suld2DArrayV4I8Trap;4092case Intrinsic::nvvm_suld_2d_array_v4i16_trap:4093return NVPTXISD::Suld2DArrayV4I16Trap;4094case Intrinsic::nvvm_suld_2d_array_v4i32_trap:4095return NVPTXISD::Suld2DArrayV4I32Trap;4096case Intrinsic::nvvm_suld_3d_i8_trap:4097return NVPTXISD::Suld3DI8Trap;4098case Intrinsic::nvvm_suld_3d_i16_trap:4099return NVPTXISD::Suld3DI16Trap;4100case Intrinsic::nvvm_suld_3d_i32_trap:4101return NVPTXISD::Suld3DI32Trap;4102case Intrinsic::nvvm_suld_3d_i64_trap:4103return NVPTXISD::Suld3DI64Trap;4104case Intrinsic::nvvm_suld_3d_v2i8_trap:4105return NVPTXISD::Suld3DV2I8Trap;4106case Intrinsic::nvvm_suld_3d_v2i16_trap:4107return NVPTXISD::Suld3DV2I16Trap;4108case Intrinsic::nvvm_suld_3d_v2i32_trap:4109return NVPTXISD::Suld3DV2I32Trap;4110case Intrinsic::nvvm_suld_3d_v2i64_trap:4111return NVPTXISD::Suld3DV2I64Trap;4112case Intrinsic::nvvm_suld_3d_v4i8_trap:4113return NVPTXISD::Suld3DV4I8Trap;4114case Intrinsic::nvvm_suld_3d_v4i16_trap:4115return NVPTXISD::Suld3DV4I16Trap;4116case Intrinsic::nvvm_suld_3d_v4i32_trap:4117return NVPTXISD::Suld3DV4I32Trap;4118case Intrinsic::nvvm_suld_1d_i8_zero:4119return NVPTXISD::Suld1DI8Zero;4120case Intrinsic::nvvm_suld_1d_i16_zero:4121return NVPTXISD::Suld1DI16Zero;4122case Intrinsic::nvvm_suld_1d_i32_zero:4123return NVPTXISD::Suld1DI32Zero;4124case Intrinsic::nvvm_suld_1d_i64_zero:4125return NVPTXISD::Suld1DI64Zero;4126case Intrinsic::nvvm_suld_1d_v2i8_zero:4127return NVPTXISD::Suld1DV2I8Zero;4128case Intrinsic::nvvm_suld_1d_v2i16_zero:4129return NVPTXISD::Suld1DV2I16Zero;4130case Intrinsic::nvvm_suld_1d_v2i32_zero:4131return NVPTXISD::Suld1DV2I32Zero;4132case Intrinsic::nvvm_suld_1d_v2i64_zero:4133return NVPTXISD::Suld1DV2I64Zero;4134case Intrinsic::nvvm_suld_1d_v4i8_zero:4135return NVPTXISD::Suld1DV4I8Zero;4136case Intrinsic::nvvm_suld_1d_v4i16_zero:4137return NVPTXISD::Suld1DV4I16Zero;4138case Intrinsic::nvvm_suld_1d_v4i32_zero:4139return NVPTXISD::Suld1DV4I32Zero;4140case Intrinsic::nvvm_suld_1d_array_i8_zero:4141return NVPTXISD::Suld1DArrayI8Zero;4142case Intrinsic::nvvm_suld_1d_array_i16_zero:4143return NVPTXISD::Suld1DArrayI16Zero;4144case Intrinsic::nvvm_suld_1d_array_i32_zero:4145return NVPTXISD::Suld1DArrayI32Zero;4146case Intrinsic::nvvm_suld_1d_array_i64_zero:4147return NVPTXISD::Suld1DArrayI64Zero;4148case Intrinsic::nvvm_suld_1d_array_v2i8_zero:4149return NVPTXISD::Suld1DArrayV2I8Zero;4150case Intrinsic::nvvm_suld_1d_array_v2i16_zero:4151return NVPTXISD::Suld1DArrayV2I16Zero;4152case Intrinsic::nvvm_suld_1d_array_v2i32_zero:4153return NVPTXISD::Suld1DArrayV2I32Zero;4154case Intrinsic::nvvm_suld_1d_array_v2i64_zero:4155return NVPTXISD::Suld1DArrayV2I64Zero;4156case Intrinsic::nvvm_suld_1d_array_v4i8_zero:4157return NVPTXISD::Suld1DArrayV4I8Zero;4158case Intrinsic::nvvm_suld_1d_array_v4i16_zero:4159return NVPTXISD::Suld1DArrayV4I16Zero;4160case Intrinsic::nvvm_suld_1d_array_v4i32_zero:4161return NVPTXISD::Suld1DArrayV4I32Zero;4162case Intrinsic::nvvm_suld_2d_i8_zero:4163return NVPTXISD::Suld2DI8Zero;4164case Intrinsic::nvvm_suld_2d_i16_zero:4165return NVPTXISD::Suld2DI16Zero;4166case Intrinsic::nvvm_suld_2d_i32_zero:4167return NVPTXISD::Suld2DI32Zero;4168case Intrinsic::nvvm_suld_2d_i64_zero:4169return NVPTXISD::Suld2DI64Zero;4170case Intrinsic::nvvm_suld_2d_v2i8_zero:4171return NVPTXISD::Suld2DV2I8Zero;4172case Intrinsic::nvvm_suld_2d_v2i16_zero:4173return NVPTXISD::Suld2DV2I16Zero;4174case Intrinsic::nvvm_suld_2d_v2i32_zero:4175return NVPTXISD::Suld2DV2I32Zero;4176case Intrinsic::nvvm_suld_2d_v2i64_zero:4177return NVPTXISD::Suld2DV2I64Zero;4178case Intrinsic::nvvm_suld_2d_v4i8_zero:4179return NVPTXISD::Suld2DV4I8Zero;4180case Intrinsic::nvvm_suld_2d_v4i16_zero:4181return NVPTXISD::Suld2DV4I16Zero;4182case Intrinsic::nvvm_suld_2d_v4i32_zero:4183return NVPTXISD::Suld2DV4I32Zero;4184case Intrinsic::nvvm_suld_2d_array_i8_zero:4185return NVPTXISD::Suld2DArrayI8Zero;4186case Intrinsic::nvvm_suld_2d_array_i16_zero:4187return NVPTXISD::Suld2DArrayI16Zero;4188case Intrinsic::nvvm_suld_2d_array_i32_zero:4189return NVPTXISD::Suld2DArrayI32Zero;4190case Intrinsic::nvvm_suld_2d_array_i64_zero:4191return NVPTXISD::Suld2DArrayI64Zero;4192case Intrinsic::nvvm_suld_2d_array_v2i8_zero:4193return NVPTXISD::Suld2DArrayV2I8Zero;4194case Intrinsic::nvvm_suld_2d_array_v2i16_zero:4195return NVPTXISD::Suld2DArrayV2I16Zero;4196case Intrinsic::nvvm_suld_2d_array_v2i32_zero:4197return NVPTXISD::Suld2DArrayV2I32Zero;4198case Intrinsic::nvvm_suld_2d_array_v2i64_zero:4199return NVPTXISD::Suld2DArrayV2I64Zero;4200case Intrinsic::nvvm_suld_2d_array_v4i8_zero:4201return NVPTXISD::Suld2DArrayV4I8Zero;4202case Intrinsic::nvvm_suld_2d_array_v4i16_zero:4203return NVPTXISD::Suld2DArrayV4I16Zero;4204case Intrinsic::nvvm_suld_2d_array_v4i32_zero:4205return NVPTXISD::Suld2DArrayV4I32Zero;4206case Intrinsic::nvvm_suld_3d_i8_zero:4207return NVPTXISD::Suld3DI8Zero;4208case Intrinsic::nvvm_suld_3d_i16_zero:4209return NVPTXISD::Suld3DI16Zero;4210case Intrinsic::nvvm_suld_3d_i32_zero:4211return NVPTXISD::Suld3DI32Zero;4212case Intrinsic::nvvm_suld_3d_i64_zero:4213return NVPTXISD::Suld3DI64Zero;4214case Intrinsic::nvvm_suld_3d_v2i8_zero:4215return NVPTXISD::Suld3DV2I8Zero;4216case Intrinsic::nvvm_suld_3d_v2i16_zero:4217return NVPTXISD::Suld3DV2I16Zero;4218case Intrinsic::nvvm_suld_3d_v2i32_zero:4219return NVPTXISD::Suld3DV2I32Zero;4220case Intrinsic::nvvm_suld_3d_v2i64_zero:4221return NVPTXISD::Suld3DV2I64Zero;4222case Intrinsic::nvvm_suld_3d_v4i8_zero:4223return NVPTXISD::Suld3DV4I8Zero;4224case Intrinsic::nvvm_suld_3d_v4i16_zero:4225return NVPTXISD::Suld3DV4I16Zero;4226case Intrinsic::nvvm_suld_3d_v4i32_zero:4227return NVPTXISD::Suld3DV4I32Zero;4228}4229}42304231// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as4232// TgtMemIntrinsic4233// because we need the information that is only available in the "Value" type4234// of destination4235// pointer. In particular, the address space information.4236bool NVPTXTargetLowering::getTgtMemIntrinsic(4237IntrinsicInfo &Info, const CallInst &I,4238MachineFunction &MF, unsigned Intrinsic) const {4239switch (Intrinsic) {4240default:4241return false;4242case Intrinsic::nvvm_match_all_sync_i32p:4243case Intrinsic::nvvm_match_all_sync_i64p:4244Info.opc = ISD::INTRINSIC_W_CHAIN;4245// memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute4246// in order to model data exchange with other threads, but perform no real4247// memory accesses.4248Info.memVT = MVT::i1;42494250// Our result depends on both our and other thread's arguments.4251Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;4252return true;4253case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:4254case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:4255case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:4256case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:4257case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:4258case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:4259case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:4260case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:4261case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:4262case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:4263case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:4264case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:4265case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:4266case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:4267case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:4268case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:4269case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:4270case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:4271case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:4272case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:4273case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:4274case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:4275case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:4276case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {4277Info.opc = ISD::INTRINSIC_W_CHAIN;4278Info.memVT = MVT::v8f16;4279Info.ptrVal = I.getArgOperand(0);4280Info.offset = 0;4281Info.flags = MachineMemOperand::MOLoad;4282Info.align = Align(16);4283return true;4284}4285case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:4286case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:4287case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:4288case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:4289case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:4290case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:4291case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:4292case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:4293case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:4294case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:4295case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:4296case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:4297case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:4298case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:4299case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:4300case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:4301case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:4302case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:4303case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:4304case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:4305case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:4306case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:4307case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:4308case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {4309Info.opc = ISD::INTRINSIC_W_CHAIN;4310Info.memVT = MVT::v2i32;4311Info.ptrVal = I.getArgOperand(0);4312Info.offset = 0;4313Info.flags = MachineMemOperand::MOLoad;4314Info.align = Align(8);4315return true;4316}43174318case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:4319case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:4320case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:4321case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:4322case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:4323case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:4324case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:4325case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:4326case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:4327case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:4328case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:4329case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:4330case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:4331case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:4332case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:4333case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:43344335case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:4336case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:4337case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:4338case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:4339case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:4340case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:4341case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:4342case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:4343case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:4344case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:4345case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:4346case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:4347case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:4348case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:4349case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:4350case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:4351case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:4352case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {4353Info.opc = ISD::INTRINSIC_W_CHAIN;4354Info.memVT = MVT::v4i32;4355Info.ptrVal = I.getArgOperand(0);4356Info.offset = 0;4357Info.flags = MachineMemOperand::MOLoad;4358Info.align = Align(16);4359return true;4360}43614362case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:4363case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:4364case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:4365case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:4366case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:4367case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:4368case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:4369case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:43704371case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:4372case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:4373case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:4374case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:4375case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:4376case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:4377case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:4378case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:4379case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:4380case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:4381case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:4382case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:4383case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:4384case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:4385case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:4386case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:4387case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:4388case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:4389case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:4390case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:4391case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:4392case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {4393Info.opc = ISD::INTRINSIC_W_CHAIN;4394Info.memVT = MVT::i32;4395Info.ptrVal = I.getArgOperand(0);4396Info.offset = 0;4397Info.flags = MachineMemOperand::MOLoad;4398Info.align = Align(4);4399return true;4400}44014402case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:4403case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:4404case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:4405case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:4406case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:4407case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:4408case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:4409case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:4410case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:4411case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:4412case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:4413case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {4414Info.opc = ISD::INTRINSIC_W_CHAIN;4415Info.memVT = MVT::v4f16;4416Info.ptrVal = I.getArgOperand(0);4417Info.offset = 0;4418Info.flags = MachineMemOperand::MOLoad;4419Info.align = Align(16);4420return true;4421}44224423case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:4424case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:4425case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:4426case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:4427case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:4428case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:4429case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:4430case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:4431case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:4432case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:4433case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:4434case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:4435case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:4436case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:4437case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:4438case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {4439Info.opc = ISD::INTRINSIC_W_CHAIN;4440Info.memVT = MVT::v8f32;4441Info.ptrVal = I.getArgOperand(0);4442Info.offset = 0;4443Info.flags = MachineMemOperand::MOLoad;4444Info.align = Align(16);4445return true;4446}44474448case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:4449case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:4450case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:4451case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:44524453case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:4454case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:4455case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:4456case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:44574458case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:4459case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:4460case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:4461case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:4462case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:4463case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:4464case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:4465case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:4466case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:4467case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:4468case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:4469case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {4470Info.opc = ISD::INTRINSIC_W_CHAIN;4471Info.memVT = MVT::v8i32;4472Info.ptrVal = I.getArgOperand(0);4473Info.offset = 0;4474Info.flags = MachineMemOperand::MOLoad;4475Info.align = Align(16);4476return true;4477}44784479case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:4480case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:4481case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:4482case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:4483case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:4484case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:4485case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:4486case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:4487case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:4488case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {4489Info.opc = ISD::INTRINSIC_W_CHAIN;4490Info.memVT = MVT::v2i32;4491Info.ptrVal = I.getArgOperand(0);4492Info.offset = 0;4493Info.flags = MachineMemOperand::MOLoad;4494Info.align = Align(8);4495return true;4496}44974498case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:4499case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:4500case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:4501case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:45024503case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:4504case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:4505case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:4506case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {4507Info.opc = ISD::INTRINSIC_W_CHAIN;4508Info.memVT = MVT::f64;4509Info.ptrVal = I.getArgOperand(0);4510Info.offset = 0;4511Info.flags = MachineMemOperand::MOLoad;4512Info.align = Align(8);4513return true;4514}45154516case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:4517case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:4518case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:4519case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {4520Info.opc = ISD::INTRINSIC_W_CHAIN;4521Info.memVT = MVT::v2f64;4522Info.ptrVal = I.getArgOperand(0);4523Info.offset = 0;4524Info.flags = MachineMemOperand::MOLoad;4525Info.align = Align(16);4526return true;4527}45284529case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:4530case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:4531case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:4532case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:4533case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:4534case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:4535case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:4536case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:4537case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:4538case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:4539case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:4540case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {4541Info.opc = ISD::INTRINSIC_VOID;4542Info.memVT = MVT::v4f16;4543Info.ptrVal = I.getArgOperand(0);4544Info.offset = 0;4545Info.flags = MachineMemOperand::MOStore;4546Info.align = Align(16);4547return true;4548}45494550case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:4551case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:4552case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:4553case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:4554case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:4555case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:4556case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:4557case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:4558case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:4559case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:4560case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:4561case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:4562case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:4563case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:4564case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:4565case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {4566Info.opc = ISD::INTRINSIC_VOID;4567Info.memVT = MVT::v8f32;4568Info.ptrVal = I.getArgOperand(0);4569Info.offset = 0;4570Info.flags = MachineMemOperand::MOStore;4571Info.align = Align(16);4572return true;4573}45744575case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:4576case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:4577case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:4578case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:4579case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:4580case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:4581case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:4582case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:4583case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:4584case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:4585case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:4586case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {4587Info.opc = ISD::INTRINSIC_VOID;4588Info.memVT = MVT::v8i32;4589Info.ptrVal = I.getArgOperand(0);4590Info.offset = 0;4591Info.flags = MachineMemOperand::MOStore;4592Info.align = Align(16);4593return true;4594}45954596case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:4597case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:4598case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:4599case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:4600case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:4601case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:4602case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:4603case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {4604Info.opc = ISD::INTRINSIC_VOID;4605Info.memVT = MVT::v2i32;4606Info.ptrVal = I.getArgOperand(0);4607Info.offset = 0;4608Info.flags = MachineMemOperand::MOStore;4609Info.align = Align(8);4610return true;4611}46124613case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:4614case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:4615case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:4616case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {4617Info.opc = ISD::INTRINSIC_VOID;4618Info.memVT = MVT::v2f64;4619Info.ptrVal = I.getArgOperand(0);4620Info.offset = 0;4621Info.flags = MachineMemOperand::MOStore;4622Info.align = Align(16);4623return true;4624}46254626case Intrinsic::nvvm_atomic_load_inc_32:4627case Intrinsic::nvvm_atomic_load_dec_32:46284629case Intrinsic::nvvm_atomic_add_gen_f_cta:4630case Intrinsic::nvvm_atomic_add_gen_f_sys:4631case Intrinsic::nvvm_atomic_add_gen_i_cta:4632case Intrinsic::nvvm_atomic_add_gen_i_sys:4633case Intrinsic::nvvm_atomic_and_gen_i_cta:4634case Intrinsic::nvvm_atomic_and_gen_i_sys:4635case Intrinsic::nvvm_atomic_cas_gen_i_cta:4636case Intrinsic::nvvm_atomic_cas_gen_i_sys:4637case Intrinsic::nvvm_atomic_dec_gen_i_cta:4638case Intrinsic::nvvm_atomic_dec_gen_i_sys:4639case Intrinsic::nvvm_atomic_inc_gen_i_cta:4640case Intrinsic::nvvm_atomic_inc_gen_i_sys:4641case Intrinsic::nvvm_atomic_max_gen_i_cta:4642case Intrinsic::nvvm_atomic_max_gen_i_sys:4643case Intrinsic::nvvm_atomic_min_gen_i_cta:4644case Intrinsic::nvvm_atomic_min_gen_i_sys:4645case Intrinsic::nvvm_atomic_or_gen_i_cta:4646case Intrinsic::nvvm_atomic_or_gen_i_sys:4647case Intrinsic::nvvm_atomic_exch_gen_i_cta:4648case Intrinsic::nvvm_atomic_exch_gen_i_sys:4649case Intrinsic::nvvm_atomic_xor_gen_i_cta:4650case Intrinsic::nvvm_atomic_xor_gen_i_sys: {4651auto &DL = I.getDataLayout();4652Info.opc = ISD::INTRINSIC_W_CHAIN;4653Info.memVT = getValueType(DL, I.getType());4654Info.ptrVal = I.getArgOperand(0);4655Info.offset = 0;4656Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;4657Info.align.reset();4658return true;4659}46604661case Intrinsic::nvvm_ldu_global_i:4662case Intrinsic::nvvm_ldu_global_f:4663case Intrinsic::nvvm_ldu_global_p: {4664auto &DL = I.getDataLayout();4665Info.opc = ISD::INTRINSIC_W_CHAIN;4666if (Intrinsic == Intrinsic::nvvm_ldu_global_i)4667Info.memVT = getValueType(DL, I.getType());4668else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)4669Info.memVT = getPointerTy(DL);4670else4671Info.memVT = getValueType(DL, I.getType());4672Info.ptrVal = I.getArgOperand(0);4673Info.offset = 0;4674Info.flags = MachineMemOperand::MOLoad;4675Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();46764677return true;4678}4679case Intrinsic::nvvm_ldg_global_i:4680case Intrinsic::nvvm_ldg_global_f:4681case Intrinsic::nvvm_ldg_global_p: {4682auto &DL = I.getDataLayout();46834684Info.opc = ISD::INTRINSIC_W_CHAIN;4685if (Intrinsic == Intrinsic::nvvm_ldg_global_i)4686Info.memVT = getValueType(DL, I.getType());4687else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)4688Info.memVT = getPointerTy(DL);4689else4690Info.memVT = getValueType(DL, I.getType());4691Info.ptrVal = I.getArgOperand(0);4692Info.offset = 0;4693Info.flags = MachineMemOperand::MOLoad;4694Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();46954696return true;4697}46984699case Intrinsic::nvvm_tex_1d_v4f32_s32:4700case Intrinsic::nvvm_tex_1d_v4f32_f32:4701case Intrinsic::nvvm_tex_1d_level_v4f32_f32:4702case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:4703case Intrinsic::nvvm_tex_1d_array_v4f32_s32:4704case Intrinsic::nvvm_tex_1d_array_v4f32_f32:4705case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:4706case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:4707case Intrinsic::nvvm_tex_2d_v4f32_s32:4708case Intrinsic::nvvm_tex_2d_v4f32_f32:4709case Intrinsic::nvvm_tex_2d_level_v4f32_f32:4710case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:4711case Intrinsic::nvvm_tex_2d_array_v4f32_s32:4712case Intrinsic::nvvm_tex_2d_array_v4f32_f32:4713case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:4714case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:4715case Intrinsic::nvvm_tex_3d_v4f32_s32:4716case Intrinsic::nvvm_tex_3d_v4f32_f32:4717case Intrinsic::nvvm_tex_3d_level_v4f32_f32:4718case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:4719case Intrinsic::nvvm_tex_cube_v4f32_f32:4720case Intrinsic::nvvm_tex_cube_level_v4f32_f32:4721case Intrinsic::nvvm_tex_cube_array_v4f32_f32:4722case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:4723case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:4724case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:4725case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:4726case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:4727case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:4728case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:4729case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:4730case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:4731case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:4732case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:4733case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:4734case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:4735case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:4736case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:4737case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:4738case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:4739case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:4740case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:4741case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:4742case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:4743case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:4744case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:4745case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:4746case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:4747case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:4748case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:4749case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:4750case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:4751case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:4752case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:4753case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:4754case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:4755case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:4756case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:4757Info.opc = getOpcForTextureInstr(Intrinsic);4758Info.memVT = MVT::v4f32;4759Info.ptrVal = nullptr;4760Info.offset = 0;4761Info.flags = MachineMemOperand::MOLoad;4762Info.align = Align(16);4763return true;47644765case Intrinsic::nvvm_tex_1d_v4s32_s32:4766case Intrinsic::nvvm_tex_1d_v4s32_f32:4767case Intrinsic::nvvm_tex_1d_level_v4s32_f32:4768case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:4769case Intrinsic::nvvm_tex_1d_array_v4s32_s32:4770case Intrinsic::nvvm_tex_1d_array_v4s32_f32:4771case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:4772case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:4773case Intrinsic::nvvm_tex_2d_v4s32_s32:4774case Intrinsic::nvvm_tex_2d_v4s32_f32:4775case Intrinsic::nvvm_tex_2d_level_v4s32_f32:4776case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:4777case Intrinsic::nvvm_tex_2d_array_v4s32_s32:4778case Intrinsic::nvvm_tex_2d_array_v4s32_f32:4779case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:4780case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:4781case Intrinsic::nvvm_tex_3d_v4s32_s32:4782case Intrinsic::nvvm_tex_3d_v4s32_f32:4783case Intrinsic::nvvm_tex_3d_level_v4s32_f32:4784case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:4785case Intrinsic::nvvm_tex_cube_v4s32_f32:4786case Intrinsic::nvvm_tex_cube_level_v4s32_f32:4787case Intrinsic::nvvm_tex_cube_array_v4s32_f32:4788case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:4789case Intrinsic::nvvm_tex_cube_v4u32_f32:4790case Intrinsic::nvvm_tex_cube_level_v4u32_f32:4791case Intrinsic::nvvm_tex_cube_array_v4u32_f32:4792case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:4793case Intrinsic::nvvm_tex_1d_v4u32_s32:4794case Intrinsic::nvvm_tex_1d_v4u32_f32:4795case Intrinsic::nvvm_tex_1d_level_v4u32_f32:4796case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:4797case Intrinsic::nvvm_tex_1d_array_v4u32_s32:4798case Intrinsic::nvvm_tex_1d_array_v4u32_f32:4799case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:4800case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:4801case Intrinsic::nvvm_tex_2d_v4u32_s32:4802case Intrinsic::nvvm_tex_2d_v4u32_f32:4803case Intrinsic::nvvm_tex_2d_level_v4u32_f32:4804case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:4805case Intrinsic::nvvm_tex_2d_array_v4u32_s32:4806case Intrinsic::nvvm_tex_2d_array_v4u32_f32:4807case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:4808case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:4809case Intrinsic::nvvm_tex_3d_v4u32_s32:4810case Intrinsic::nvvm_tex_3d_v4u32_f32:4811case Intrinsic::nvvm_tex_3d_level_v4u32_f32:4812case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:4813case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:4814case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:4815case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:4816case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:4817case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:4818case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:4819case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:4820case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:4821case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:4822case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:4823case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:4824case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:4825case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:4826case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:4827case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:4828case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:4829case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:4830case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:4831case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:4832case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:4833case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:4834case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:4835case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:4836case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:4837case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:4838case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:4839case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:4840case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:4841case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:4842case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:4843case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:4844case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:4845case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:4846case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:4847case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:4848case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:4849case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:4850case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:4851case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:4852case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:4853case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:4854case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:4855case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:4856case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:4857case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:4858case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:4859case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:4860case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:4861case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:4862case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:4863case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:4864case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:4865case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:4866case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:4867case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:4868case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:4869case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:4870case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:4871case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:4872case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:4873case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:4874case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:4875case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:4876case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:4877case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:4878case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:4879case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:4880case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:4881Info.opc = getOpcForTextureInstr(Intrinsic);4882Info.memVT = MVT::v4i32;4883Info.ptrVal = nullptr;4884Info.offset = 0;4885Info.flags = MachineMemOperand::MOLoad;4886Info.align = Align(16);4887return true;48884889case Intrinsic::nvvm_suld_1d_i8_clamp:4890case Intrinsic::nvvm_suld_1d_v2i8_clamp:4891case Intrinsic::nvvm_suld_1d_v4i8_clamp:4892case Intrinsic::nvvm_suld_1d_array_i8_clamp:4893case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:4894case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:4895case Intrinsic::nvvm_suld_2d_i8_clamp:4896case Intrinsic::nvvm_suld_2d_v2i8_clamp:4897case Intrinsic::nvvm_suld_2d_v4i8_clamp:4898case Intrinsic::nvvm_suld_2d_array_i8_clamp:4899case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:4900case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:4901case Intrinsic::nvvm_suld_3d_i8_clamp:4902case Intrinsic::nvvm_suld_3d_v2i8_clamp:4903case Intrinsic::nvvm_suld_3d_v4i8_clamp:4904case Intrinsic::nvvm_suld_1d_i8_trap:4905case Intrinsic::nvvm_suld_1d_v2i8_trap:4906case Intrinsic::nvvm_suld_1d_v4i8_trap:4907case Intrinsic::nvvm_suld_1d_array_i8_trap:4908case Intrinsic::nvvm_suld_1d_array_v2i8_trap:4909case Intrinsic::nvvm_suld_1d_array_v4i8_trap:4910case Intrinsic::nvvm_suld_2d_i8_trap:4911case Intrinsic::nvvm_suld_2d_v2i8_trap:4912case Intrinsic::nvvm_suld_2d_v4i8_trap:4913case Intrinsic::nvvm_suld_2d_array_i8_trap:4914case Intrinsic::nvvm_suld_2d_array_v2i8_trap:4915case Intrinsic::nvvm_suld_2d_array_v4i8_trap:4916case Intrinsic::nvvm_suld_3d_i8_trap:4917case Intrinsic::nvvm_suld_3d_v2i8_trap:4918case Intrinsic::nvvm_suld_3d_v4i8_trap:4919case Intrinsic::nvvm_suld_1d_i8_zero:4920case Intrinsic::nvvm_suld_1d_v2i8_zero:4921case Intrinsic::nvvm_suld_1d_v4i8_zero:4922case Intrinsic::nvvm_suld_1d_array_i8_zero:4923case Intrinsic::nvvm_suld_1d_array_v2i8_zero:4924case Intrinsic::nvvm_suld_1d_array_v4i8_zero:4925case Intrinsic::nvvm_suld_2d_i8_zero:4926case Intrinsic::nvvm_suld_2d_v2i8_zero:4927case Intrinsic::nvvm_suld_2d_v4i8_zero:4928case Intrinsic::nvvm_suld_2d_array_i8_zero:4929case Intrinsic::nvvm_suld_2d_array_v2i8_zero:4930case Intrinsic::nvvm_suld_2d_array_v4i8_zero:4931case Intrinsic::nvvm_suld_3d_i8_zero:4932case Intrinsic::nvvm_suld_3d_v2i8_zero:4933case Intrinsic::nvvm_suld_3d_v4i8_zero:4934Info.opc = getOpcForSurfaceInstr(Intrinsic);4935Info.memVT = MVT::i8;4936Info.ptrVal = nullptr;4937Info.offset = 0;4938Info.flags = MachineMemOperand::MOLoad;4939Info.align = Align(16);4940return true;49414942case Intrinsic::nvvm_suld_1d_i16_clamp:4943case Intrinsic::nvvm_suld_1d_v2i16_clamp:4944case Intrinsic::nvvm_suld_1d_v4i16_clamp:4945case Intrinsic::nvvm_suld_1d_array_i16_clamp:4946case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:4947case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:4948case Intrinsic::nvvm_suld_2d_i16_clamp:4949case Intrinsic::nvvm_suld_2d_v2i16_clamp:4950case Intrinsic::nvvm_suld_2d_v4i16_clamp:4951case Intrinsic::nvvm_suld_2d_array_i16_clamp:4952case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:4953case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:4954case Intrinsic::nvvm_suld_3d_i16_clamp:4955case Intrinsic::nvvm_suld_3d_v2i16_clamp:4956case Intrinsic::nvvm_suld_3d_v4i16_clamp:4957case Intrinsic::nvvm_suld_1d_i16_trap:4958case Intrinsic::nvvm_suld_1d_v2i16_trap:4959case Intrinsic::nvvm_suld_1d_v4i16_trap:4960case Intrinsic::nvvm_suld_1d_array_i16_trap:4961case Intrinsic::nvvm_suld_1d_array_v2i16_trap:4962case Intrinsic::nvvm_suld_1d_array_v4i16_trap:4963case Intrinsic::nvvm_suld_2d_i16_trap:4964case Intrinsic::nvvm_suld_2d_v2i16_trap:4965case Intrinsic::nvvm_suld_2d_v4i16_trap:4966case Intrinsic::nvvm_suld_2d_array_i16_trap:4967case Intrinsic::nvvm_suld_2d_array_v2i16_trap:4968case Intrinsic::nvvm_suld_2d_array_v4i16_trap:4969case Intrinsic::nvvm_suld_3d_i16_trap:4970case Intrinsic::nvvm_suld_3d_v2i16_trap:4971case Intrinsic::nvvm_suld_3d_v4i16_trap:4972case Intrinsic::nvvm_suld_1d_i16_zero:4973case Intrinsic::nvvm_suld_1d_v2i16_zero:4974case Intrinsic::nvvm_suld_1d_v4i16_zero:4975case Intrinsic::nvvm_suld_1d_array_i16_zero:4976case Intrinsic::nvvm_suld_1d_array_v2i16_zero:4977case Intrinsic::nvvm_suld_1d_array_v4i16_zero:4978case Intrinsic::nvvm_suld_2d_i16_zero:4979case Intrinsic::nvvm_suld_2d_v2i16_zero:4980case Intrinsic::nvvm_suld_2d_v4i16_zero:4981case Intrinsic::nvvm_suld_2d_array_i16_zero:4982case Intrinsic::nvvm_suld_2d_array_v2i16_zero:4983case Intrinsic::nvvm_suld_2d_array_v4i16_zero:4984case Intrinsic::nvvm_suld_3d_i16_zero:4985case Intrinsic::nvvm_suld_3d_v2i16_zero:4986case Intrinsic::nvvm_suld_3d_v4i16_zero:4987Info.opc = getOpcForSurfaceInstr(Intrinsic);4988Info.memVT = MVT::i16;4989Info.ptrVal = nullptr;4990Info.offset = 0;4991Info.flags = MachineMemOperand::MOLoad;4992Info.align = Align(16);4993return true;49944995case Intrinsic::nvvm_suld_1d_i32_clamp:4996case Intrinsic::nvvm_suld_1d_v2i32_clamp:4997case Intrinsic::nvvm_suld_1d_v4i32_clamp:4998case Intrinsic::nvvm_suld_1d_array_i32_clamp:4999case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:5000case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:5001case Intrinsic::nvvm_suld_2d_i32_clamp:5002case Intrinsic::nvvm_suld_2d_v2i32_clamp:5003case Intrinsic::nvvm_suld_2d_v4i32_clamp:5004case Intrinsic::nvvm_suld_2d_array_i32_clamp:5005case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:5006case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:5007case Intrinsic::nvvm_suld_3d_i32_clamp:5008case Intrinsic::nvvm_suld_3d_v2i32_clamp:5009case Intrinsic::nvvm_suld_3d_v4i32_clamp:5010case Intrinsic::nvvm_suld_1d_i32_trap:5011case Intrinsic::nvvm_suld_1d_v2i32_trap:5012case Intrinsic::nvvm_suld_1d_v4i32_trap:5013case Intrinsic::nvvm_suld_1d_array_i32_trap:5014case Intrinsic::nvvm_suld_1d_array_v2i32_trap:5015case Intrinsic::nvvm_suld_1d_array_v4i32_trap:5016case Intrinsic::nvvm_suld_2d_i32_trap:5017case Intrinsic::nvvm_suld_2d_v2i32_trap:5018case Intrinsic::nvvm_suld_2d_v4i32_trap:5019case Intrinsic::nvvm_suld_2d_array_i32_trap:5020case Intrinsic::nvvm_suld_2d_array_v2i32_trap:5021case Intrinsic::nvvm_suld_2d_array_v4i32_trap:5022case Intrinsic::nvvm_suld_3d_i32_trap:5023case Intrinsic::nvvm_suld_3d_v2i32_trap:5024case Intrinsic::nvvm_suld_3d_v4i32_trap:5025case Intrinsic::nvvm_suld_1d_i32_zero:5026case Intrinsic::nvvm_suld_1d_v2i32_zero:5027case Intrinsic::nvvm_suld_1d_v4i32_zero:5028case Intrinsic::nvvm_suld_1d_array_i32_zero:5029case Intrinsic::nvvm_suld_1d_array_v2i32_zero:5030case Intrinsic::nvvm_suld_1d_array_v4i32_zero:5031case Intrinsic::nvvm_suld_2d_i32_zero:5032case Intrinsic::nvvm_suld_2d_v2i32_zero:5033case Intrinsic::nvvm_suld_2d_v4i32_zero:5034case Intrinsic::nvvm_suld_2d_array_i32_zero:5035case Intrinsic::nvvm_suld_2d_array_v2i32_zero:5036case Intrinsic::nvvm_suld_2d_array_v4i32_zero:5037case Intrinsic::nvvm_suld_3d_i32_zero:5038case Intrinsic::nvvm_suld_3d_v2i32_zero:5039case Intrinsic::nvvm_suld_3d_v4i32_zero:5040Info.opc = getOpcForSurfaceInstr(Intrinsic);5041Info.memVT = MVT::i32;5042Info.ptrVal = nullptr;5043Info.offset = 0;5044Info.flags = MachineMemOperand::MOLoad;5045Info.align = Align(16);5046return true;50475048case Intrinsic::nvvm_suld_1d_i64_clamp:5049case Intrinsic::nvvm_suld_1d_v2i64_clamp:5050case Intrinsic::nvvm_suld_1d_array_i64_clamp:5051case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:5052case Intrinsic::nvvm_suld_2d_i64_clamp:5053case Intrinsic::nvvm_suld_2d_v2i64_clamp:5054case Intrinsic::nvvm_suld_2d_array_i64_clamp:5055case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:5056case Intrinsic::nvvm_suld_3d_i64_clamp:5057case Intrinsic::nvvm_suld_3d_v2i64_clamp:5058case Intrinsic::nvvm_suld_1d_i64_trap:5059case Intrinsic::nvvm_suld_1d_v2i64_trap:5060case Intrinsic::nvvm_suld_1d_array_i64_trap:5061case Intrinsic::nvvm_suld_1d_array_v2i64_trap:5062case Intrinsic::nvvm_suld_2d_i64_trap:5063case Intrinsic::nvvm_suld_2d_v2i64_trap:5064case Intrinsic::nvvm_suld_2d_array_i64_trap:5065case Intrinsic::nvvm_suld_2d_array_v2i64_trap:5066case Intrinsic::nvvm_suld_3d_i64_trap:5067case Intrinsic::nvvm_suld_3d_v2i64_trap:5068case Intrinsic::nvvm_suld_1d_i64_zero:5069case Intrinsic::nvvm_suld_1d_v2i64_zero:5070case Intrinsic::nvvm_suld_1d_array_i64_zero:5071case Intrinsic::nvvm_suld_1d_array_v2i64_zero:5072case Intrinsic::nvvm_suld_2d_i64_zero:5073case Intrinsic::nvvm_suld_2d_v2i64_zero:5074case Intrinsic::nvvm_suld_2d_array_i64_zero:5075case Intrinsic::nvvm_suld_2d_array_v2i64_zero:5076case Intrinsic::nvvm_suld_3d_i64_zero:5077case Intrinsic::nvvm_suld_3d_v2i64_zero:5078Info.opc = getOpcForSurfaceInstr(Intrinsic);5079Info.memVT = MVT::i64;5080Info.ptrVal = nullptr;5081Info.offset = 0;5082Info.flags = MachineMemOperand::MOLoad;5083Info.align = Align(16);5084return true;5085}5086return false;5087}50885089/// getFunctionParamOptimizedAlign - since function arguments are passed via5090/// .param space, we may want to increase their alignment in a way that5091/// ensures that we can effectively vectorize their loads & stores. We can5092/// increase alignment only if the function has internal or has private5093/// linkage as for other linkage types callers may already rely on default5094/// alignment. To allow using 128-bit vectorized loads/stores, this function5095/// ensures that alignment is 16 or greater.5096Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(5097const Function *F, Type *ArgTy, const DataLayout &DL) const {5098// Capping the alignment to 128 bytes as that is the maximum alignment5099// supported by PTX.5100const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));51015102// If a function has linkage different from internal or private, we5103// must use default ABI alignment as external users rely on it. Same5104// for a function that may be called from a function pointer.5105if (!F || !F->hasLocalLinkage() ||5106F->hasAddressTaken(/*Users=*/nullptr,5107/*IgnoreCallbackUses=*/false,5108/*IgnoreAssumeLikeCalls=*/true,5109/*IgnoreLLVMUsed=*/true))5110return ABITypeAlign;51115112assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");5113return std::max(Align(16), ABITypeAlign);5114}51155116/// Helper for computing alignment of a device function byval parameter.5117Align NVPTXTargetLowering::getFunctionByValParamAlign(5118const Function *F, Type *ArgTy, Align InitialAlign,5119const DataLayout &DL) const {5120Align ArgAlign = InitialAlign;5121// Try to increase alignment to enhance vectorization options.5122if (F)5123ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));51245125// Old ptx versions have a bug. When PTX code takes address of5126// byval parameter with alignment < 4, ptxas generates code to5127// spill argument into memory. Alas on sm_50+ ptxas generates5128// SASS code that fails with misaligned access. To work around5129// the problem, make sure that we align byval parameters by at5130// least 4. This bug seems to be fixed at least starting from5131// ptxas > 9.0.5132// TODO: remove this after verifying the bug is not reproduced5133// on non-deprecated ptxas versions.5134if (ForceMinByValParamAlign)5135ArgAlign = std::max(ArgAlign, Align(4));51365137return ArgAlign;5138}51395140// Helper for getting a function parameter name. Name is composed from5141// its index and the function name. Negative index corresponds to special5142// parameter (unsized array) used for passing variable arguments.5143std::string NVPTXTargetLowering::getParamName(const Function *F,5144int Idx) const {5145std::string ParamName;5146raw_string_ostream ParamStr(ParamName);51475148ParamStr << getTargetMachine().getSymbol(F)->getName();5149if (Idx < 0)5150ParamStr << "_vararg";5151else5152ParamStr << "_param_" << Idx;51535154return ParamName;5155}51565157/// isLegalAddressingMode - Return true if the addressing mode represented5158/// by AM is legal for this target, for a load/store of the specified type.5159/// Used to guide target specific optimizations, like loop strength reduction5160/// (LoopStrengthReduce.cpp) and memory optimization for address mode5161/// (CodeGenPrepare.cpp)5162bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,5163const AddrMode &AM, Type *Ty,5164unsigned AS, Instruction *I) const {5165// AddrMode - This represents an addressing mode of:5166// BaseGV + BaseOffs + BaseReg + Scale*ScaleReg5167//5168// The legal address modes are5169// - [avar]5170// - [areg]5171// - [areg+immoff]5172// - [immAddr]51735174// immoff must fit in a signed 32-bit int5175if (!APInt(64, AM.BaseOffs).isSignedIntN(32))5176return false;51775178if (AM.BaseGV)5179return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;51805181switch (AM.Scale) {5182case 0: // "r", "r+i" or "i" is allowed5183break;5184case 1:5185if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.5186return false;5187// Otherwise we have r+i.5188break;5189default:5190// No scale > 1 is allowed5191return false;5192}5193return true;5194}51955196//===----------------------------------------------------------------------===//5197// NVPTX Inline Assembly Support5198//===----------------------------------------------------------------------===//51995200/// getConstraintType - Given a constraint letter, return the type of5201/// constraint it is for this target.5202NVPTXTargetLowering::ConstraintType5203NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {5204if (Constraint.size() == 1) {5205switch (Constraint[0]) {5206default:5207break;5208case 'b':5209case 'r':5210case 'h':5211case 'c':5212case 'l':5213case 'f':5214case 'd':5215case 'q':5216case '0':5217case 'N':5218return C_RegisterClass;5219}5220}5221return TargetLowering::getConstraintType(Constraint);5222}52235224std::pair<unsigned, const TargetRegisterClass *>5225NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,5226StringRef Constraint,5227MVT VT) const {5228if (Constraint.size() == 1) {5229switch (Constraint[0]) {5230case 'b':5231return std::make_pair(0U, &NVPTX::Int1RegsRegClass);5232case 'c':5233return std::make_pair(0U, &NVPTX::Int16RegsRegClass);5234case 'h':5235return std::make_pair(0U, &NVPTX::Int16RegsRegClass);5236case 'r':5237return std::make_pair(0U, &NVPTX::Int32RegsRegClass);5238case 'l':5239case 'N':5240return std::make_pair(0U, &NVPTX::Int64RegsRegClass);5241case 'q': {5242if (STI.getSmVersion() < 70)5243report_fatal_error("Inline asm with 128 bit operands is only "5244"supported for sm_70 and higher!");5245return std::make_pair(0U, &NVPTX::Int128RegsRegClass);5246}5247case 'f':5248return std::make_pair(0U, &NVPTX::Float32RegsRegClass);5249case 'd':5250return std::make_pair(0U, &NVPTX::Float64RegsRegClass);5251}5252}5253return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);5254}52555256//===----------------------------------------------------------------------===//5257// NVPTX DAG Combining5258//===----------------------------------------------------------------------===//52595260bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,5261CodeGenOptLevel OptLevel) const {5262// Always honor command-line argument5263if (FMAContractLevelOpt.getNumOccurrences() > 0)5264return FMAContractLevelOpt > 0;52655266// Do not contract if we're not optimizing the code.5267if (OptLevel == CodeGenOptLevel::None)5268return false;52695270// Honor TargetOptions flags that explicitly say fusion is okay.5271if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)5272return true;52735274return allowUnsafeFPMath(MF);5275}52765277bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {5278// Honor TargetOptions flags that explicitly say unsafe math is okay.5279if (MF.getTarget().Options.UnsafeFPMath)5280return true;52815282// Allow unsafe math if unsafe-fp-math attribute explicitly says so.5283const Function &F = MF.getFunction();5284return F.getFnAttribute("unsafe-fp-math").getValueAsBool();5285}52865287static bool isConstZero(const SDValue &Operand) {5288const auto *Const = dyn_cast<ConstantSDNode>(Operand);5289return Const && Const->getZExtValue() == 0;5290}52915292/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with5293/// operands N0 and N1. This is a helper for PerformADDCombine that is5294/// called with the default operands, and if that fails, with commuted5295/// operands.5296static SDValue5297PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,5298TargetLowering::DAGCombinerInfo &DCI) {5299EVT VT = N0.getValueType();53005301// Since integer multiply-add costs the same as integer multiply5302// but is more costly than integer add, do the fusion only when5303// the mul is only used in the add.5304// TODO: this may not be true for later architectures, consider relaxing this5305if (!N0.getNode()->hasOneUse())5306return SDValue();53075308// fold (add (mul a, b), c) -> (mad a, b, c)5309//5310if (N0.getOpcode() == ISD::MUL)5311return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0),5312N0.getOperand(1), N1);53135314// fold (add (select cond, 0, (mul a, b)), c)5315// -> (select cond, c, (mad a, b, c))5316//5317if (N0.getOpcode() == ISD::SELECT) {5318unsigned ZeroOpNum;5319if (isConstZero(N0->getOperand(1)))5320ZeroOpNum = 1;5321else if (isConstZero(N0->getOperand(2)))5322ZeroOpNum = 2;5323else5324return SDValue();53255326SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);5327if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())5328return SDValue();53295330SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,5331M->getOperand(0), M->getOperand(1), N1);5332return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),5333((ZeroOpNum == 1) ? N1 : MAD),5334((ZeroOpNum == 1) ? MAD : N1));5335}53365337return SDValue();5338}53395340static SDValue5341PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,5342TargetLowering::DAGCombinerInfo &DCI,5343CodeGenOptLevel OptLevel) {5344EVT VT = N0.getValueType();5345if (N0.getOpcode() == ISD::FMUL) {5346const auto *TLI = static_cast<const NVPTXTargetLowering *>(5347&DCI.DAG.getTargetLoweringInfo());5348if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel))5349return SDValue();53505351// For floating point:5352// Do the fusion only when the mul has less than 5 uses and all5353// are add.5354// The heuristic is that if a use is not an add, then that use5355// cannot be fused into fma, therefore mul is still needed anyway.5356// If there are more than 4 uses, even if they are all add, fusing5357// them will increase register pressue.5358//5359int numUses = 0;5360int nonAddCount = 0;5361for (const SDNode *User : N0.getNode()->uses()) {5362numUses++;5363if (User->getOpcode() != ISD::FADD)5364++nonAddCount;5365if (numUses >= 5)5366return SDValue();5367}5368if (nonAddCount) {5369int orderNo = N->getIROrder();5370int orderNo2 = N0.getNode()->getIROrder();5371// simple heuristics here for considering potential register5372// pressure, the logics here is that the differnce are used5373// to measure the distance between def and use, the longer distance5374// more likely cause register pressure.5375if (orderNo - orderNo2 < 500)5376return SDValue();53775378// Now, check if at least one of the FMUL's operands is live beyond the5379// node N, which guarantees that the FMA will not increase register5380// pressure at node N.5381bool opIsLive = false;5382const SDNode *left = N0.getOperand(0).getNode();5383const SDNode *right = N0.getOperand(1).getNode();53845385if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))5386opIsLive = true;53875388if (!opIsLive)5389for (const SDNode *User : left->uses()) {5390int orderNo3 = User->getIROrder();5391if (orderNo3 > orderNo) {5392opIsLive = true;5393break;5394}5395}53965397if (!opIsLive)5398for (const SDNode *User : right->uses()) {5399int orderNo3 = User->getIROrder();5400if (orderNo3 > orderNo) {5401opIsLive = true;5402break;5403}5404}54055406if (!opIsLive)5407return SDValue();5408}54095410return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),5411N0.getOperand(1), N1);5412}54135414return SDValue();5415}54165417static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,5418std::size_t Back) {5419if (all_of(N->ops().drop_front(Front).drop_back(Back),5420[](const SDUse &U) { return U.get()->isUndef(); }))5421// Operand 0 is the previous value in the chain. Cannot return EntryToken5422// as the previous value will become unused and eliminated later.5423return N->getOperand(0);54245425return SDValue();5426}54275428static SDValue PerformStoreParamCombine(SDNode *N) {5429// Operands from the 3rd to the 2nd last one are the values to be stored.5430// {Chain, ArgID, Offset, Val, Glue}5431return PerformStoreCombineHelper(N, 3, 1);5432}54335434static SDValue PerformStoreRetvalCombine(SDNode *N) {5435// Operands from the 2nd to the last one are the values to be stored5436return PerformStoreCombineHelper(N, 2, 0);5437}54385439/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.5440///5441static SDValue PerformADDCombine(SDNode *N,5442TargetLowering::DAGCombinerInfo &DCI,5443CodeGenOptLevel OptLevel) {5444if (OptLevel == CodeGenOptLevel::None)5445return SDValue();54465447SDValue N0 = N->getOperand(0);5448SDValue N1 = N->getOperand(1);54495450// Skip non-integer, non-scalar case5451EVT VT = N0.getValueType();5452if (VT.isVector() || VT != MVT::i32)5453return SDValue();54545455// First try with the default operand order.5456if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))5457return Result;54585459// If that didn't work, try again with the operands commuted.5460return PerformADDCombineWithOperands(N, N1, N0, DCI);5461}54625463/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.5464///5465static SDValue PerformFADDCombine(SDNode *N,5466TargetLowering::DAGCombinerInfo &DCI,5467CodeGenOptLevel OptLevel) {5468SDValue N0 = N->getOperand(0);5469SDValue N1 = N->getOperand(1);54705471EVT VT = N0.getValueType();5472if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))5473return SDValue();54745475// First try with the default operand order.5476if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))5477return Result;54785479// If that didn't work, try again with the operands commuted.5480return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);5481}54825483static SDValue PerformANDCombine(SDNode *N,5484TargetLowering::DAGCombinerInfo &DCI) {5485// The type legalizer turns a vector load of i8 values into a zextload to i165486// registers, optionally ANY_EXTENDs it (if target type is integer),5487// and ANDs off the high 8 bits. Since we turn this load into a5488// target-specific DAG node, the DAG combiner fails to eliminate these AND5489// nodes. Do that here.5490SDValue Val = N->getOperand(0);5491SDValue Mask = N->getOperand(1);54925493if (isa<ConstantSDNode>(Val)) {5494std::swap(Val, Mask);5495}54965497SDValue AExt;54985499// Convert BFE-> truncate i16 -> and 2555500// To just BFE-> truncate i16, as the value already has all the bits in the5501// right places.5502if (Val.getOpcode() == ISD::TRUNCATE) {5503SDValue BFE = Val.getOperand(0);5504if (BFE.getOpcode() != NVPTXISD::BFE)5505return SDValue();55065507ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));5508if (!BFEBits)5509return SDValue();5510uint64_t BFEBitsVal = BFEBits->getZExtValue();55115512ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);5513if (!MaskCnst) {5514// Not an AND with a constant5515return SDValue();5516}5517uint64_t MaskVal = MaskCnst->getZExtValue();55185519if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)5520return SDValue();5521// If we get here, the AND is unnecessary. Just replace it with the trunc5522DCI.CombineTo(N, Val, false);5523}5524// Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and5525if (Val.getOpcode() == ISD::ANY_EXTEND) {5526AExt = Val;5527Val = Val->getOperand(0);5528}55295530if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {5531Val = Val->getOperand(0);5532}55335534if (Val->getOpcode() == NVPTXISD::LoadV2 ||5535Val->getOpcode() == NVPTXISD::LoadV4) {5536ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);5537if (!MaskCnst) {5538// Not an AND with a constant5539return SDValue();5540}55415542uint64_t MaskVal = MaskCnst->getZExtValue();5543if (MaskVal != 0xff) {5544// Not an AND that chops off top 8 bits5545return SDValue();5546}55475548MemSDNode *Mem = dyn_cast<MemSDNode>(Val);5549if (!Mem) {5550// Not a MemSDNode?!?5551return SDValue();5552}55535554EVT MemVT = Mem->getMemoryVT();5555if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {5556// We only handle the i8 case5557return SDValue();5558}55595560unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1);5561if (ExtType == ISD::SEXTLOAD) {5562// If for some reason the load is a sextload, the and is needed to zero5563// out the high 8 bits5564return SDValue();5565}55665567bool AddTo = false;5568if (AExt.getNode() != nullptr) {5569// Re-insert the ext as a zext.5570Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),5571AExt.getValueType(), Val);5572AddTo = true;5573}55745575// If we get here, the AND is unnecessary. Just replace it with the load5576DCI.CombineTo(N, Val, AddTo);5577}55785579return SDValue();5580}55815582static SDValue PerformREMCombine(SDNode *N,5583TargetLowering::DAGCombinerInfo &DCI,5584CodeGenOptLevel OptLevel) {5585assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);55865587// Don't do anything at less than -O2.5588if (OptLevel < CodeGenOptLevel::Default)5589return SDValue();55905591SelectionDAG &DAG = DCI.DAG;5592SDLoc DL(N);5593EVT VT = N->getValueType(0);5594bool IsSigned = N->getOpcode() == ISD::SREM;5595unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;55965597const SDValue &Num = N->getOperand(0);5598const SDValue &Den = N->getOperand(1);55995600for (const SDNode *U : Num->uses()) {5601if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&5602U->getOperand(1) == Den) {5603// Num % Den -> Num - (Num / Den) * Den5604return DAG.getNode(ISD::SUB, DL, VT, Num,5605DAG.getNode(ISD::MUL, DL, VT,5606DAG.getNode(DivOpc, DL, VT, Num, Den),5607Den));5608}5609}5610return SDValue();5611}56125613enum OperandSignedness {5614Signed = 0,5615Unsigned,5616Unknown5617};56185619/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand5620/// that can be demoted to \p OptSize bits without loss of information. The5621/// signedness of the operand, if determinable, is placed in \p S.5622static bool IsMulWideOperandDemotable(SDValue Op,5623unsigned OptSize,5624OperandSignedness &S) {5625S = Unknown;56265627if (Op.getOpcode() == ISD::SIGN_EXTEND ||5628Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {5629EVT OrigVT = Op.getOperand(0).getValueType();5630if (OrigVT.getFixedSizeInBits() <= OptSize) {5631S = Signed;5632return true;5633}5634} else if (Op.getOpcode() == ISD::ZERO_EXTEND) {5635EVT OrigVT = Op.getOperand(0).getValueType();5636if (OrigVT.getFixedSizeInBits() <= OptSize) {5637S = Unsigned;5638return true;5639}5640}56415642return false;5643}56445645/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can5646/// be demoted to \p OptSize bits without loss of information. If the operands5647/// contain a constant, it should appear as the RHS operand. The signedness of5648/// the operands is placed in \p IsSigned.5649static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,5650unsigned OptSize,5651bool &IsSigned) {5652OperandSignedness LHSSign;56535654// The LHS operand must be a demotable op5655if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))5656return false;56575658// We should have been able to determine the signedness from the LHS5659if (LHSSign == Unknown)5660return false;56615662IsSigned = (LHSSign == Signed);56635664// The RHS can be a demotable op or a constant5665if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {5666const APInt &Val = CI->getAPIntValue();5667if (LHSSign == Unsigned) {5668return Val.isIntN(OptSize);5669} else {5670return Val.isSignedIntN(OptSize);5671}5672} else {5673OperandSignedness RHSSign;5674if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))5675return false;56765677return LHSSign == RHSSign;5678}5679}56805681/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply5682/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform5683/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift5684/// amount.5685static SDValue TryMULWIDECombine(SDNode *N,5686TargetLowering::DAGCombinerInfo &DCI) {5687EVT MulType = N->getValueType(0);5688if (MulType != MVT::i32 && MulType != MVT::i64) {5689return SDValue();5690}56915692SDLoc DL(N);5693unsigned OptSize = MulType.getSizeInBits() >> 1;5694SDValue LHS = N->getOperand(0);5695SDValue RHS = N->getOperand(1);56965697// Canonicalize the multiply so the constant (if any) is on the right5698if (N->getOpcode() == ISD::MUL) {5699if (isa<ConstantSDNode>(LHS)) {5700std::swap(LHS, RHS);5701}5702}57035704// If we have a SHL, determine the actual multiply amount5705if (N->getOpcode() == ISD::SHL) {5706ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);5707if (!ShlRHS) {5708return SDValue();5709}57105711APInt ShiftAmt = ShlRHS->getAPIntValue();5712unsigned BitWidth = MulType.getSizeInBits();5713if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {5714APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;5715RHS = DCI.DAG.getConstant(MulVal, DL, MulType);5716} else {5717return SDValue();5718}5719}57205721bool Signed;5722// Verify that our operands are demotable5723if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {5724return SDValue();5725}57265727EVT DemotedVT;5728if (MulType == MVT::i32) {5729DemotedVT = MVT::i16;5730} else {5731DemotedVT = MVT::i32;5732}57335734// Truncate the operands to the correct size. Note that these are just for5735// type consistency and will (likely) be eliminated in later phases.5736SDValue TruncLHS =5737DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);5738SDValue TruncRHS =5739DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);57405741unsigned Opc;5742if (Signed) {5743Opc = NVPTXISD::MUL_WIDE_SIGNED;5744} else {5745Opc = NVPTXISD::MUL_WIDE_UNSIGNED;5746}57475748return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);5749}57505751static bool isConstOne(const SDValue &Operand) {5752const auto *Const = dyn_cast<ConstantSDNode>(Operand);5753return Const && Const->getZExtValue() == 1;5754}57555756static SDValue matchMADConstOnePattern(SDValue Add) {5757if (Add->getOpcode() != ISD::ADD)5758return SDValue();57595760if (isConstOne(Add->getOperand(0)))5761return Add->getOperand(1);57625763if (isConstOne(Add->getOperand(1)))5764return Add->getOperand(0);57655766return SDValue();5767}57685769static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL,5770TargetLowering::DAGCombinerInfo &DCI) {57715772if (SDValue Y = matchMADConstOnePattern(Add))5773return DCI.DAG.getNode(NVPTXISD::IMAD, DL, VT, X, Y, X);57745775return SDValue();5776}57775778static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT,5779SDLoc DL,5780TargetLowering::DAGCombinerInfo &DCI) {5781if (Select->getOpcode() != ISD::SELECT)5782return SDValue();57835784SDValue Cond = Select->getOperand(0);57855786unsigned ConstOpNo;5787if (isConstOne(Select->getOperand(1)))5788ConstOpNo = 1;5789else if (isConstOne(Select->getOperand(2)))5790ConstOpNo = 2;5791else5792return SDValue();57935794SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);57955796// Do not combine if the resulting sequence is not obviously profitable.5797if (!matchMADConstOnePattern(Y))5798return SDValue();57995800SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);58015802return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,5803(ConstOpNo == 1) ? X : NewMul,5804(ConstOpNo == 1) ? NewMul : X);5805}58065807static SDValue5808PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,5809TargetLowering::DAGCombinerInfo &DCI) {58105811EVT VT = N0.getValueType();5812if (VT.isVector())5813return SDValue();58145815if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)5816return SDValue();58175818SDLoc DL(N);58195820// (mul x, (add y, 1)) -> (mad x, y, x)5821if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))5822return Res;5823if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))5824return Res;58255826// (mul x, (select y, 1)) -> (select (mul x, y), x)5827if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))5828return Res;5829if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))5830return Res;58315832return SDValue();5833}58345835/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.5836static SDValue PerformMULCombine(SDNode *N,5837TargetLowering::DAGCombinerInfo &DCI,5838CodeGenOptLevel OptLevel) {5839if (OptLevel == CodeGenOptLevel::None)5840return SDValue();58415842if (SDValue Ret = TryMULWIDECombine(N, DCI))5843return Ret;58445845SDValue N0 = N->getOperand(0);5846SDValue N1 = N->getOperand(1);5847return PerformMULCombineWithOperands(N, N0, N1, DCI);5848}58495850/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.5851static SDValue PerformSHLCombine(SDNode *N,5852TargetLowering::DAGCombinerInfo &DCI,5853CodeGenOptLevel OptLevel) {5854if (OptLevel > CodeGenOptLevel::None) {5855// Try mul.wide combining at OptLevel > 05856if (SDValue Ret = TryMULWIDECombine(N, DCI))5857return Ret;5858}58595860return SDValue();5861}58625863static SDValue PerformSETCCCombine(SDNode *N,5864TargetLowering::DAGCombinerInfo &DCI,5865unsigned int SmVersion) {5866EVT CCType = N->getValueType(0);5867SDValue A = N->getOperand(0);5868SDValue B = N->getOperand(1);58695870EVT AType = A.getValueType();5871if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))5872return SDValue();58735874if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)5875return SDValue();58765877SDLoc DL(N);5878// setp.f16x2 returns two scalar predicates, which we need to5879// convert back to v2i1. The returned result will be scalarized by5880// the legalizer, but the comparison will remain a single vector5881// instruction.5882SDValue CCNode = DCI.DAG.getNode(5883A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X25884: NVPTXISD::SETP_BF16X2,5885DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});5886return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),5887CCNode.getValue(1));5888}58895890static SDValue PerformEXTRACTCombine(SDNode *N,5891TargetLowering::DAGCombinerInfo &DCI) {5892SDValue Vector = N->getOperand(0);5893SDLoc DL(N);5894EVT VectorVT = Vector.getValueType();5895if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&5896IsPTXVectorType(VectorVT.getSimpleVT()))5897return SDValue(); // Native vector loads already combine nicely w/5898// extract_vector_elt.5899// Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already5900// handle them OK.5901if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||5902VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)5903return SDValue();59045905// Don't mess with undef values as sra may be simplified to 0, not undef.5906if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))5907return SDValue();59085909uint64_t VectorBits = VectorVT.getSizeInBits();5910// We only handle the types we can extract in-register.5911if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))5912return SDValue();59135914ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));5915// Index == 0 is handled by generic DAG combiner.5916if (!Index || Index->getZExtValue() == 0)5917return SDValue();59185919MVT IVT = MVT::getIntegerVT(VectorBits);5920EVT EltVT = VectorVT.getVectorElementType();5921EVT EltIVT = EltVT.changeTypeToInteger();5922uint64_t EltBits = EltVT.getScalarSizeInBits();59235924SDValue Result = DCI.DAG.getNode(5925ISD::TRUNCATE, DL, EltIVT,5926DCI.DAG.getNode(5927ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),5928DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));59295930// If element has non-integer type, bitcast it back to the expected type.5931if (EltVT != EltIVT)5932Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);5933// Past legalizer, we may need to extent i8 -> i16 to match the register type.5934if (EltVT != N->getValueType(0))5935Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);59365937return Result;5938}59395940static SDValue PerformVSELECTCombine(SDNode *N,5941TargetLowering::DAGCombinerInfo &DCI) {5942SDValue VA = N->getOperand(1);5943EVT VectorVT = VA.getValueType();5944if (VectorVT != MVT::v4i8)5945return SDValue();59465947// We need to split vselect into individual per-element operations Because we5948// use BFE/BFI instruction for byte extraction/insertion, we do end up with5949// 32-bit values, so we may as well do comparison as i32 to avoid conversions5950// to/from i16 normally used for i8 values.5951SmallVector<SDValue, 4> E;5952SDLoc DL(N);5953SDValue VCond = N->getOperand(0);5954SDValue VB = N->getOperand(2);5955for (int I = 0; I < 4; ++I) {5956SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,5957DCI.DAG.getConstant(I, DL, MVT::i32));5958SDValue EA = DCI.DAG.getAnyExtOrTrunc(5959DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,5960DCI.DAG.getConstant(I, DL, MVT::i32)),5961DL, MVT::i32);5962SDValue EB = DCI.DAG.getAnyExtOrTrunc(5963DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,5964DCI.DAG.getConstant(I, DL, MVT::i32)),5965DL, MVT::i32);5966E.push_back(DCI.DAG.getAnyExtOrTrunc(5967DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));5968}5969return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);5970}59715972static SDValue PerformLOADCombine(SDNode *N,5973TargetLowering::DAGCombinerInfo &DCI) {5974SelectionDAG &DAG = DCI.DAG;5975LoadSDNode *LD = cast<LoadSDNode>(N);59765977// Lower a v16i8 load into a LoadV4 operation with i32 results instead of5978// letting ReplaceLoadVector split it into smaller loads during legalization.5979// This is done at dag-combine1 time, so that vector operations with i85980// elements can be optimised away instead of being needlessly split during5981// legalization, which involves storing to the stack and loading it back.5982EVT VT = N->getValueType(0);5983if (VT != MVT::v16i8)5984return SDValue();59855986SDLoc DL(N);59875988// Create a v4i32 vector load operation, effectively <4 x v4i8>.5989unsigned Opc = NVPTXISD::LoadV4;5990EVT NewVT = MVT::v4i32;5991EVT EltVT = NewVT.getVectorElementType();5992unsigned NumElts = NewVT.getVectorNumElements();5993EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};5994SDVTList RetVTList = DAG.getVTList(RetVTs);5995SmallVector<SDValue, 8> Ops(N->ops());5996Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));5997SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,5998LD->getMemOperand());5999SDValue NewChain = NewLoad.getValue(NumElts);60006001// Create a vector of the same type returned by the original load.6002SmallVector<SDValue, 4> Elts;6003for (unsigned i = 0; i < NumElts; i++)6004Elts.push_back(NewLoad.getValue(i));6005return DCI.DAG.getMergeValues(6006{DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),6007NewChain},6008DL);6009}60106011SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,6012DAGCombinerInfo &DCI) const {6013CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();6014switch (N->getOpcode()) {6015default: break;6016case ISD::ADD:6017return PerformADDCombine(N, DCI, OptLevel);6018case ISD::FADD:6019return PerformFADDCombine(N, DCI, OptLevel);6020case ISD::MUL:6021return PerformMULCombine(N, DCI, OptLevel);6022case ISD::SHL:6023return PerformSHLCombine(N, DCI, OptLevel);6024case ISD::AND:6025return PerformANDCombine(N, DCI);6026case ISD::UREM:6027case ISD::SREM:6028return PerformREMCombine(N, DCI, OptLevel);6029case ISD::SETCC:6030return PerformSETCCCombine(N, DCI, STI.getSmVersion());6031case ISD::LOAD:6032return PerformLOADCombine(N, DCI);6033case NVPTXISD::StoreRetval:6034case NVPTXISD::StoreRetvalV2:6035case NVPTXISD::StoreRetvalV4:6036return PerformStoreRetvalCombine(N);6037case NVPTXISD::StoreParam:6038case NVPTXISD::StoreParamV2:6039case NVPTXISD::StoreParamV4:6040return PerformStoreParamCombine(N);6041case ISD::EXTRACT_VECTOR_ELT:6042return PerformEXTRACTCombine(N, DCI);6043case ISD::VSELECT:6044return PerformVSELECTCombine(N, DCI);6045}6046return SDValue();6047}60486049/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.6050static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,6051SmallVectorImpl<SDValue> &Results) {6052EVT ResVT = N->getValueType(0);6053SDLoc DL(N);60546055assert(ResVT.isVector() && "Vector load must have vector type");60566057// We only handle "native" vector sizes for now, e.g. <4 x double> is not6058// legal. We can (and should) split that into 2 loads of <2 x double> here6059// but I'm leaving that as a TODO for now.6060assert(ResVT.isSimple() && "Can only handle simple types");6061switch (ResVT.getSimpleVT().SimpleTy) {6062default:6063return;6064case MVT::v2i8:6065case MVT::v2i16:6066case MVT::v2i32:6067case MVT::v2i64:6068case MVT::v2f16:6069case MVT::v2f32:6070case MVT::v2f64:6071case MVT::v4i8:6072case MVT::v4i16:6073case MVT::v4i32:6074case MVT::v4f16:6075case MVT::v4f32:6076case MVT::v8f16: // <4 x f16x2>6077case MVT::v8bf16: // <4 x bf16x2>6078case MVT::v8i16: // <4 x i16x2>6079// This is a "native" vector type6080break;6081}60826083LoadSDNode *LD = cast<LoadSDNode>(N);60846085Align Alignment = LD->getAlign();6086auto &TD = DAG.getDataLayout();6087Align PrefAlign =6088TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));6089if (Alignment < PrefAlign) {6090// This load is not sufficiently aligned, so bail out and let this vector6091// load be scalarized. Note that we may still be able to emit smaller6092// vector loads. For example, if we are loading a <4 x float> with an6093// alignment of 8, this check will fail but the legalizer will try again6094// with 2 x <2 x float>, which will succeed with an alignment of 8.6095return;6096}60976098EVT EltVT = ResVT.getVectorElementType();6099unsigned NumElts = ResVT.getVectorNumElements();61006101// Since LoadV2 is a target node, we cannot rely on DAG type legalization.6102// Therefore, we must ensure the type is legal. For i1 and i8, we set the6103// loaded type to i16 and propagate the "real" type as the memory type.6104bool NeedTrunc = false;6105if (EltVT.getSizeInBits() < 16) {6106EltVT = MVT::i16;6107NeedTrunc = true;6108}61096110unsigned Opcode = 0;6111SDVTList LdResVTs;6112bool Load16x2 = false;61136114switch (NumElts) {6115default:6116return;6117case 2:6118Opcode = NVPTXISD::LoadV2;6119LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);6120break;6121case 4: {6122Opcode = NVPTXISD::LoadV4;6123EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };6124LdResVTs = DAG.getVTList(ListVTs);6125break;6126}6127case 8: {6128// v8f16 is a special case. PTX doesn't have ld.v8.f166129// instruction. Instead, we split the vector into v2f16 chunks and6130// load them with ld.v4.b32.6131assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");6132Load16x2 = true;6133Opcode = NVPTXISD::LoadV4;6134EVT VVT;6135switch (EltVT.getSimpleVT().SimpleTy) {6136case MVT::f16:6137VVT = MVT::v2f16;6138break;6139case MVT::bf16:6140VVT = MVT::v2bf16;6141break;6142case MVT::i16:6143VVT = MVT::v2i16;6144break;6145default:6146llvm_unreachable("Unsupported v8 vector type.");6147}6148EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};6149LdResVTs = DAG.getVTList(ListVTs);6150break;6151}6152}61536154// Copy regular operands6155SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());61566157// The select routine does not have access to the LoadSDNode instance, so6158// pass along the extension information6159OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));61606161SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,6162LD->getMemoryVT(),6163LD->getMemOperand());61646165SmallVector<SDValue, 8> ScalarRes;6166if (Load16x2) {6167// Split v2f16 subvectors back into individual elements.6168NumElts /= 2;6169for (unsigned i = 0; i < NumElts; ++i) {6170SDValue SubVector = NewLD.getValue(i);6171SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,6172DAG.getIntPtrConstant(0, DL));6173SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,6174DAG.getIntPtrConstant(1, DL));6175ScalarRes.push_back(E0);6176ScalarRes.push_back(E1);6177}6178} else {6179for (unsigned i = 0; i < NumElts; ++i) {6180SDValue Res = NewLD.getValue(i);6181if (NeedTrunc)6182Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);6183ScalarRes.push_back(Res);6184}6185}61866187SDValue LoadChain = NewLD.getValue(NumElts);61886189SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);61906191Results.push_back(BuildVec);6192Results.push_back(LoadChain);6193}61946195static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,6196SmallVectorImpl<SDValue> &Results) {6197SDValue Chain = N->getOperand(0);6198SDValue Intrin = N->getOperand(1);6199SDLoc DL(N);62006201// Get the intrinsic ID6202unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();6203switch (IntrinNo) {6204default:6205return;6206case Intrinsic::nvvm_ldg_global_i:6207case Intrinsic::nvvm_ldg_global_f:6208case Intrinsic::nvvm_ldg_global_p:6209case Intrinsic::nvvm_ldu_global_i:6210case Intrinsic::nvvm_ldu_global_f:6211case Intrinsic::nvvm_ldu_global_p: {6212EVT ResVT = N->getValueType(0);62136214if (ResVT.isVector()) {6215// Vector LDG/LDU62166217unsigned NumElts = ResVT.getVectorNumElements();6218EVT EltVT = ResVT.getVectorElementType();62196220// Since LDU/LDG are target nodes, we cannot rely on DAG type6221// legalization.6222// Therefore, we must ensure the type is legal. For i1 and i8, we set the6223// loaded type to i16 and propagate the "real" type as the memory type.6224bool NeedTrunc = false;6225if (EltVT.getSizeInBits() < 16) {6226EltVT = MVT::i16;6227NeedTrunc = true;6228}62296230unsigned Opcode = 0;6231SDVTList LdResVTs;62326233switch (NumElts) {6234default:6235return;6236case 2:6237switch (IntrinNo) {6238default:6239return;6240case Intrinsic::nvvm_ldg_global_i:6241case Intrinsic::nvvm_ldg_global_f:6242case Intrinsic::nvvm_ldg_global_p:6243Opcode = NVPTXISD::LDGV2;6244break;6245case Intrinsic::nvvm_ldu_global_i:6246case Intrinsic::nvvm_ldu_global_f:6247case Intrinsic::nvvm_ldu_global_p:6248Opcode = NVPTXISD::LDUV2;6249break;6250}6251LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);6252break;6253case 4: {6254switch (IntrinNo) {6255default:6256return;6257case Intrinsic::nvvm_ldg_global_i:6258case Intrinsic::nvvm_ldg_global_f:6259case Intrinsic::nvvm_ldg_global_p:6260Opcode = NVPTXISD::LDGV4;6261break;6262case Intrinsic::nvvm_ldu_global_i:6263case Intrinsic::nvvm_ldu_global_f:6264case Intrinsic::nvvm_ldu_global_p:6265Opcode = NVPTXISD::LDUV4;6266break;6267}6268EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };6269LdResVTs = DAG.getVTList(ListVTs);6270break;6271}6272}62736274SmallVector<SDValue, 8> OtherOps;62756276// Copy regular operands62776278OtherOps.push_back(Chain); // Chain6279// Skip operand 1 (intrinsic ID)6280// Others6281OtherOps.append(N->op_begin() + 2, N->op_end());62826283MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);62846285SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,6286MemSD->getMemoryVT(),6287MemSD->getMemOperand());62886289SmallVector<SDValue, 4> ScalarRes;62906291for (unsigned i = 0; i < NumElts; ++i) {6292SDValue Res = NewLD.getValue(i);6293if (NeedTrunc)6294Res =6295DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);6296ScalarRes.push_back(Res);6297}62986299SDValue LoadChain = NewLD.getValue(NumElts);63006301SDValue BuildVec =6302DAG.getBuildVector(ResVT, DL, ScalarRes);63036304Results.push_back(BuildVec);6305Results.push_back(LoadChain);6306} else {6307// i8 LDG/LDU6308assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&6309"Custom handling of non-i8 ldu/ldg?");63106311// Just copy all operands as-is6312SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());63136314// Force output to i166315SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);63166317MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);63186319// We make sure the memory type is i8, which will be used during isel6320// to select the proper instruction.6321SDValue NewLD =6322DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,6323MVT::i8, MemSD->getMemOperand());63246325Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,6326NewLD.getValue(0)));6327Results.push_back(NewLD.getValue(1));6328}6329}6330}6331}63326333static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG,6334SmallVectorImpl<SDValue> &Results) {6335// Change the CopyFromReg to output 2 64-bit results instead of a 128-bit6336// result so that it can pass the legalization6337SDLoc DL(N);6338SDValue Chain = N->getOperand(0);6339SDValue Reg = N->getOperand(1);6340SDValue Glue = N->getOperand(2);63416342assert(Reg.getValueType() == MVT::i128 &&6343"Custom lowering for CopyFromReg with 128-bit reg only");6344SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),6345N->getValueType(2)};6346SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};63476348SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);6349SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,6350{NewValue.getValue(0), NewValue.getValue(1)});63516352Results.push_back(Pair);6353Results.push_back(NewValue.getValue(2));6354Results.push_back(NewValue.getValue(3));6355}63566357void NVPTXTargetLowering::ReplaceNodeResults(6358SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {6359switch (N->getOpcode()) {6360default:6361report_fatal_error("Unhandled custom legalization");6362case ISD::LOAD:6363ReplaceLoadVector(N, DAG, Results);6364return;6365case ISD::INTRINSIC_W_CHAIN:6366ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);6367return;6368case ISD::CopyFromReg:6369ReplaceCopyFromReg_128(N, DAG, Results);6370return;6371}6372}63736374NVPTXTargetLowering::AtomicExpansionKind6375NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {6376Type *Ty = AI->getValOperand()->getType();63776378if (AI->isFloatingPointOperation()) {6379if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {6380if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&6381STI.getPTXVersion() >= 63)6382return AtomicExpansionKind::None;6383if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&6384STI.getPTXVersion() >= 78)6385return AtomicExpansionKind::None;6386if (Ty->isFloatTy())6387return AtomicExpansionKind::None;6388if (Ty->isDoubleTy() && STI.hasAtomAddF64())6389return AtomicExpansionKind::None;6390}6391return AtomicExpansionKind::CmpXChg;6392}63936394assert(Ty->isIntegerTy() && "Ty should be integer at this point");6395auto ITy = cast<llvm::IntegerType>(Ty);63966397switch (AI->getOperation()) {6398default:6399return AtomicExpansionKind::CmpXChg;6400case AtomicRMWInst::BinOp::And:6401case AtomicRMWInst::BinOp::Or:6402case AtomicRMWInst::BinOp::Xor:6403case AtomicRMWInst::BinOp::Xchg:6404switch (ITy->getBitWidth()) {6405case 8:6406case 16:6407return AtomicExpansionKind::CmpXChg;6408case 32:6409return AtomicExpansionKind::None;6410case 64:6411if (STI.hasAtomBitwise64())6412return AtomicExpansionKind::None;6413return AtomicExpansionKind::CmpXChg;6414default:6415llvm_unreachable("unsupported width encountered");6416}6417case AtomicRMWInst::BinOp::Add:6418case AtomicRMWInst::BinOp::Sub:6419case AtomicRMWInst::BinOp::Max:6420case AtomicRMWInst::BinOp::Min:6421case AtomicRMWInst::BinOp::UMax:6422case AtomicRMWInst::BinOp::UMin:6423switch (ITy->getBitWidth()) {6424case 8:6425case 16:6426return AtomicExpansionKind::CmpXChg;6427case 32:6428return AtomicExpansionKind::None;6429case 64:6430if (STI.hasAtomMinMax64())6431return AtomicExpansionKind::None;6432return AtomicExpansionKind::CmpXChg;6433default:6434llvm_unreachable("unsupported width encountered");6435}6436}64376438return AtomicExpansionKind::CmpXChg;6439}64406441// Pin NVPTXTargetObjectFile's vtables to this file.6442NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;64436444MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(6445const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {6446return getDataSection();6447}644864496450