Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
35271 views
1
//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10
// selection DAG.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "NVPTXISelLowering.h"
15
#include "MCTargetDesc/NVPTXBaseInfo.h"
16
#include "NVPTX.h"
17
#include "NVPTXSubtarget.h"
18
#include "NVPTXTargetMachine.h"
19
#include "NVPTXTargetObjectFile.h"
20
#include "NVPTXUtilities.h"
21
#include "llvm/ADT/APInt.h"
22
#include "llvm/ADT/STLExtras.h"
23
#include "llvm/ADT/SmallVector.h"
24
#include "llvm/ADT/StringRef.h"
25
#include "llvm/CodeGen/Analysis.h"
26
#include "llvm/CodeGen/ISDOpcodes.h"
27
#include "llvm/CodeGen/MachineFunction.h"
28
#include "llvm/CodeGen/MachineMemOperand.h"
29
#include "llvm/CodeGen/SelectionDAG.h"
30
#include "llvm/CodeGen/SelectionDAGNodes.h"
31
#include "llvm/CodeGen/TargetCallingConv.h"
32
#include "llvm/CodeGen/TargetLowering.h"
33
#include "llvm/CodeGen/ValueTypes.h"
34
#include "llvm/CodeGenTypes/MachineValueType.h"
35
#include "llvm/IR/Argument.h"
36
#include "llvm/IR/Attributes.h"
37
#include "llvm/IR/Constants.h"
38
#include "llvm/IR/DataLayout.h"
39
#include "llvm/IR/DerivedTypes.h"
40
#include "llvm/IR/DiagnosticInfo.h"
41
#include "llvm/IR/FPEnv.h"
42
#include "llvm/IR/Function.h"
43
#include "llvm/IR/GlobalValue.h"
44
#include "llvm/IR/Instruction.h"
45
#include "llvm/IR/Instructions.h"
46
#include "llvm/IR/IntrinsicsNVPTX.h"
47
#include "llvm/IR/Module.h"
48
#include "llvm/IR/Type.h"
49
#include "llvm/IR/Value.h"
50
#include "llvm/Support/Alignment.h"
51
#include "llvm/Support/Casting.h"
52
#include "llvm/Support/CodeGen.h"
53
#include "llvm/Support/CommandLine.h"
54
#include "llvm/Support/ErrorHandling.h"
55
#include "llvm/Support/raw_ostream.h"
56
#include "llvm/Target/TargetMachine.h"
57
#include "llvm/Target/TargetOptions.h"
58
#include <algorithm>
59
#include <cassert>
60
#include <cmath>
61
#include <cstdint>
62
#include <iterator>
63
#include <optional>
64
#include <sstream>
65
#include <string>
66
#include <utility>
67
#include <vector>
68
69
#define DEBUG_TYPE "nvptx-lower"
70
71
using namespace llvm;
72
73
static std::atomic<unsigned> GlobalUniqueCallSite;
74
75
static cl::opt<bool> sched4reg(
76
"nvptx-sched4reg",
77
cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
78
79
static cl::opt<unsigned> FMAContractLevelOpt(
80
"nvptx-fma-level", cl::Hidden,
81
cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
82
" 1: do it 2: do it aggressively"),
83
cl::init(2));
84
85
static cl::opt<int> UsePrecDivF32(
86
"nvptx-prec-divf32", cl::Hidden,
87
cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
88
" IEEE Compliant F32 div.rnd if available."),
89
cl::init(2));
90
91
static cl::opt<bool> UsePrecSqrtF32(
92
"nvptx-prec-sqrtf32", cl::Hidden,
93
cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
94
cl::init(true));
95
96
static cl::opt<bool> ForceMinByValParamAlign(
97
"nvptx-force-min-byval-param-align", cl::Hidden,
98
cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
99
" params of device functions."),
100
cl::init(false));
101
102
int NVPTXTargetLowering::getDivF32Level() const {
103
if (UsePrecDivF32.getNumOccurrences() > 0) {
104
// If nvptx-prec-div32=N is used on the command-line, always honor it
105
return UsePrecDivF32;
106
} else {
107
// Otherwise, use div.approx if fast math is enabled
108
if (getTargetMachine().Options.UnsafeFPMath)
109
return 0;
110
else
111
return 2;
112
}
113
}
114
115
bool NVPTXTargetLowering::usePrecSqrtF32() const {
116
if (UsePrecSqrtF32.getNumOccurrences() > 0) {
117
// If nvptx-prec-sqrtf32 is used on the command-line, always honor it
118
return UsePrecSqrtF32;
119
} else {
120
// Otherwise, use sqrt.approx if fast math is enabled
121
return !getTargetMachine().Options.UnsafeFPMath;
122
}
123
}
124
125
bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
126
return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==
127
DenormalMode::PreserveSign;
128
}
129
130
static bool IsPTXVectorType(MVT VT) {
131
switch (VT.SimpleTy) {
132
default:
133
return false;
134
case MVT::v2i1:
135
case MVT::v4i1:
136
case MVT::v2i8:
137
case MVT::v4i8:
138
case MVT::v2i16:
139
case MVT::v4i16:
140
case MVT::v8i16: // <4 x i16x2>
141
case MVT::v2i32:
142
case MVT::v4i32:
143
case MVT::v2i64:
144
case MVT::v2f16:
145
case MVT::v4f16:
146
case MVT::v8f16: // <4 x f16x2>
147
case MVT::v2bf16:
148
case MVT::v4bf16:
149
case MVT::v8bf16: // <4 x bf16x2>
150
case MVT::v2f32:
151
case MVT::v4f32:
152
case MVT::v2f64:
153
return true;
154
}
155
}
156
157
static bool Is16bitsType(MVT VT) {
158
return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
159
VT.SimpleTy == MVT::i16);
160
}
161
162
/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
163
/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
164
/// into their primitive components.
165
/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
166
/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
167
/// LowerCall, and LowerReturn.
168
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
169
Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
170
SmallVectorImpl<uint64_t> *Offsets = nullptr,
171
uint64_t StartingOffset = 0) {
172
SmallVector<EVT, 16> TempVTs;
173
SmallVector<uint64_t, 16> TempOffsets;
174
175
// Special case for i128 - decompose to (i64, i64)
176
if (Ty->isIntegerTy(128)) {
177
ValueVTs.push_back(EVT(MVT::i64));
178
ValueVTs.push_back(EVT(MVT::i64));
179
180
if (Offsets) {
181
Offsets->push_back(StartingOffset + 0);
182
Offsets->push_back(StartingOffset + 8);
183
}
184
185
return;
186
}
187
188
// Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
189
if (StructType *STy = dyn_cast<StructType>(Ty)) {
190
auto const *SL = DL.getStructLayout(STy);
191
auto ElementNum = 0;
192
for(auto *EI : STy->elements()) {
193
ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
194
StartingOffset + SL->getElementOffset(ElementNum));
195
++ElementNum;
196
}
197
return;
198
}
199
200
ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
201
for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
202
EVT VT = TempVTs[i];
203
uint64_t Off = TempOffsets[i];
204
// Split vectors into individual elements, except for v2f16, which
205
// we will pass as a single scalar.
206
if (VT.isVector()) {
207
unsigned NumElts = VT.getVectorNumElements();
208
EVT EltVT = VT.getVectorElementType();
209
// Vectors with an even number of f16 elements will be passed to
210
// us as an array of v2f16/v2bf16 elements. We must match this so we
211
// stay in sync with Ins/Outs.
212
if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) {
213
switch (EltVT.getSimpleVT().SimpleTy) {
214
case MVT::f16:
215
EltVT = MVT::v2f16;
216
break;
217
case MVT::bf16:
218
EltVT = MVT::v2bf16;
219
break;
220
case MVT::i16:
221
EltVT = MVT::v2i16;
222
break;
223
default:
224
llvm_unreachable("Unexpected type");
225
}
226
NumElts /= 2;
227
} else if (EltVT.getSimpleVT() == MVT::i8 &&
228
(NumElts % 4 == 0 || NumElts == 3)) {
229
// v*i8 are formally lowered as v4i8
230
EltVT = MVT::v4i8;
231
NumElts = (NumElts + 3) / 4;
232
} else if (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) {
233
// v2i8 is promoted to v2i16
234
NumElts = 1;
235
EltVT = MVT::v2i16;
236
}
237
for (unsigned j = 0; j != NumElts; ++j) {
238
ValueVTs.push_back(EltVT);
239
if (Offsets)
240
Offsets->push_back(Off + j * EltVT.getStoreSize());
241
}
242
} else {
243
ValueVTs.push_back(VT);
244
if (Offsets)
245
Offsets->push_back(Off);
246
}
247
}
248
}
249
250
/// PromoteScalarIntegerPTX
251
/// Used to make sure the arguments/returns are suitable for passing
252
/// and promote them to a larger size if they're not.
253
///
254
/// The promoted type is placed in \p PromoteVT if the function returns true.
255
static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
256
if (VT.isScalarInteger()) {
257
switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
258
default:
259
llvm_unreachable(
260
"Promotion is not suitable for scalars of size larger than 64-bits");
261
case 1:
262
*PromotedVT = MVT::i1;
263
break;
264
case 2:
265
case 4:
266
case 8:
267
*PromotedVT = MVT::i8;
268
break;
269
case 16:
270
*PromotedVT = MVT::i16;
271
break;
272
case 32:
273
*PromotedVT = MVT::i32;
274
break;
275
case 64:
276
*PromotedVT = MVT::i64;
277
break;
278
}
279
return EVT(*PromotedVT) != VT;
280
}
281
return false;
282
}
283
284
// Check whether we can merge loads/stores of some of the pieces of a
285
// flattened function parameter or return value into a single vector
286
// load/store.
287
//
288
// The flattened parameter is represented as a list of EVTs and
289
// offsets, and the whole structure is aligned to ParamAlignment. This
290
// function determines whether we can load/store pieces of the
291
// parameter starting at index Idx using a single vectorized op of
292
// size AccessSize. If so, it returns the number of param pieces
293
// covered by the vector op. Otherwise, it returns 1.
294
static unsigned CanMergeParamLoadStoresStartingAt(
295
unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
296
const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
297
298
// Can't vectorize if param alignment is not sufficient.
299
if (ParamAlignment < AccessSize)
300
return 1;
301
// Can't vectorize if offset is not aligned.
302
if (Offsets[Idx] & (AccessSize - 1))
303
return 1;
304
305
EVT EltVT = ValueVTs[Idx];
306
unsigned EltSize = EltVT.getStoreSize();
307
308
// Element is too large to vectorize.
309
if (EltSize >= AccessSize)
310
return 1;
311
312
unsigned NumElts = AccessSize / EltSize;
313
// Can't vectorize if AccessBytes if not a multiple of EltSize.
314
if (AccessSize != EltSize * NumElts)
315
return 1;
316
317
// We don't have enough elements to vectorize.
318
if (Idx + NumElts > ValueVTs.size())
319
return 1;
320
321
// PTX ISA can only deal with 2- and 4-element vector ops.
322
if (NumElts != 4 && NumElts != 2)
323
return 1;
324
325
for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
326
// Types do not match.
327
if (ValueVTs[j] != EltVT)
328
return 1;
329
330
// Elements are not contiguous.
331
if (Offsets[j] - Offsets[j - 1] != EltSize)
332
return 1;
333
}
334
// OK. We can vectorize ValueVTs[i..i+NumElts)
335
return NumElts;
336
}
337
338
// Flags for tracking per-element vectorization state of loads/stores
339
// of a flattened function parameter or return value.
340
enum ParamVectorizationFlags {
341
PVF_INNER = 0x0, // Middle elements of a vector.
342
PVF_FIRST = 0x1, // First element of the vector.
343
PVF_LAST = 0x2, // Last element of the vector.
344
// Scalar is effectively a 1-element vector.
345
PVF_SCALAR = PVF_FIRST | PVF_LAST
346
};
347
348
// Computes whether and how we can vectorize the loads/stores of a
349
// flattened function parameter or return value.
350
//
351
// The flattened parameter is represented as the list of ValueVTs and
352
// Offsets, and is aligned to ParamAlignment bytes. We return a vector
353
// of the same size as ValueVTs indicating how each piece should be
354
// loaded/stored (i.e. as a scalar, or as part of a vector
355
// load/store).
356
static SmallVector<ParamVectorizationFlags, 16>
357
VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
358
const SmallVectorImpl<uint64_t> &Offsets,
359
Align ParamAlignment, bool IsVAArg = false) {
360
// Set vector size to match ValueVTs and mark all elements as
361
// scalars by default.
362
SmallVector<ParamVectorizationFlags, 16> VectorInfo;
363
VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
364
365
if (IsVAArg)
366
return VectorInfo;
367
368
// Check what we can vectorize using 128/64/32-bit accesses.
369
for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
370
// Skip elements we've already processed.
371
assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
372
for (unsigned AccessSize : {16, 8, 4, 2}) {
373
unsigned NumElts = CanMergeParamLoadStoresStartingAt(
374
I, AccessSize, ValueVTs, Offsets, ParamAlignment);
375
// Mark vectorized elements.
376
switch (NumElts) {
377
default:
378
llvm_unreachable("Unexpected return value");
379
case 1:
380
// Can't vectorize using this size, try next smaller size.
381
continue;
382
case 2:
383
assert(I + 1 < E && "Not enough elements.");
384
VectorInfo[I] = PVF_FIRST;
385
VectorInfo[I + 1] = PVF_LAST;
386
I += 1;
387
break;
388
case 4:
389
assert(I + 3 < E && "Not enough elements.");
390
VectorInfo[I] = PVF_FIRST;
391
VectorInfo[I + 1] = PVF_INNER;
392
VectorInfo[I + 2] = PVF_INNER;
393
VectorInfo[I + 3] = PVF_LAST;
394
I += 3;
395
break;
396
}
397
// Break out of the inner loop because we've already succeeded
398
// using largest possible AccessSize.
399
break;
400
}
401
}
402
return VectorInfo;
403
}
404
405
// NVPTXTargetLowering Constructor.
406
NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
407
const NVPTXSubtarget &STI)
408
: TargetLowering(TM), nvTM(&TM), STI(STI) {
409
// always lower memset, memcpy, and memmove intrinsics to load/store
410
// instructions, rather
411
// then generating calls to memset, mempcy or memmove.
412
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF;
413
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF;
414
MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF;
415
416
setBooleanContents(ZeroOrNegativeOneBooleanContent);
417
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
418
419
// Jump is Expensive. Don't create extra control flow for 'and', 'or'
420
// condition branches.
421
setJumpIsExpensive(true);
422
423
// Wide divides are _very_ slow. Try to reduce the width of the divide if
424
// possible.
425
addBypassSlowDiv(64, 32);
426
427
// By default, use the Source scheduling
428
if (sched4reg)
429
setSchedulingPreference(Sched::RegPressure);
430
else
431
setSchedulingPreference(Sched::Source);
432
433
auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
434
LegalizeAction NoF16Action) {
435
setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
436
};
437
438
auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
439
LegalizeAction NoBF16Action) {
440
bool IsOpSupported = STI.hasBF16Math();
441
// Few instructions are available on sm_90 only
442
switch(Op) {
443
case ISD::FADD:
444
case ISD::FMUL:
445
case ISD::FSUB:
446
case ISD::SELECT:
447
case ISD::SELECT_CC:
448
case ISD::SETCC:
449
case ISD::FEXP2:
450
case ISD::FCEIL:
451
case ISD::FFLOOR:
452
case ISD::FNEARBYINT:
453
case ISD::FRINT:
454
case ISD::FROUNDEVEN:
455
case ISD::FTRUNC:
456
IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
457
break;
458
}
459
setOperationAction(
460
Op, VT, IsOpSupported ? Action : NoBF16Action);
461
};
462
463
auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
464
LegalizeAction NoI16x2Action) {
465
bool IsOpSupported = false;
466
// instructions are available on sm_90 only
467
switch (Op) {
468
case ISD::ADD:
469
case ISD::SMAX:
470
case ISD::SMIN:
471
case ISD::UMIN:
472
case ISD::UMAX:
473
IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
474
break;
475
}
476
setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
477
};
478
479
addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
480
addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
481
addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
482
addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
483
addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
484
addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
485
addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
486
addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
487
addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
488
addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
489
addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
490
addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
491
492
// Conversion to/from FP16/FP16x2 is always legal.
493
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
494
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
495
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
496
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
497
498
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
499
if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
500
setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
501
502
setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
503
setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
504
505
// Conversion to/from BFP16/BFP16x2 is always legal.
506
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom);
507
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom);
508
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand);
509
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand);
510
511
setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
512
setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
513
if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
514
AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
515
516
// Conversion to/from i16/i16x2 is always legal.
517
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
518
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
519
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);
520
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);
521
522
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);
523
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
524
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
525
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
526
// Only logical ops can be done on v4i8 directly, others must be done
527
// elementwise.
528
setOperationAction(
529
{ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE,
530
ISD::BITREVERSE, ISD::CTLZ, ISD::CTPOP, ISD::CTTZ,
531
ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FSHL, ISD::FSHR,
532
ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::PARITY,
533
ISD::ROTL, ISD::ROTR, ISD::SADDO, ISD::SADDO_CARRY,
534
ISD::SADDSAT, ISD::SDIV, ISD::SDIVREM, ISD::SELECT_CC,
535
ISD::SETCC, ISD::SHL, ISD::SINT_TO_FP, ISD::SMAX,
536
ISD::SMIN, ISD::SMULO, ISD::SMUL_LOHI, ISD::SRA,
537
ISD::SREM, ISD::SRL, ISD::SSHLSAT, ISD::SSUBO,
538
ISD::SSUBO_CARRY, ISD::SSUBSAT, ISD::SUB, ISD::SUBC,
539
ISD::SUBE, ISD::UADDO, ISD::UADDO_CARRY, ISD::UADDSAT,
540
ISD::UDIV, ISD::UDIVREM, ISD::UINT_TO_FP, ISD::UMAX,
541
ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM,
542
ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT,
543
ISD::USUBSAT},
544
MVT::v4i8, Expand);
545
546
// Operations not directly supported by NVPTX.
547
for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
548
MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
549
MVT::i32, MVT::i64}) {
550
setOperationAction(ISD::SELECT_CC, VT, Expand);
551
setOperationAction(ISD::BR_CC, VT, Expand);
552
}
553
554
// Some SIGN_EXTEND_INREG can be done using cvt instruction.
555
// For others we will expand to a SHL/SRA pair.
556
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
557
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
558
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
559
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
560
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
561
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
562
563
setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);
564
setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);
565
setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom);
566
setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom);
567
setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom);
568
setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom);
569
570
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
571
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
572
573
// TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
574
// that don't have h/w rotation we lower them to multi-instruction assembly.
575
// See ROT*_sw in NVPTXIntrInfo.td
576
setOperationAction(ISD::ROTL, MVT::i64, Legal);
577
setOperationAction(ISD::ROTR, MVT::i64, Legal);
578
setOperationAction(ISD::ROTL, MVT::i32, Legal);
579
setOperationAction(ISD::ROTR, MVT::i32, Legal);
580
581
setOperationAction(ISD::ROTL, MVT::i16, Expand);
582
setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
583
setOperationAction(ISD::ROTR, MVT::i16, Expand);
584
setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
585
setOperationAction(ISD::ROTL, MVT::i8, Expand);
586
setOperationAction(ISD::ROTR, MVT::i8, Expand);
587
setOperationAction(ISD::BSWAP, MVT::i16, Expand);
588
589
// Indirect branch is not supported.
590
// This also disables Jump Table creation.
591
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
592
setOperationAction(ISD::BRIND, MVT::Other, Expand);
593
594
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
595
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
596
597
// We want to legalize constant related memmove and memcopy
598
// intrinsics.
599
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
600
601
// Turn FP extload into load/fpextend
602
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
603
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
604
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
605
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
606
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
607
setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
608
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
609
setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
610
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
611
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
612
setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
613
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
614
setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
615
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
616
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
617
setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
618
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
619
setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
620
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
621
// Turn FP truncstore into trunc + store.
622
// FIXME: vector types should also be expanded
623
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
624
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
625
setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
626
setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
627
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
628
629
// PTX does not support load / store predicate registers
630
setOperationAction(ISD::LOAD, MVT::i1, Custom);
631
setOperationAction(ISD::STORE, MVT::i1, Custom);
632
633
for (MVT VT : MVT::integer_valuetypes()) {
634
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
635
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
636
setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
637
setTruncStoreAction(VT, MVT::i1, Expand);
638
}
639
640
// expand extload of vector of integers.
641
setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
642
MVT::v2i8, Expand);
643
setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
644
645
// This is legal in NVPTX
646
setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
647
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
648
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
649
setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
650
651
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
652
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
653
654
// TRAP can be lowered to PTX trap
655
setOperationAction(ISD::TRAP, MVT::Other, Legal);
656
657
// Register custom handling for vector loads/stores
658
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
659
if (IsPTXVectorType(VT)) {
660
setOperationAction(ISD::LOAD, VT, Custom);
661
setOperationAction(ISD::STORE, VT, Custom);
662
setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
663
}
664
}
665
666
// Support varargs.
667
setOperationAction(ISD::VASTART, MVT::Other, Custom);
668
setOperationAction(ISD::VAARG, MVT::Other, Custom);
669
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
670
setOperationAction(ISD::VAEND, MVT::Other, Expand);
671
672
// Custom handling for i8 intrinsics
673
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
674
675
for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
676
setOperationAction(ISD::ABS, Ty, Legal);
677
setOperationAction(ISD::SMIN, Ty, Legal);
678
setOperationAction(ISD::SMAX, Ty, Legal);
679
setOperationAction(ISD::UMIN, Ty, Legal);
680
setOperationAction(ISD::UMAX, Ty, Legal);
681
682
setOperationAction(ISD::CTPOP, Ty, Legal);
683
setOperationAction(ISD::CTLZ, Ty, Legal);
684
}
685
686
setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
687
setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
688
setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
689
setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
690
setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
691
setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
692
setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
693
694
setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
695
setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
696
setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
697
setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
698
setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
699
setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
700
701
// Other arithmetic and logic ops are unsupported.
702
setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
703
ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
704
ISD::SINT_TO_FP, ISD::UINT_TO_FP},
705
MVT::v2i16, Expand);
706
707
setOperationAction(ISD::ADDC, MVT::i32, Legal);
708
setOperationAction(ISD::ADDE, MVT::i32, Legal);
709
setOperationAction(ISD::SUBC, MVT::i32, Legal);
710
setOperationAction(ISD::SUBE, MVT::i32, Legal);
711
if (STI.getPTXVersion() >= 43) {
712
setOperationAction(ISD::ADDC, MVT::i64, Legal);
713
setOperationAction(ISD::ADDE, MVT::i64, Legal);
714
setOperationAction(ISD::SUBC, MVT::i64, Legal);
715
setOperationAction(ISD::SUBE, MVT::i64, Legal);
716
}
717
718
setOperationAction(ISD::CTTZ, MVT::i16, Expand);
719
setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
720
setOperationAction(ISD::CTTZ, MVT::i32, Expand);
721
setOperationAction(ISD::CTTZ, MVT::i64, Expand);
722
723
// PTX does not directly support SELP of i1, so promote to i32 first
724
setOperationAction(ISD::SELECT, MVT::i1, Custom);
725
726
// PTX cannot multiply two i64s in a single instruction.
727
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
728
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
729
730
// We have some custom DAG combine patterns for these nodes
731
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
732
ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
733
ISD::VSELECT});
734
735
// setcc for f16x2 and bf16x2 needs special handling to prevent
736
// legalizer's attempt to scalarize it due to v2i1 not being legal.
737
if (STI.allowFP16Math() || STI.hasBF16Math())
738
setTargetDAGCombine(ISD::SETCC);
739
740
// Promote fp16 arithmetic if fp16 hardware isn't available or the
741
// user passed --nvptx-no-fp16-math. The flag is useful because,
742
// although sm_53+ GPUs have some sort of FP16 support in
743
// hardware, only sm_53 and sm_60 have full implementation. Others
744
// only have token amount of hardware and are likely to run faster
745
// by using fp32 units instead.
746
for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
747
setFP16OperationAction(Op, MVT::f16, Legal, Promote);
748
setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
749
setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
750
// bf16 must be promoted to f32.
751
setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
752
if (getOperationAction(Op, MVT::bf16) == Promote)
753
AddPromotedToType(Op, MVT::bf16, MVT::f32);
754
}
755
756
// f16/f16x2 neg was introduced in PTX 60, SM_53.
757
const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
758
STI.getPTXVersion() >= 60 &&
759
STI.allowFP16Math();
760
for (const auto &VT : {MVT::f16, MVT::v2f16})
761
setOperationAction(ISD::FNEG, VT,
762
IsFP16FP16x2NegAvailable ? Legal : Expand);
763
764
setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
765
setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
766
// (would be) Library functions.
767
768
// These map to conversion instructions for scalar FP types.
769
for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
770
ISD::FROUNDEVEN, ISD::FTRUNC}) {
771
setOperationAction(Op, MVT::f16, Legal);
772
setOperationAction(Op, MVT::f32, Legal);
773
setOperationAction(Op, MVT::f64, Legal);
774
setOperationAction(Op, MVT::v2f16, Expand);
775
setOperationAction(Op, MVT::v2bf16, Expand);
776
setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
777
if (getOperationAction(Op, MVT::bf16) == Promote)
778
AddPromotedToType(Op, MVT::bf16, MVT::f32);
779
}
780
781
if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
782
setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
783
}
784
if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
785
for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
786
setOperationAction(ISD::FP_EXTEND, VT, Custom);
787
setOperationAction(ISD::FP_ROUND, VT, Custom);
788
}
789
}
790
791
// sm_80 only has conversions between f32 and bf16. Custom lower all other
792
// bf16 conversions.
793
if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
794
for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
795
setOperationAction(
796
{ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
797
VT, Custom);
798
}
799
setOperationAction(
800
{ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
801
MVT::bf16, Custom);
802
}
803
804
setOperationAction(ISD::FROUND, MVT::f16, Promote);
805
setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
806
setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
807
setOperationAction(ISD::FROUND, MVT::f32, Custom);
808
setOperationAction(ISD::FROUND, MVT::f64, Custom);
809
setOperationAction(ISD::FROUND, MVT::bf16, Promote);
810
AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
811
812
// 'Expand' implements FCOPYSIGN without calling an external library.
813
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
814
setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
815
setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
816
setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand);
817
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
818
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
819
820
// These map to corresponding instructions for f32/f64. f16 must be
821
// promoted to f32. v2f16 is expanded to f16, which is then promoted
822
// to f32.
823
for (const auto &Op :
824
{ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
825
setOperationAction(Op, MVT::f16, Promote);
826
setOperationAction(Op, MVT::f32, Legal);
827
setOperationAction(Op, MVT::f64, Legal);
828
setOperationAction(Op, MVT::v2f16, Expand);
829
setOperationAction(Op, MVT::v2bf16, Expand);
830
setOperationAction(Op, MVT::bf16, Promote);
831
AddPromotedToType(Op, MVT::bf16, MVT::f32);
832
}
833
for (const auto &Op : {ISD::FABS}) {
834
setOperationAction(Op, MVT::f16, Promote);
835
setOperationAction(Op, MVT::f32, Legal);
836
setOperationAction(Op, MVT::f64, Legal);
837
setOperationAction(Op, MVT::v2f16, Expand);
838
setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
839
setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
840
if (getOperationAction(Op, MVT::bf16) == Promote)
841
AddPromotedToType(Op, MVT::bf16, MVT::f32);
842
}
843
844
// max.f16, max.f16x2 and max.NaN are supported on sm_80+.
845
auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
846
bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
847
return IsAtLeastSm80 ? Legal : NotSm80Action;
848
};
849
for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
850
setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
851
setOperationAction(Op, MVT::f32, Legal);
852
setOperationAction(Op, MVT::f64, Legal);
853
setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
854
setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
855
setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
856
if (getOperationAction(Op, MVT::bf16) == Promote)
857
AddPromotedToType(Op, MVT::bf16, MVT::f32);
858
}
859
for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
860
setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
861
setFP16OperationAction(Op, MVT::bf16, Legal, Expand);
862
setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
863
setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
864
setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
865
}
866
867
// Custom lowering for inline asm with 128-bit operands
868
setOperationAction(ISD::CopyToReg, MVT::i128, Custom);
869
setOperationAction(ISD::CopyFromReg, MVT::i128, Custom);
870
871
// No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
872
// No FPOW or FREM in PTX.
873
874
// Now deduce the information based on the above mentioned
875
// actions
876
computeRegisterProperties(STI.getRegisterInfo());
877
878
setMinCmpXchgSizeInBits(32);
879
setMaxAtomicSizeInBitsSupported(64);
880
setMaxDivRemBitWidthSupported(64);
881
}
882
883
const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
884
885
#define MAKE_CASE(V) \
886
case V: \
887
return #V;
888
889
switch ((NVPTXISD::NodeType)Opcode) {
890
case NVPTXISD::FIRST_NUMBER:
891
break;
892
893
MAKE_CASE(NVPTXISD::CALL)
894
MAKE_CASE(NVPTXISD::RET_GLUE)
895
MAKE_CASE(NVPTXISD::LOAD_PARAM)
896
MAKE_CASE(NVPTXISD::Wrapper)
897
MAKE_CASE(NVPTXISD::DeclareParam)
898
MAKE_CASE(NVPTXISD::DeclareScalarParam)
899
MAKE_CASE(NVPTXISD::DeclareRet)
900
MAKE_CASE(NVPTXISD::DeclareScalarRet)
901
MAKE_CASE(NVPTXISD::DeclareRetParam)
902
MAKE_CASE(NVPTXISD::PrintCall)
903
MAKE_CASE(NVPTXISD::PrintConvergentCall)
904
MAKE_CASE(NVPTXISD::PrintCallUni)
905
MAKE_CASE(NVPTXISD::PrintConvergentCallUni)
906
MAKE_CASE(NVPTXISD::LoadParam)
907
MAKE_CASE(NVPTXISD::LoadParamV2)
908
MAKE_CASE(NVPTXISD::LoadParamV4)
909
MAKE_CASE(NVPTXISD::StoreParam)
910
MAKE_CASE(NVPTXISD::StoreParamV2)
911
MAKE_CASE(NVPTXISD::StoreParamV4)
912
MAKE_CASE(NVPTXISD::StoreParamS32)
913
MAKE_CASE(NVPTXISD::StoreParamU32)
914
MAKE_CASE(NVPTXISD::CallArgBegin)
915
MAKE_CASE(NVPTXISD::CallArg)
916
MAKE_CASE(NVPTXISD::LastCallArg)
917
MAKE_CASE(NVPTXISD::CallArgEnd)
918
MAKE_CASE(NVPTXISD::CallVoid)
919
MAKE_CASE(NVPTXISD::CallVal)
920
MAKE_CASE(NVPTXISD::CallSymbol)
921
MAKE_CASE(NVPTXISD::Prototype)
922
MAKE_CASE(NVPTXISD::MoveParam)
923
MAKE_CASE(NVPTXISD::StoreRetval)
924
MAKE_CASE(NVPTXISD::StoreRetvalV2)
925
MAKE_CASE(NVPTXISD::StoreRetvalV4)
926
MAKE_CASE(NVPTXISD::PseudoUseParam)
927
MAKE_CASE(NVPTXISD::RETURN)
928
MAKE_CASE(NVPTXISD::CallSeqBegin)
929
MAKE_CASE(NVPTXISD::CallSeqEnd)
930
MAKE_CASE(NVPTXISD::CallPrototype)
931
MAKE_CASE(NVPTXISD::ProxyReg)
932
MAKE_CASE(NVPTXISD::LoadV2)
933
MAKE_CASE(NVPTXISD::LoadV4)
934
MAKE_CASE(NVPTXISD::LDGV2)
935
MAKE_CASE(NVPTXISD::LDGV4)
936
MAKE_CASE(NVPTXISD::LDUV2)
937
MAKE_CASE(NVPTXISD::LDUV4)
938
MAKE_CASE(NVPTXISD::StoreV2)
939
MAKE_CASE(NVPTXISD::StoreV4)
940
MAKE_CASE(NVPTXISD::FUN_SHFL_CLAMP)
941
MAKE_CASE(NVPTXISD::FUN_SHFR_CLAMP)
942
MAKE_CASE(NVPTXISD::IMAD)
943
MAKE_CASE(NVPTXISD::BFE)
944
MAKE_CASE(NVPTXISD::BFI)
945
MAKE_CASE(NVPTXISD::PRMT)
946
MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
947
MAKE_CASE(NVPTXISD::SETP_F16X2)
948
MAKE_CASE(NVPTXISD::SETP_BF16X2)
949
MAKE_CASE(NVPTXISD::Dummy)
950
MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED)
951
MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED)
952
MAKE_CASE(NVPTXISD::Tex1DFloatS32)
953
MAKE_CASE(NVPTXISD::Tex1DFloatFloat)
954
MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel)
955
MAKE_CASE(NVPTXISD::Tex1DFloatFloatGrad)
956
MAKE_CASE(NVPTXISD::Tex1DS32S32)
957
MAKE_CASE(NVPTXISD::Tex1DS32Float)
958
MAKE_CASE(NVPTXISD::Tex1DS32FloatLevel)
959
MAKE_CASE(NVPTXISD::Tex1DS32FloatGrad)
960
MAKE_CASE(NVPTXISD::Tex1DU32S32)
961
MAKE_CASE(NVPTXISD::Tex1DU32Float)
962
MAKE_CASE(NVPTXISD::Tex1DU32FloatLevel)
963
MAKE_CASE(NVPTXISD::Tex1DU32FloatGrad)
964
MAKE_CASE(NVPTXISD::Tex1DArrayFloatS32)
965
MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloat)
966
MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatLevel)
967
MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatGrad)
968
MAKE_CASE(NVPTXISD::Tex1DArrayS32S32)
969
MAKE_CASE(NVPTXISD::Tex1DArrayS32Float)
970
MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatLevel)
971
MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatGrad)
972
MAKE_CASE(NVPTXISD::Tex1DArrayU32S32)
973
MAKE_CASE(NVPTXISD::Tex1DArrayU32Float)
974
MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatLevel)
975
MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatGrad)
976
MAKE_CASE(NVPTXISD::Tex2DFloatS32)
977
MAKE_CASE(NVPTXISD::Tex2DFloatFloat)
978
MAKE_CASE(NVPTXISD::Tex2DFloatFloatLevel)
979
MAKE_CASE(NVPTXISD::Tex2DFloatFloatGrad)
980
MAKE_CASE(NVPTXISD::Tex2DS32S32)
981
MAKE_CASE(NVPTXISD::Tex2DS32Float)
982
MAKE_CASE(NVPTXISD::Tex2DS32FloatLevel)
983
MAKE_CASE(NVPTXISD::Tex2DS32FloatGrad)
984
MAKE_CASE(NVPTXISD::Tex2DU32S32)
985
MAKE_CASE(NVPTXISD::Tex2DU32Float)
986
MAKE_CASE(NVPTXISD::Tex2DU32FloatLevel)
987
MAKE_CASE(NVPTXISD::Tex2DU32FloatGrad)
988
MAKE_CASE(NVPTXISD::Tex2DArrayFloatS32)
989
MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloat)
990
MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatLevel)
991
MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatGrad)
992
MAKE_CASE(NVPTXISD::Tex2DArrayS32S32)
993
MAKE_CASE(NVPTXISD::Tex2DArrayS32Float)
994
MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatLevel)
995
MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatGrad)
996
MAKE_CASE(NVPTXISD::Tex2DArrayU32S32)
997
MAKE_CASE(NVPTXISD::Tex2DArrayU32Float)
998
MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatLevel)
999
MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatGrad)
1000
MAKE_CASE(NVPTXISD::Tex3DFloatS32)
1001
MAKE_CASE(NVPTXISD::Tex3DFloatFloat)
1002
MAKE_CASE(NVPTXISD::Tex3DFloatFloatLevel)
1003
MAKE_CASE(NVPTXISD::Tex3DFloatFloatGrad)
1004
MAKE_CASE(NVPTXISD::Tex3DS32S32)
1005
MAKE_CASE(NVPTXISD::Tex3DS32Float)
1006
MAKE_CASE(NVPTXISD::Tex3DS32FloatLevel)
1007
MAKE_CASE(NVPTXISD::Tex3DS32FloatGrad)
1008
MAKE_CASE(NVPTXISD::Tex3DU32S32)
1009
MAKE_CASE(NVPTXISD::Tex3DU32Float)
1010
MAKE_CASE(NVPTXISD::Tex3DU32FloatLevel)
1011
MAKE_CASE(NVPTXISD::Tex3DU32FloatGrad)
1012
MAKE_CASE(NVPTXISD::TexCubeFloatFloat)
1013
MAKE_CASE(NVPTXISD::TexCubeFloatFloatLevel)
1014
MAKE_CASE(NVPTXISD::TexCubeS32Float)
1015
MAKE_CASE(NVPTXISD::TexCubeS32FloatLevel)
1016
MAKE_CASE(NVPTXISD::TexCubeU32Float)
1017
MAKE_CASE(NVPTXISD::TexCubeU32FloatLevel)
1018
MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloat)
1019
MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloatLevel)
1020
MAKE_CASE(NVPTXISD::TexCubeArrayS32Float)
1021
MAKE_CASE(NVPTXISD::TexCubeArrayS32FloatLevel)
1022
MAKE_CASE(NVPTXISD::TexCubeArrayU32Float)
1023
MAKE_CASE(NVPTXISD::TexCubeArrayU32FloatLevel)
1024
MAKE_CASE(NVPTXISD::Tld4R2DFloatFloat)
1025
MAKE_CASE(NVPTXISD::Tld4G2DFloatFloat)
1026
MAKE_CASE(NVPTXISD::Tld4B2DFloatFloat)
1027
MAKE_CASE(NVPTXISD::Tld4A2DFloatFloat)
1028
MAKE_CASE(NVPTXISD::Tld4R2DS64Float)
1029
MAKE_CASE(NVPTXISD::Tld4G2DS64Float)
1030
MAKE_CASE(NVPTXISD::Tld4B2DS64Float)
1031
MAKE_CASE(NVPTXISD::Tld4A2DS64Float)
1032
MAKE_CASE(NVPTXISD::Tld4R2DU64Float)
1033
MAKE_CASE(NVPTXISD::Tld4G2DU64Float)
1034
MAKE_CASE(NVPTXISD::Tld4B2DU64Float)
1035
MAKE_CASE(NVPTXISD::Tld4A2DU64Float)
1036
1037
MAKE_CASE(NVPTXISD::TexUnified1DFloatS32)
1038
MAKE_CASE(NVPTXISD::TexUnified1DFloatFloat)
1039
MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatLevel)
1040
MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatGrad)
1041
MAKE_CASE(NVPTXISD::TexUnified1DS32S32)
1042
MAKE_CASE(NVPTXISD::TexUnified1DS32Float)
1043
MAKE_CASE(NVPTXISD::TexUnified1DS32FloatLevel)
1044
MAKE_CASE(NVPTXISD::TexUnified1DS32FloatGrad)
1045
MAKE_CASE(NVPTXISD::TexUnified1DU32S32)
1046
MAKE_CASE(NVPTXISD::TexUnified1DU32Float)
1047
MAKE_CASE(NVPTXISD::TexUnified1DU32FloatLevel)
1048
MAKE_CASE(NVPTXISD::TexUnified1DU32FloatGrad)
1049
MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatS32)
1050
MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloat)
1051
MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatLevel)
1052
MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatGrad)
1053
MAKE_CASE(NVPTXISD::TexUnified1DArrayS32S32)
1054
MAKE_CASE(NVPTXISD::TexUnified1DArrayS32Float)
1055
MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatLevel)
1056
MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatGrad)
1057
MAKE_CASE(NVPTXISD::TexUnified1DArrayU32S32)
1058
MAKE_CASE(NVPTXISD::TexUnified1DArrayU32Float)
1059
MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatLevel)
1060
MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatGrad)
1061
MAKE_CASE(NVPTXISD::TexUnified2DFloatS32)
1062
MAKE_CASE(NVPTXISD::TexUnified2DFloatFloat)
1063
MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatLevel)
1064
MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatGrad)
1065
MAKE_CASE(NVPTXISD::TexUnified2DS32S32)
1066
MAKE_CASE(NVPTXISD::TexUnified2DS32Float)
1067
MAKE_CASE(NVPTXISD::TexUnified2DS32FloatLevel)
1068
MAKE_CASE(NVPTXISD::TexUnified2DS32FloatGrad)
1069
MAKE_CASE(NVPTXISD::TexUnified2DU32S32)
1070
MAKE_CASE(NVPTXISD::TexUnified2DU32Float)
1071
MAKE_CASE(NVPTXISD::TexUnified2DU32FloatLevel)
1072
MAKE_CASE(NVPTXISD::TexUnified2DU32FloatGrad)
1073
MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatS32)
1074
MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloat)
1075
MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatLevel)
1076
MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatGrad)
1077
MAKE_CASE(NVPTXISD::TexUnified2DArrayS32S32)
1078
MAKE_CASE(NVPTXISD::TexUnified2DArrayS32Float)
1079
MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatLevel)
1080
MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatGrad)
1081
MAKE_CASE(NVPTXISD::TexUnified2DArrayU32S32)
1082
MAKE_CASE(NVPTXISD::TexUnified2DArrayU32Float)
1083
MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatLevel)
1084
MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatGrad)
1085
MAKE_CASE(NVPTXISD::TexUnified3DFloatS32)
1086
MAKE_CASE(NVPTXISD::TexUnified3DFloatFloat)
1087
MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatLevel)
1088
MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatGrad)
1089
MAKE_CASE(NVPTXISD::TexUnified3DS32S32)
1090
MAKE_CASE(NVPTXISD::TexUnified3DS32Float)
1091
MAKE_CASE(NVPTXISD::TexUnified3DS32FloatLevel)
1092
MAKE_CASE(NVPTXISD::TexUnified3DS32FloatGrad)
1093
MAKE_CASE(NVPTXISD::TexUnified3DU32S32)
1094
MAKE_CASE(NVPTXISD::TexUnified3DU32Float)
1095
MAKE_CASE(NVPTXISD::TexUnified3DU32FloatLevel)
1096
MAKE_CASE(NVPTXISD::TexUnified3DU32FloatGrad)
1097
MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloat)
1098
MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatLevel)
1099
MAKE_CASE(NVPTXISD::TexUnifiedCubeS32Float)
1100
MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatLevel)
1101
MAKE_CASE(NVPTXISD::TexUnifiedCubeU32Float)
1102
MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatLevel)
1103
MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloat)
1104
MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel)
1105
MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32Float)
1106
MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatLevel)
1107
MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32Float)
1108
MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatLevel)
1109
MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatGrad)
1110
MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatGrad)
1111
MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatGrad)
1112
MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad)
1113
MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatGrad)
1114
MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatGrad)
1115
MAKE_CASE(NVPTXISD::Tld4UnifiedR2DFloatFloat)
1116
MAKE_CASE(NVPTXISD::Tld4UnifiedG2DFloatFloat)
1117
MAKE_CASE(NVPTXISD::Tld4UnifiedB2DFloatFloat)
1118
MAKE_CASE(NVPTXISD::Tld4UnifiedA2DFloatFloat)
1119
MAKE_CASE(NVPTXISD::Tld4UnifiedR2DS64Float)
1120
MAKE_CASE(NVPTXISD::Tld4UnifiedG2DS64Float)
1121
MAKE_CASE(NVPTXISD::Tld4UnifiedB2DS64Float)
1122
MAKE_CASE(NVPTXISD::Tld4UnifiedA2DS64Float)
1123
MAKE_CASE(NVPTXISD::Tld4UnifiedR2DU64Float)
1124
MAKE_CASE(NVPTXISD::Tld4UnifiedG2DU64Float)
1125
MAKE_CASE(NVPTXISD::Tld4UnifiedB2DU64Float)
1126
MAKE_CASE(NVPTXISD::Tld4UnifiedA2DU64Float)
1127
1128
MAKE_CASE(NVPTXISD::Suld1DI8Clamp)
1129
MAKE_CASE(NVPTXISD::Suld1DI16Clamp)
1130
MAKE_CASE(NVPTXISD::Suld1DI32Clamp)
1131
MAKE_CASE(NVPTXISD::Suld1DI64Clamp)
1132
MAKE_CASE(NVPTXISD::Suld1DV2I8Clamp)
1133
MAKE_CASE(NVPTXISD::Suld1DV2I16Clamp)
1134
MAKE_CASE(NVPTXISD::Suld1DV2I32Clamp)
1135
MAKE_CASE(NVPTXISD::Suld1DV2I64Clamp)
1136
MAKE_CASE(NVPTXISD::Suld1DV4I8Clamp)
1137
MAKE_CASE(NVPTXISD::Suld1DV4I16Clamp)
1138
MAKE_CASE(NVPTXISD::Suld1DV4I32Clamp)
1139
1140
MAKE_CASE(NVPTXISD::Suld1DArrayI8Clamp)
1141
MAKE_CASE(NVPTXISD::Suld1DArrayI16Clamp)
1142
MAKE_CASE(NVPTXISD::Suld1DArrayI32Clamp)
1143
MAKE_CASE(NVPTXISD::Suld1DArrayI64Clamp)
1144
MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Clamp)
1145
MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Clamp)
1146
MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Clamp)
1147
MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Clamp)
1148
MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Clamp)
1149
MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Clamp)
1150
MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Clamp)
1151
1152
MAKE_CASE(NVPTXISD::Suld2DI8Clamp)
1153
MAKE_CASE(NVPTXISD::Suld2DI16Clamp)
1154
MAKE_CASE(NVPTXISD::Suld2DI32Clamp)
1155
MAKE_CASE(NVPTXISD::Suld2DI64Clamp)
1156
MAKE_CASE(NVPTXISD::Suld2DV2I8Clamp)
1157
MAKE_CASE(NVPTXISD::Suld2DV2I16Clamp)
1158
MAKE_CASE(NVPTXISD::Suld2DV2I32Clamp)
1159
MAKE_CASE(NVPTXISD::Suld2DV2I64Clamp)
1160
MAKE_CASE(NVPTXISD::Suld2DV4I8Clamp)
1161
MAKE_CASE(NVPTXISD::Suld2DV4I16Clamp)
1162
MAKE_CASE(NVPTXISD::Suld2DV4I32Clamp)
1163
1164
MAKE_CASE(NVPTXISD::Suld2DArrayI8Clamp)
1165
MAKE_CASE(NVPTXISD::Suld2DArrayI16Clamp)
1166
MAKE_CASE(NVPTXISD::Suld2DArrayI32Clamp)
1167
MAKE_CASE(NVPTXISD::Suld2DArrayI64Clamp)
1168
MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Clamp)
1169
MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Clamp)
1170
MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Clamp)
1171
MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Clamp)
1172
MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Clamp)
1173
MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Clamp)
1174
MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Clamp)
1175
1176
MAKE_CASE(NVPTXISD::Suld3DI8Clamp)
1177
MAKE_CASE(NVPTXISD::Suld3DI16Clamp)
1178
MAKE_CASE(NVPTXISD::Suld3DI32Clamp)
1179
MAKE_CASE(NVPTXISD::Suld3DI64Clamp)
1180
MAKE_CASE(NVPTXISD::Suld3DV2I8Clamp)
1181
MAKE_CASE(NVPTXISD::Suld3DV2I16Clamp)
1182
MAKE_CASE(NVPTXISD::Suld3DV2I32Clamp)
1183
MAKE_CASE(NVPTXISD::Suld3DV2I64Clamp)
1184
MAKE_CASE(NVPTXISD::Suld3DV4I8Clamp)
1185
MAKE_CASE(NVPTXISD::Suld3DV4I16Clamp)
1186
MAKE_CASE(NVPTXISD::Suld3DV4I32Clamp)
1187
1188
MAKE_CASE(NVPTXISD::Suld1DI8Trap)
1189
MAKE_CASE(NVPTXISD::Suld1DI16Trap)
1190
MAKE_CASE(NVPTXISD::Suld1DI32Trap)
1191
MAKE_CASE(NVPTXISD::Suld1DI64Trap)
1192
MAKE_CASE(NVPTXISD::Suld1DV2I8Trap)
1193
MAKE_CASE(NVPTXISD::Suld1DV2I16Trap)
1194
MAKE_CASE(NVPTXISD::Suld1DV2I32Trap)
1195
MAKE_CASE(NVPTXISD::Suld1DV2I64Trap)
1196
MAKE_CASE(NVPTXISD::Suld1DV4I8Trap)
1197
MAKE_CASE(NVPTXISD::Suld1DV4I16Trap)
1198
MAKE_CASE(NVPTXISD::Suld1DV4I32Trap)
1199
1200
MAKE_CASE(NVPTXISD::Suld1DArrayI8Trap)
1201
MAKE_CASE(NVPTXISD::Suld1DArrayI16Trap)
1202
MAKE_CASE(NVPTXISD::Suld1DArrayI32Trap)
1203
MAKE_CASE(NVPTXISD::Suld1DArrayI64Trap)
1204
MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Trap)
1205
MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Trap)
1206
MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Trap)
1207
MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Trap)
1208
MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Trap)
1209
MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Trap)
1210
MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Trap)
1211
1212
MAKE_CASE(NVPTXISD::Suld2DI8Trap)
1213
MAKE_CASE(NVPTXISD::Suld2DI16Trap)
1214
MAKE_CASE(NVPTXISD::Suld2DI32Trap)
1215
MAKE_CASE(NVPTXISD::Suld2DI64Trap)
1216
MAKE_CASE(NVPTXISD::Suld2DV2I8Trap)
1217
MAKE_CASE(NVPTXISD::Suld2DV2I16Trap)
1218
MAKE_CASE(NVPTXISD::Suld2DV2I32Trap)
1219
MAKE_CASE(NVPTXISD::Suld2DV2I64Trap)
1220
MAKE_CASE(NVPTXISD::Suld2DV4I8Trap)
1221
MAKE_CASE(NVPTXISD::Suld2DV4I16Trap)
1222
MAKE_CASE(NVPTXISD::Suld2DV4I32Trap)
1223
1224
MAKE_CASE(NVPTXISD::Suld2DArrayI8Trap)
1225
MAKE_CASE(NVPTXISD::Suld2DArrayI16Trap)
1226
MAKE_CASE(NVPTXISD::Suld2DArrayI32Trap)
1227
MAKE_CASE(NVPTXISD::Suld2DArrayI64Trap)
1228
MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Trap)
1229
MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Trap)
1230
MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Trap)
1231
MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Trap)
1232
MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Trap)
1233
MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Trap)
1234
MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Trap)
1235
1236
MAKE_CASE(NVPTXISD::Suld3DI8Trap)
1237
MAKE_CASE(NVPTXISD::Suld3DI16Trap)
1238
MAKE_CASE(NVPTXISD::Suld3DI32Trap)
1239
MAKE_CASE(NVPTXISD::Suld3DI64Trap)
1240
MAKE_CASE(NVPTXISD::Suld3DV2I8Trap)
1241
MAKE_CASE(NVPTXISD::Suld3DV2I16Trap)
1242
MAKE_CASE(NVPTXISD::Suld3DV2I32Trap)
1243
MAKE_CASE(NVPTXISD::Suld3DV2I64Trap)
1244
MAKE_CASE(NVPTXISD::Suld3DV4I8Trap)
1245
MAKE_CASE(NVPTXISD::Suld3DV4I16Trap)
1246
MAKE_CASE(NVPTXISD::Suld3DV4I32Trap)
1247
1248
MAKE_CASE(NVPTXISD::Suld1DI8Zero)
1249
MAKE_CASE(NVPTXISD::Suld1DI16Zero)
1250
MAKE_CASE(NVPTXISD::Suld1DI32Zero)
1251
MAKE_CASE(NVPTXISD::Suld1DI64Zero)
1252
MAKE_CASE(NVPTXISD::Suld1DV2I8Zero)
1253
MAKE_CASE(NVPTXISD::Suld1DV2I16Zero)
1254
MAKE_CASE(NVPTXISD::Suld1DV2I32Zero)
1255
MAKE_CASE(NVPTXISD::Suld1DV2I64Zero)
1256
MAKE_CASE(NVPTXISD::Suld1DV4I8Zero)
1257
MAKE_CASE(NVPTXISD::Suld1DV4I16Zero)
1258
MAKE_CASE(NVPTXISD::Suld1DV4I32Zero)
1259
1260
MAKE_CASE(NVPTXISD::Suld1DArrayI8Zero)
1261
MAKE_CASE(NVPTXISD::Suld1DArrayI16Zero)
1262
MAKE_CASE(NVPTXISD::Suld1DArrayI32Zero)
1263
MAKE_CASE(NVPTXISD::Suld1DArrayI64Zero)
1264
MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Zero)
1265
MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Zero)
1266
MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Zero)
1267
MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Zero)
1268
MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Zero)
1269
MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Zero)
1270
MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Zero)
1271
1272
MAKE_CASE(NVPTXISD::Suld2DI8Zero)
1273
MAKE_CASE(NVPTXISD::Suld2DI16Zero)
1274
MAKE_CASE(NVPTXISD::Suld2DI32Zero)
1275
MAKE_CASE(NVPTXISD::Suld2DI64Zero)
1276
MAKE_CASE(NVPTXISD::Suld2DV2I8Zero)
1277
MAKE_CASE(NVPTXISD::Suld2DV2I16Zero)
1278
MAKE_CASE(NVPTXISD::Suld2DV2I32Zero)
1279
MAKE_CASE(NVPTXISD::Suld2DV2I64Zero)
1280
MAKE_CASE(NVPTXISD::Suld2DV4I8Zero)
1281
MAKE_CASE(NVPTXISD::Suld2DV4I16Zero)
1282
MAKE_CASE(NVPTXISD::Suld2DV4I32Zero)
1283
1284
MAKE_CASE(NVPTXISD::Suld2DArrayI8Zero)
1285
MAKE_CASE(NVPTXISD::Suld2DArrayI16Zero)
1286
MAKE_CASE(NVPTXISD::Suld2DArrayI32Zero)
1287
MAKE_CASE(NVPTXISD::Suld2DArrayI64Zero)
1288
MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Zero)
1289
MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Zero)
1290
MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Zero)
1291
MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Zero)
1292
MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Zero)
1293
MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Zero)
1294
MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Zero)
1295
1296
MAKE_CASE(NVPTXISD::Suld3DI8Zero)
1297
MAKE_CASE(NVPTXISD::Suld3DI16Zero)
1298
MAKE_CASE(NVPTXISD::Suld3DI32Zero)
1299
MAKE_CASE(NVPTXISD::Suld3DI64Zero)
1300
MAKE_CASE(NVPTXISD::Suld3DV2I8Zero)
1301
MAKE_CASE(NVPTXISD::Suld3DV2I16Zero)
1302
MAKE_CASE(NVPTXISD::Suld3DV2I32Zero)
1303
MAKE_CASE(NVPTXISD::Suld3DV2I64Zero)
1304
MAKE_CASE(NVPTXISD::Suld3DV4I8Zero)
1305
MAKE_CASE(NVPTXISD::Suld3DV4I16Zero)
1306
MAKE_CASE(NVPTXISD::Suld3DV4I32Zero)
1307
}
1308
return nullptr;
1309
1310
#undef MAKE_CASE
1311
}
1312
1313
TargetLoweringBase::LegalizeTypeAction
1314
NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
1315
if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1316
VT.getScalarType() == MVT::i1)
1317
return TypeSplitVector;
1318
if (Isv2x16VT(VT))
1319
return TypeLegal;
1320
return TargetLoweringBase::getPreferredVectorAction(VT);
1321
}
1322
1323
SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
1324
int Enabled, int &ExtraSteps,
1325
bool &UseOneConst,
1326
bool Reciprocal) const {
1327
if (!(Enabled == ReciprocalEstimate::Enabled ||
1328
(Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1329
return SDValue();
1330
1331
if (ExtraSteps == ReciprocalEstimate::Unspecified)
1332
ExtraSteps = 0;
1333
1334
SDLoc DL(Operand);
1335
EVT VT = Operand.getValueType();
1336
bool Ftz = useF32FTZ(DAG.getMachineFunction());
1337
1338
auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1339
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1340
DAG.getConstant(IID, DL, MVT::i32), Operand);
1341
};
1342
1343
// The sqrt and rsqrt refinement processes assume we always start out with an
1344
// approximation of the rsqrt. Therefore, if we're going to do any refinement
1345
// (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1346
// any refinement, we must return a regular sqrt.
1347
if (Reciprocal || ExtraSteps > 0) {
1348
if (VT == MVT::f32)
1349
return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1350
: Intrinsic::nvvm_rsqrt_approx_f);
1351
else if (VT == MVT::f64)
1352
return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1353
else
1354
return SDValue();
1355
} else {
1356
if (VT == MVT::f32)
1357
return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1358
: Intrinsic::nvvm_sqrt_approx_f);
1359
else {
1360
// There's no sqrt.approx.f64 instruction, so we emit
1361
// reciprocal(rsqrt(x)). This is faster than
1362
// select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1363
// x * rsqrt(x).)
1364
return DAG.getNode(
1365
ISD::INTRINSIC_WO_CHAIN, DL, VT,
1366
DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1367
MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1368
}
1369
}
1370
}
1371
1372
SDValue
1373
NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
1374
SDLoc dl(Op);
1375
const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1376
auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1377
Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1378
return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1379
}
1380
1381
static bool IsTypePassedAsArray(const Type *Ty) {
1382
return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1383
Ty->isHalfTy() || Ty->isBFloatTy();
1384
}
1385
1386
std::string NVPTXTargetLowering::getPrototype(
1387
const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1388
const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1389
std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1390
const CallBase &CB, unsigned UniqueCallSite) const {
1391
auto PtrVT = getPointerTy(DL);
1392
1393
bool isABI = (STI.getSmVersion() >= 20);
1394
assert(isABI && "Non-ABI compilation is not supported");
1395
if (!isABI)
1396
return "";
1397
1398
std::string Prototype;
1399
raw_string_ostream O(Prototype);
1400
O << "prototype_" << UniqueCallSite << " : .callprototype ";
1401
1402
if (retTy->getTypeID() == Type::VoidTyID) {
1403
O << "()";
1404
} else {
1405
O << "(";
1406
if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1407
!IsTypePassedAsArray(retTy)) {
1408
unsigned size = 0;
1409
if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1410
size = ITy->getBitWidth();
1411
} else {
1412
assert(retTy->isFloatingPointTy() &&
1413
"Floating point type expected here");
1414
size = retTy->getPrimitiveSizeInBits();
1415
}
1416
// PTX ABI requires all scalar return values to be at least 32
1417
// bits in size. fp16 normally uses .b16 as its storage type in
1418
// PTX, so its size must be adjusted here, too.
1419
size = promoteScalarArgumentSize(size);
1420
1421
O << ".param .b" << size << " _";
1422
} else if (isa<PointerType>(retTy)) {
1423
O << ".param .b" << PtrVT.getSizeInBits() << " _";
1424
} else if (IsTypePassedAsArray(retTy)) {
1425
O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1426
<< " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1427
} else {
1428
llvm_unreachable("Unknown return type");
1429
}
1430
O << ") ";
1431
}
1432
O << "_ (";
1433
1434
bool first = true;
1435
1436
unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1437
for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1438
Type *Ty = Args[i].Ty;
1439
if (!first) {
1440
O << ", ";
1441
}
1442
first = false;
1443
1444
if (!Outs[OIdx].Flags.isByVal()) {
1445
if (IsTypePassedAsArray(Ty)) {
1446
Align ParamAlign =
1447
getArgumentAlignment(&CB, Ty, i + AttributeList::FirstArgIndex, DL);
1448
O << ".param .align " << ParamAlign.value() << " .b8 ";
1449
O << "_";
1450
O << "[" << DL.getTypeAllocSize(Ty) << "]";
1451
// update the index for Outs
1452
SmallVector<EVT, 16> vtparts;
1453
ComputeValueVTs(*this, DL, Ty, vtparts);
1454
if (unsigned len = vtparts.size())
1455
OIdx += len - 1;
1456
continue;
1457
}
1458
// i8 types in IR will be i16 types in SDAG
1459
assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1460
(getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1461
"type mismatch between callee prototype and arguments");
1462
// scalar type
1463
unsigned sz = 0;
1464
if (isa<IntegerType>(Ty)) {
1465
sz = cast<IntegerType>(Ty)->getBitWidth();
1466
sz = promoteScalarArgumentSize(sz);
1467
} else if (isa<PointerType>(Ty)) {
1468
sz = PtrVT.getSizeInBits();
1469
} else {
1470
sz = Ty->getPrimitiveSizeInBits();
1471
}
1472
O << ".param .b" << sz << " ";
1473
O << "_";
1474
continue;
1475
}
1476
1477
// Indirect calls need strict ABI alignment so we disable optimizations by
1478
// not providing a function to optimize.
1479
Type *ETy = Args[i].IndirectType;
1480
Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1481
Align ParamByValAlign =
1482
getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1483
1484
O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1485
O << "_";
1486
O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1487
}
1488
1489
if (VAInfo)
1490
O << (first ? "" : ",") << " .param .align " << VAInfo->second
1491
<< " .b8 _[]\n";
1492
O << ")";
1493
if (shouldEmitPTXNoReturn(&CB, *nvTM))
1494
O << " .noreturn";
1495
O << ";";
1496
1497
return Prototype;
1498
}
1499
1500
Align NVPTXTargetLowering::getFunctionArgumentAlignment(
1501
const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1502
return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1503
}
1504
1505
Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1506
unsigned Idx,
1507
const DataLayout &DL) const {
1508
if (!CB) {
1509
// CallSite is zero, fallback to ABI type alignment
1510
return DL.getABITypeAlign(Ty);
1511
}
1512
1513
const Function *DirectCallee = CB->getCalledFunction();
1514
1515
if (!DirectCallee) {
1516
// We don't have a direct function symbol, but that may be because of
1517
// constant cast instructions in the call.
1518
1519
// With bitcast'd call targets, the instruction will be the call
1520
if (const auto *CI = dyn_cast<CallInst>(CB)) {
1521
// Check if we have call alignment metadata
1522
if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1523
return StackAlign.value();
1524
}
1525
DirectCallee = getMaybeBitcastedCallee(CB);
1526
}
1527
1528
// Check for function alignment information if we found that the
1529
// ultimate target is a Function
1530
if (DirectCallee)
1531
return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1532
1533
// Call is indirect, fall back to the ABI type alignment
1534
return DL.getABITypeAlign(Ty);
1535
}
1536
1537
static bool adjustElementType(EVT &ElementType) {
1538
switch (ElementType.getSimpleVT().SimpleTy) {
1539
default:
1540
return false;
1541
case MVT::f16:
1542
case MVT::bf16:
1543
ElementType = MVT::i16;
1544
return true;
1545
case MVT::f32:
1546
case MVT::v2f16:
1547
case MVT::v2bf16:
1548
ElementType = MVT::i32;
1549
return true;
1550
case MVT::f64:
1551
ElementType = MVT::i64;
1552
return true;
1553
}
1554
}
1555
1556
// Use byte-store when the param address of the argument value is unaligned.
1557
// This may happen when the return value is a field of a packed structure.
1558
//
1559
// This is called in LowerCall() when passing the param values.
1560
static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,
1561
uint64_t Offset, EVT ElementType,
1562
SDValue StVal, SDValue &InGlue,
1563
unsigned ArgID, const SDLoc &dl) {
1564
// Bit logic only works on integer types
1565
if (adjustElementType(ElementType))
1566
StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
1567
1568
// Store each byte
1569
SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1570
for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1571
// Shift the byte to the last byte position
1572
SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
1573
DAG.getConstant(i * 8, dl, MVT::i32));
1574
SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
1575
DAG.getConstant(Offset + i, dl, MVT::i32),
1576
ShiftVal, InGlue};
1577
// Trunc store only the last byte by using
1578
// st.param.b8
1579
// The register type can be larger than b8.
1580
Chain = DAG.getMemIntrinsicNode(
1581
NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
1582
MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
1583
InGlue = Chain.getValue(1);
1584
}
1585
return Chain;
1586
}
1587
1588
// Use byte-load when the param adress of the returned value is unaligned.
1589
// This may happen when the returned value is a field of a packed structure.
1590
static SDValue
1591
LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
1592
EVT ElementType, SDValue &InGlue,
1593
SmallVectorImpl<SDValue> &TempProxyRegOps,
1594
const SDLoc &dl) {
1595
// Bit logic only works on integer types
1596
EVT MergedType = ElementType;
1597
adjustElementType(MergedType);
1598
1599
// Load each byte and construct the whole value. Initial value to 0
1600
SDValue RetVal = DAG.getConstant(0, dl, MergedType);
1601
// LoadParamMemI8 loads into i16 register only
1602
SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
1603
for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1604
SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1605
DAG.getConstant(Offset + i, dl, MVT::i32),
1606
InGlue};
1607
// This will be selected to LoadParamMemI8
1608
SDValue LdVal =
1609
DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
1610
MVT::i8, MachinePointerInfo(), Align(1));
1611
SDValue TmpLdVal = LdVal.getValue(0);
1612
Chain = LdVal.getValue(1);
1613
InGlue = LdVal.getValue(2);
1614
1615
TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
1616
TmpLdVal.getSimpleValueType(), TmpLdVal);
1617
TempProxyRegOps.push_back(TmpLdVal);
1618
1619
SDValue CMask = DAG.getConstant(255, dl, MergedType);
1620
SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
1621
// Need to extend the i16 register to the whole width.
1622
TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
1623
// Mask off the high bits. Leave only the lower 8bits.
1624
// Do this because we are using loadparam.b8.
1625
TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
1626
// Shift and merge
1627
TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
1628
RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
1629
}
1630
if (ElementType != MergedType)
1631
RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
1632
1633
return RetVal;
1634
}
1635
1636
SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1637
SmallVectorImpl<SDValue> &InVals) const {
1638
1639
if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1640
report_fatal_error(
1641
"Support for variadic functions (unsized array parameter) introduced "
1642
"in PTX ISA version 6.0 and requires target sm_30.");
1643
1644
SelectionDAG &DAG = CLI.DAG;
1645
SDLoc dl = CLI.DL;
1646
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1647
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1648
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1649
SDValue Chain = CLI.Chain;
1650
SDValue Callee = CLI.Callee;
1651
bool &isTailCall = CLI.IsTailCall;
1652
ArgListTy &Args = CLI.getArgs();
1653
Type *RetTy = CLI.RetTy;
1654
const CallBase *CB = CLI.CB;
1655
const DataLayout &DL = DAG.getDataLayout();
1656
1657
bool isABI = (STI.getSmVersion() >= 20);
1658
assert(isABI && "Non-ABI compilation is not supported");
1659
if (!isABI)
1660
return Chain;
1661
1662
// Variadic arguments.
1663
//
1664
// Normally, for each argument, we declare a param scalar or a param
1665
// byte array in the .param space, and store the argument value to that
1666
// param scalar or array starting at offset 0.
1667
//
1668
// In the case of the first variadic argument, we declare a vararg byte array
1669
// with size 0. The exact size of this array isn't known at this point, so
1670
// it'll be patched later. All the variadic arguments will be stored to this
1671
// array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1672
// initially set to 0, so it can be used for non-variadic arguments (which use
1673
// 0 offset) to simplify the code.
1674
//
1675
// After all vararg is processed, 'VAOffset' holds the size of the
1676
// vararg byte array.
1677
1678
SDValue VADeclareParam; // vararg byte array
1679
unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1680
unsigned VAOffset = 0; // current offset in the param array
1681
1682
unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1683
SDValue TempChain = Chain;
1684
Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1685
SDValue InGlue = Chain.getValue(1);
1686
1687
unsigned ParamCount = 0;
1688
// Args.size() and Outs.size() need not match.
1689
// Outs.size() will be larger
1690
// * if there is an aggregate argument with multiple fields (each field
1691
// showing up separately in Outs)
1692
// * if there is a vector argument with more than typical vector-length
1693
// elements (generally if more than 4) where each vector element is
1694
// individually present in Outs.
1695
// So a different index should be used for indexing into Outs/OutVals.
1696
// See similar issue in LowerFormalArguments.
1697
unsigned OIdx = 0;
1698
// Declare the .params or .reg need to pass values
1699
// to the function
1700
for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1701
EVT VT = Outs[OIdx].VT;
1702
Type *Ty = Args[i].Ty;
1703
bool IsVAArg = (i >= CLI.NumFixedArgs);
1704
bool IsByVal = Outs[OIdx].Flags.isByVal();
1705
1706
SmallVector<EVT, 16> VTs;
1707
SmallVector<uint64_t, 16> Offsets;
1708
1709
assert((!IsByVal || Args[i].IndirectType) &&
1710
"byval arg must have indirect type");
1711
Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1712
ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1713
1714
Align ArgAlign;
1715
if (IsByVal) {
1716
// The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1717
// so we don't need to worry whether it's naturally aligned or not.
1718
// See TargetLowering::LowerCallTo().
1719
Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1720
ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1721
InitialAlign, DL);
1722
if (IsVAArg)
1723
VAOffset = alignTo(VAOffset, ArgAlign);
1724
} else {
1725
ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL);
1726
}
1727
1728
unsigned TypeSize =
1729
(IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1730
SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1731
1732
bool NeedAlign; // Does argument declaration specify alignment?
1733
bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1734
if (IsVAArg) {
1735
if (ParamCount == FirstVAArg) {
1736
SDValue DeclareParamOps[] = {
1737
Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1738
DAG.getConstant(ParamCount, dl, MVT::i32),
1739
DAG.getConstant(1, dl, MVT::i32), InGlue};
1740
VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1741
DeclareParamVTs, DeclareParamOps);
1742
}
1743
NeedAlign = PassAsArray;
1744
} else if (PassAsArray) {
1745
// declare .param .align <align> .b8 .param<n>[<size>];
1746
SDValue DeclareParamOps[] = {
1747
Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1748
DAG.getConstant(ParamCount, dl, MVT::i32),
1749
DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1750
Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1751
DeclareParamOps);
1752
NeedAlign = true;
1753
} else {
1754
// declare .param .b<size> .param<n>;
1755
if (VT.isInteger() || VT.isFloatingPoint()) {
1756
// PTX ABI requires integral types to be at least 32 bits in
1757
// size. FP16 is loaded/stored using i16, so it's handled
1758
// here as well.
1759
TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8;
1760
}
1761
SDValue DeclareScalarParamOps[] = {
1762
Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1763
DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1764
DAG.getConstant(0, dl, MVT::i32), InGlue};
1765
Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1766
DeclareScalarParamOps);
1767
NeedAlign = false;
1768
}
1769
InGlue = Chain.getValue(1);
1770
1771
// PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1772
// than 32-bits are sign extended or zero extended, depending on
1773
// whether they are signed or unsigned types. This case applies
1774
// only to scalar parameters and not to aggregate values.
1775
bool ExtendIntegerParam =
1776
Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1777
1778
auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1779
SmallVector<SDValue, 6> StoreOperands;
1780
for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1781
EVT EltVT = VTs[j];
1782
int CurOffset = Offsets[j];
1783
MaybeAlign PartAlign;
1784
if (NeedAlign)
1785
PartAlign = commonAlignment(ArgAlign, CurOffset);
1786
1787
SDValue StVal = OutVals[OIdx];
1788
1789
MVT PromotedVT;
1790
if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1791
EltVT = EVT(PromotedVT);
1792
}
1793
if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1794
llvm::ISD::NodeType Ext =
1795
Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1796
StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1797
}
1798
1799
if (IsByVal) {
1800
auto PtrVT = getPointerTy(DL);
1801
SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1802
DAG.getConstant(CurOffset, dl, PtrVT));
1803
StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1804
PartAlign);
1805
} else if (ExtendIntegerParam) {
1806
assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1807
// zext/sext to i32
1808
StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1809
: ISD::ZERO_EXTEND,
1810
dl, MVT::i32, StVal);
1811
}
1812
1813
if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1814
// Use 16-bit registers for small stores as it's the
1815
// smallest general purpose register size supported by NVPTX.
1816
StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1817
}
1818
1819
// If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1820
// scalar store. In such cases, fall back to byte stores.
1821
if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
1822
PartAlign.value() <
1823
DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
1824
assert(StoreOperands.empty() && "Unfinished preceeding store.");
1825
Chain = LowerUnalignedStoreParam(
1826
DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
1827
StVal, InGlue, ParamCount, dl);
1828
1829
// LowerUnalignedStoreParam took care of inserting the necessary nodes
1830
// into the SDAG, so just move on to the next element.
1831
if (!IsByVal)
1832
++OIdx;
1833
continue;
1834
}
1835
1836
// New store.
1837
if (VectorInfo[j] & PVF_FIRST) {
1838
assert(StoreOperands.empty() && "Unfinished preceding store.");
1839
StoreOperands.push_back(Chain);
1840
StoreOperands.push_back(
1841
DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1842
1843
StoreOperands.push_back(DAG.getConstant(
1844
IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1845
dl, MVT::i32));
1846
}
1847
1848
// Record the value to store.
1849
StoreOperands.push_back(StVal);
1850
1851
if (VectorInfo[j] & PVF_LAST) {
1852
unsigned NumElts = StoreOperands.size() - 3;
1853
NVPTXISD::NodeType Op;
1854
switch (NumElts) {
1855
case 1:
1856
Op = NVPTXISD::StoreParam;
1857
break;
1858
case 2:
1859
Op = NVPTXISD::StoreParamV2;
1860
break;
1861
case 4:
1862
Op = NVPTXISD::StoreParamV4;
1863
break;
1864
default:
1865
llvm_unreachable("Invalid vector info.");
1866
}
1867
1868
StoreOperands.push_back(InGlue);
1869
1870
// Adjust type of the store op if we've extended the scalar
1871
// return value.
1872
EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1873
1874
Chain = DAG.getMemIntrinsicNode(
1875
Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1876
TheStoreType, MachinePointerInfo(), PartAlign,
1877
MachineMemOperand::MOStore);
1878
InGlue = Chain.getValue(1);
1879
1880
// Cleanup.
1881
StoreOperands.clear();
1882
1883
// TODO: We may need to support vector types that can be passed
1884
// as scalars in variadic arguments.
1885
if (!IsByVal && IsVAArg) {
1886
assert(NumElts == 1 &&
1887
"Vectorization is expected to be disabled for variadics.");
1888
VAOffset += DL.getTypeAllocSize(
1889
TheStoreType.getTypeForEVT(*DAG.getContext()));
1890
}
1891
}
1892
if (!IsByVal)
1893
++OIdx;
1894
}
1895
assert(StoreOperands.empty() && "Unfinished parameter store.");
1896
if (!IsByVal && VTs.size() > 0)
1897
--OIdx;
1898
++ParamCount;
1899
if (IsByVal && IsVAArg)
1900
VAOffset += TypeSize;
1901
}
1902
1903
GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1904
MaybeAlign retAlignment = std::nullopt;
1905
1906
// Handle Result
1907
if (Ins.size() > 0) {
1908
SmallVector<EVT, 16> resvtparts;
1909
ComputeValueVTs(*this, DL, RetTy, resvtparts);
1910
1911
// Declare
1912
// .param .align N .b8 retval0[<size-in-bytes>], or
1913
// .param .b<size-in-bits> retval0
1914
unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1915
if (!IsTypePassedAsArray(RetTy)) {
1916
resultsz = promoteScalarArgumentSize(resultsz);
1917
SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1918
SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1919
DAG.getConstant(resultsz, dl, MVT::i32),
1920
DAG.getConstant(0, dl, MVT::i32), InGlue };
1921
Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1922
DeclareRetOps);
1923
InGlue = Chain.getValue(1);
1924
} else {
1925
retAlignment = getArgumentAlignment(CB, RetTy, 0, DL);
1926
assert(retAlignment && "retAlignment is guaranteed to be set");
1927
SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1928
SDValue DeclareRetOps[] = {
1929
Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1930
DAG.getConstant(resultsz / 8, dl, MVT::i32),
1931
DAG.getConstant(0, dl, MVT::i32), InGlue};
1932
Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1933
DeclareRetOps);
1934
InGlue = Chain.getValue(1);
1935
}
1936
}
1937
1938
bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1939
// Set the size of the vararg param byte array if the callee is a variadic
1940
// function and the variadic part is not empty.
1941
if (HasVAArgs) {
1942
SDValue DeclareParamOps[] = {
1943
VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1944
VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1945
VADeclareParam.getOperand(4)};
1946
DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1947
VADeclareParam->getVTList(), DeclareParamOps);
1948
}
1949
1950
// Both indirect calls and libcalls have nullptr Func. In order to distinguish
1951
// between them we must rely on the call site value which is valid for
1952
// indirect calls but is always null for libcalls.
1953
bool isIndirectCall = !Func && CB;
1954
1955
if (isa<ExternalSymbolSDNode>(Callee)) {
1956
Function* CalleeFunc = nullptr;
1957
1958
// Try to find the callee in the current module.
1959
Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1960
assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1961
1962
// Set the "libcall callee" attribute to indicate that the function
1963
// must always have a declaration.
1964
CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1965
}
1966
1967
if (isIndirectCall) {
1968
// This is indirect function call case : PTX requires a prototype of the
1969
// form
1970
// proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1971
// to be emitted, and the label has to used as the last arg of call
1972
// instruction.
1973
// The prototype is embedded in a string and put as the operand for a
1974
// CallPrototype SDNode which will print out to the value of the string.
1975
SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1976
std::string Proto = getPrototype(
1977
DL, RetTy, Args, Outs, retAlignment,
1978
HasVAArgs
1979
? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1980
CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))
1981
: std::nullopt,
1982
*CB, UniqueCallSite);
1983
const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1984
SDValue ProtoOps[] = {
1985
Chain,
1986
DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
1987
InGlue,
1988
};
1989
Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1990
InGlue = Chain.getValue(1);
1991
}
1992
// Op to just print "call"
1993
SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1994
SDValue PrintCallOps[] = {
1995
Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
1996
};
1997
// We model convergent calls as separate opcodes.
1998
unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
1999
if (CLI.IsConvergent)
2000
Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
2001
: NVPTXISD::PrintConvergentCall;
2002
Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
2003
InGlue = Chain.getValue(1);
2004
2005
// Ops to print out the function name
2006
SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2007
SDValue CallVoidOps[] = { Chain, Callee, InGlue };
2008
Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
2009
InGlue = Chain.getValue(1);
2010
2011
// Ops to print out the param list
2012
SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2013
SDValue CallArgBeginOps[] = { Chain, InGlue };
2014
Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
2015
CallArgBeginOps);
2016
InGlue = Chain.getValue(1);
2017
2018
for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
2019
++i) {
2020
unsigned opcode;
2021
if (i == (e - 1))
2022
opcode = NVPTXISD::LastCallArg;
2023
else
2024
opcode = NVPTXISD::CallArg;
2025
SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2026
SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
2027
DAG.getConstant(i, dl, MVT::i32), InGlue };
2028
Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
2029
InGlue = Chain.getValue(1);
2030
}
2031
SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2032
SDValue CallArgEndOps[] = { Chain,
2033
DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
2034
InGlue };
2035
Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
2036
InGlue = Chain.getValue(1);
2037
2038
if (isIndirectCall) {
2039
SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2040
SDValue PrototypeOps[] = {
2041
Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
2042
Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
2043
InGlue = Chain.getValue(1);
2044
}
2045
2046
SmallVector<SDValue, 16> ProxyRegOps;
2047
SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
2048
// An item of the vector is filled if the element does not need a ProxyReg
2049
// operation on it and should be added to InVals as is. ProxyRegOps and
2050
// ProxyRegTruncates contain empty/none items at the same index.
2051
SmallVector<SDValue, 16> RetElts;
2052
// A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
2053
// to use the values of `LoadParam`s and to be replaced later then
2054
// `CALLSEQ_END` is added.
2055
SmallVector<SDValue, 16> TempProxyRegOps;
2056
2057
// Generate loads from param memory/moves from registers for result
2058
if (Ins.size() > 0) {
2059
SmallVector<EVT, 16> VTs;
2060
SmallVector<uint64_t, 16> Offsets;
2061
ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
2062
assert(VTs.size() == Ins.size() && "Bad value decomposition");
2063
2064
Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
2065
auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
2066
2067
SmallVector<EVT, 6> LoadVTs;
2068
int VecIdx = -1; // Index of the first element of the vector.
2069
2070
// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2071
// 32-bits are sign extended or zero extended, depending on whether
2072
// they are signed or unsigned types.
2073
bool ExtendIntegerRetVal =
2074
RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2075
2076
for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2077
bool needTruncate = false;
2078
EVT TheLoadType = VTs[i];
2079
EVT EltType = Ins[i].VT;
2080
Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
2081
MVT PromotedVT;
2082
2083
if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
2084
TheLoadType = EVT(PromotedVT);
2085
EltType = EVT(PromotedVT);
2086
needTruncate = true;
2087
}
2088
2089
if (ExtendIntegerRetVal) {
2090
TheLoadType = MVT::i32;
2091
EltType = MVT::i32;
2092
needTruncate = true;
2093
} else if (TheLoadType.getSizeInBits() < 16) {
2094
if (VTs[i].isInteger())
2095
needTruncate = true;
2096
EltType = MVT::i16;
2097
}
2098
2099
// If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
2100
// scalar load. In such cases, fall back to byte loads.
2101
if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
2102
EltAlign < DL.getABITypeAlign(
2103
TheLoadType.getTypeForEVT(*DAG.getContext()))) {
2104
assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2105
SDValue Ret = LowerUnalignedLoadRetParam(
2106
DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
2107
ProxyRegOps.push_back(SDValue());
2108
ProxyRegTruncates.push_back(std::optional<MVT>());
2109
RetElts.resize(i);
2110
RetElts.push_back(Ret);
2111
2112
continue;
2113
}
2114
2115
// Record index of the very first element of the vector.
2116
if (VectorInfo[i] & PVF_FIRST) {
2117
assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2118
VecIdx = i;
2119
}
2120
2121
LoadVTs.push_back(EltType);
2122
2123
if (VectorInfo[i] & PVF_LAST) {
2124
unsigned NumElts = LoadVTs.size();
2125
LoadVTs.push_back(MVT::Other);
2126
LoadVTs.push_back(MVT::Glue);
2127
NVPTXISD::NodeType Op;
2128
switch (NumElts) {
2129
case 1:
2130
Op = NVPTXISD::LoadParam;
2131
break;
2132
case 2:
2133
Op = NVPTXISD::LoadParamV2;
2134
break;
2135
case 4:
2136
Op = NVPTXISD::LoadParamV4;
2137
break;
2138
default:
2139
llvm_unreachable("Invalid vector info.");
2140
}
2141
2142
SDValue LoadOperands[] = {
2143
Chain, DAG.getConstant(1, dl, MVT::i32),
2144
DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
2145
SDValue RetVal = DAG.getMemIntrinsicNode(
2146
Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
2147
MachinePointerInfo(), EltAlign,
2148
MachineMemOperand::MOLoad);
2149
2150
for (unsigned j = 0; j < NumElts; ++j) {
2151
ProxyRegOps.push_back(RetVal.getValue(j));
2152
2153
if (needTruncate)
2154
ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
2155
else
2156
ProxyRegTruncates.push_back(std::optional<MVT>());
2157
}
2158
2159
Chain = RetVal.getValue(NumElts);
2160
InGlue = RetVal.getValue(NumElts + 1);
2161
2162
// Cleanup
2163
VecIdx = -1;
2164
LoadVTs.clear();
2165
}
2166
}
2167
}
2168
2169
Chain =
2170
DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
2171
InGlue = Chain.getValue(1);
2172
2173
// Append ProxyReg instructions to the chain to make sure that `callseq_end`
2174
// will not get lost. Otherwise, during libcalls expansion, the nodes can become
2175
// dangling.
2176
for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
2177
if (i < RetElts.size() && RetElts[i]) {
2178
InVals.push_back(RetElts[i]);
2179
continue;
2180
}
2181
2182
SDValue Ret = DAG.getNode(
2183
NVPTXISD::ProxyReg, dl,
2184
DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
2185
{ Chain, ProxyRegOps[i], InGlue }
2186
);
2187
2188
Chain = Ret.getValue(1);
2189
InGlue = Ret.getValue(2);
2190
2191
if (ProxyRegTruncates[i]) {
2192
Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
2193
}
2194
2195
InVals.push_back(Ret);
2196
}
2197
2198
for (SDValue &T : TempProxyRegOps) {
2199
SDValue Repl = DAG.getNode(
2200
NVPTXISD::ProxyReg, dl,
2201
DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
2202
{Chain, T.getOperand(0), InGlue});
2203
DAG.ReplaceAllUsesWith(T, Repl);
2204
DAG.RemoveDeadNode(T.getNode());
2205
2206
Chain = Repl.getValue(1);
2207
InGlue = Repl.getValue(2);
2208
}
2209
2210
// set isTailCall to false for now, until we figure out how to express
2211
// tail call optimization in PTX
2212
isTailCall = false;
2213
return Chain;
2214
}
2215
2216
SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
2217
SelectionDAG &DAG) const {
2218
2219
if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2220
const Function &Fn = DAG.getMachineFunction().getFunction();
2221
2222
DiagnosticInfoUnsupported NoDynamicAlloca(
2223
Fn,
2224
"Support for dynamic alloca introduced in PTX ISA version 7.3 and "
2225
"requires target sm_52.",
2226
SDLoc(Op).getDebugLoc());
2227
DAG.getContext()->diagnose(NoDynamicAlloca);
2228
auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
2229
Op.getOperand(0)};
2230
return DAG.getMergeValues(Ops, SDLoc());
2231
}
2232
2233
SDValue Chain = Op.getOperand(0);
2234
SDValue Size = Op.getOperand(1);
2235
uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2236
SDLoc DL(Op.getNode());
2237
2238
// The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
2239
if (nvTM->is64Bit())
2240
Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64);
2241
else
2242
Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32);
2243
2244
SDValue AllocOps[] = {Chain, Size,
2245
DAG.getTargetConstant(Align, DL, MVT::i32)};
2246
SDValue Alloca = DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL,
2247
nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps);
2248
2249
SDValue MergeOps[] = {Alloca, Chain};
2250
return DAG.getMergeValues(MergeOps, DL);
2251
}
2252
2253
// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2254
// (see LegalizeDAG.cpp). This is slow and uses local memory.
2255
// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2256
SDValue
2257
NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2258
SDNode *Node = Op.getNode();
2259
SDLoc dl(Node);
2260
SmallVector<SDValue, 8> Ops;
2261
unsigned NumOperands = Node->getNumOperands();
2262
for (unsigned i = 0; i < NumOperands; ++i) {
2263
SDValue SubOp = Node->getOperand(i);
2264
EVT VVT = SubOp.getNode()->getValueType(0);
2265
EVT EltVT = VVT.getVectorElementType();
2266
unsigned NumSubElem = VVT.getVectorNumElements();
2267
for (unsigned j = 0; j < NumSubElem; ++j) {
2268
Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2269
DAG.getIntPtrConstant(j, dl)));
2270
}
2271
}
2272
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2273
}
2274
2275
// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2276
// would get lowered as two constant loads and vector-packing move.
2277
// Instead we want just a constant move:
2278
// mov.b32 %r2, 0x40003C00
2279
SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2280
SelectionDAG &DAG) const {
2281
EVT VT = Op->getValueType(0);
2282
if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2283
return Op;
2284
2285
SDLoc DL(Op);
2286
2287
if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2288
return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2289
isa<ConstantFPSDNode>(Operand);
2290
})) {
2291
// Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2292
// to optimize calculation of constant parts.
2293
if (VT == MVT::v4i8) {
2294
SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
2295
SDValue E01 = DAG.getNode(
2296
NVPTXISD::BFI, DL, MVT::i32,
2297
DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
2298
DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
2299
SDValue E012 =
2300
DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2301
DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
2302
E01, DAG.getConstant(16, DL, MVT::i32), C8);
2303
SDValue E0123 =
2304
DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2305
DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
2306
E012, DAG.getConstant(24, DL, MVT::i32), C8);
2307
return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
2308
}
2309
return Op;
2310
}
2311
2312
// Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2313
auto GetOperand = [](SDValue Op, int N) -> APInt {
2314
const SDValue &Operand = Op->getOperand(N);
2315
EVT VT = Op->getValueType(0);
2316
if (Operand->isUndef())
2317
return APInt(32, 0);
2318
APInt Value;
2319
if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2320
Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2321
else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2322
Value = Operand->getAsAPIntVal();
2323
else
2324
llvm_unreachable("Unsupported type");
2325
// i8 values are carried around as i16, so we need to zero out upper bits,
2326
// so they do not get in the way of combining individual byte values
2327
if (VT == MVT::v4i8)
2328
Value = Value.trunc(8);
2329
return Value.zext(32);
2330
};
2331
APInt Value;
2332
if (Isv2x16VT(VT)) {
2333
Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
2334
} else if (VT == MVT::v4i8) {
2335
Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
2336
GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
2337
} else {
2338
llvm_unreachable("Unsupported type");
2339
}
2340
SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
2341
return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
2342
}
2343
2344
SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2345
SelectionDAG &DAG) const {
2346
SDValue Index = Op->getOperand(1);
2347
SDValue Vector = Op->getOperand(0);
2348
SDLoc DL(Op);
2349
EVT VectorVT = Vector.getValueType();
2350
2351
if (VectorVT == MVT::v4i8) {
2352
SDValue BFE =
2353
DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
2354
{Vector,
2355
DAG.getNode(ISD::MUL, DL, MVT::i32,
2356
DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2357
DAG.getConstant(8, DL, MVT::i32)),
2358
DAG.getConstant(8, DL, MVT::i32)});
2359
return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
2360
}
2361
2362
// Constant index will be matched by tablegen.
2363
if (isa<ConstantSDNode>(Index.getNode()))
2364
return Op;
2365
2366
// Extract individual elements and select one of them.
2367
assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2368
EVT EltVT = VectorVT.getVectorElementType();
2369
2370
SDLoc dl(Op.getNode());
2371
SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2372
DAG.getIntPtrConstant(0, dl));
2373
SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2374
DAG.getIntPtrConstant(1, dl));
2375
return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2376
ISD::CondCode::SETEQ);
2377
}
2378
2379
SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2380
SelectionDAG &DAG) const {
2381
SDValue Vector = Op->getOperand(0);
2382
EVT VectorVT = Vector.getValueType();
2383
2384
if (VectorVT != MVT::v4i8)
2385
return Op;
2386
SDLoc DL(Op);
2387
SDValue Value = Op->getOperand(1);
2388
if (Value->isUndef())
2389
return Vector;
2390
2391
SDValue Index = Op->getOperand(2);
2392
2393
SDValue BFI =
2394
DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2395
{DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2396
DAG.getNode(ISD::MUL, DL, MVT::i32,
2397
DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2398
DAG.getConstant(8, DL, MVT::i32)),
2399
DAG.getConstant(8, DL, MVT::i32)});
2400
return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2401
}
2402
2403
SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2404
SelectionDAG &DAG) const {
2405
SDValue V1 = Op.getOperand(0);
2406
EVT VectorVT = V1.getValueType();
2407
if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2408
return Op;
2409
2410
// Lower shuffle to PRMT instruction.
2411
const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2412
SDValue V2 = Op.getOperand(1);
2413
uint32_t Selector = 0;
2414
for (auto I : llvm::enumerate(SVN->getMask())) {
2415
if (I.value() != -1) // -1 is a placeholder for undef.
2416
Selector |= (I.value() << (I.index() * 4));
2417
}
2418
2419
SDLoc DL(Op);
2420
return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
2421
DAG.getConstant(Selector, DL, MVT::i32),
2422
DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
2423
}
2424
/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2425
/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2426
/// amount, or
2427
/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2428
/// amount.
2429
SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2430
SelectionDAG &DAG) const {
2431
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2432
assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2433
2434
EVT VT = Op.getValueType();
2435
unsigned VTBits = VT.getSizeInBits();
2436
SDLoc dl(Op);
2437
SDValue ShOpLo = Op.getOperand(0);
2438
SDValue ShOpHi = Op.getOperand(1);
2439
SDValue ShAmt = Op.getOperand(2);
2440
unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2441
2442
if (VTBits == 32 && STI.getSmVersion() >= 35) {
2443
// For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2444
// {dHi, dLo} = {aHi, aLo} >> Amt
2445
// dHi = aHi >> Amt
2446
// dLo = shf.r.clamp aLo, aHi, Amt
2447
2448
SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2449
SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
2450
ShAmt);
2451
2452
SDValue Ops[2] = { Lo, Hi };
2453
return DAG.getMergeValues(Ops, dl);
2454
}
2455
else {
2456
// {dHi, dLo} = {aHi, aLo} >> Amt
2457
// - if (Amt>=size) then
2458
// dLo = aHi >> (Amt-size)
2459
// dHi = aHi >> Amt (this is either all 0 or all 1)
2460
// else
2461
// dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2462
// dHi = aHi >> Amt
2463
2464
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2465
DAG.getConstant(VTBits, dl, MVT::i32),
2466
ShAmt);
2467
SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2468
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2469
DAG.getConstant(VTBits, dl, MVT::i32));
2470
SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2471
SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2472
SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2473
2474
SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2475
DAG.getConstant(VTBits, dl, MVT::i32),
2476
ISD::SETGE);
2477
SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2478
SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2479
2480
SDValue Ops[2] = { Lo, Hi };
2481
return DAG.getMergeValues(Ops, dl);
2482
}
2483
}
2484
2485
/// LowerShiftLeftParts - Lower SHL_PARTS, which
2486
/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2487
/// amount, or
2488
/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2489
/// amount.
2490
SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2491
SelectionDAG &DAG) const {
2492
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2493
assert(Op.getOpcode() == ISD::SHL_PARTS);
2494
2495
EVT VT = Op.getValueType();
2496
unsigned VTBits = VT.getSizeInBits();
2497
SDLoc dl(Op);
2498
SDValue ShOpLo = Op.getOperand(0);
2499
SDValue ShOpHi = Op.getOperand(1);
2500
SDValue ShAmt = Op.getOperand(2);
2501
2502
if (VTBits == 32 && STI.getSmVersion() >= 35) {
2503
// For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2504
// {dHi, dLo} = {aHi, aLo} << Amt
2505
// dHi = shf.l.clamp aLo, aHi, Amt
2506
// dLo = aLo << Amt
2507
2508
SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2509
ShAmt);
2510
SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2511
2512
SDValue Ops[2] = { Lo, Hi };
2513
return DAG.getMergeValues(Ops, dl);
2514
}
2515
else {
2516
// {dHi, dLo} = {aHi, aLo} << Amt
2517
// - if (Amt>=size) then
2518
// dLo = aLo << Amt (all 0)
2519
// dLo = aLo << (Amt-size)
2520
// else
2521
// dLo = aLo << Amt
2522
// dHi = (aHi << Amt) | (aLo >> (size-Amt))
2523
2524
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2525
DAG.getConstant(VTBits, dl, MVT::i32),
2526
ShAmt);
2527
SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2528
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2529
DAG.getConstant(VTBits, dl, MVT::i32));
2530
SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2531
SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2532
SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2533
2534
SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2535
DAG.getConstant(VTBits, dl, MVT::i32),
2536
ISD::SETGE);
2537
SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2538
SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2539
2540
SDValue Ops[2] = { Lo, Hi };
2541
return DAG.getMergeValues(Ops, dl);
2542
}
2543
}
2544
2545
SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2546
EVT VT = Op.getValueType();
2547
2548
if (VT == MVT::f32)
2549
return LowerFROUND32(Op, DAG);
2550
2551
if (VT == MVT::f64)
2552
return LowerFROUND64(Op, DAG);
2553
2554
llvm_unreachable("unhandled type");
2555
}
2556
2557
// This is the the rounding method used in CUDA libdevice in C like code:
2558
// float roundf(float A)
2559
// {
2560
// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2561
// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2562
// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2563
// }
2564
SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2565
SelectionDAG &DAG) const {
2566
SDLoc SL(Op);
2567
SDValue A = Op.getOperand(0);
2568
EVT VT = Op.getValueType();
2569
2570
SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2571
2572
// RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2573
SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2574
const int SignBitMask = 0x80000000;
2575
SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2576
DAG.getConstant(SignBitMask, SL, MVT::i32));
2577
const int PointFiveInBits = 0x3F000000;
2578
SDValue PointFiveWithSignRaw =
2579
DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2580
DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2581
SDValue PointFiveWithSign =
2582
DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2583
SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2584
SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2585
2586
// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2587
EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2588
SDValue IsLarge =
2589
DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2590
ISD::SETOGT);
2591
RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2592
2593
// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2594
SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2595
DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2596
SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2597
return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2598
}
2599
2600
// The implementation of round(double) is similar to that of round(float) in
2601
// that they both separate the value range into three regions and use a method
2602
// specific to the region to round the values. However, round(double) first
2603
// calculates the round of the absolute value and then adds the sign back while
2604
// round(float) directly rounds the value with sign.
2605
SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2606
SelectionDAG &DAG) const {
2607
SDLoc SL(Op);
2608
SDValue A = Op.getOperand(0);
2609
EVT VT = Op.getValueType();
2610
2611
SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2612
2613
// double RoundedA = (double) (int) (abs(A) + 0.5f);
2614
SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2615
DAG.getConstantFP(0.5, SL, VT));
2616
SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2617
2618
// RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2619
EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2620
SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2621
DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2622
RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2623
DAG.getConstantFP(0, SL, VT),
2624
RoundedA);
2625
2626
// Add sign to rounded_A
2627
RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2628
DAG.getNode(ISD::FTRUNC, SL, VT, A);
2629
2630
// RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2631
SDValue IsLarge =
2632
DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2633
ISD::SETOGT);
2634
return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2635
}
2636
2637
SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2638
SelectionDAG &DAG) const {
2639
assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2640
2641
if (Op.getValueType() == MVT::bf16) {
2642
SDLoc Loc(Op);
2643
return DAG.getNode(
2644
ISD::FP_ROUND, Loc, MVT::bf16,
2645
DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2646
DAG.getIntPtrConstant(0, Loc));
2647
}
2648
2649
// Everything else is considered legal.
2650
return Op;
2651
}
2652
2653
SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2654
SelectionDAG &DAG) const {
2655
assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2656
2657
if (Op.getOperand(0).getValueType() == MVT::bf16) {
2658
SDLoc Loc(Op);
2659
return DAG.getNode(
2660
Op.getOpcode(), Loc, Op.getValueType(),
2661
DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2662
}
2663
2664
// Everything else is considered legal.
2665
return Op;
2666
}
2667
2668
SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2669
SelectionDAG &DAG) const {
2670
EVT NarrowVT = Op.getValueType();
2671
SDValue Wide = Op.getOperand(0);
2672
EVT WideVT = Wide.getValueType();
2673
if (NarrowVT.getScalarType() == MVT::bf16) {
2674
const TargetLowering *TLI = STI.getTargetLowering();
2675
if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2676
return TLI->expandFP_ROUND(Op.getNode(), DAG);
2677
}
2678
if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2679
// This combination was the first to support f32 -> bf16.
2680
if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2681
if (WideVT.getScalarType() == MVT::f32) {
2682
return Op;
2683
}
2684
if (WideVT.getScalarType() == MVT::f64) {
2685
SDLoc Loc(Op);
2686
// Round-inexact-to-odd f64 to f32, then do the final rounding using
2687
// the hardware f32 -> bf16 instruction.
2688
SDValue rod = TLI->expandRoundInexactToOdd(
2689
WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2690
: MVT::f32,
2691
Wide, Loc, DAG);
2692
return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2693
}
2694
}
2695
return TLI->expandFP_ROUND(Op.getNode(), DAG);
2696
}
2697
}
2698
2699
// Everything else is considered legal.
2700
return Op;
2701
}
2702
2703
SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2704
SelectionDAG &DAG) const {
2705
SDValue Narrow = Op.getOperand(0);
2706
EVT NarrowVT = Narrow.getValueType();
2707
EVT WideVT = Op.getValueType();
2708
if (NarrowVT.getScalarType() == MVT::bf16) {
2709
if (WideVT.getScalarType() == MVT::f32 &&
2710
(STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2711
SDLoc Loc(Op);
2712
return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2713
}
2714
if (WideVT.getScalarType() == MVT::f64 &&
2715
(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2716
EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2717
: MVT::f32;
2718
SDLoc Loc(Op);
2719
if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2720
Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2721
} else {
2722
Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2723
}
2724
return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2725
}
2726
}
2727
2728
// Everything else is considered legal.
2729
return Op;
2730
}
2731
2732
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) {
2733
SDLoc DL(Op);
2734
if (Op.getValueType() != MVT::v2i16)
2735
return Op;
2736
EVT EltVT = Op.getValueType().getVectorElementType();
2737
SmallVector<SDValue> VecElements;
2738
for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2739
SmallVector<SDValue> ScalarArgs;
2740
llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2741
[&](const SDUse &O) {
2742
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2743
O.get(), DAG.getIntPtrConstant(I, DL));
2744
});
2745
VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2746
}
2747
SDValue V =
2748
DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2749
return V;
2750
}
2751
2752
SDValue
2753
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2754
switch (Op.getOpcode()) {
2755
case ISD::RETURNADDR:
2756
return SDValue();
2757
case ISD::FRAMEADDR:
2758
return SDValue();
2759
case ISD::GlobalAddress:
2760
return LowerGlobalAddress(Op, DAG);
2761
case ISD::INTRINSIC_W_CHAIN:
2762
return Op;
2763
case ISD::BUILD_VECTOR:
2764
return LowerBUILD_VECTOR(Op, DAG);
2765
case ISD::EXTRACT_SUBVECTOR:
2766
return Op;
2767
case ISD::EXTRACT_VECTOR_ELT:
2768
return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2769
case ISD::INSERT_VECTOR_ELT:
2770
return LowerINSERT_VECTOR_ELT(Op, DAG);
2771
case ISD::VECTOR_SHUFFLE:
2772
return LowerVECTOR_SHUFFLE(Op, DAG);
2773
case ISD::CONCAT_VECTORS:
2774
return LowerCONCAT_VECTORS(Op, DAG);
2775
case ISD::STORE:
2776
return LowerSTORE(Op, DAG);
2777
case ISD::LOAD:
2778
return LowerLOAD(Op, DAG);
2779
case ISD::SHL_PARTS:
2780
return LowerShiftLeftParts(Op, DAG);
2781
case ISD::SRA_PARTS:
2782
case ISD::SRL_PARTS:
2783
return LowerShiftRightParts(Op, DAG);
2784
case ISD::SELECT:
2785
return LowerSelect(Op, DAG);
2786
case ISD::FROUND:
2787
return LowerFROUND(Op, DAG);
2788
case ISD::SINT_TO_FP:
2789
case ISD::UINT_TO_FP:
2790
return LowerINT_TO_FP(Op, DAG);
2791
case ISD::FP_TO_SINT:
2792
case ISD::FP_TO_UINT:
2793
return LowerFP_TO_INT(Op, DAG);
2794
case ISD::FP_ROUND:
2795
return LowerFP_ROUND(Op, DAG);
2796
case ISD::FP_EXTEND:
2797
return LowerFP_EXTEND(Op, DAG);
2798
case ISD::VAARG:
2799
return LowerVAARG(Op, DAG);
2800
case ISD::VASTART:
2801
return LowerVASTART(Op, DAG);
2802
case ISD::ABS:
2803
case ISD::SMIN:
2804
case ISD::SMAX:
2805
case ISD::UMIN:
2806
case ISD::UMAX:
2807
case ISD::ADD:
2808
case ISD::SUB:
2809
case ISD::MUL:
2810
case ISD::SHL:
2811
case ISD::SREM:
2812
case ISD::UREM:
2813
return LowerVectorArith(Op, DAG);
2814
case ISD::DYNAMIC_STACKALLOC:
2815
return LowerDYNAMIC_STACKALLOC(Op, DAG);
2816
case ISD::CopyToReg:
2817
return LowerCopyToReg_128(Op, DAG);
2818
default:
2819
llvm_unreachable("Custom lowering not defined for operation");
2820
}
2821
}
2822
2823
// This function is almost a copy of SelectionDAG::expandVAArg().
2824
// The only diff is that this one produces loads from local address space.
2825
SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2826
const TargetLowering *TLI = STI.getTargetLowering();
2827
SDLoc DL(Op);
2828
2829
SDNode *Node = Op.getNode();
2830
const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2831
EVT VT = Node->getValueType(0);
2832
auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2833
SDValue Tmp1 = Node->getOperand(0);
2834
SDValue Tmp2 = Node->getOperand(1);
2835
const MaybeAlign MA(Node->getConstantOperandVal(3));
2836
2837
SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2838
Tmp1, Tmp2, MachinePointerInfo(V));
2839
SDValue VAList = VAListLoad;
2840
2841
if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2842
VAList = DAG.getNode(
2843
ISD::ADD, DL, VAList.getValueType(), VAList,
2844
DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2845
2846
VAList = DAG.getNode(
2847
ISD::AND, DL, VAList.getValueType(), VAList,
2848
DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));
2849
}
2850
2851
// Increment the pointer, VAList, to the next vaarg
2852
Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2853
DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty),
2854
DL, VAList.getValueType()));
2855
2856
// Store the incremented VAList to the legalized pointer
2857
Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2858
MachinePointerInfo(V));
2859
2860
const Value *SrcV =
2861
Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL));
2862
2863
// Load the actual argument out of the pointer VAList
2864
return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2865
}
2866
2867
SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2868
const TargetLowering *TLI = STI.getTargetLowering();
2869
SDLoc DL(Op);
2870
EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2871
2872
// Store the address of unsized array <function>_vararg[] in the ap object.
2873
SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2874
SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2875
2876
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2877
return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2878
MachinePointerInfo(SV));
2879
}
2880
2881
SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2882
SDValue Op0 = Op->getOperand(0);
2883
SDValue Op1 = Op->getOperand(1);
2884
SDValue Op2 = Op->getOperand(2);
2885
SDLoc DL(Op.getNode());
2886
2887
assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2888
2889
Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2890
Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2891
SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2892
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2893
2894
return Trunc;
2895
}
2896
2897
SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2898
if (Op.getValueType() == MVT::i1)
2899
return LowerLOADi1(Op, DAG);
2900
2901
// v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2902
// unaligned loads and have to handle it here.
2903
EVT VT = Op.getValueType();
2904
if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2905
LoadSDNode *Load = cast<LoadSDNode>(Op);
2906
EVT MemVT = Load->getMemoryVT();
2907
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2908
MemVT, *Load->getMemOperand())) {
2909
SDValue Ops[2];
2910
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2911
return DAG.getMergeValues(Ops, SDLoc(Op));
2912
}
2913
}
2914
2915
return SDValue();
2916
}
2917
2918
// v = ld i1* addr
2919
// =>
2920
// v1 = ld i8* addr (-> i16)
2921
// v = trunc i16 to i1
2922
SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2923
SDNode *Node = Op.getNode();
2924
LoadSDNode *LD = cast<LoadSDNode>(Node);
2925
SDLoc dl(Node);
2926
assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2927
assert(Node->getValueType(0) == MVT::i1 &&
2928
"Custom lowering for i1 load only");
2929
SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
2930
LD->getBasePtr(), LD->getPointerInfo(),
2931
MVT::i8, LD->getAlign(),
2932
LD->getMemOperand()->getFlags());
2933
SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2934
// The legalizer (the caller) is expecting two values from the legalized
2935
// load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2936
// in LegalizeDAG.cpp which also uses MergeValues.
2937
SDValue Ops[] = { result, LD->getChain() };
2938
return DAG.getMergeValues(Ops, dl);
2939
}
2940
2941
SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2942
StoreSDNode *Store = cast<StoreSDNode>(Op);
2943
EVT VT = Store->getMemoryVT();
2944
2945
if (VT == MVT::i1)
2946
return LowerSTOREi1(Op, DAG);
2947
2948
// v2f16 is legal, so we can't rely on legalizer to handle unaligned
2949
// stores and have to handle it here.
2950
if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
2951
!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2952
VT, *Store->getMemOperand()))
2953
return expandUnalignedStore(Store, DAG);
2954
2955
// v2f16, v2bf16 and v2i16 don't need special handling.
2956
if (Isv2x16VT(VT) || VT == MVT::v4i8)
2957
return SDValue();
2958
2959
if (VT.isVector())
2960
return LowerSTOREVector(Op, DAG);
2961
2962
return SDValue();
2963
}
2964
2965
SDValue
2966
NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2967
SDNode *N = Op.getNode();
2968
SDValue Val = N->getOperand(1);
2969
SDLoc DL(N);
2970
EVT ValVT = Val.getValueType();
2971
2972
if (ValVT.isVector()) {
2973
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
2974
// legal. We can (and should) split that into 2 stores of <2 x double> here
2975
// but I'm leaving that as a TODO for now.
2976
if (!ValVT.isSimple())
2977
return SDValue();
2978
switch (ValVT.getSimpleVT().SimpleTy) {
2979
default:
2980
return SDValue();
2981
case MVT::v2i8:
2982
case MVT::v2i16:
2983
case MVT::v2i32:
2984
case MVT::v2i64:
2985
case MVT::v2f16:
2986
case MVT::v2bf16:
2987
case MVT::v2f32:
2988
case MVT::v2f64:
2989
case MVT::v4i8:
2990
case MVT::v4i16:
2991
case MVT::v4i32:
2992
case MVT::v4f16:
2993
case MVT::v4bf16:
2994
case MVT::v4f32:
2995
case MVT::v8f16: // <4 x f16x2>
2996
case MVT::v8bf16: // <4 x bf16x2>
2997
case MVT::v8i16: // <4 x i16x2>
2998
// This is a "native" vector type
2999
break;
3000
}
3001
3002
MemSDNode *MemSD = cast<MemSDNode>(N);
3003
const DataLayout &TD = DAG.getDataLayout();
3004
3005
Align Alignment = MemSD->getAlign();
3006
Align PrefAlign =
3007
TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3008
if (Alignment < PrefAlign) {
3009
// This store is not sufficiently aligned, so bail out and let this vector
3010
// store be scalarized. Note that we may still be able to emit smaller
3011
// vector stores. For example, if we are storing a <4 x float> with an
3012
// alignment of 8, this check will fail but the legalizer will try again
3013
// with 2 x <2 x float>, which will succeed with an alignment of 8.
3014
return SDValue();
3015
}
3016
3017
unsigned Opcode = 0;
3018
EVT EltVT = ValVT.getVectorElementType();
3019
unsigned NumElts = ValVT.getVectorNumElements();
3020
3021
// Since StoreV2 is a target node, we cannot rely on DAG type legalization.
3022
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
3023
// stored type to i16 and propagate the "real" type as the memory type.
3024
bool NeedExt = false;
3025
if (EltVT.getSizeInBits() < 16)
3026
NeedExt = true;
3027
3028
bool StoreF16x2 = false;
3029
switch (NumElts) {
3030
default:
3031
return SDValue();
3032
case 2:
3033
Opcode = NVPTXISD::StoreV2;
3034
break;
3035
case 4:
3036
Opcode = NVPTXISD::StoreV4;
3037
break;
3038
case 8:
3039
// v8f16 is a special case. PTX doesn't have st.v8.f16
3040
// instruction. Instead, we split the vector into v2f16 chunks and
3041
// store them with st.v4.b32.
3042
assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
3043
Opcode = NVPTXISD::StoreV4;
3044
StoreF16x2 = true;
3045
break;
3046
}
3047
3048
SmallVector<SDValue, 8> Ops;
3049
3050
// First is the chain
3051
Ops.push_back(N->getOperand(0));
3052
3053
if (StoreF16x2) {
3054
// Combine f16,f16 -> v2f16
3055
NumElts /= 2;
3056
for (unsigned i = 0; i < NumElts; ++i) {
3057
SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3058
DAG.getIntPtrConstant(i * 2, DL));
3059
SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3060
DAG.getIntPtrConstant(i * 2 + 1, DL));
3061
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
3062
SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
3063
Ops.push_back(V2);
3064
}
3065
} else {
3066
// Then the split values
3067
for (unsigned i = 0; i < NumElts; ++i) {
3068
SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3069
DAG.getIntPtrConstant(i, DL));
3070
if (NeedExt)
3071
ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3072
Ops.push_back(ExtVal);
3073
}
3074
}
3075
3076
// Then any remaining arguments
3077
Ops.append(N->op_begin() + 2, N->op_end());
3078
3079
SDValue NewSt =
3080
DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3081
MemSD->getMemoryVT(), MemSD->getMemOperand());
3082
3083
// return DCI.CombineTo(N, NewSt, true);
3084
return NewSt;
3085
}
3086
3087
return SDValue();
3088
}
3089
3090
// st i1 v, addr
3091
// =>
3092
// v1 = zxt v to i16
3093
// st.u8 i16, addr
3094
SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3095
SDNode *Node = Op.getNode();
3096
SDLoc dl(Node);
3097
StoreSDNode *ST = cast<StoreSDNode>(Node);
3098
SDValue Tmp1 = ST->getChain();
3099
SDValue Tmp2 = ST->getBasePtr();
3100
SDValue Tmp3 = ST->getValue();
3101
assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3102
Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3103
SDValue Result =
3104
DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3105
ST->getAlign(), ST->getMemOperand()->getFlags());
3106
return Result;
3107
}
3108
3109
SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3110
SelectionDAG &DAG) const {
3111
// Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3112
// operand so that it can pass the legalization.
3113
3114
assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3115
"Custom lowering for 128-bit CopyToReg only");
3116
3117
SDNode *Node = Op.getNode();
3118
SDLoc DL(Node);
3119
3120
SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
3121
SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3122
DAG.getIntPtrConstant(0, DL));
3123
SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3124
DAG.getIntPtrConstant(1, DL));
3125
3126
SmallVector<SDValue, 5> NewOps(Op->getNumOperands() + 1);
3127
SmallVector<EVT, 3> ResultsType(Node->values());
3128
3129
NewOps[0] = Op->getOperand(0); // Chain
3130
NewOps[1] = Op->getOperand(1); // Dst Reg
3131
NewOps[2] = Lo; // Lower 64-bit
3132
NewOps[3] = Hi; // Higher 64-bit
3133
if (Op.getNumOperands() == 4)
3134
NewOps[4] = Op->getOperand(3); // Glue if exists
3135
3136
return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3137
}
3138
3139
unsigned NVPTXTargetLowering::getNumRegisters(
3140
LLVMContext &Context, EVT VT,
3141
std::optional<MVT> RegisterVT = std::nullopt) const {
3142
if (VT == MVT::i128 && RegisterVT == MVT::i128)
3143
return 1;
3144
return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3145
}
3146
3147
bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3148
SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
3149
unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
3150
if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3151
Parts[0] = Val;
3152
return true;
3153
}
3154
return false;
3155
}
3156
3157
// This creates target external symbol for a function parameter.
3158
// Name of the symbol is composed from its index and the function name.
3159
// Negative index corresponds to special parameter (unsized array) used for
3160
// passing variable arguments.
3161
SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
3162
EVT v) const {
3163
StringRef SavedStr = nvTM->getStrPool().save(
3164
getParamName(&DAG.getMachineFunction().getFunction(), idx));
3165
return DAG.getTargetExternalSymbol(SavedStr.data(), v);
3166
}
3167
3168
SDValue NVPTXTargetLowering::LowerFormalArguments(
3169
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3170
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3171
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3172
MachineFunction &MF = DAG.getMachineFunction();
3173
const DataLayout &DL = DAG.getDataLayout();
3174
auto PtrVT = getPointerTy(DAG.getDataLayout());
3175
3176
const Function *F = &MF.getFunction();
3177
const AttributeList &PAL = F->getAttributes();
3178
const TargetLowering *TLI = STI.getTargetLowering();
3179
3180
SDValue Root = DAG.getRoot();
3181
std::vector<SDValue> OutChains;
3182
3183
bool isABI = (STI.getSmVersion() >= 20);
3184
assert(isABI && "Non-ABI compilation is not supported");
3185
if (!isABI)
3186
return Chain;
3187
3188
std::vector<Type *> argTypes;
3189
std::vector<const Argument *> theArgs;
3190
for (const Argument &I : F->args()) {
3191
theArgs.push_back(&I);
3192
argTypes.push_back(I.getType());
3193
}
3194
// argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3195
// Ins.size() will be larger
3196
// * if there is an aggregate argument with multiple fields (each field
3197
// showing up separately in Ins)
3198
// * if there is a vector argument with more than typical vector-length
3199
// elements (generally if more than 4) where each vector element is
3200
// individually present in Ins.
3201
// So a different index should be used for indexing into Ins.
3202
// See similar issue in LowerCall.
3203
unsigned InsIdx = 0;
3204
3205
for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
3206
Type *Ty = argTypes[i];
3207
3208
if (theArgs[i]->use_empty()) {
3209
// argument is dead
3210
if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3211
SmallVector<EVT, 16> vtparts;
3212
3213
ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
3214
if (vtparts.empty())
3215
report_fatal_error("Empty parameter types are not supported");
3216
3217
for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3218
++parti) {
3219
InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3220
++InsIdx;
3221
}
3222
if (vtparts.size() > 0)
3223
--InsIdx;
3224
continue;
3225
}
3226
if (Ty->isVectorTy()) {
3227
EVT ObjectVT = getValueType(DL, Ty);
3228
unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
3229
for (unsigned parti = 0; parti < NumRegs; ++parti) {
3230
InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3231
++InsIdx;
3232
}
3233
if (NumRegs > 0)
3234
--InsIdx;
3235
continue;
3236
}
3237
InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3238
continue;
3239
}
3240
3241
// In the following cases, assign a node order of "i+1"
3242
// to newly created nodes. The SDNodes for params have to
3243
// appear in the same order as their order of appearance
3244
// in the original function. "i+1" holds that order.
3245
if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3246
bool aggregateIsPacked = false;
3247
if (StructType *STy = dyn_cast<StructType>(Ty))
3248
aggregateIsPacked = STy->isPacked();
3249
3250
SmallVector<EVT, 16> VTs;
3251
SmallVector<uint64_t, 16> Offsets;
3252
ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
3253
if (VTs.empty())
3254
report_fatal_error("Empty parameter types are not supported");
3255
3256
Align ArgAlign = getFunctionArgumentAlignment(
3257
F, Ty, i + AttributeList::FirstArgIndex, DL);
3258
auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3259
3260
SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3261
int VecIdx = -1; // Index of the first element of the current vector.
3262
for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3263
if (VectorInfo[parti] & PVF_FIRST) {
3264
assert(VecIdx == -1 && "Orphaned vector.");
3265
VecIdx = parti;
3266
}
3267
3268
// That's the last element of this store op.
3269
if (VectorInfo[parti] & PVF_LAST) {
3270
unsigned NumElts = parti - VecIdx + 1;
3271
EVT EltVT = VTs[parti];
3272
// i1 is loaded/stored as i8.
3273
EVT LoadVT = EltVT;
3274
if (EltVT == MVT::i1)
3275
LoadVT = MVT::i8;
3276
else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3277
// getLoad needs a vector type, but it can't handle
3278
// vectors which contain v2f16 or v2bf16 elements. So we must load
3279
// using i32 here and then bitcast back.
3280
LoadVT = MVT::i32;
3281
3282
EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
3283
SDValue VecAddr =
3284
DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
3285
DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
3286
Value *srcValue = Constant::getNullValue(PointerType::get(
3287
EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
3288
3289
const MaybeAlign PartAlign = [&]() -> MaybeAlign {
3290
if (aggregateIsPacked)
3291
return Align(1);
3292
if (NumElts != 1)
3293
return std::nullopt;
3294
Align PartAlign =
3295
DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
3296
return commonAlignment(PartAlign, Offsets[parti]);
3297
}();
3298
SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
3299
MachinePointerInfo(srcValue), PartAlign,
3300
MachineMemOperand::MODereferenceable |
3301
MachineMemOperand::MOInvariant);
3302
if (P.getNode())
3303
P.getNode()->setIROrder(i + 1);
3304
for (unsigned j = 0; j < NumElts; ++j) {
3305
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
3306
DAG.getIntPtrConstant(j, dl));
3307
// We've loaded i1 as an i8 and now must truncate it back to i1
3308
if (EltVT == MVT::i1)
3309
Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3310
// v2f16 was loaded as an i32. Now we must bitcast it back.
3311
else if (EltVT != LoadVT)
3312
Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
3313
3314
// If a promoted integer type is used, truncate down to the original
3315
MVT PromotedVT;
3316
if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
3317
Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
3318
}
3319
3320
// Extend the element if necessary (e.g. an i8 is loaded
3321
// into an i16 register)
3322
if (Ins[InsIdx].VT.isInteger() &&
3323
Ins[InsIdx].VT.getFixedSizeInBits() >
3324
LoadVT.getFixedSizeInBits()) {
3325
unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
3326
: ISD::ZERO_EXTEND;
3327
Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3328
}
3329
InVals.push_back(Elt);
3330
}
3331
3332
// Reset vector tracking state.
3333
VecIdx = -1;
3334
}
3335
++InsIdx;
3336
}
3337
if (VTs.size() > 0)
3338
--InsIdx;
3339
continue;
3340
}
3341
3342
// Param has ByVal attribute
3343
// Return MoveParam(param symbol).
3344
// Ideally, the param symbol can be returned directly,
3345
// but when SDNode builder decides to use it in a CopyToReg(),
3346
// machine instruction fails because TargetExternalSymbol
3347
// (not lowered) is target dependent, and CopyToReg assumes
3348
// the source is lowered.
3349
EVT ObjectVT = getValueType(DL, Ty);
3350
assert(ObjectVT == Ins[InsIdx].VT &&
3351
"Ins type did not match function type");
3352
SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3353
SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
3354
if (p.getNode())
3355
p.getNode()->setIROrder(i + 1);
3356
InVals.push_back(p);
3357
}
3358
3359
if (!OutChains.empty())
3360
DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3361
3362
return Chain;
3363
}
3364
3365
// Use byte-store when the param adress of the return value is unaligned.
3366
// This may happen when the return value is a field of a packed structure.
3367
static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain,
3368
uint64_t Offset, EVT ElementType,
3369
SDValue RetVal, const SDLoc &dl) {
3370
// Bit logic only works on integer types
3371
if (adjustElementType(ElementType))
3372
RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
3373
3374
// Store each byte
3375
for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
3376
// Shift the byte to the last byte position
3377
SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
3378
DAG.getConstant(i * 8, dl, MVT::i32));
3379
SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
3380
ShiftVal};
3381
// Trunc store only the last byte by using
3382
// st.param.b8
3383
// The register type can be larger than b8.
3384
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
3385
DAG.getVTList(MVT::Other), StoreOperands,
3386
MVT::i8, MachinePointerInfo(), std::nullopt,
3387
MachineMemOperand::MOStore);
3388
}
3389
return Chain;
3390
}
3391
3392
SDValue
3393
NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3394
bool isVarArg,
3395
const SmallVectorImpl<ISD::OutputArg> &Outs,
3396
const SmallVectorImpl<SDValue> &OutVals,
3397
const SDLoc &dl, SelectionDAG &DAG) const {
3398
const MachineFunction &MF = DAG.getMachineFunction();
3399
const Function &F = MF.getFunction();
3400
Type *RetTy = MF.getFunction().getReturnType();
3401
3402
bool isABI = (STI.getSmVersion() >= 20);
3403
assert(isABI && "Non-ABI compilation is not supported");
3404
if (!isABI)
3405
return Chain;
3406
3407
const DataLayout &DL = DAG.getDataLayout();
3408
SmallVector<SDValue, 16> PromotedOutVals;
3409
SmallVector<EVT, 16> VTs;
3410
SmallVector<uint64_t, 16> Offsets;
3411
ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
3412
assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3413
3414
for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3415
SDValue PromotedOutVal = OutVals[i];
3416
MVT PromotedVT;
3417
if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3418
VTs[i] = EVT(PromotedVT);
3419
}
3420
if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3421
llvm::ISD::NodeType Ext =
3422
Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3423
PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3424
}
3425
PromotedOutVals.push_back(PromotedOutVal);
3426
}
3427
3428
auto VectorInfo = VectorizePTXValueVTs(
3429
VTs, Offsets,
3430
RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
3431
: Align(1));
3432
3433
// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3434
// 32-bits are sign extended or zero extended, depending on whether
3435
// they are signed or unsigned types.
3436
bool ExtendIntegerRetVal =
3437
RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3438
3439
SmallVector<SDValue, 6> StoreOperands;
3440
for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3441
SDValue OutVal = OutVals[i];
3442
SDValue RetVal = PromotedOutVals[i];
3443
3444
if (ExtendIntegerRetVal) {
3445
RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3446
: ISD::ZERO_EXTEND,
3447
dl, MVT::i32, RetVal);
3448
} else if (OutVal.getValueSizeInBits() < 16) {
3449
// Use 16-bit registers for small load-stores as it's the
3450
// smallest general purpose register size supported by NVPTX.
3451
RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3452
}
3453
3454
// If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
3455
// for a scalar store. In such cases, fall back to byte stores.
3456
if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
3457
EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3458
Align ElementTypeAlign =
3459
DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
3460
Align ElementAlign =
3461
commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
3462
if (ElementAlign < ElementTypeAlign) {
3463
assert(StoreOperands.empty() && "Orphaned operand list.");
3464
Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
3465
RetVal, dl);
3466
3467
// The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
3468
// into the graph, so just move on to the next element.
3469
continue;
3470
}
3471
}
3472
3473
// New load/store. Record chain and offset operands.
3474
if (VectorInfo[i] & PVF_FIRST) {
3475
assert(StoreOperands.empty() && "Orphaned operand list.");
3476
StoreOperands.push_back(Chain);
3477
StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3478
}
3479
3480
// Record the value to return.
3481
StoreOperands.push_back(RetVal);
3482
3483
// That's the last element of this store op.
3484
if (VectorInfo[i] & PVF_LAST) {
3485
NVPTXISD::NodeType Op;
3486
unsigned NumElts = StoreOperands.size() - 2;
3487
switch (NumElts) {
3488
case 1:
3489
Op = NVPTXISD::StoreRetval;
3490
break;
3491
case 2:
3492
Op = NVPTXISD::StoreRetvalV2;
3493
break;
3494
case 4:
3495
Op = NVPTXISD::StoreRetvalV4;
3496
break;
3497
default:
3498
llvm_unreachable("Invalid vector info.");
3499
}
3500
3501
// Adjust type of load/store op if we've extended the scalar
3502
// return value.
3503
EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3504
Chain = DAG.getMemIntrinsicNode(
3505
Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3506
MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
3507
// Cleanup vector state.
3508
StoreOperands.clear();
3509
}
3510
}
3511
3512
return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3513
}
3514
3515
void NVPTXTargetLowering::LowerAsmOperandForConstraint(
3516
SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3517
SelectionDAG &DAG) const {
3518
if (Constraint.size() > 1)
3519
return;
3520
TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3521
}
3522
3523
static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
3524
switch (Intrinsic) {
3525
default:
3526
return 0;
3527
3528
case Intrinsic::nvvm_tex_1d_v4f32_s32:
3529
return NVPTXISD::Tex1DFloatS32;
3530
case Intrinsic::nvvm_tex_1d_v4f32_f32:
3531
return NVPTXISD::Tex1DFloatFloat;
3532
case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3533
return NVPTXISD::Tex1DFloatFloatLevel;
3534
case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3535
return NVPTXISD::Tex1DFloatFloatGrad;
3536
case Intrinsic::nvvm_tex_1d_v4s32_s32:
3537
return NVPTXISD::Tex1DS32S32;
3538
case Intrinsic::nvvm_tex_1d_v4s32_f32:
3539
return NVPTXISD::Tex1DS32Float;
3540
case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3541
return NVPTXISD::Tex1DS32FloatLevel;
3542
case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3543
return NVPTXISD::Tex1DS32FloatGrad;
3544
case Intrinsic::nvvm_tex_1d_v4u32_s32:
3545
return NVPTXISD::Tex1DU32S32;
3546
case Intrinsic::nvvm_tex_1d_v4u32_f32:
3547
return NVPTXISD::Tex1DU32Float;
3548
case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3549
return NVPTXISD::Tex1DU32FloatLevel;
3550
case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3551
return NVPTXISD::Tex1DU32FloatGrad;
3552
3553
case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3554
return NVPTXISD::Tex1DArrayFloatS32;
3555
case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3556
return NVPTXISD::Tex1DArrayFloatFloat;
3557
case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3558
return NVPTXISD::Tex1DArrayFloatFloatLevel;
3559
case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3560
return NVPTXISD::Tex1DArrayFloatFloatGrad;
3561
case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3562
return NVPTXISD::Tex1DArrayS32S32;
3563
case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3564
return NVPTXISD::Tex1DArrayS32Float;
3565
case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3566
return NVPTXISD::Tex1DArrayS32FloatLevel;
3567
case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3568
return NVPTXISD::Tex1DArrayS32FloatGrad;
3569
case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3570
return NVPTXISD::Tex1DArrayU32S32;
3571
case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3572
return NVPTXISD::Tex1DArrayU32Float;
3573
case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3574
return NVPTXISD::Tex1DArrayU32FloatLevel;
3575
case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3576
return NVPTXISD::Tex1DArrayU32FloatGrad;
3577
3578
case Intrinsic::nvvm_tex_2d_v4f32_s32:
3579
return NVPTXISD::Tex2DFloatS32;
3580
case Intrinsic::nvvm_tex_2d_v4f32_f32:
3581
return NVPTXISD::Tex2DFloatFloat;
3582
case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3583
return NVPTXISD::Tex2DFloatFloatLevel;
3584
case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3585
return NVPTXISD::Tex2DFloatFloatGrad;
3586
case Intrinsic::nvvm_tex_2d_v4s32_s32:
3587
return NVPTXISD::Tex2DS32S32;
3588
case Intrinsic::nvvm_tex_2d_v4s32_f32:
3589
return NVPTXISD::Tex2DS32Float;
3590
case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3591
return NVPTXISD::Tex2DS32FloatLevel;
3592
case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3593
return NVPTXISD::Tex2DS32FloatGrad;
3594
case Intrinsic::nvvm_tex_2d_v4u32_s32:
3595
return NVPTXISD::Tex2DU32S32;
3596
case Intrinsic::nvvm_tex_2d_v4u32_f32:
3597
return NVPTXISD::Tex2DU32Float;
3598
case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3599
return NVPTXISD::Tex2DU32FloatLevel;
3600
case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3601
return NVPTXISD::Tex2DU32FloatGrad;
3602
3603
case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3604
return NVPTXISD::Tex2DArrayFloatS32;
3605
case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3606
return NVPTXISD::Tex2DArrayFloatFloat;
3607
case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3608
return NVPTXISD::Tex2DArrayFloatFloatLevel;
3609
case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3610
return NVPTXISD::Tex2DArrayFloatFloatGrad;
3611
case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3612
return NVPTXISD::Tex2DArrayS32S32;
3613
case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3614
return NVPTXISD::Tex2DArrayS32Float;
3615
case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3616
return NVPTXISD::Tex2DArrayS32FloatLevel;
3617
case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3618
return NVPTXISD::Tex2DArrayS32FloatGrad;
3619
case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3620
return NVPTXISD::Tex2DArrayU32S32;
3621
case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3622
return NVPTXISD::Tex2DArrayU32Float;
3623
case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3624
return NVPTXISD::Tex2DArrayU32FloatLevel;
3625
case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3626
return NVPTXISD::Tex2DArrayU32FloatGrad;
3627
3628
case Intrinsic::nvvm_tex_3d_v4f32_s32:
3629
return NVPTXISD::Tex3DFloatS32;
3630
case Intrinsic::nvvm_tex_3d_v4f32_f32:
3631
return NVPTXISD::Tex3DFloatFloat;
3632
case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3633
return NVPTXISD::Tex3DFloatFloatLevel;
3634
case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3635
return NVPTXISD::Tex3DFloatFloatGrad;
3636
case Intrinsic::nvvm_tex_3d_v4s32_s32:
3637
return NVPTXISD::Tex3DS32S32;
3638
case Intrinsic::nvvm_tex_3d_v4s32_f32:
3639
return NVPTXISD::Tex3DS32Float;
3640
case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3641
return NVPTXISD::Tex3DS32FloatLevel;
3642
case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3643
return NVPTXISD::Tex3DS32FloatGrad;
3644
case Intrinsic::nvvm_tex_3d_v4u32_s32:
3645
return NVPTXISD::Tex3DU32S32;
3646
case Intrinsic::nvvm_tex_3d_v4u32_f32:
3647
return NVPTXISD::Tex3DU32Float;
3648
case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3649
return NVPTXISD::Tex3DU32FloatLevel;
3650
case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3651
return NVPTXISD::Tex3DU32FloatGrad;
3652
3653
case Intrinsic::nvvm_tex_cube_v4f32_f32:
3654
return NVPTXISD::TexCubeFloatFloat;
3655
case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3656
return NVPTXISD::TexCubeFloatFloatLevel;
3657
case Intrinsic::nvvm_tex_cube_v4s32_f32:
3658
return NVPTXISD::TexCubeS32Float;
3659
case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3660
return NVPTXISD::TexCubeS32FloatLevel;
3661
case Intrinsic::nvvm_tex_cube_v4u32_f32:
3662
return NVPTXISD::TexCubeU32Float;
3663
case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3664
return NVPTXISD::TexCubeU32FloatLevel;
3665
3666
case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3667
return NVPTXISD::TexCubeArrayFloatFloat;
3668
case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3669
return NVPTXISD::TexCubeArrayFloatFloatLevel;
3670
case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3671
return NVPTXISD::TexCubeArrayS32Float;
3672
case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3673
return NVPTXISD::TexCubeArrayS32FloatLevel;
3674
case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3675
return NVPTXISD::TexCubeArrayU32Float;
3676
case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3677
return NVPTXISD::TexCubeArrayU32FloatLevel;
3678
3679
case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3680
return NVPTXISD::Tld4R2DFloatFloat;
3681
case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3682
return NVPTXISD::Tld4G2DFloatFloat;
3683
case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3684
return NVPTXISD::Tld4B2DFloatFloat;
3685
case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3686
return NVPTXISD::Tld4A2DFloatFloat;
3687
case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3688
return NVPTXISD::Tld4R2DS64Float;
3689
case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3690
return NVPTXISD::Tld4G2DS64Float;
3691
case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3692
return NVPTXISD::Tld4B2DS64Float;
3693
case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3694
return NVPTXISD::Tld4A2DS64Float;
3695
case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3696
return NVPTXISD::Tld4R2DU64Float;
3697
case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3698
return NVPTXISD::Tld4G2DU64Float;
3699
case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3700
return NVPTXISD::Tld4B2DU64Float;
3701
case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3702
return NVPTXISD::Tld4A2DU64Float;
3703
3704
case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3705
return NVPTXISD::TexUnified1DFloatS32;
3706
case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3707
return NVPTXISD::TexUnified1DFloatFloat;
3708
case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3709
return NVPTXISD::TexUnified1DFloatFloatLevel;
3710
case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3711
return NVPTXISD::TexUnified1DFloatFloatGrad;
3712
case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3713
return NVPTXISD::TexUnified1DS32S32;
3714
case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3715
return NVPTXISD::TexUnified1DS32Float;
3716
case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3717
return NVPTXISD::TexUnified1DS32FloatLevel;
3718
case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3719
return NVPTXISD::TexUnified1DS32FloatGrad;
3720
case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3721
return NVPTXISD::TexUnified1DU32S32;
3722
case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3723
return NVPTXISD::TexUnified1DU32Float;
3724
case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3725
return NVPTXISD::TexUnified1DU32FloatLevel;
3726
case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3727
return NVPTXISD::TexUnified1DU32FloatGrad;
3728
3729
case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3730
return NVPTXISD::TexUnified1DArrayFloatS32;
3731
case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3732
return NVPTXISD::TexUnified1DArrayFloatFloat;
3733
case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3734
return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
3735
case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3736
return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
3737
case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3738
return NVPTXISD::TexUnified1DArrayS32S32;
3739
case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3740
return NVPTXISD::TexUnified1DArrayS32Float;
3741
case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3742
return NVPTXISD::TexUnified1DArrayS32FloatLevel;
3743
case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3744
return NVPTXISD::TexUnified1DArrayS32FloatGrad;
3745
case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3746
return NVPTXISD::TexUnified1DArrayU32S32;
3747
case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3748
return NVPTXISD::TexUnified1DArrayU32Float;
3749
case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3750
return NVPTXISD::TexUnified1DArrayU32FloatLevel;
3751
case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3752
return NVPTXISD::TexUnified1DArrayU32FloatGrad;
3753
3754
case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3755
return NVPTXISD::TexUnified2DFloatS32;
3756
case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3757
return NVPTXISD::TexUnified2DFloatFloat;
3758
case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3759
return NVPTXISD::TexUnified2DFloatFloatLevel;
3760
case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3761
return NVPTXISD::TexUnified2DFloatFloatGrad;
3762
case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3763
return NVPTXISD::TexUnified2DS32S32;
3764
case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3765
return NVPTXISD::TexUnified2DS32Float;
3766
case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3767
return NVPTXISD::TexUnified2DS32FloatLevel;
3768
case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3769
return NVPTXISD::TexUnified2DS32FloatGrad;
3770
case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3771
return NVPTXISD::TexUnified2DU32S32;
3772
case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3773
return NVPTXISD::TexUnified2DU32Float;
3774
case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3775
return NVPTXISD::TexUnified2DU32FloatLevel;
3776
case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3777
return NVPTXISD::TexUnified2DU32FloatGrad;
3778
3779
case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3780
return NVPTXISD::TexUnified2DArrayFloatS32;
3781
case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3782
return NVPTXISD::TexUnified2DArrayFloatFloat;
3783
case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3784
return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
3785
case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3786
return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
3787
case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3788
return NVPTXISD::TexUnified2DArrayS32S32;
3789
case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3790
return NVPTXISD::TexUnified2DArrayS32Float;
3791
case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3792
return NVPTXISD::TexUnified2DArrayS32FloatLevel;
3793
case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3794
return NVPTXISD::TexUnified2DArrayS32FloatGrad;
3795
case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3796
return NVPTXISD::TexUnified2DArrayU32S32;
3797
case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3798
return NVPTXISD::TexUnified2DArrayU32Float;
3799
case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3800
return NVPTXISD::TexUnified2DArrayU32FloatLevel;
3801
case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3802
return NVPTXISD::TexUnified2DArrayU32FloatGrad;
3803
3804
case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3805
return NVPTXISD::TexUnified3DFloatS32;
3806
case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3807
return NVPTXISD::TexUnified3DFloatFloat;
3808
case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3809
return NVPTXISD::TexUnified3DFloatFloatLevel;
3810
case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3811
return NVPTXISD::TexUnified3DFloatFloatGrad;
3812
case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3813
return NVPTXISD::TexUnified3DS32S32;
3814
case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3815
return NVPTXISD::TexUnified3DS32Float;
3816
case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3817
return NVPTXISD::TexUnified3DS32FloatLevel;
3818
case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3819
return NVPTXISD::TexUnified3DS32FloatGrad;
3820
case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3821
return NVPTXISD::TexUnified3DU32S32;
3822
case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3823
return NVPTXISD::TexUnified3DU32Float;
3824
case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3825
return NVPTXISD::TexUnified3DU32FloatLevel;
3826
case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3827
return NVPTXISD::TexUnified3DU32FloatGrad;
3828
3829
case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3830
return NVPTXISD::TexUnifiedCubeFloatFloat;
3831
case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3832
return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
3833
case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3834
return NVPTXISD::TexUnifiedCubeS32Float;
3835
case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3836
return NVPTXISD::TexUnifiedCubeS32FloatLevel;
3837
case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3838
return NVPTXISD::TexUnifiedCubeU32Float;
3839
case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3840
return NVPTXISD::TexUnifiedCubeU32FloatLevel;
3841
3842
case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3843
return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
3844
case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3845
return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
3846
case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3847
return NVPTXISD::TexUnifiedCubeArrayS32Float;
3848
case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3849
return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
3850
case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3851
return NVPTXISD::TexUnifiedCubeArrayU32Float;
3852
case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3853
return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
3854
3855
case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3856
return NVPTXISD::TexUnifiedCubeFloatFloatGrad;
3857
case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
3858
return NVPTXISD::TexUnifiedCubeS32FloatGrad;
3859
case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
3860
return NVPTXISD::TexUnifiedCubeU32FloatGrad;
3861
case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3862
return NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad;
3863
case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
3864
return NVPTXISD::TexUnifiedCubeArrayS32FloatGrad;
3865
case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
3866
return NVPTXISD::TexUnifiedCubeArrayU32FloatGrad;
3867
3868
case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3869
return NVPTXISD::Tld4UnifiedR2DFloatFloat;
3870
case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3871
return NVPTXISD::Tld4UnifiedG2DFloatFloat;
3872
case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3873
return NVPTXISD::Tld4UnifiedB2DFloatFloat;
3874
case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3875
return NVPTXISD::Tld4UnifiedA2DFloatFloat;
3876
case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3877
return NVPTXISD::Tld4UnifiedR2DS64Float;
3878
case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3879
return NVPTXISD::Tld4UnifiedG2DS64Float;
3880
case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3881
return NVPTXISD::Tld4UnifiedB2DS64Float;
3882
case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3883
return NVPTXISD::Tld4UnifiedA2DS64Float;
3884
case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3885
return NVPTXISD::Tld4UnifiedR2DU64Float;
3886
case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3887
return NVPTXISD::Tld4UnifiedG2DU64Float;
3888
case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3889
return NVPTXISD::Tld4UnifiedB2DU64Float;
3890
case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3891
return NVPTXISD::Tld4UnifiedA2DU64Float;
3892
}
3893
}
3894
3895
static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3896
switch (Intrinsic) {
3897
default:
3898
return 0;
3899
case Intrinsic::nvvm_suld_1d_i8_clamp:
3900
return NVPTXISD::Suld1DI8Clamp;
3901
case Intrinsic::nvvm_suld_1d_i16_clamp:
3902
return NVPTXISD::Suld1DI16Clamp;
3903
case Intrinsic::nvvm_suld_1d_i32_clamp:
3904
return NVPTXISD::Suld1DI32Clamp;
3905
case Intrinsic::nvvm_suld_1d_i64_clamp:
3906
return NVPTXISD::Suld1DI64Clamp;
3907
case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3908
return NVPTXISD::Suld1DV2I8Clamp;
3909
case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3910
return NVPTXISD::Suld1DV2I16Clamp;
3911
case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3912
return NVPTXISD::Suld1DV2I32Clamp;
3913
case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3914
return NVPTXISD::Suld1DV2I64Clamp;
3915
case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3916
return NVPTXISD::Suld1DV4I8Clamp;
3917
case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3918
return NVPTXISD::Suld1DV4I16Clamp;
3919
case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3920
return NVPTXISD::Suld1DV4I32Clamp;
3921
case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3922
return NVPTXISD::Suld1DArrayI8Clamp;
3923
case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3924
return NVPTXISD::Suld1DArrayI16Clamp;
3925
case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3926
return NVPTXISD::Suld1DArrayI32Clamp;
3927
case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3928
return NVPTXISD::Suld1DArrayI64Clamp;
3929
case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3930
return NVPTXISD::Suld1DArrayV2I8Clamp;
3931
case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3932
return NVPTXISD::Suld1DArrayV2I16Clamp;
3933
case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3934
return NVPTXISD::Suld1DArrayV2I32Clamp;
3935
case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3936
return NVPTXISD::Suld1DArrayV2I64Clamp;
3937
case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3938
return NVPTXISD::Suld1DArrayV4I8Clamp;
3939
case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3940
return NVPTXISD::Suld1DArrayV4I16Clamp;
3941
case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3942
return NVPTXISD::Suld1DArrayV4I32Clamp;
3943
case Intrinsic::nvvm_suld_2d_i8_clamp:
3944
return NVPTXISD::Suld2DI8Clamp;
3945
case Intrinsic::nvvm_suld_2d_i16_clamp:
3946
return NVPTXISD::Suld2DI16Clamp;
3947
case Intrinsic::nvvm_suld_2d_i32_clamp:
3948
return NVPTXISD::Suld2DI32Clamp;
3949
case Intrinsic::nvvm_suld_2d_i64_clamp:
3950
return NVPTXISD::Suld2DI64Clamp;
3951
case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3952
return NVPTXISD::Suld2DV2I8Clamp;
3953
case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3954
return NVPTXISD::Suld2DV2I16Clamp;
3955
case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3956
return NVPTXISD::Suld2DV2I32Clamp;
3957
case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3958
return NVPTXISD::Suld2DV2I64Clamp;
3959
case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3960
return NVPTXISD::Suld2DV4I8Clamp;
3961
case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3962
return NVPTXISD::Suld2DV4I16Clamp;
3963
case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3964
return NVPTXISD::Suld2DV4I32Clamp;
3965
case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3966
return NVPTXISD::Suld2DArrayI8Clamp;
3967
case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3968
return NVPTXISD::Suld2DArrayI16Clamp;
3969
case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3970
return NVPTXISD::Suld2DArrayI32Clamp;
3971
case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3972
return NVPTXISD::Suld2DArrayI64Clamp;
3973
case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3974
return NVPTXISD::Suld2DArrayV2I8Clamp;
3975
case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3976
return NVPTXISD::Suld2DArrayV2I16Clamp;
3977
case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3978
return NVPTXISD::Suld2DArrayV2I32Clamp;
3979
case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3980
return NVPTXISD::Suld2DArrayV2I64Clamp;
3981
case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3982
return NVPTXISD::Suld2DArrayV4I8Clamp;
3983
case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3984
return NVPTXISD::Suld2DArrayV4I16Clamp;
3985
case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3986
return NVPTXISD::Suld2DArrayV4I32Clamp;
3987
case Intrinsic::nvvm_suld_3d_i8_clamp:
3988
return NVPTXISD::Suld3DI8Clamp;
3989
case Intrinsic::nvvm_suld_3d_i16_clamp:
3990
return NVPTXISD::Suld3DI16Clamp;
3991
case Intrinsic::nvvm_suld_3d_i32_clamp:
3992
return NVPTXISD::Suld3DI32Clamp;
3993
case Intrinsic::nvvm_suld_3d_i64_clamp:
3994
return NVPTXISD::Suld3DI64Clamp;
3995
case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3996
return NVPTXISD::Suld3DV2I8Clamp;
3997
case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3998
return NVPTXISD::Suld3DV2I16Clamp;
3999
case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4000
return NVPTXISD::Suld3DV2I32Clamp;
4001
case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4002
return NVPTXISD::Suld3DV2I64Clamp;
4003
case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4004
return NVPTXISD::Suld3DV4I8Clamp;
4005
case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4006
return NVPTXISD::Suld3DV4I16Clamp;
4007
case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4008
return NVPTXISD::Suld3DV4I32Clamp;
4009
case Intrinsic::nvvm_suld_1d_i8_trap:
4010
return NVPTXISD::Suld1DI8Trap;
4011
case Intrinsic::nvvm_suld_1d_i16_trap:
4012
return NVPTXISD::Suld1DI16Trap;
4013
case Intrinsic::nvvm_suld_1d_i32_trap:
4014
return NVPTXISD::Suld1DI32Trap;
4015
case Intrinsic::nvvm_suld_1d_i64_trap:
4016
return NVPTXISD::Suld1DI64Trap;
4017
case Intrinsic::nvvm_suld_1d_v2i8_trap:
4018
return NVPTXISD::Suld1DV2I8Trap;
4019
case Intrinsic::nvvm_suld_1d_v2i16_trap:
4020
return NVPTXISD::Suld1DV2I16Trap;
4021
case Intrinsic::nvvm_suld_1d_v2i32_trap:
4022
return NVPTXISD::Suld1DV2I32Trap;
4023
case Intrinsic::nvvm_suld_1d_v2i64_trap:
4024
return NVPTXISD::Suld1DV2I64Trap;
4025
case Intrinsic::nvvm_suld_1d_v4i8_trap:
4026
return NVPTXISD::Suld1DV4I8Trap;
4027
case Intrinsic::nvvm_suld_1d_v4i16_trap:
4028
return NVPTXISD::Suld1DV4I16Trap;
4029
case Intrinsic::nvvm_suld_1d_v4i32_trap:
4030
return NVPTXISD::Suld1DV4I32Trap;
4031
case Intrinsic::nvvm_suld_1d_array_i8_trap:
4032
return NVPTXISD::Suld1DArrayI8Trap;
4033
case Intrinsic::nvvm_suld_1d_array_i16_trap:
4034
return NVPTXISD::Suld1DArrayI16Trap;
4035
case Intrinsic::nvvm_suld_1d_array_i32_trap:
4036
return NVPTXISD::Suld1DArrayI32Trap;
4037
case Intrinsic::nvvm_suld_1d_array_i64_trap:
4038
return NVPTXISD::Suld1DArrayI64Trap;
4039
case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4040
return NVPTXISD::Suld1DArrayV2I8Trap;
4041
case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4042
return NVPTXISD::Suld1DArrayV2I16Trap;
4043
case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4044
return NVPTXISD::Suld1DArrayV2I32Trap;
4045
case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4046
return NVPTXISD::Suld1DArrayV2I64Trap;
4047
case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4048
return NVPTXISD::Suld1DArrayV4I8Trap;
4049
case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4050
return NVPTXISD::Suld1DArrayV4I16Trap;
4051
case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4052
return NVPTXISD::Suld1DArrayV4I32Trap;
4053
case Intrinsic::nvvm_suld_2d_i8_trap:
4054
return NVPTXISD::Suld2DI8Trap;
4055
case Intrinsic::nvvm_suld_2d_i16_trap:
4056
return NVPTXISD::Suld2DI16Trap;
4057
case Intrinsic::nvvm_suld_2d_i32_trap:
4058
return NVPTXISD::Suld2DI32Trap;
4059
case Intrinsic::nvvm_suld_2d_i64_trap:
4060
return NVPTXISD::Suld2DI64Trap;
4061
case Intrinsic::nvvm_suld_2d_v2i8_trap:
4062
return NVPTXISD::Suld2DV2I8Trap;
4063
case Intrinsic::nvvm_suld_2d_v2i16_trap:
4064
return NVPTXISD::Suld2DV2I16Trap;
4065
case Intrinsic::nvvm_suld_2d_v2i32_trap:
4066
return NVPTXISD::Suld2DV2I32Trap;
4067
case Intrinsic::nvvm_suld_2d_v2i64_trap:
4068
return NVPTXISD::Suld2DV2I64Trap;
4069
case Intrinsic::nvvm_suld_2d_v4i8_trap:
4070
return NVPTXISD::Suld2DV4I8Trap;
4071
case Intrinsic::nvvm_suld_2d_v4i16_trap:
4072
return NVPTXISD::Suld2DV4I16Trap;
4073
case Intrinsic::nvvm_suld_2d_v4i32_trap:
4074
return NVPTXISD::Suld2DV4I32Trap;
4075
case Intrinsic::nvvm_suld_2d_array_i8_trap:
4076
return NVPTXISD::Suld2DArrayI8Trap;
4077
case Intrinsic::nvvm_suld_2d_array_i16_trap:
4078
return NVPTXISD::Suld2DArrayI16Trap;
4079
case Intrinsic::nvvm_suld_2d_array_i32_trap:
4080
return NVPTXISD::Suld2DArrayI32Trap;
4081
case Intrinsic::nvvm_suld_2d_array_i64_trap:
4082
return NVPTXISD::Suld2DArrayI64Trap;
4083
case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4084
return NVPTXISD::Suld2DArrayV2I8Trap;
4085
case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4086
return NVPTXISD::Suld2DArrayV2I16Trap;
4087
case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4088
return NVPTXISD::Suld2DArrayV2I32Trap;
4089
case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4090
return NVPTXISD::Suld2DArrayV2I64Trap;
4091
case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4092
return NVPTXISD::Suld2DArrayV4I8Trap;
4093
case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4094
return NVPTXISD::Suld2DArrayV4I16Trap;
4095
case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4096
return NVPTXISD::Suld2DArrayV4I32Trap;
4097
case Intrinsic::nvvm_suld_3d_i8_trap:
4098
return NVPTXISD::Suld3DI8Trap;
4099
case Intrinsic::nvvm_suld_3d_i16_trap:
4100
return NVPTXISD::Suld3DI16Trap;
4101
case Intrinsic::nvvm_suld_3d_i32_trap:
4102
return NVPTXISD::Suld3DI32Trap;
4103
case Intrinsic::nvvm_suld_3d_i64_trap:
4104
return NVPTXISD::Suld3DI64Trap;
4105
case Intrinsic::nvvm_suld_3d_v2i8_trap:
4106
return NVPTXISD::Suld3DV2I8Trap;
4107
case Intrinsic::nvvm_suld_3d_v2i16_trap:
4108
return NVPTXISD::Suld3DV2I16Trap;
4109
case Intrinsic::nvvm_suld_3d_v2i32_trap:
4110
return NVPTXISD::Suld3DV2I32Trap;
4111
case Intrinsic::nvvm_suld_3d_v2i64_trap:
4112
return NVPTXISD::Suld3DV2I64Trap;
4113
case Intrinsic::nvvm_suld_3d_v4i8_trap:
4114
return NVPTXISD::Suld3DV4I8Trap;
4115
case Intrinsic::nvvm_suld_3d_v4i16_trap:
4116
return NVPTXISD::Suld3DV4I16Trap;
4117
case Intrinsic::nvvm_suld_3d_v4i32_trap:
4118
return NVPTXISD::Suld3DV4I32Trap;
4119
case Intrinsic::nvvm_suld_1d_i8_zero:
4120
return NVPTXISD::Suld1DI8Zero;
4121
case Intrinsic::nvvm_suld_1d_i16_zero:
4122
return NVPTXISD::Suld1DI16Zero;
4123
case Intrinsic::nvvm_suld_1d_i32_zero:
4124
return NVPTXISD::Suld1DI32Zero;
4125
case Intrinsic::nvvm_suld_1d_i64_zero:
4126
return NVPTXISD::Suld1DI64Zero;
4127
case Intrinsic::nvvm_suld_1d_v2i8_zero:
4128
return NVPTXISD::Suld1DV2I8Zero;
4129
case Intrinsic::nvvm_suld_1d_v2i16_zero:
4130
return NVPTXISD::Suld1DV2I16Zero;
4131
case Intrinsic::nvvm_suld_1d_v2i32_zero:
4132
return NVPTXISD::Suld1DV2I32Zero;
4133
case Intrinsic::nvvm_suld_1d_v2i64_zero:
4134
return NVPTXISD::Suld1DV2I64Zero;
4135
case Intrinsic::nvvm_suld_1d_v4i8_zero:
4136
return NVPTXISD::Suld1DV4I8Zero;
4137
case Intrinsic::nvvm_suld_1d_v4i16_zero:
4138
return NVPTXISD::Suld1DV4I16Zero;
4139
case Intrinsic::nvvm_suld_1d_v4i32_zero:
4140
return NVPTXISD::Suld1DV4I32Zero;
4141
case Intrinsic::nvvm_suld_1d_array_i8_zero:
4142
return NVPTXISD::Suld1DArrayI8Zero;
4143
case Intrinsic::nvvm_suld_1d_array_i16_zero:
4144
return NVPTXISD::Suld1DArrayI16Zero;
4145
case Intrinsic::nvvm_suld_1d_array_i32_zero:
4146
return NVPTXISD::Suld1DArrayI32Zero;
4147
case Intrinsic::nvvm_suld_1d_array_i64_zero:
4148
return NVPTXISD::Suld1DArrayI64Zero;
4149
case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4150
return NVPTXISD::Suld1DArrayV2I8Zero;
4151
case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4152
return NVPTXISD::Suld1DArrayV2I16Zero;
4153
case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4154
return NVPTXISD::Suld1DArrayV2I32Zero;
4155
case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4156
return NVPTXISD::Suld1DArrayV2I64Zero;
4157
case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4158
return NVPTXISD::Suld1DArrayV4I8Zero;
4159
case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4160
return NVPTXISD::Suld1DArrayV4I16Zero;
4161
case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4162
return NVPTXISD::Suld1DArrayV4I32Zero;
4163
case Intrinsic::nvvm_suld_2d_i8_zero:
4164
return NVPTXISD::Suld2DI8Zero;
4165
case Intrinsic::nvvm_suld_2d_i16_zero:
4166
return NVPTXISD::Suld2DI16Zero;
4167
case Intrinsic::nvvm_suld_2d_i32_zero:
4168
return NVPTXISD::Suld2DI32Zero;
4169
case Intrinsic::nvvm_suld_2d_i64_zero:
4170
return NVPTXISD::Suld2DI64Zero;
4171
case Intrinsic::nvvm_suld_2d_v2i8_zero:
4172
return NVPTXISD::Suld2DV2I8Zero;
4173
case Intrinsic::nvvm_suld_2d_v2i16_zero:
4174
return NVPTXISD::Suld2DV2I16Zero;
4175
case Intrinsic::nvvm_suld_2d_v2i32_zero:
4176
return NVPTXISD::Suld2DV2I32Zero;
4177
case Intrinsic::nvvm_suld_2d_v2i64_zero:
4178
return NVPTXISD::Suld2DV2I64Zero;
4179
case Intrinsic::nvvm_suld_2d_v4i8_zero:
4180
return NVPTXISD::Suld2DV4I8Zero;
4181
case Intrinsic::nvvm_suld_2d_v4i16_zero:
4182
return NVPTXISD::Suld2DV4I16Zero;
4183
case Intrinsic::nvvm_suld_2d_v4i32_zero:
4184
return NVPTXISD::Suld2DV4I32Zero;
4185
case Intrinsic::nvvm_suld_2d_array_i8_zero:
4186
return NVPTXISD::Suld2DArrayI8Zero;
4187
case Intrinsic::nvvm_suld_2d_array_i16_zero:
4188
return NVPTXISD::Suld2DArrayI16Zero;
4189
case Intrinsic::nvvm_suld_2d_array_i32_zero:
4190
return NVPTXISD::Suld2DArrayI32Zero;
4191
case Intrinsic::nvvm_suld_2d_array_i64_zero:
4192
return NVPTXISD::Suld2DArrayI64Zero;
4193
case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4194
return NVPTXISD::Suld2DArrayV2I8Zero;
4195
case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4196
return NVPTXISD::Suld2DArrayV2I16Zero;
4197
case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4198
return NVPTXISD::Suld2DArrayV2I32Zero;
4199
case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4200
return NVPTXISD::Suld2DArrayV2I64Zero;
4201
case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4202
return NVPTXISD::Suld2DArrayV4I8Zero;
4203
case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4204
return NVPTXISD::Suld2DArrayV4I16Zero;
4205
case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4206
return NVPTXISD::Suld2DArrayV4I32Zero;
4207
case Intrinsic::nvvm_suld_3d_i8_zero:
4208
return NVPTXISD::Suld3DI8Zero;
4209
case Intrinsic::nvvm_suld_3d_i16_zero:
4210
return NVPTXISD::Suld3DI16Zero;
4211
case Intrinsic::nvvm_suld_3d_i32_zero:
4212
return NVPTXISD::Suld3DI32Zero;
4213
case Intrinsic::nvvm_suld_3d_i64_zero:
4214
return NVPTXISD::Suld3DI64Zero;
4215
case Intrinsic::nvvm_suld_3d_v2i8_zero:
4216
return NVPTXISD::Suld3DV2I8Zero;
4217
case Intrinsic::nvvm_suld_3d_v2i16_zero:
4218
return NVPTXISD::Suld3DV2I16Zero;
4219
case Intrinsic::nvvm_suld_3d_v2i32_zero:
4220
return NVPTXISD::Suld3DV2I32Zero;
4221
case Intrinsic::nvvm_suld_3d_v2i64_zero:
4222
return NVPTXISD::Suld3DV2I64Zero;
4223
case Intrinsic::nvvm_suld_3d_v4i8_zero:
4224
return NVPTXISD::Suld3DV4I8Zero;
4225
case Intrinsic::nvvm_suld_3d_v4i16_zero:
4226
return NVPTXISD::Suld3DV4I16Zero;
4227
case Intrinsic::nvvm_suld_3d_v4i32_zero:
4228
return NVPTXISD::Suld3DV4I32Zero;
4229
}
4230
}
4231
4232
// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4233
// TgtMemIntrinsic
4234
// because we need the information that is only available in the "Value" type
4235
// of destination
4236
// pointer. In particular, the address space information.
4237
bool NVPTXTargetLowering::getTgtMemIntrinsic(
4238
IntrinsicInfo &Info, const CallInst &I,
4239
MachineFunction &MF, unsigned Intrinsic) const {
4240
switch (Intrinsic) {
4241
default:
4242
return false;
4243
case Intrinsic::nvvm_match_all_sync_i32p:
4244
case Intrinsic::nvvm_match_all_sync_i64p:
4245
Info.opc = ISD::INTRINSIC_W_CHAIN;
4246
// memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4247
// in order to model data exchange with other threads, but perform no real
4248
// memory accesses.
4249
Info.memVT = MVT::i1;
4250
4251
// Our result depends on both our and other thread's arguments.
4252
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4253
return true;
4254
case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4255
case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4256
case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4257
case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4258
case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4259
case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4260
case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4261
case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4262
case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4263
case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4264
case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4265
case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4266
case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4267
case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4268
case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4269
case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4270
case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4271
case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4272
case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4273
case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4274
case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4275
case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4276
case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4277
case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4278
Info.opc = ISD::INTRINSIC_W_CHAIN;
4279
Info.memVT = MVT::v8f16;
4280
Info.ptrVal = I.getArgOperand(0);
4281
Info.offset = 0;
4282
Info.flags = MachineMemOperand::MOLoad;
4283
Info.align = Align(16);
4284
return true;
4285
}
4286
case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4287
case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4288
case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4289
case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4290
case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4291
case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4292
case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4293
case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4294
case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4295
case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4296
case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4297
case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4298
case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4299
case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4300
case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4301
case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4302
case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4303
case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4304
case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4305
case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4306
case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4307
case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4308
case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4309
case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4310
Info.opc = ISD::INTRINSIC_W_CHAIN;
4311
Info.memVT = MVT::v2i32;
4312
Info.ptrVal = I.getArgOperand(0);
4313
Info.offset = 0;
4314
Info.flags = MachineMemOperand::MOLoad;
4315
Info.align = Align(8);
4316
return true;
4317
}
4318
4319
case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4320
case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4321
case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4322
case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4323
case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4324
case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4325
case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4326
case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4327
case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4328
case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4329
case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4330
case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4331
case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4332
case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4333
case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4334
case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4335
4336
case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4337
case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4338
case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4339
case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4340
case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4341
case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4342
case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4343
case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4344
case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4345
case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4346
case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4347
case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4348
case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4349
case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4350
case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4351
case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4352
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4353
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
4354
Info.opc = ISD::INTRINSIC_W_CHAIN;
4355
Info.memVT = MVT::v4i32;
4356
Info.ptrVal = I.getArgOperand(0);
4357
Info.offset = 0;
4358
Info.flags = MachineMemOperand::MOLoad;
4359
Info.align = Align(16);
4360
return true;
4361
}
4362
4363
case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4364
case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4365
case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4366
case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4367
case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4368
case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4369
case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4370
case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4371
4372
case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4373
case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4374
case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4375
case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4376
case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4377
case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4378
case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4379
case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4380
case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4381
case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4382
case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4383
case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4384
case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4385
case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4386
case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4387
case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4388
case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4389
case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4390
case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4391
case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4392
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4393
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
4394
Info.opc = ISD::INTRINSIC_W_CHAIN;
4395
Info.memVT = MVT::i32;
4396
Info.ptrVal = I.getArgOperand(0);
4397
Info.offset = 0;
4398
Info.flags = MachineMemOperand::MOLoad;
4399
Info.align = Align(4);
4400
return true;
4401
}
4402
4403
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4404
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4405
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4406
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4407
case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4408
case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4409
case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4410
case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4411
case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4412
case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4413
case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4414
case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4415
Info.opc = ISD::INTRINSIC_W_CHAIN;
4416
Info.memVT = MVT::v4f16;
4417
Info.ptrVal = I.getArgOperand(0);
4418
Info.offset = 0;
4419
Info.flags = MachineMemOperand::MOLoad;
4420
Info.align = Align(16);
4421
return true;
4422
}
4423
4424
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4425
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4426
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4427
case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4428
case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4429
case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4430
case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4431
case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4432
case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4433
case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4434
case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4435
case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4436
case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4437
case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4438
case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4439
case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4440
Info.opc = ISD::INTRINSIC_W_CHAIN;
4441
Info.memVT = MVT::v8f32;
4442
Info.ptrVal = I.getArgOperand(0);
4443
Info.offset = 0;
4444
Info.flags = MachineMemOperand::MOLoad;
4445
Info.align = Align(16);
4446
return true;
4447
}
4448
4449
case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4450
case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4451
case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4452
case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4453
4454
case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4455
case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4456
case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4457
case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4458
4459
case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4460
case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4461
case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4462
case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4463
case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4464
case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4465
case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4466
case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4467
case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4468
case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4469
case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4470
case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4471
Info.opc = ISD::INTRINSIC_W_CHAIN;
4472
Info.memVT = MVT::v8i32;
4473
Info.ptrVal = I.getArgOperand(0);
4474
Info.offset = 0;
4475
Info.flags = MachineMemOperand::MOLoad;
4476
Info.align = Align(16);
4477
return true;
4478
}
4479
4480
case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4481
case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4482
case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4483
case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4484
case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4485
case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4486
case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4487
case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4488
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4489
case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
4490
Info.opc = ISD::INTRINSIC_W_CHAIN;
4491
Info.memVT = MVT::v2i32;
4492
Info.ptrVal = I.getArgOperand(0);
4493
Info.offset = 0;
4494
Info.flags = MachineMemOperand::MOLoad;
4495
Info.align = Align(8);
4496
return true;
4497
}
4498
4499
case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4500
case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4501
case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4502
case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4503
4504
case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4505
case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4506
case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4507
case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4508
Info.opc = ISD::INTRINSIC_W_CHAIN;
4509
Info.memVT = MVT::f64;
4510
Info.ptrVal = I.getArgOperand(0);
4511
Info.offset = 0;
4512
Info.flags = MachineMemOperand::MOLoad;
4513
Info.align = Align(8);
4514
return true;
4515
}
4516
4517
case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4518
case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4519
case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4520
case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4521
Info.opc = ISD::INTRINSIC_W_CHAIN;
4522
Info.memVT = MVT::v2f64;
4523
Info.ptrVal = I.getArgOperand(0);
4524
Info.offset = 0;
4525
Info.flags = MachineMemOperand::MOLoad;
4526
Info.align = Align(16);
4527
return true;
4528
}
4529
4530
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4531
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4532
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4533
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4534
case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4535
case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4536
case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4537
case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4538
case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4539
case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4540
case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4541
case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4542
Info.opc = ISD::INTRINSIC_VOID;
4543
Info.memVT = MVT::v4f16;
4544
Info.ptrVal = I.getArgOperand(0);
4545
Info.offset = 0;
4546
Info.flags = MachineMemOperand::MOStore;
4547
Info.align = Align(16);
4548
return true;
4549
}
4550
4551
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4552
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4553
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4554
case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4555
case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4556
case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4557
case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4558
case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4559
case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4560
case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4561
case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4562
case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4563
case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4564
case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4565
case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4566
case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4567
Info.opc = ISD::INTRINSIC_VOID;
4568
Info.memVT = MVT::v8f32;
4569
Info.ptrVal = I.getArgOperand(0);
4570
Info.offset = 0;
4571
Info.flags = MachineMemOperand::MOStore;
4572
Info.align = Align(16);
4573
return true;
4574
}
4575
4576
case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4577
case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4578
case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4579
case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4580
case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4581
case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4582
case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4583
case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4584
case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4585
case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4586
case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4587
case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4588
Info.opc = ISD::INTRINSIC_VOID;
4589
Info.memVT = MVT::v8i32;
4590
Info.ptrVal = I.getArgOperand(0);
4591
Info.offset = 0;
4592
Info.flags = MachineMemOperand::MOStore;
4593
Info.align = Align(16);
4594
return true;
4595
}
4596
4597
case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4598
case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4599
case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4600
case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4601
case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4602
case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4603
case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4604
case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
4605
Info.opc = ISD::INTRINSIC_VOID;
4606
Info.memVT = MVT::v2i32;
4607
Info.ptrVal = I.getArgOperand(0);
4608
Info.offset = 0;
4609
Info.flags = MachineMemOperand::MOStore;
4610
Info.align = Align(8);
4611
return true;
4612
}
4613
4614
case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4615
case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4616
case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4617
case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4618
Info.opc = ISD::INTRINSIC_VOID;
4619
Info.memVT = MVT::v2f64;
4620
Info.ptrVal = I.getArgOperand(0);
4621
Info.offset = 0;
4622
Info.flags = MachineMemOperand::MOStore;
4623
Info.align = Align(16);
4624
return true;
4625
}
4626
4627
case Intrinsic::nvvm_atomic_load_inc_32:
4628
case Intrinsic::nvvm_atomic_load_dec_32:
4629
4630
case Intrinsic::nvvm_atomic_add_gen_f_cta:
4631
case Intrinsic::nvvm_atomic_add_gen_f_sys:
4632
case Intrinsic::nvvm_atomic_add_gen_i_cta:
4633
case Intrinsic::nvvm_atomic_add_gen_i_sys:
4634
case Intrinsic::nvvm_atomic_and_gen_i_cta:
4635
case Intrinsic::nvvm_atomic_and_gen_i_sys:
4636
case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4637
case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4638
case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4639
case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4640
case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4641
case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4642
case Intrinsic::nvvm_atomic_max_gen_i_cta:
4643
case Intrinsic::nvvm_atomic_max_gen_i_sys:
4644
case Intrinsic::nvvm_atomic_min_gen_i_cta:
4645
case Intrinsic::nvvm_atomic_min_gen_i_sys:
4646
case Intrinsic::nvvm_atomic_or_gen_i_cta:
4647
case Intrinsic::nvvm_atomic_or_gen_i_sys:
4648
case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4649
case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4650
case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4651
case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4652
auto &DL = I.getDataLayout();
4653
Info.opc = ISD::INTRINSIC_W_CHAIN;
4654
Info.memVT = getValueType(DL, I.getType());
4655
Info.ptrVal = I.getArgOperand(0);
4656
Info.offset = 0;
4657
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4658
Info.align.reset();
4659
return true;
4660
}
4661
4662
case Intrinsic::nvvm_ldu_global_i:
4663
case Intrinsic::nvvm_ldu_global_f:
4664
case Intrinsic::nvvm_ldu_global_p: {
4665
auto &DL = I.getDataLayout();
4666
Info.opc = ISD::INTRINSIC_W_CHAIN;
4667
if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
4668
Info.memVT = getValueType(DL, I.getType());
4669
else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
4670
Info.memVT = getPointerTy(DL);
4671
else
4672
Info.memVT = getValueType(DL, I.getType());
4673
Info.ptrVal = I.getArgOperand(0);
4674
Info.offset = 0;
4675
Info.flags = MachineMemOperand::MOLoad;
4676
Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4677
4678
return true;
4679
}
4680
case Intrinsic::nvvm_ldg_global_i:
4681
case Intrinsic::nvvm_ldg_global_f:
4682
case Intrinsic::nvvm_ldg_global_p: {
4683
auto &DL = I.getDataLayout();
4684
4685
Info.opc = ISD::INTRINSIC_W_CHAIN;
4686
if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
4687
Info.memVT = getValueType(DL, I.getType());
4688
else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
4689
Info.memVT = getPointerTy(DL);
4690
else
4691
Info.memVT = getValueType(DL, I.getType());
4692
Info.ptrVal = I.getArgOperand(0);
4693
Info.offset = 0;
4694
Info.flags = MachineMemOperand::MOLoad;
4695
Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4696
4697
return true;
4698
}
4699
4700
case Intrinsic::nvvm_tex_1d_v4f32_s32:
4701
case Intrinsic::nvvm_tex_1d_v4f32_f32:
4702
case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4703
case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4704
case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4705
case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4706
case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4707
case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4708
case Intrinsic::nvvm_tex_2d_v4f32_s32:
4709
case Intrinsic::nvvm_tex_2d_v4f32_f32:
4710
case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4711
case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4712
case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4713
case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4714
case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4715
case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4716
case Intrinsic::nvvm_tex_3d_v4f32_s32:
4717
case Intrinsic::nvvm_tex_3d_v4f32_f32:
4718
case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4719
case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4720
case Intrinsic::nvvm_tex_cube_v4f32_f32:
4721
case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4722
case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4723
case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4724
case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4725
case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4726
case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4727
case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4728
case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4729
case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4730
case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4731
case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4732
case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4733
case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4734
case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4735
case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4736
case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4737
case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4738
case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4739
case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4740
case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4741
case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4742
case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4743
case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4744
case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4745
case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4746
case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4747
case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4748
case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4749
case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4750
case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4751
case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4752
case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4753
case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4754
case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4755
case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4756
case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4757
case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4758
Info.opc = getOpcForTextureInstr(Intrinsic);
4759
Info.memVT = MVT::v4f32;
4760
Info.ptrVal = nullptr;
4761
Info.offset = 0;
4762
Info.flags = MachineMemOperand::MOLoad;
4763
Info.align = Align(16);
4764
return true;
4765
4766
case Intrinsic::nvvm_tex_1d_v4s32_s32:
4767
case Intrinsic::nvvm_tex_1d_v4s32_f32:
4768
case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4769
case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4770
case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4771
case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4772
case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4773
case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4774
case Intrinsic::nvvm_tex_2d_v4s32_s32:
4775
case Intrinsic::nvvm_tex_2d_v4s32_f32:
4776
case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4777
case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4778
case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4779
case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4780
case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4781
case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4782
case Intrinsic::nvvm_tex_3d_v4s32_s32:
4783
case Intrinsic::nvvm_tex_3d_v4s32_f32:
4784
case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4785
case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4786
case Intrinsic::nvvm_tex_cube_v4s32_f32:
4787
case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4788
case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4789
case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4790
case Intrinsic::nvvm_tex_cube_v4u32_f32:
4791
case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4792
case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4793
case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4794
case Intrinsic::nvvm_tex_1d_v4u32_s32:
4795
case Intrinsic::nvvm_tex_1d_v4u32_f32:
4796
case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4797
case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4798
case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4799
case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4800
case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4801
case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4802
case Intrinsic::nvvm_tex_2d_v4u32_s32:
4803
case Intrinsic::nvvm_tex_2d_v4u32_f32:
4804
case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4805
case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4806
case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4807
case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4808
case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4809
case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4810
case Intrinsic::nvvm_tex_3d_v4u32_s32:
4811
case Intrinsic::nvvm_tex_3d_v4u32_f32:
4812
case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4813
case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4814
case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4815
case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4816
case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4817
case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4818
case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4819
case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4820
case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4821
case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4822
case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4823
case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4824
case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4825
case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4826
case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4827
case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4828
case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4829
case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4830
case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4831
case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4832
case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4833
case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4834
case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4835
case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4836
case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4837
case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4838
case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4839
case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4840
case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4841
case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4842
case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4843
case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4844
case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4845
case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4846
case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4847
case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4848
case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4849
case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4850
case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4851
case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4852
case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4853
case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4854
case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4855
case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4856
case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4857
case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4858
case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4859
case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4860
case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4861
case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4862
case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4863
case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4864
case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4865
case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4866
case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4867
case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4868
case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4869
case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4870
case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4871
case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4872
case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4873
case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4874
case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4875
case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4876
case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4877
case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4878
case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4879
case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4880
case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4881
case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4882
Info.opc = getOpcForTextureInstr(Intrinsic);
4883
Info.memVT = MVT::v4i32;
4884
Info.ptrVal = nullptr;
4885
Info.offset = 0;
4886
Info.flags = MachineMemOperand::MOLoad;
4887
Info.align = Align(16);
4888
return true;
4889
4890
case Intrinsic::nvvm_suld_1d_i8_clamp:
4891
case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4892
case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4893
case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4894
case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4895
case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4896
case Intrinsic::nvvm_suld_2d_i8_clamp:
4897
case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4898
case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4899
case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4900
case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4901
case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4902
case Intrinsic::nvvm_suld_3d_i8_clamp:
4903
case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4904
case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4905
case Intrinsic::nvvm_suld_1d_i8_trap:
4906
case Intrinsic::nvvm_suld_1d_v2i8_trap:
4907
case Intrinsic::nvvm_suld_1d_v4i8_trap:
4908
case Intrinsic::nvvm_suld_1d_array_i8_trap:
4909
case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4910
case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4911
case Intrinsic::nvvm_suld_2d_i8_trap:
4912
case Intrinsic::nvvm_suld_2d_v2i8_trap:
4913
case Intrinsic::nvvm_suld_2d_v4i8_trap:
4914
case Intrinsic::nvvm_suld_2d_array_i8_trap:
4915
case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4916
case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4917
case Intrinsic::nvvm_suld_3d_i8_trap:
4918
case Intrinsic::nvvm_suld_3d_v2i8_trap:
4919
case Intrinsic::nvvm_suld_3d_v4i8_trap:
4920
case Intrinsic::nvvm_suld_1d_i8_zero:
4921
case Intrinsic::nvvm_suld_1d_v2i8_zero:
4922
case Intrinsic::nvvm_suld_1d_v4i8_zero:
4923
case Intrinsic::nvvm_suld_1d_array_i8_zero:
4924
case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4925
case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4926
case Intrinsic::nvvm_suld_2d_i8_zero:
4927
case Intrinsic::nvvm_suld_2d_v2i8_zero:
4928
case Intrinsic::nvvm_suld_2d_v4i8_zero:
4929
case Intrinsic::nvvm_suld_2d_array_i8_zero:
4930
case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4931
case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4932
case Intrinsic::nvvm_suld_3d_i8_zero:
4933
case Intrinsic::nvvm_suld_3d_v2i8_zero:
4934
case Intrinsic::nvvm_suld_3d_v4i8_zero:
4935
Info.opc = getOpcForSurfaceInstr(Intrinsic);
4936
Info.memVT = MVT::i8;
4937
Info.ptrVal = nullptr;
4938
Info.offset = 0;
4939
Info.flags = MachineMemOperand::MOLoad;
4940
Info.align = Align(16);
4941
return true;
4942
4943
case Intrinsic::nvvm_suld_1d_i16_clamp:
4944
case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4945
case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4946
case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4947
case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4948
case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4949
case Intrinsic::nvvm_suld_2d_i16_clamp:
4950
case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4951
case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4952
case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4953
case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4954
case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4955
case Intrinsic::nvvm_suld_3d_i16_clamp:
4956
case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4957
case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4958
case Intrinsic::nvvm_suld_1d_i16_trap:
4959
case Intrinsic::nvvm_suld_1d_v2i16_trap:
4960
case Intrinsic::nvvm_suld_1d_v4i16_trap:
4961
case Intrinsic::nvvm_suld_1d_array_i16_trap:
4962
case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4963
case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4964
case Intrinsic::nvvm_suld_2d_i16_trap:
4965
case Intrinsic::nvvm_suld_2d_v2i16_trap:
4966
case Intrinsic::nvvm_suld_2d_v4i16_trap:
4967
case Intrinsic::nvvm_suld_2d_array_i16_trap:
4968
case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4969
case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4970
case Intrinsic::nvvm_suld_3d_i16_trap:
4971
case Intrinsic::nvvm_suld_3d_v2i16_trap:
4972
case Intrinsic::nvvm_suld_3d_v4i16_trap:
4973
case Intrinsic::nvvm_suld_1d_i16_zero:
4974
case Intrinsic::nvvm_suld_1d_v2i16_zero:
4975
case Intrinsic::nvvm_suld_1d_v4i16_zero:
4976
case Intrinsic::nvvm_suld_1d_array_i16_zero:
4977
case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4978
case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4979
case Intrinsic::nvvm_suld_2d_i16_zero:
4980
case Intrinsic::nvvm_suld_2d_v2i16_zero:
4981
case Intrinsic::nvvm_suld_2d_v4i16_zero:
4982
case Intrinsic::nvvm_suld_2d_array_i16_zero:
4983
case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4984
case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4985
case Intrinsic::nvvm_suld_3d_i16_zero:
4986
case Intrinsic::nvvm_suld_3d_v2i16_zero:
4987
case Intrinsic::nvvm_suld_3d_v4i16_zero:
4988
Info.opc = getOpcForSurfaceInstr(Intrinsic);
4989
Info.memVT = MVT::i16;
4990
Info.ptrVal = nullptr;
4991
Info.offset = 0;
4992
Info.flags = MachineMemOperand::MOLoad;
4993
Info.align = Align(16);
4994
return true;
4995
4996
case Intrinsic::nvvm_suld_1d_i32_clamp:
4997
case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4998
case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4999
case Intrinsic::nvvm_suld_1d_array_i32_clamp:
5000
case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
5001
case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
5002
case Intrinsic::nvvm_suld_2d_i32_clamp:
5003
case Intrinsic::nvvm_suld_2d_v2i32_clamp:
5004
case Intrinsic::nvvm_suld_2d_v4i32_clamp:
5005
case Intrinsic::nvvm_suld_2d_array_i32_clamp:
5006
case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
5007
case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
5008
case Intrinsic::nvvm_suld_3d_i32_clamp:
5009
case Intrinsic::nvvm_suld_3d_v2i32_clamp:
5010
case Intrinsic::nvvm_suld_3d_v4i32_clamp:
5011
case Intrinsic::nvvm_suld_1d_i32_trap:
5012
case Intrinsic::nvvm_suld_1d_v2i32_trap:
5013
case Intrinsic::nvvm_suld_1d_v4i32_trap:
5014
case Intrinsic::nvvm_suld_1d_array_i32_trap:
5015
case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
5016
case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
5017
case Intrinsic::nvvm_suld_2d_i32_trap:
5018
case Intrinsic::nvvm_suld_2d_v2i32_trap:
5019
case Intrinsic::nvvm_suld_2d_v4i32_trap:
5020
case Intrinsic::nvvm_suld_2d_array_i32_trap:
5021
case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
5022
case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
5023
case Intrinsic::nvvm_suld_3d_i32_trap:
5024
case Intrinsic::nvvm_suld_3d_v2i32_trap:
5025
case Intrinsic::nvvm_suld_3d_v4i32_trap:
5026
case Intrinsic::nvvm_suld_1d_i32_zero:
5027
case Intrinsic::nvvm_suld_1d_v2i32_zero:
5028
case Intrinsic::nvvm_suld_1d_v4i32_zero:
5029
case Intrinsic::nvvm_suld_1d_array_i32_zero:
5030
case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
5031
case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
5032
case Intrinsic::nvvm_suld_2d_i32_zero:
5033
case Intrinsic::nvvm_suld_2d_v2i32_zero:
5034
case Intrinsic::nvvm_suld_2d_v4i32_zero:
5035
case Intrinsic::nvvm_suld_2d_array_i32_zero:
5036
case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
5037
case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
5038
case Intrinsic::nvvm_suld_3d_i32_zero:
5039
case Intrinsic::nvvm_suld_3d_v2i32_zero:
5040
case Intrinsic::nvvm_suld_3d_v4i32_zero:
5041
Info.opc = getOpcForSurfaceInstr(Intrinsic);
5042
Info.memVT = MVT::i32;
5043
Info.ptrVal = nullptr;
5044
Info.offset = 0;
5045
Info.flags = MachineMemOperand::MOLoad;
5046
Info.align = Align(16);
5047
return true;
5048
5049
case Intrinsic::nvvm_suld_1d_i64_clamp:
5050
case Intrinsic::nvvm_suld_1d_v2i64_clamp:
5051
case Intrinsic::nvvm_suld_1d_array_i64_clamp:
5052
case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
5053
case Intrinsic::nvvm_suld_2d_i64_clamp:
5054
case Intrinsic::nvvm_suld_2d_v2i64_clamp:
5055
case Intrinsic::nvvm_suld_2d_array_i64_clamp:
5056
case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
5057
case Intrinsic::nvvm_suld_3d_i64_clamp:
5058
case Intrinsic::nvvm_suld_3d_v2i64_clamp:
5059
case Intrinsic::nvvm_suld_1d_i64_trap:
5060
case Intrinsic::nvvm_suld_1d_v2i64_trap:
5061
case Intrinsic::nvvm_suld_1d_array_i64_trap:
5062
case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
5063
case Intrinsic::nvvm_suld_2d_i64_trap:
5064
case Intrinsic::nvvm_suld_2d_v2i64_trap:
5065
case Intrinsic::nvvm_suld_2d_array_i64_trap:
5066
case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
5067
case Intrinsic::nvvm_suld_3d_i64_trap:
5068
case Intrinsic::nvvm_suld_3d_v2i64_trap:
5069
case Intrinsic::nvvm_suld_1d_i64_zero:
5070
case Intrinsic::nvvm_suld_1d_v2i64_zero:
5071
case Intrinsic::nvvm_suld_1d_array_i64_zero:
5072
case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
5073
case Intrinsic::nvvm_suld_2d_i64_zero:
5074
case Intrinsic::nvvm_suld_2d_v2i64_zero:
5075
case Intrinsic::nvvm_suld_2d_array_i64_zero:
5076
case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
5077
case Intrinsic::nvvm_suld_3d_i64_zero:
5078
case Intrinsic::nvvm_suld_3d_v2i64_zero:
5079
Info.opc = getOpcForSurfaceInstr(Intrinsic);
5080
Info.memVT = MVT::i64;
5081
Info.ptrVal = nullptr;
5082
Info.offset = 0;
5083
Info.flags = MachineMemOperand::MOLoad;
5084
Info.align = Align(16);
5085
return true;
5086
}
5087
return false;
5088
}
5089
5090
/// getFunctionParamOptimizedAlign - since function arguments are passed via
5091
/// .param space, we may want to increase their alignment in a way that
5092
/// ensures that we can effectively vectorize their loads & stores. We can
5093
/// increase alignment only if the function has internal or has private
5094
/// linkage as for other linkage types callers may already rely on default
5095
/// alignment. To allow using 128-bit vectorized loads/stores, this function
5096
/// ensures that alignment is 16 or greater.
5097
Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
5098
const Function *F, Type *ArgTy, const DataLayout &DL) const {
5099
// Capping the alignment to 128 bytes as that is the maximum alignment
5100
// supported by PTX.
5101
const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
5102
5103
// If a function has linkage different from internal or private, we
5104
// must use default ABI alignment as external users rely on it. Same
5105
// for a function that may be called from a function pointer.
5106
if (!F || !F->hasLocalLinkage() ||
5107
F->hasAddressTaken(/*Users=*/nullptr,
5108
/*IgnoreCallbackUses=*/false,
5109
/*IgnoreAssumeLikeCalls=*/true,
5110
/*IgnoreLLVMUsed=*/true))
5111
return ABITypeAlign;
5112
5113
assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
5114
return std::max(Align(16), ABITypeAlign);
5115
}
5116
5117
/// Helper for computing alignment of a device function byval parameter.
5118
Align NVPTXTargetLowering::getFunctionByValParamAlign(
5119
const Function *F, Type *ArgTy, Align InitialAlign,
5120
const DataLayout &DL) const {
5121
Align ArgAlign = InitialAlign;
5122
// Try to increase alignment to enhance vectorization options.
5123
if (F)
5124
ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
5125
5126
// Old ptx versions have a bug. When PTX code takes address of
5127
// byval parameter with alignment < 4, ptxas generates code to
5128
// spill argument into memory. Alas on sm_50+ ptxas generates
5129
// SASS code that fails with misaligned access. To work around
5130
// the problem, make sure that we align byval parameters by at
5131
// least 4. This bug seems to be fixed at least starting from
5132
// ptxas > 9.0.
5133
// TODO: remove this after verifying the bug is not reproduced
5134
// on non-deprecated ptxas versions.
5135
if (ForceMinByValParamAlign)
5136
ArgAlign = std::max(ArgAlign, Align(4));
5137
5138
return ArgAlign;
5139
}
5140
5141
// Helper for getting a function parameter name. Name is composed from
5142
// its index and the function name. Negative index corresponds to special
5143
// parameter (unsized array) used for passing variable arguments.
5144
std::string NVPTXTargetLowering::getParamName(const Function *F,
5145
int Idx) const {
5146
std::string ParamName;
5147
raw_string_ostream ParamStr(ParamName);
5148
5149
ParamStr << getTargetMachine().getSymbol(F)->getName();
5150
if (Idx < 0)
5151
ParamStr << "_vararg";
5152
else
5153
ParamStr << "_param_" << Idx;
5154
5155
return ParamName;
5156
}
5157
5158
/// isLegalAddressingMode - Return true if the addressing mode represented
5159
/// by AM is legal for this target, for a load/store of the specified type.
5160
/// Used to guide target specific optimizations, like loop strength reduction
5161
/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5162
/// (CodeGenPrepare.cpp)
5163
bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
5164
const AddrMode &AM, Type *Ty,
5165
unsigned AS, Instruction *I) const {
5166
// AddrMode - This represents an addressing mode of:
5167
// BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5168
//
5169
// The legal address modes are
5170
// - [avar]
5171
// - [areg]
5172
// - [areg+immoff]
5173
// - [immAddr]
5174
5175
// immoff must fit in a signed 32-bit int
5176
if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
5177
return false;
5178
5179
if (AM.BaseGV)
5180
return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5181
5182
switch (AM.Scale) {
5183
case 0: // "r", "r+i" or "i" is allowed
5184
break;
5185
case 1:
5186
if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5187
return false;
5188
// Otherwise we have r+i.
5189
break;
5190
default:
5191
// No scale > 1 is allowed
5192
return false;
5193
}
5194
return true;
5195
}
5196
5197
//===----------------------------------------------------------------------===//
5198
// NVPTX Inline Assembly Support
5199
//===----------------------------------------------------------------------===//
5200
5201
/// getConstraintType - Given a constraint letter, return the type of
5202
/// constraint it is for this target.
5203
NVPTXTargetLowering::ConstraintType
5204
NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
5205
if (Constraint.size() == 1) {
5206
switch (Constraint[0]) {
5207
default:
5208
break;
5209
case 'b':
5210
case 'r':
5211
case 'h':
5212
case 'c':
5213
case 'l':
5214
case 'f':
5215
case 'd':
5216
case 'q':
5217
case '0':
5218
case 'N':
5219
return C_RegisterClass;
5220
}
5221
}
5222
return TargetLowering::getConstraintType(Constraint);
5223
}
5224
5225
std::pair<unsigned, const TargetRegisterClass *>
5226
NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
5227
StringRef Constraint,
5228
MVT VT) const {
5229
if (Constraint.size() == 1) {
5230
switch (Constraint[0]) {
5231
case 'b':
5232
return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
5233
case 'c':
5234
return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5235
case 'h':
5236
return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5237
case 'r':
5238
return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
5239
case 'l':
5240
case 'N':
5241
return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
5242
case 'q': {
5243
if (STI.getSmVersion() < 70)
5244
report_fatal_error("Inline asm with 128 bit operands is only "
5245
"supported for sm_70 and higher!");
5246
return std::make_pair(0U, &NVPTX::Int128RegsRegClass);
5247
}
5248
case 'f':
5249
return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
5250
case 'd':
5251
return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
5252
}
5253
}
5254
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5255
}
5256
5257
//===----------------------------------------------------------------------===//
5258
// NVPTX DAG Combining
5259
//===----------------------------------------------------------------------===//
5260
5261
bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
5262
CodeGenOptLevel OptLevel) const {
5263
// Always honor command-line argument
5264
if (FMAContractLevelOpt.getNumOccurrences() > 0)
5265
return FMAContractLevelOpt > 0;
5266
5267
// Do not contract if we're not optimizing the code.
5268
if (OptLevel == CodeGenOptLevel::None)
5269
return false;
5270
5271
// Honor TargetOptions flags that explicitly say fusion is okay.
5272
if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
5273
return true;
5274
5275
return allowUnsafeFPMath(MF);
5276
}
5277
5278
bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
5279
// Honor TargetOptions flags that explicitly say unsafe math is okay.
5280
if (MF.getTarget().Options.UnsafeFPMath)
5281
return true;
5282
5283
// Allow unsafe math if unsafe-fp-math attribute explicitly says so.
5284
const Function &F = MF.getFunction();
5285
return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
5286
}
5287
5288
static bool isConstZero(const SDValue &Operand) {
5289
const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5290
return Const && Const->getZExtValue() == 0;
5291
}
5292
5293
/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5294
/// operands N0 and N1. This is a helper for PerformADDCombine that is
5295
/// called with the default operands, and if that fails, with commuted
5296
/// operands.
5297
static SDValue
5298
PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
5299
TargetLowering::DAGCombinerInfo &DCI) {
5300
EVT VT = N0.getValueType();
5301
5302
// Since integer multiply-add costs the same as integer multiply
5303
// but is more costly than integer add, do the fusion only when
5304
// the mul is only used in the add.
5305
// TODO: this may not be true for later architectures, consider relaxing this
5306
if (!N0.getNode()->hasOneUse())
5307
return SDValue();
5308
5309
// fold (add (mul a, b), c) -> (mad a, b, c)
5310
//
5311
if (N0.getOpcode() == ISD::MUL)
5312
return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0),
5313
N0.getOperand(1), N1);
5314
5315
// fold (add (select cond, 0, (mul a, b)), c)
5316
// -> (select cond, c, (mad a, b, c))
5317
//
5318
if (N0.getOpcode() == ISD::SELECT) {
5319
unsigned ZeroOpNum;
5320
if (isConstZero(N0->getOperand(1)))
5321
ZeroOpNum = 1;
5322
else if (isConstZero(N0->getOperand(2)))
5323
ZeroOpNum = 2;
5324
else
5325
return SDValue();
5326
5327
SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5328
if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5329
return SDValue();
5330
5331
SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
5332
M->getOperand(0), M->getOperand(1), N1);
5333
return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5334
((ZeroOpNum == 1) ? N1 : MAD),
5335
((ZeroOpNum == 1) ? MAD : N1));
5336
}
5337
5338
return SDValue();
5339
}
5340
5341
static SDValue
5342
PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
5343
TargetLowering::DAGCombinerInfo &DCI,
5344
CodeGenOptLevel OptLevel) {
5345
EVT VT = N0.getValueType();
5346
if (N0.getOpcode() == ISD::FMUL) {
5347
const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5348
&DCI.DAG.getTargetLoweringInfo());
5349
if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel))
5350
return SDValue();
5351
5352
// For floating point:
5353
// Do the fusion only when the mul has less than 5 uses and all
5354
// are add.
5355
// The heuristic is that if a use is not an add, then that use
5356
// cannot be fused into fma, therefore mul is still needed anyway.
5357
// If there are more than 4 uses, even if they are all add, fusing
5358
// them will increase register pressue.
5359
//
5360
int numUses = 0;
5361
int nonAddCount = 0;
5362
for (const SDNode *User : N0.getNode()->uses()) {
5363
numUses++;
5364
if (User->getOpcode() != ISD::FADD)
5365
++nonAddCount;
5366
if (numUses >= 5)
5367
return SDValue();
5368
}
5369
if (nonAddCount) {
5370
int orderNo = N->getIROrder();
5371
int orderNo2 = N0.getNode()->getIROrder();
5372
// simple heuristics here for considering potential register
5373
// pressure, the logics here is that the differnce are used
5374
// to measure the distance between def and use, the longer distance
5375
// more likely cause register pressure.
5376
if (orderNo - orderNo2 < 500)
5377
return SDValue();
5378
5379
// Now, check if at least one of the FMUL's operands is live beyond the
5380
// node N, which guarantees that the FMA will not increase register
5381
// pressure at node N.
5382
bool opIsLive = false;
5383
const SDNode *left = N0.getOperand(0).getNode();
5384
const SDNode *right = N0.getOperand(1).getNode();
5385
5386
if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5387
opIsLive = true;
5388
5389
if (!opIsLive)
5390
for (const SDNode *User : left->uses()) {
5391
int orderNo3 = User->getIROrder();
5392
if (orderNo3 > orderNo) {
5393
opIsLive = true;
5394
break;
5395
}
5396
}
5397
5398
if (!opIsLive)
5399
for (const SDNode *User : right->uses()) {
5400
int orderNo3 = User->getIROrder();
5401
if (orderNo3 > orderNo) {
5402
opIsLive = true;
5403
break;
5404
}
5405
}
5406
5407
if (!opIsLive)
5408
return SDValue();
5409
}
5410
5411
return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5412
N0.getOperand(1), N1);
5413
}
5414
5415
return SDValue();
5416
}
5417
5418
static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
5419
std::size_t Back) {
5420
if (all_of(N->ops().drop_front(Front).drop_back(Back),
5421
[](const SDUse &U) { return U.get()->isUndef(); }))
5422
// Operand 0 is the previous value in the chain. Cannot return EntryToken
5423
// as the previous value will become unused and eliminated later.
5424
return N->getOperand(0);
5425
5426
return SDValue();
5427
}
5428
5429
static SDValue PerformStoreParamCombine(SDNode *N) {
5430
// Operands from the 3rd to the 2nd last one are the values to be stored.
5431
// {Chain, ArgID, Offset, Val, Glue}
5432
return PerformStoreCombineHelper(N, 3, 1);
5433
}
5434
5435
static SDValue PerformStoreRetvalCombine(SDNode *N) {
5436
// Operands from the 2nd to the last one are the values to be stored
5437
return PerformStoreCombineHelper(N, 2, 0);
5438
}
5439
5440
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5441
///
5442
static SDValue PerformADDCombine(SDNode *N,
5443
TargetLowering::DAGCombinerInfo &DCI,
5444
CodeGenOptLevel OptLevel) {
5445
if (OptLevel == CodeGenOptLevel::None)
5446
return SDValue();
5447
5448
SDValue N0 = N->getOperand(0);
5449
SDValue N1 = N->getOperand(1);
5450
5451
// Skip non-integer, non-scalar case
5452
EVT VT = N0.getValueType();
5453
if (VT.isVector() || VT != MVT::i32)
5454
return SDValue();
5455
5456
// First try with the default operand order.
5457
if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
5458
return Result;
5459
5460
// If that didn't work, try again with the operands commuted.
5461
return PerformADDCombineWithOperands(N, N1, N0, DCI);
5462
}
5463
5464
/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5465
///
5466
static SDValue PerformFADDCombine(SDNode *N,
5467
TargetLowering::DAGCombinerInfo &DCI,
5468
CodeGenOptLevel OptLevel) {
5469
SDValue N0 = N->getOperand(0);
5470
SDValue N1 = N->getOperand(1);
5471
5472
EVT VT = N0.getValueType();
5473
if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
5474
return SDValue();
5475
5476
// First try with the default operand order.
5477
if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
5478
return Result;
5479
5480
// If that didn't work, try again with the operands commuted.
5481
return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
5482
}
5483
5484
static SDValue PerformANDCombine(SDNode *N,
5485
TargetLowering::DAGCombinerInfo &DCI) {
5486
// The type legalizer turns a vector load of i8 values into a zextload to i16
5487
// registers, optionally ANY_EXTENDs it (if target type is integer),
5488
// and ANDs off the high 8 bits. Since we turn this load into a
5489
// target-specific DAG node, the DAG combiner fails to eliminate these AND
5490
// nodes. Do that here.
5491
SDValue Val = N->getOperand(0);
5492
SDValue Mask = N->getOperand(1);
5493
5494
if (isa<ConstantSDNode>(Val)) {
5495
std::swap(Val, Mask);
5496
}
5497
5498
SDValue AExt;
5499
5500
// Convert BFE-> truncate i16 -> and 255
5501
// To just BFE-> truncate i16, as the value already has all the bits in the
5502
// right places.
5503
if (Val.getOpcode() == ISD::TRUNCATE) {
5504
SDValue BFE = Val.getOperand(0);
5505
if (BFE.getOpcode() != NVPTXISD::BFE)
5506
return SDValue();
5507
5508
ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
5509
if (!BFEBits)
5510
return SDValue();
5511
uint64_t BFEBitsVal = BFEBits->getZExtValue();
5512
5513
ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5514
if (!MaskCnst) {
5515
// Not an AND with a constant
5516
return SDValue();
5517
}
5518
uint64_t MaskVal = MaskCnst->getZExtValue();
5519
5520
if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
5521
return SDValue();
5522
// If we get here, the AND is unnecessary. Just replace it with the trunc
5523
DCI.CombineTo(N, Val, false);
5524
}
5525
// Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
5526
if (Val.getOpcode() == ISD::ANY_EXTEND) {
5527
AExt = Val;
5528
Val = Val->getOperand(0);
5529
}
5530
5531
if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
5532
Val = Val->getOperand(0);
5533
}
5534
5535
if (Val->getOpcode() == NVPTXISD::LoadV2 ||
5536
Val->getOpcode() == NVPTXISD::LoadV4) {
5537
ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5538
if (!MaskCnst) {
5539
// Not an AND with a constant
5540
return SDValue();
5541
}
5542
5543
uint64_t MaskVal = MaskCnst->getZExtValue();
5544
if (MaskVal != 0xff) {
5545
// Not an AND that chops off top 8 bits
5546
return SDValue();
5547
}
5548
5549
MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
5550
if (!Mem) {
5551
// Not a MemSDNode?!?
5552
return SDValue();
5553
}
5554
5555
EVT MemVT = Mem->getMemoryVT();
5556
if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
5557
// We only handle the i8 case
5558
return SDValue();
5559
}
5560
5561
unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1);
5562
if (ExtType == ISD::SEXTLOAD) {
5563
// If for some reason the load is a sextload, the and is needed to zero
5564
// out the high 8 bits
5565
return SDValue();
5566
}
5567
5568
bool AddTo = false;
5569
if (AExt.getNode() != nullptr) {
5570
// Re-insert the ext as a zext.
5571
Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
5572
AExt.getValueType(), Val);
5573
AddTo = true;
5574
}
5575
5576
// If we get here, the AND is unnecessary. Just replace it with the load
5577
DCI.CombineTo(N, Val, AddTo);
5578
}
5579
5580
return SDValue();
5581
}
5582
5583
static SDValue PerformREMCombine(SDNode *N,
5584
TargetLowering::DAGCombinerInfo &DCI,
5585
CodeGenOptLevel OptLevel) {
5586
assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5587
5588
// Don't do anything at less than -O2.
5589
if (OptLevel < CodeGenOptLevel::Default)
5590
return SDValue();
5591
5592
SelectionDAG &DAG = DCI.DAG;
5593
SDLoc DL(N);
5594
EVT VT = N->getValueType(0);
5595
bool IsSigned = N->getOpcode() == ISD::SREM;
5596
unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5597
5598
const SDValue &Num = N->getOperand(0);
5599
const SDValue &Den = N->getOperand(1);
5600
5601
for (const SDNode *U : Num->uses()) {
5602
if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5603
U->getOperand(1) == Den) {
5604
// Num % Den -> Num - (Num / Den) * Den
5605
return DAG.getNode(ISD::SUB, DL, VT, Num,
5606
DAG.getNode(ISD::MUL, DL, VT,
5607
DAG.getNode(DivOpc, DL, VT, Num, Den),
5608
Den));
5609
}
5610
}
5611
return SDValue();
5612
}
5613
5614
enum OperandSignedness {
5615
Signed = 0,
5616
Unsigned,
5617
Unknown
5618
};
5619
5620
/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5621
/// that can be demoted to \p OptSize bits without loss of information. The
5622
/// signedness of the operand, if determinable, is placed in \p S.
5623
static bool IsMulWideOperandDemotable(SDValue Op,
5624
unsigned OptSize,
5625
OperandSignedness &S) {
5626
S = Unknown;
5627
5628
if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5629
Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5630
EVT OrigVT = Op.getOperand(0).getValueType();
5631
if (OrigVT.getFixedSizeInBits() <= OptSize) {
5632
S = Signed;
5633
return true;
5634
}
5635
} else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5636
EVT OrigVT = Op.getOperand(0).getValueType();
5637
if (OrigVT.getFixedSizeInBits() <= OptSize) {
5638
S = Unsigned;
5639
return true;
5640
}
5641
}
5642
5643
return false;
5644
}
5645
5646
/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5647
/// be demoted to \p OptSize bits without loss of information. If the operands
5648
/// contain a constant, it should appear as the RHS operand. The signedness of
5649
/// the operands is placed in \p IsSigned.
5650
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
5651
unsigned OptSize,
5652
bool &IsSigned) {
5653
OperandSignedness LHSSign;
5654
5655
// The LHS operand must be a demotable op
5656
if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5657
return false;
5658
5659
// We should have been able to determine the signedness from the LHS
5660
if (LHSSign == Unknown)
5661
return false;
5662
5663
IsSigned = (LHSSign == Signed);
5664
5665
// The RHS can be a demotable op or a constant
5666
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
5667
const APInt &Val = CI->getAPIntValue();
5668
if (LHSSign == Unsigned) {
5669
return Val.isIntN(OptSize);
5670
} else {
5671
return Val.isSignedIntN(OptSize);
5672
}
5673
} else {
5674
OperandSignedness RHSSign;
5675
if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5676
return false;
5677
5678
return LHSSign == RHSSign;
5679
}
5680
}
5681
5682
/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5683
/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5684
/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5685
/// amount.
5686
static SDValue TryMULWIDECombine(SDNode *N,
5687
TargetLowering::DAGCombinerInfo &DCI) {
5688
EVT MulType = N->getValueType(0);
5689
if (MulType != MVT::i32 && MulType != MVT::i64) {
5690
return SDValue();
5691
}
5692
5693
SDLoc DL(N);
5694
unsigned OptSize = MulType.getSizeInBits() >> 1;
5695
SDValue LHS = N->getOperand(0);
5696
SDValue RHS = N->getOperand(1);
5697
5698
// Canonicalize the multiply so the constant (if any) is on the right
5699
if (N->getOpcode() == ISD::MUL) {
5700
if (isa<ConstantSDNode>(LHS)) {
5701
std::swap(LHS, RHS);
5702
}
5703
}
5704
5705
// If we have a SHL, determine the actual multiply amount
5706
if (N->getOpcode() == ISD::SHL) {
5707
ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
5708
if (!ShlRHS) {
5709
return SDValue();
5710
}
5711
5712
APInt ShiftAmt = ShlRHS->getAPIntValue();
5713
unsigned BitWidth = MulType.getSizeInBits();
5714
if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5715
APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5716
RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5717
} else {
5718
return SDValue();
5719
}
5720
}
5721
5722
bool Signed;
5723
// Verify that our operands are demotable
5724
if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5725
return SDValue();
5726
}
5727
5728
EVT DemotedVT;
5729
if (MulType == MVT::i32) {
5730
DemotedVT = MVT::i16;
5731
} else {
5732
DemotedVT = MVT::i32;
5733
}
5734
5735
// Truncate the operands to the correct size. Note that these are just for
5736
// type consistency and will (likely) be eliminated in later phases.
5737
SDValue TruncLHS =
5738
DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5739
SDValue TruncRHS =
5740
DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5741
5742
unsigned Opc;
5743
if (Signed) {
5744
Opc = NVPTXISD::MUL_WIDE_SIGNED;
5745
} else {
5746
Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
5747
}
5748
5749
return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5750
}
5751
5752
static bool isConstOne(const SDValue &Operand) {
5753
const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5754
return Const && Const->getZExtValue() == 1;
5755
}
5756
5757
static SDValue matchMADConstOnePattern(SDValue Add) {
5758
if (Add->getOpcode() != ISD::ADD)
5759
return SDValue();
5760
5761
if (isConstOne(Add->getOperand(0)))
5762
return Add->getOperand(1);
5763
5764
if (isConstOne(Add->getOperand(1)))
5765
return Add->getOperand(0);
5766
5767
return SDValue();
5768
}
5769
5770
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL,
5771
TargetLowering::DAGCombinerInfo &DCI) {
5772
5773
if (SDValue Y = matchMADConstOnePattern(Add))
5774
return DCI.DAG.getNode(NVPTXISD::IMAD, DL, VT, X, Y, X);
5775
5776
return SDValue();
5777
}
5778
5779
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT,
5780
SDLoc DL,
5781
TargetLowering::DAGCombinerInfo &DCI) {
5782
if (Select->getOpcode() != ISD::SELECT)
5783
return SDValue();
5784
5785
SDValue Cond = Select->getOperand(0);
5786
5787
unsigned ConstOpNo;
5788
if (isConstOne(Select->getOperand(1)))
5789
ConstOpNo = 1;
5790
else if (isConstOne(Select->getOperand(2)))
5791
ConstOpNo = 2;
5792
else
5793
return SDValue();
5794
5795
SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
5796
5797
// Do not combine if the resulting sequence is not obviously profitable.
5798
if (!matchMADConstOnePattern(Y))
5799
return SDValue();
5800
5801
SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5802
5803
return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
5804
(ConstOpNo == 1) ? X : NewMul,
5805
(ConstOpNo == 1) ? NewMul : X);
5806
}
5807
5808
static SDValue
5809
PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
5810
TargetLowering::DAGCombinerInfo &DCI) {
5811
5812
EVT VT = N0.getValueType();
5813
if (VT.isVector())
5814
return SDValue();
5815
5816
if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
5817
return SDValue();
5818
5819
SDLoc DL(N);
5820
5821
// (mul x, (add y, 1)) -> (mad x, y, x)
5822
if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
5823
return Res;
5824
if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
5825
return Res;
5826
5827
// (mul x, (select y, 1)) -> (select (mul x, y), x)
5828
if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
5829
return Res;
5830
if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
5831
return Res;
5832
5833
return SDValue();
5834
}
5835
5836
/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5837
static SDValue PerformMULCombine(SDNode *N,
5838
TargetLowering::DAGCombinerInfo &DCI,
5839
CodeGenOptLevel OptLevel) {
5840
if (OptLevel == CodeGenOptLevel::None)
5841
return SDValue();
5842
5843
if (SDValue Ret = TryMULWIDECombine(N, DCI))
5844
return Ret;
5845
5846
SDValue N0 = N->getOperand(0);
5847
SDValue N1 = N->getOperand(1);
5848
return PerformMULCombineWithOperands(N, N0, N1, DCI);
5849
}
5850
5851
/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5852
static SDValue PerformSHLCombine(SDNode *N,
5853
TargetLowering::DAGCombinerInfo &DCI,
5854
CodeGenOptLevel OptLevel) {
5855
if (OptLevel > CodeGenOptLevel::None) {
5856
// Try mul.wide combining at OptLevel > 0
5857
if (SDValue Ret = TryMULWIDECombine(N, DCI))
5858
return Ret;
5859
}
5860
5861
return SDValue();
5862
}
5863
5864
static SDValue PerformSETCCCombine(SDNode *N,
5865
TargetLowering::DAGCombinerInfo &DCI,
5866
unsigned int SmVersion) {
5867
EVT CCType = N->getValueType(0);
5868
SDValue A = N->getOperand(0);
5869
SDValue B = N->getOperand(1);
5870
5871
EVT AType = A.getValueType();
5872
if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5873
return SDValue();
5874
5875
if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5876
return SDValue();
5877
5878
SDLoc DL(N);
5879
// setp.f16x2 returns two scalar predicates, which we need to
5880
// convert back to v2i1. The returned result will be scalarized by
5881
// the legalizer, but the comparison will remain a single vector
5882
// instruction.
5883
SDValue CCNode = DCI.DAG.getNode(
5884
A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5885
: NVPTXISD::SETP_BF16X2,
5886
DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5887
return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5888
CCNode.getValue(1));
5889
}
5890
5891
static SDValue PerformEXTRACTCombine(SDNode *N,
5892
TargetLowering::DAGCombinerInfo &DCI) {
5893
SDValue Vector = N->getOperand(0);
5894
SDLoc DL(N);
5895
EVT VectorVT = Vector.getValueType();
5896
if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5897
IsPTXVectorType(VectorVT.getSimpleVT()))
5898
return SDValue(); // Native vector loads already combine nicely w/
5899
// extract_vector_elt.
5900
// Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
5901
// handle them OK.
5902
if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
5903
VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
5904
return SDValue();
5905
5906
// Don't mess with undef values as sra may be simplified to 0, not undef.
5907
if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
5908
return SDValue();
5909
5910
uint64_t VectorBits = VectorVT.getSizeInBits();
5911
// We only handle the types we can extract in-register.
5912
if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5913
return SDValue();
5914
5915
ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5916
// Index == 0 is handled by generic DAG combiner.
5917
if (!Index || Index->getZExtValue() == 0)
5918
return SDValue();
5919
5920
MVT IVT = MVT::getIntegerVT(VectorBits);
5921
EVT EltVT = VectorVT.getVectorElementType();
5922
EVT EltIVT = EltVT.changeTypeToInteger();
5923
uint64_t EltBits = EltVT.getScalarSizeInBits();
5924
5925
SDValue Result = DCI.DAG.getNode(
5926
ISD::TRUNCATE, DL, EltIVT,
5927
DCI.DAG.getNode(
5928
ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5929
DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5930
5931
// If element has non-integer type, bitcast it back to the expected type.
5932
if (EltVT != EltIVT)
5933
Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5934
// Past legalizer, we may need to extent i8 -> i16 to match the register type.
5935
if (EltVT != N->getValueType(0))
5936
Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5937
5938
return Result;
5939
}
5940
5941
static SDValue PerformVSELECTCombine(SDNode *N,
5942
TargetLowering::DAGCombinerInfo &DCI) {
5943
SDValue VA = N->getOperand(1);
5944
EVT VectorVT = VA.getValueType();
5945
if (VectorVT != MVT::v4i8)
5946
return SDValue();
5947
5948
// We need to split vselect into individual per-element operations Because we
5949
// use BFE/BFI instruction for byte extraction/insertion, we do end up with
5950
// 32-bit values, so we may as well do comparison as i32 to avoid conversions
5951
// to/from i16 normally used for i8 values.
5952
SmallVector<SDValue, 4> E;
5953
SDLoc DL(N);
5954
SDValue VCond = N->getOperand(0);
5955
SDValue VB = N->getOperand(2);
5956
for (int I = 0; I < 4; ++I) {
5957
SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5958
DCI.DAG.getConstant(I, DL, MVT::i32));
5959
SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5960
DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5961
DCI.DAG.getConstant(I, DL, MVT::i32)),
5962
DL, MVT::i32);
5963
SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5964
DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5965
DCI.DAG.getConstant(I, DL, MVT::i32)),
5966
DL, MVT::i32);
5967
E.push_back(DCI.DAG.getAnyExtOrTrunc(
5968
DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5969
}
5970
return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5971
}
5972
5973
static SDValue PerformLOADCombine(SDNode *N,
5974
TargetLowering::DAGCombinerInfo &DCI) {
5975
SelectionDAG &DAG = DCI.DAG;
5976
LoadSDNode *LD = cast<LoadSDNode>(N);
5977
5978
// Lower a v16i8 load into a LoadV4 operation with i32 results instead of
5979
// letting ReplaceLoadVector split it into smaller loads during legalization.
5980
// This is done at dag-combine1 time, so that vector operations with i8
5981
// elements can be optimised away instead of being needlessly split during
5982
// legalization, which involves storing to the stack and loading it back.
5983
EVT VT = N->getValueType(0);
5984
if (VT != MVT::v16i8)
5985
return SDValue();
5986
5987
SDLoc DL(N);
5988
5989
// Create a v4i32 vector load operation, effectively <4 x v4i8>.
5990
unsigned Opc = NVPTXISD::LoadV4;
5991
EVT NewVT = MVT::v4i32;
5992
EVT EltVT = NewVT.getVectorElementType();
5993
unsigned NumElts = NewVT.getVectorNumElements();
5994
EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
5995
SDVTList RetVTList = DAG.getVTList(RetVTs);
5996
SmallVector<SDValue, 8> Ops(N->ops());
5997
Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5998
SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,
5999
LD->getMemOperand());
6000
SDValue NewChain = NewLoad.getValue(NumElts);
6001
6002
// Create a vector of the same type returned by the original load.
6003
SmallVector<SDValue, 4> Elts;
6004
for (unsigned i = 0; i < NumElts; i++)
6005
Elts.push_back(NewLoad.getValue(i));
6006
return DCI.DAG.getMergeValues(
6007
{DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),
6008
NewChain},
6009
DL);
6010
}
6011
6012
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
6013
DAGCombinerInfo &DCI) const {
6014
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
6015
switch (N->getOpcode()) {
6016
default: break;
6017
case ISD::ADD:
6018
return PerformADDCombine(N, DCI, OptLevel);
6019
case ISD::FADD:
6020
return PerformFADDCombine(N, DCI, OptLevel);
6021
case ISD::MUL:
6022
return PerformMULCombine(N, DCI, OptLevel);
6023
case ISD::SHL:
6024
return PerformSHLCombine(N, DCI, OptLevel);
6025
case ISD::AND:
6026
return PerformANDCombine(N, DCI);
6027
case ISD::UREM:
6028
case ISD::SREM:
6029
return PerformREMCombine(N, DCI, OptLevel);
6030
case ISD::SETCC:
6031
return PerformSETCCCombine(N, DCI, STI.getSmVersion());
6032
case ISD::LOAD:
6033
return PerformLOADCombine(N, DCI);
6034
case NVPTXISD::StoreRetval:
6035
case NVPTXISD::StoreRetvalV2:
6036
case NVPTXISD::StoreRetvalV4:
6037
return PerformStoreRetvalCombine(N);
6038
case NVPTXISD::StoreParam:
6039
case NVPTXISD::StoreParamV2:
6040
case NVPTXISD::StoreParamV4:
6041
return PerformStoreParamCombine(N);
6042
case ISD::EXTRACT_VECTOR_ELT:
6043
return PerformEXTRACTCombine(N, DCI);
6044
case ISD::VSELECT:
6045
return PerformVSELECTCombine(N, DCI);
6046
}
6047
return SDValue();
6048
}
6049
6050
/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
6051
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
6052
SmallVectorImpl<SDValue> &Results) {
6053
EVT ResVT = N->getValueType(0);
6054
SDLoc DL(N);
6055
6056
assert(ResVT.isVector() && "Vector load must have vector type");
6057
6058
// We only handle "native" vector sizes for now, e.g. <4 x double> is not
6059
// legal. We can (and should) split that into 2 loads of <2 x double> here
6060
// but I'm leaving that as a TODO for now.
6061
assert(ResVT.isSimple() && "Can only handle simple types");
6062
switch (ResVT.getSimpleVT().SimpleTy) {
6063
default:
6064
return;
6065
case MVT::v2i8:
6066
case MVT::v2i16:
6067
case MVT::v2i32:
6068
case MVT::v2i64:
6069
case MVT::v2f16:
6070
case MVT::v2f32:
6071
case MVT::v2f64:
6072
case MVT::v4i8:
6073
case MVT::v4i16:
6074
case MVT::v4i32:
6075
case MVT::v4f16:
6076
case MVT::v4f32:
6077
case MVT::v8f16: // <4 x f16x2>
6078
case MVT::v8bf16: // <4 x bf16x2>
6079
case MVT::v8i16: // <4 x i16x2>
6080
// This is a "native" vector type
6081
break;
6082
}
6083
6084
LoadSDNode *LD = cast<LoadSDNode>(N);
6085
6086
Align Alignment = LD->getAlign();
6087
auto &TD = DAG.getDataLayout();
6088
Align PrefAlign =
6089
TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
6090
if (Alignment < PrefAlign) {
6091
// This load is not sufficiently aligned, so bail out and let this vector
6092
// load be scalarized. Note that we may still be able to emit smaller
6093
// vector loads. For example, if we are loading a <4 x float> with an
6094
// alignment of 8, this check will fail but the legalizer will try again
6095
// with 2 x <2 x float>, which will succeed with an alignment of 8.
6096
return;
6097
}
6098
6099
EVT EltVT = ResVT.getVectorElementType();
6100
unsigned NumElts = ResVT.getVectorNumElements();
6101
6102
// Since LoadV2 is a target node, we cannot rely on DAG type legalization.
6103
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
6104
// loaded type to i16 and propagate the "real" type as the memory type.
6105
bool NeedTrunc = false;
6106
if (EltVT.getSizeInBits() < 16) {
6107
EltVT = MVT::i16;
6108
NeedTrunc = true;
6109
}
6110
6111
unsigned Opcode = 0;
6112
SDVTList LdResVTs;
6113
bool Load16x2 = false;
6114
6115
switch (NumElts) {
6116
default:
6117
return;
6118
case 2:
6119
Opcode = NVPTXISD::LoadV2;
6120
LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6121
break;
6122
case 4: {
6123
Opcode = NVPTXISD::LoadV4;
6124
EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6125
LdResVTs = DAG.getVTList(ListVTs);
6126
break;
6127
}
6128
case 8: {
6129
// v8f16 is a special case. PTX doesn't have ld.v8.f16
6130
// instruction. Instead, we split the vector into v2f16 chunks and
6131
// load them with ld.v4.b32.
6132
assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
6133
Load16x2 = true;
6134
Opcode = NVPTXISD::LoadV4;
6135
EVT VVT;
6136
switch (EltVT.getSimpleVT().SimpleTy) {
6137
case MVT::f16:
6138
VVT = MVT::v2f16;
6139
break;
6140
case MVT::bf16:
6141
VVT = MVT::v2bf16;
6142
break;
6143
case MVT::i16:
6144
VVT = MVT::v2i16;
6145
break;
6146
default:
6147
llvm_unreachable("Unsupported v8 vector type.");
6148
}
6149
EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
6150
LdResVTs = DAG.getVTList(ListVTs);
6151
break;
6152
}
6153
}
6154
6155
// Copy regular operands
6156
SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
6157
6158
// The select routine does not have access to the LoadSDNode instance, so
6159
// pass along the extension information
6160
OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
6161
6162
SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6163
LD->getMemoryVT(),
6164
LD->getMemOperand());
6165
6166
SmallVector<SDValue, 8> ScalarRes;
6167
if (Load16x2) {
6168
// Split v2f16 subvectors back into individual elements.
6169
NumElts /= 2;
6170
for (unsigned i = 0; i < NumElts; ++i) {
6171
SDValue SubVector = NewLD.getValue(i);
6172
SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
6173
DAG.getIntPtrConstant(0, DL));
6174
SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
6175
DAG.getIntPtrConstant(1, DL));
6176
ScalarRes.push_back(E0);
6177
ScalarRes.push_back(E1);
6178
}
6179
} else {
6180
for (unsigned i = 0; i < NumElts; ++i) {
6181
SDValue Res = NewLD.getValue(i);
6182
if (NeedTrunc)
6183
Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6184
ScalarRes.push_back(Res);
6185
}
6186
}
6187
6188
SDValue LoadChain = NewLD.getValue(NumElts);
6189
6190
SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
6191
6192
Results.push_back(BuildVec);
6193
Results.push_back(LoadChain);
6194
}
6195
6196
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
6197
SmallVectorImpl<SDValue> &Results) {
6198
SDValue Chain = N->getOperand(0);
6199
SDValue Intrin = N->getOperand(1);
6200
SDLoc DL(N);
6201
6202
// Get the intrinsic ID
6203
unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
6204
switch (IntrinNo) {
6205
default:
6206
return;
6207
case Intrinsic::nvvm_ldg_global_i:
6208
case Intrinsic::nvvm_ldg_global_f:
6209
case Intrinsic::nvvm_ldg_global_p:
6210
case Intrinsic::nvvm_ldu_global_i:
6211
case Intrinsic::nvvm_ldu_global_f:
6212
case Intrinsic::nvvm_ldu_global_p: {
6213
EVT ResVT = N->getValueType(0);
6214
6215
if (ResVT.isVector()) {
6216
// Vector LDG/LDU
6217
6218
unsigned NumElts = ResVT.getVectorNumElements();
6219
EVT EltVT = ResVT.getVectorElementType();
6220
6221
// Since LDU/LDG are target nodes, we cannot rely on DAG type
6222
// legalization.
6223
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
6224
// loaded type to i16 and propagate the "real" type as the memory type.
6225
bool NeedTrunc = false;
6226
if (EltVT.getSizeInBits() < 16) {
6227
EltVT = MVT::i16;
6228
NeedTrunc = true;
6229
}
6230
6231
unsigned Opcode = 0;
6232
SDVTList LdResVTs;
6233
6234
switch (NumElts) {
6235
default:
6236
return;
6237
case 2:
6238
switch (IntrinNo) {
6239
default:
6240
return;
6241
case Intrinsic::nvvm_ldg_global_i:
6242
case Intrinsic::nvvm_ldg_global_f:
6243
case Intrinsic::nvvm_ldg_global_p:
6244
Opcode = NVPTXISD::LDGV2;
6245
break;
6246
case Intrinsic::nvvm_ldu_global_i:
6247
case Intrinsic::nvvm_ldu_global_f:
6248
case Intrinsic::nvvm_ldu_global_p:
6249
Opcode = NVPTXISD::LDUV2;
6250
break;
6251
}
6252
LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6253
break;
6254
case 4: {
6255
switch (IntrinNo) {
6256
default:
6257
return;
6258
case Intrinsic::nvvm_ldg_global_i:
6259
case Intrinsic::nvvm_ldg_global_f:
6260
case Intrinsic::nvvm_ldg_global_p:
6261
Opcode = NVPTXISD::LDGV4;
6262
break;
6263
case Intrinsic::nvvm_ldu_global_i:
6264
case Intrinsic::nvvm_ldu_global_f:
6265
case Intrinsic::nvvm_ldu_global_p:
6266
Opcode = NVPTXISD::LDUV4;
6267
break;
6268
}
6269
EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6270
LdResVTs = DAG.getVTList(ListVTs);
6271
break;
6272
}
6273
}
6274
6275
SmallVector<SDValue, 8> OtherOps;
6276
6277
// Copy regular operands
6278
6279
OtherOps.push_back(Chain); // Chain
6280
// Skip operand 1 (intrinsic ID)
6281
// Others
6282
OtherOps.append(N->op_begin() + 2, N->op_end());
6283
6284
MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6285
6286
SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6287
MemSD->getMemoryVT(),
6288
MemSD->getMemOperand());
6289
6290
SmallVector<SDValue, 4> ScalarRes;
6291
6292
for (unsigned i = 0; i < NumElts; ++i) {
6293
SDValue Res = NewLD.getValue(i);
6294
if (NeedTrunc)
6295
Res =
6296
DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6297
ScalarRes.push_back(Res);
6298
}
6299
6300
SDValue LoadChain = NewLD.getValue(NumElts);
6301
6302
SDValue BuildVec =
6303
DAG.getBuildVector(ResVT, DL, ScalarRes);
6304
6305
Results.push_back(BuildVec);
6306
Results.push_back(LoadChain);
6307
} else {
6308
// i8 LDG/LDU
6309
assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6310
"Custom handling of non-i8 ldu/ldg?");
6311
6312
// Just copy all operands as-is
6313
SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
6314
6315
// Force output to i16
6316
SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6317
6318
MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6319
6320
// We make sure the memory type is i8, which will be used during isel
6321
// to select the proper instruction.
6322
SDValue NewLD =
6323
DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
6324
MVT::i8, MemSD->getMemOperand());
6325
6326
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6327
NewLD.getValue(0)));
6328
Results.push_back(NewLD.getValue(1));
6329
}
6330
}
6331
}
6332
}
6333
6334
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG,
6335
SmallVectorImpl<SDValue> &Results) {
6336
// Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
6337
// result so that it can pass the legalization
6338
SDLoc DL(N);
6339
SDValue Chain = N->getOperand(0);
6340
SDValue Reg = N->getOperand(1);
6341
SDValue Glue = N->getOperand(2);
6342
6343
assert(Reg.getValueType() == MVT::i128 &&
6344
"Custom lowering for CopyFromReg with 128-bit reg only");
6345
SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
6346
N->getValueType(2)};
6347
SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
6348
6349
SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
6350
SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
6351
{NewValue.getValue(0), NewValue.getValue(1)});
6352
6353
Results.push_back(Pair);
6354
Results.push_back(NewValue.getValue(2));
6355
Results.push_back(NewValue.getValue(3));
6356
}
6357
6358
void NVPTXTargetLowering::ReplaceNodeResults(
6359
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
6360
switch (N->getOpcode()) {
6361
default:
6362
report_fatal_error("Unhandled custom legalization");
6363
case ISD::LOAD:
6364
ReplaceLoadVector(N, DAG, Results);
6365
return;
6366
case ISD::INTRINSIC_W_CHAIN:
6367
ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
6368
return;
6369
case ISD::CopyFromReg:
6370
ReplaceCopyFromReg_128(N, DAG, Results);
6371
return;
6372
}
6373
}
6374
6375
NVPTXTargetLowering::AtomicExpansionKind
6376
NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
6377
Type *Ty = AI->getValOperand()->getType();
6378
6379
if (AI->isFloatingPointOperation()) {
6380
if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {
6381
if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6382
STI.getPTXVersion() >= 63)
6383
return AtomicExpansionKind::None;
6384
if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
6385
STI.getPTXVersion() >= 78)
6386
return AtomicExpansionKind::None;
6387
if (Ty->isFloatTy())
6388
return AtomicExpansionKind::None;
6389
if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6390
return AtomicExpansionKind::None;
6391
}
6392
return AtomicExpansionKind::CmpXChg;
6393
}
6394
6395
assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6396
auto ITy = cast<llvm::IntegerType>(Ty);
6397
6398
switch (AI->getOperation()) {
6399
default:
6400
return AtomicExpansionKind::CmpXChg;
6401
case AtomicRMWInst::BinOp::And:
6402
case AtomicRMWInst::BinOp::Or:
6403
case AtomicRMWInst::BinOp::Xor:
6404
case AtomicRMWInst::BinOp::Xchg:
6405
switch (ITy->getBitWidth()) {
6406
case 8:
6407
case 16:
6408
return AtomicExpansionKind::CmpXChg;
6409
case 32:
6410
return AtomicExpansionKind::None;
6411
case 64:
6412
if (STI.hasAtomBitwise64())
6413
return AtomicExpansionKind::None;
6414
return AtomicExpansionKind::CmpXChg;
6415
default:
6416
llvm_unreachable("unsupported width encountered");
6417
}
6418
case AtomicRMWInst::BinOp::Add:
6419
case AtomicRMWInst::BinOp::Sub:
6420
case AtomicRMWInst::BinOp::Max:
6421
case AtomicRMWInst::BinOp::Min:
6422
case AtomicRMWInst::BinOp::UMax:
6423
case AtomicRMWInst::BinOp::UMin:
6424
switch (ITy->getBitWidth()) {
6425
case 8:
6426
case 16:
6427
return AtomicExpansionKind::CmpXChg;
6428
case 32:
6429
return AtomicExpansionKind::None;
6430
case 64:
6431
if (STI.hasAtomMinMax64())
6432
return AtomicExpansionKind::None;
6433
return AtomicExpansionKind::CmpXChg;
6434
default:
6435
llvm_unreachable("unsupported width encountered");
6436
}
6437
}
6438
6439
return AtomicExpansionKind::CmpXChg;
6440
}
6441
6442
// Pin NVPTXTargetObjectFile's vtables to this file.
6443
NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
6444
6445
MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
6446
const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
6447
return getDataSection();
6448
}
6449
6450