Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
35267 views
1
//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//==-----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// Defines an instruction selector for the AMDGPU target.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "AMDGPUISelDAGToDAG.h"
15
#include "AMDGPU.h"
16
#include "AMDGPUInstrInfo.h"
17
#include "AMDGPUSubtarget.h"
18
#include "AMDGPUTargetMachine.h"
19
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20
#include "MCTargetDesc/R600MCTargetDesc.h"
21
#include "R600RegisterInfo.h"
22
#include "SIISelLowering.h"
23
#include "SIMachineFunctionInfo.h"
24
#include "llvm/Analysis/UniformityAnalysis.h"
25
#include "llvm/Analysis/ValueTracking.h"
26
#include "llvm/CodeGen/FunctionLoweringInfo.h"
27
#include "llvm/CodeGen/SelectionDAG.h"
28
#include "llvm/CodeGen/SelectionDAGISel.h"
29
#include "llvm/CodeGen/SelectionDAGNodes.h"
30
#include "llvm/IR/IntrinsicsAMDGPU.h"
31
#include "llvm/InitializePasses.h"
32
#include "llvm/Support/ErrorHandling.h"
33
34
#ifdef EXPENSIVE_CHECKS
35
#include "llvm/Analysis/LoopInfo.h"
36
#include "llvm/IR/Dominators.h"
37
#endif
38
39
#define DEBUG_TYPE "amdgpu-isel"
40
41
using namespace llvm;
42
43
//===----------------------------------------------------------------------===//
44
// Instruction Selector Implementation
45
//===----------------------------------------------------------------------===//
46
47
namespace {
48
static SDValue stripBitcast(SDValue Val) {
49
return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
50
}
51
52
// Figure out if this is really an extract of the high 16-bits of a dword.
53
static bool isExtractHiElt(SDValue In, SDValue &Out) {
54
In = stripBitcast(In);
55
56
if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
57
if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
58
if (!Idx->isOne())
59
return false;
60
Out = In.getOperand(0);
61
return true;
62
}
63
}
64
65
if (In.getOpcode() != ISD::TRUNCATE)
66
return false;
67
68
SDValue Srl = In.getOperand(0);
69
if (Srl.getOpcode() == ISD::SRL) {
70
if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
71
if (ShiftAmt->getZExtValue() == 16) {
72
Out = stripBitcast(Srl.getOperand(0));
73
return true;
74
}
75
}
76
}
77
78
return false;
79
}
80
81
// Look through operations that obscure just looking at the low 16-bits of the
82
// same register.
83
static SDValue stripExtractLoElt(SDValue In) {
84
if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
85
SDValue Idx = In.getOperand(1);
86
if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
87
return In.getOperand(0);
88
}
89
90
if (In.getOpcode() == ISD::TRUNCATE) {
91
SDValue Src = In.getOperand(0);
92
if (Src.getValueType().getSizeInBits() == 32)
93
return stripBitcast(Src);
94
}
95
96
return In;
97
}
98
99
} // end anonymous namespace
100
101
INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
102
"AMDGPU DAG->DAG Pattern Instruction Selection", false,
103
false)
104
INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
105
INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
106
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
107
#ifdef EXPENSIVE_CHECKS
108
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
109
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
110
#endif
111
INITIALIZE_PASS_END(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
112
"AMDGPU DAG->DAG Pattern Instruction Selection", false,
113
false)
114
115
/// This pass converts a legalized DAG into a AMDGPU-specific
116
// DAG, ready for instruction scheduling.
117
FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
118
CodeGenOptLevel OptLevel) {
119
return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
120
}
121
122
AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,
123
CodeGenOptLevel OptLevel)
124
: SelectionDAGISel(TM, OptLevel) {
125
EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
126
}
127
128
bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
129
Subtarget = &MF.getSubtarget<GCNSubtarget>();
130
Subtarget->checkSubtargetFeatures(MF.getFunction());
131
Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
132
return SelectionDAGISel::runOnMachineFunction(MF);
133
}
134
135
bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
136
// XXX - only need to list legal operations.
137
switch (Opc) {
138
case ISD::FADD:
139
case ISD::FSUB:
140
case ISD::FMUL:
141
case ISD::FDIV:
142
case ISD::FREM:
143
case ISD::FCANONICALIZE:
144
case ISD::UINT_TO_FP:
145
case ISD::SINT_TO_FP:
146
case ISD::FABS:
147
// Fabs is lowered to a bit operation, but it's an and which will clear the
148
// high bits anyway.
149
case ISD::FSQRT:
150
case ISD::FSIN:
151
case ISD::FCOS:
152
case ISD::FPOWI:
153
case ISD::FPOW:
154
case ISD::FLOG:
155
case ISD::FLOG2:
156
case ISD::FLOG10:
157
case ISD::FEXP:
158
case ISD::FEXP2:
159
case ISD::FCEIL:
160
case ISD::FTRUNC:
161
case ISD::FRINT:
162
case ISD::FNEARBYINT:
163
case ISD::FROUNDEVEN:
164
case ISD::FROUND:
165
case ISD::FFLOOR:
166
case ISD::FMINNUM:
167
case ISD::FMAXNUM:
168
case ISD::FLDEXP:
169
case AMDGPUISD::FRACT:
170
case AMDGPUISD::CLAMP:
171
case AMDGPUISD::COS_HW:
172
case AMDGPUISD::SIN_HW:
173
case AMDGPUISD::FMIN3:
174
case AMDGPUISD::FMAX3:
175
case AMDGPUISD::FMED3:
176
case AMDGPUISD::FMAD_FTZ:
177
case AMDGPUISD::RCP:
178
case AMDGPUISD::RSQ:
179
case AMDGPUISD::RCP_IFLAG:
180
// On gfx10, all 16-bit instructions preserve the high bits.
181
return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
182
case ISD::FP_ROUND:
183
// We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
184
// high bits on gfx9.
185
// TODO: If we had the source node we could see if the source was fma/mad
186
return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
187
case ISD::FMA:
188
case ISD::FMAD:
189
case AMDGPUISD::DIV_FIXUP:
190
return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
191
default:
192
// fcopysign, select and others may be lowered to 32-bit bit operations
193
// which don't zero the high bits.
194
return false;
195
}
196
}
197
198
bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {
199
#ifdef EXPENSIVE_CHECKS
200
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
201
LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
202
for (auto &L : LI->getLoopsInPreorder()) {
203
assert(L->isLCSSAForm(DT));
204
}
205
#endif
206
return SelectionDAGISelLegacy::runOnMachineFunction(MF);
207
}
208
209
void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
210
AU.addRequired<AMDGPUArgumentUsageInfo>();
211
AU.addRequired<UniformityInfoWrapperPass>();
212
#ifdef EXPENSIVE_CHECKS
213
AU.addRequired<DominatorTreeWrapperPass>();
214
AU.addRequired<LoopInfoWrapperPass>();
215
#endif
216
SelectionDAGISelLegacy::getAnalysisUsage(AU);
217
}
218
219
bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
220
assert(Subtarget->d16PreservesUnusedBits());
221
MVT VT = N->getValueType(0).getSimpleVT();
222
if (VT != MVT::v2i16 && VT != MVT::v2f16)
223
return false;
224
225
SDValue Lo = N->getOperand(0);
226
SDValue Hi = N->getOperand(1);
227
228
LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
229
230
// build_vector lo, (load ptr) -> load_d16_hi ptr, lo
231
// build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
232
// build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
233
234
// Need to check for possible indirect dependencies on the other half of the
235
// vector to avoid introducing a cycle.
236
if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
237
SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
238
239
SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
240
SDValue Ops[] = {
241
LdHi->getChain(), LdHi->getBasePtr(), TiedIn
242
};
243
244
unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
245
if (LdHi->getMemoryVT() == MVT::i8) {
246
LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
247
AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
248
} else {
249
assert(LdHi->getMemoryVT() == MVT::i16);
250
}
251
252
SDValue NewLoadHi =
253
CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
254
Ops, LdHi->getMemoryVT(),
255
LdHi->getMemOperand());
256
257
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
258
CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
259
return true;
260
}
261
262
// build_vector (load ptr), hi -> load_d16_lo ptr, hi
263
// build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
264
// build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
265
LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
266
if (LdLo && Lo.hasOneUse()) {
267
SDValue TiedIn = getHi16Elt(Hi);
268
if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
269
return false;
270
271
SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
272
unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
273
if (LdLo->getMemoryVT() == MVT::i8) {
274
LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
275
AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
276
} else {
277
assert(LdLo->getMemoryVT() == MVT::i16);
278
}
279
280
TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
281
282
SDValue Ops[] = {
283
LdLo->getChain(), LdLo->getBasePtr(), TiedIn
284
};
285
286
SDValue NewLoadLo =
287
CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
288
Ops, LdLo->getMemoryVT(),
289
LdLo->getMemOperand());
290
291
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
292
CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
293
return true;
294
}
295
296
return false;
297
}
298
299
void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
300
if (!Subtarget->d16PreservesUnusedBits())
301
return;
302
303
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
304
305
bool MadeChange = false;
306
while (Position != CurDAG->allnodes_begin()) {
307
SDNode *N = &*--Position;
308
if (N->use_empty())
309
continue;
310
311
switch (N->getOpcode()) {
312
case ISD::BUILD_VECTOR:
313
// TODO: Match load d16 from shl (extload:i16), 16
314
MadeChange |= matchLoadD16FromBuildVector(N);
315
break;
316
default:
317
break;
318
}
319
}
320
321
if (MadeChange) {
322
CurDAG->RemoveDeadNodes();
323
LLVM_DEBUG(dbgs() << "After PreProcess:\n";
324
CurDAG->dump(););
325
}
326
}
327
328
bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
329
if (N->isUndef())
330
return true;
331
332
const SIInstrInfo *TII = Subtarget->getInstrInfo();
333
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
334
return TII->isInlineConstant(C->getAPIntValue());
335
336
if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
337
return TII->isInlineConstant(C->getValueAPF());
338
339
return false;
340
}
341
342
/// Determine the register class for \p OpNo
343
/// \returns The register class of the virtual register that will be used for
344
/// the given operand number \OpNo or NULL if the register class cannot be
345
/// determined.
346
const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
347
unsigned OpNo) const {
348
if (!N->isMachineOpcode()) {
349
if (N->getOpcode() == ISD::CopyToReg) {
350
Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
351
if (Reg.isVirtual()) {
352
MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
353
return MRI.getRegClass(Reg);
354
}
355
356
const SIRegisterInfo *TRI
357
= static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
358
return TRI->getPhysRegBaseClass(Reg);
359
}
360
361
return nullptr;
362
}
363
364
switch (N->getMachineOpcode()) {
365
default: {
366
const MCInstrDesc &Desc =
367
Subtarget->getInstrInfo()->get(N->getMachineOpcode());
368
unsigned OpIdx = Desc.getNumDefs() + OpNo;
369
if (OpIdx >= Desc.getNumOperands())
370
return nullptr;
371
int RegClass = Desc.operands()[OpIdx].RegClass;
372
if (RegClass == -1)
373
return nullptr;
374
375
return Subtarget->getRegisterInfo()->getRegClass(RegClass);
376
}
377
case AMDGPU::REG_SEQUENCE: {
378
unsigned RCID = N->getConstantOperandVal(0);
379
const TargetRegisterClass *SuperRC =
380
Subtarget->getRegisterInfo()->getRegClass(RCID);
381
382
SDValue SubRegOp = N->getOperand(OpNo + 1);
383
unsigned SubRegIdx = SubRegOp->getAsZExtVal();
384
return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
385
SubRegIdx);
386
}
387
}
388
}
389
390
SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
391
SDValue Glue) const {
392
SmallVector <SDValue, 8> Ops;
393
Ops.push_back(NewChain); // Replace the chain.
394
for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
395
Ops.push_back(N->getOperand(i));
396
397
Ops.push_back(Glue);
398
return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
399
}
400
401
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
402
const SITargetLowering& Lowering =
403
*static_cast<const SITargetLowering*>(getTargetLowering());
404
405
assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
406
407
SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
408
return glueCopyToOp(N, M0, M0.getValue(1));
409
}
410
411
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
412
unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
413
if (AS == AMDGPUAS::LOCAL_ADDRESS) {
414
if (Subtarget->ldsRequiresM0Init())
415
return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
416
} else if (AS == AMDGPUAS::REGION_ADDRESS) {
417
MachineFunction &MF = CurDAG->getMachineFunction();
418
unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
419
return
420
glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
421
}
422
return N;
423
}
424
425
MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
426
EVT VT) const {
427
SDNode *Lo = CurDAG->getMachineNode(
428
AMDGPU::S_MOV_B32, DL, MVT::i32,
429
CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
430
SDNode *Hi =
431
CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
432
CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
433
const SDValue Ops[] = {
434
CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
435
SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
436
SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
437
438
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
439
}
440
441
void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
442
EVT VT = N->getValueType(0);
443
unsigned NumVectorElts = VT.getVectorNumElements();
444
EVT EltVT = VT.getVectorElementType();
445
SDLoc DL(N);
446
SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
447
448
if (NumVectorElts == 1) {
449
CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
450
RegClass);
451
return;
452
}
453
454
assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
455
"supported yet");
456
// 32 = Max Num Vector Elements
457
// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
458
// 1 = Vector Register Class
459
SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
460
461
bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
462
Triple::amdgcn;
463
RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
464
bool IsRegSeq = true;
465
unsigned NOps = N->getNumOperands();
466
for (unsigned i = 0; i < NOps; i++) {
467
// XXX: Why is this here?
468
if (isa<RegisterSDNode>(N->getOperand(i))) {
469
IsRegSeq = false;
470
break;
471
}
472
unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
473
: R600RegisterInfo::getSubRegFromChannel(i);
474
RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
475
RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
476
}
477
if (NOps != NumVectorElts) {
478
// Fill in the missing undef elements if this was a scalar_to_vector.
479
assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
480
MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
481
DL, EltVT);
482
for (unsigned i = NOps; i < NumVectorElts; ++i) {
483
unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
484
: R600RegisterInfo::getSubRegFromChannel(i);
485
RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
486
RegSeqArgs[1 + (2 * i) + 1] =
487
CurDAG->getTargetConstant(Sub, DL, MVT::i32);
488
}
489
}
490
491
if (!IsRegSeq)
492
SelectCode(N);
493
CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
494
}
495
496
void AMDGPUDAGToDAGISel::Select(SDNode *N) {
497
unsigned int Opc = N->getOpcode();
498
if (N->isMachineOpcode()) {
499
N->setNodeId(-1);
500
return; // Already selected.
501
}
502
503
// isa<MemSDNode> almost works but is slightly too permissive for some DS
504
// intrinsics.
505
if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
506
N = glueCopyToM0LDSInit(N);
507
SelectCode(N);
508
return;
509
}
510
511
switch (Opc) {
512
default:
513
break;
514
// We are selecting i64 ADD here instead of custom lower it during
515
// DAG legalization, so we can fold some i64 ADDs used for address
516
// calculation into the LOAD and STORE instructions.
517
case ISD::ADDC:
518
case ISD::ADDE:
519
case ISD::SUBC:
520
case ISD::SUBE: {
521
if (N->getValueType(0) != MVT::i64)
522
break;
523
524
SelectADD_SUB_I64(N);
525
return;
526
}
527
case ISD::UADDO_CARRY:
528
case ISD::USUBO_CARRY:
529
if (N->getValueType(0) != MVT::i32)
530
break;
531
532
SelectAddcSubb(N);
533
return;
534
case ISD::UADDO:
535
case ISD::USUBO: {
536
SelectUADDO_USUBO(N);
537
return;
538
}
539
case AMDGPUISD::FMUL_W_CHAIN: {
540
SelectFMUL_W_CHAIN(N);
541
return;
542
}
543
case AMDGPUISD::FMA_W_CHAIN: {
544
SelectFMA_W_CHAIN(N);
545
return;
546
}
547
548
case ISD::SCALAR_TO_VECTOR:
549
case ISD::BUILD_VECTOR: {
550
EVT VT = N->getValueType(0);
551
unsigned NumVectorElts = VT.getVectorNumElements();
552
if (VT.getScalarSizeInBits() == 16) {
553
if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
554
if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
555
ReplaceNode(N, Packed);
556
return;
557
}
558
}
559
560
break;
561
}
562
563
assert(VT.getVectorElementType().bitsEq(MVT::i32));
564
unsigned RegClassID =
565
SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
566
SelectBuildVector(N, RegClassID);
567
return;
568
}
569
case ISD::BUILD_PAIR: {
570
SDValue RC, SubReg0, SubReg1;
571
SDLoc DL(N);
572
if (N->getValueType(0) == MVT::i128) {
573
RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
574
SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
575
SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
576
} else if (N->getValueType(0) == MVT::i64) {
577
RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
578
SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
579
SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
580
} else {
581
llvm_unreachable("Unhandled value type for BUILD_PAIR");
582
}
583
const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
584
N->getOperand(1), SubReg1 };
585
ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
586
N->getValueType(0), Ops));
587
return;
588
}
589
590
case ISD::Constant:
591
case ISD::ConstantFP: {
592
if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
593
break;
594
595
uint64_t Imm;
596
if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
597
Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
598
if (AMDGPU::isValid32BitLiteral(Imm, true))
599
break;
600
} else {
601
ConstantSDNode *C = cast<ConstantSDNode>(N);
602
Imm = C->getZExtValue();
603
if (AMDGPU::isValid32BitLiteral(Imm, false))
604
break;
605
}
606
607
SDLoc DL(N);
608
ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
609
return;
610
}
611
case AMDGPUISD::BFE_I32:
612
case AMDGPUISD::BFE_U32: {
613
// There is a scalar version available, but unlike the vector version which
614
// has a separate operand for the offset and width, the scalar version packs
615
// the width and offset into a single operand. Try to move to the scalar
616
// version if the offsets are constant, so that we can try to keep extended
617
// loads of kernel arguments in SGPRs.
618
619
// TODO: Technically we could try to pattern match scalar bitshifts of
620
// dynamic values, but it's probably not useful.
621
ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
622
if (!Offset)
623
break;
624
625
ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
626
if (!Width)
627
break;
628
629
bool Signed = Opc == AMDGPUISD::BFE_I32;
630
631
uint32_t OffsetVal = Offset->getZExtValue();
632
uint32_t WidthVal = Width->getZExtValue();
633
634
ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
635
WidthVal));
636
return;
637
}
638
case AMDGPUISD::DIV_SCALE: {
639
SelectDIV_SCALE(N);
640
return;
641
}
642
case AMDGPUISD::MAD_I64_I32:
643
case AMDGPUISD::MAD_U64_U32: {
644
SelectMAD_64_32(N);
645
return;
646
}
647
case ISD::SMUL_LOHI:
648
case ISD::UMUL_LOHI:
649
return SelectMUL_LOHI(N);
650
case ISD::CopyToReg: {
651
const SITargetLowering& Lowering =
652
*static_cast<const SITargetLowering*>(getTargetLowering());
653
N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
654
break;
655
}
656
case ISD::AND:
657
case ISD::SRL:
658
case ISD::SRA:
659
case ISD::SIGN_EXTEND_INREG:
660
if (N->getValueType(0) != MVT::i32)
661
break;
662
663
SelectS_BFE(N);
664
return;
665
case ISD::BRCOND:
666
SelectBRCOND(N);
667
return;
668
case ISD::FP_EXTEND:
669
SelectFP_EXTEND(N);
670
return;
671
case AMDGPUISD::CVT_PKRTZ_F16_F32:
672
case AMDGPUISD::CVT_PKNORM_I16_F32:
673
case AMDGPUISD::CVT_PKNORM_U16_F32:
674
case AMDGPUISD::CVT_PK_U16_U32:
675
case AMDGPUISD::CVT_PK_I16_I32: {
676
// Hack around using a legal type if f16 is illegal.
677
if (N->getValueType(0) == MVT::i32) {
678
MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
679
N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
680
{ N->getOperand(0), N->getOperand(1) });
681
SelectCode(N);
682
return;
683
}
684
685
break;
686
}
687
case ISD::INTRINSIC_W_CHAIN: {
688
SelectINTRINSIC_W_CHAIN(N);
689
return;
690
}
691
case ISD::INTRINSIC_WO_CHAIN: {
692
SelectINTRINSIC_WO_CHAIN(N);
693
return;
694
}
695
case ISD::INTRINSIC_VOID: {
696
SelectINTRINSIC_VOID(N);
697
return;
698
}
699
case AMDGPUISD::WAVE_ADDRESS: {
700
SelectWAVE_ADDRESS(N);
701
return;
702
}
703
case ISD::STACKRESTORE: {
704
SelectSTACKRESTORE(N);
705
return;
706
}
707
}
708
709
SelectCode(N);
710
}
711
712
bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
713
const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
714
const Instruction *Term = BB->getTerminator();
715
return Term->getMetadata("amdgpu.uniform") ||
716
Term->getMetadata("structurizecfg.uniform");
717
}
718
719
bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
720
unsigned ShAmtBits) const {
721
assert(N->getOpcode() == ISD::AND);
722
723
const APInt &RHS = N->getConstantOperandAPInt(1);
724
if (RHS.countr_one() >= ShAmtBits)
725
return true;
726
727
const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
728
return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
729
}
730
731
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
732
SDValue &N0, SDValue &N1) {
733
if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
734
Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
735
// As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
736
// (i64 (bitcast (v2i32 (build_vector
737
// (or (extract_vector_elt V, 0), OFFSET),
738
// (extract_vector_elt V, 1)))))
739
SDValue Lo = Addr.getOperand(0).getOperand(0);
740
if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
741
SDValue BaseLo = Lo.getOperand(0);
742
SDValue BaseHi = Addr.getOperand(0).getOperand(1);
743
// Check that split base (Lo and Hi) are extracted from the same one.
744
if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
745
BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
746
BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
747
// Lo is statically extracted from index 0.
748
isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
749
BaseLo.getConstantOperandVal(1) == 0 &&
750
// Hi is statically extracted from index 0.
751
isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
752
BaseHi.getConstantOperandVal(1) == 1) {
753
N0 = BaseLo.getOperand(0).getOperand(0);
754
N1 = Lo.getOperand(1);
755
return true;
756
}
757
}
758
}
759
return false;
760
}
761
762
bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
763
SDValue &RHS) const {
764
if (CurDAG->isBaseWithConstantOffset(Addr)) {
765
LHS = Addr.getOperand(0);
766
RHS = Addr.getOperand(1);
767
return true;
768
}
769
770
if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
771
assert(LHS && RHS && isa<ConstantSDNode>(RHS));
772
return true;
773
}
774
775
return false;
776
}
777
778
StringRef AMDGPUDAGToDAGISelLegacy::getPassName() const {
779
return "AMDGPU DAG->DAG Pattern Instruction Selection";
780
}
781
782
AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass(TargetMachine &TM)
783
: SelectionDAGISelPass(
784
std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
785
786
PreservedAnalyses
787
AMDGPUISelDAGToDAGPass::run(MachineFunction &MF,
788
MachineFunctionAnalysisManager &MFAM) {
789
#ifdef EXPENSIVE_CHECKS
790
auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
791
.getManager();
792
auto &F = MF.getFunction();
793
DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
794
LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
795
for (auto &L : LI.getLoopsInPreorder())
796
assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
797
#endif
798
return SelectionDAGISelPass::run(MF, MFAM);
799
}
800
801
//===----------------------------------------------------------------------===//
802
// Complex Patterns
803
//===----------------------------------------------------------------------===//
804
805
bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
806
SDValue &Offset) {
807
return false;
808
}
809
810
bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
811
SDValue &Offset) {
812
ConstantSDNode *C;
813
SDLoc DL(Addr);
814
815
if ((C = dyn_cast<ConstantSDNode>(Addr))) {
816
Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
817
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
818
} else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
819
(C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
820
Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
821
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
822
} else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
823
(C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
824
Base = Addr.getOperand(0);
825
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
826
} else {
827
Base = Addr;
828
Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
829
}
830
831
return true;
832
}
833
834
SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
835
const SDLoc &DL) const {
836
SDNode *Mov = CurDAG->getMachineNode(
837
AMDGPU::S_MOV_B32, DL, MVT::i32,
838
CurDAG->getTargetConstant(Val, DL, MVT::i32));
839
return SDValue(Mov, 0);
840
}
841
842
// FIXME: Should only handle uaddo_carry/usubo_carry
843
void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
844
SDLoc DL(N);
845
SDValue LHS = N->getOperand(0);
846
SDValue RHS = N->getOperand(1);
847
848
unsigned Opcode = N->getOpcode();
849
bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
850
bool ProduceCarry =
851
ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
852
bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
853
854
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
855
SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
856
857
SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
858
DL, MVT::i32, LHS, Sub0);
859
SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
860
DL, MVT::i32, LHS, Sub1);
861
862
SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
863
DL, MVT::i32, RHS, Sub0);
864
SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
865
DL, MVT::i32, RHS, Sub1);
866
867
SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
868
869
static const unsigned OpcMap[2][2][2] = {
870
{{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
871
{AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
872
{{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
873
{AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
874
875
unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
876
unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
877
878
SDNode *AddLo;
879
if (!ConsumeCarry) {
880
SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
881
AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
882
} else {
883
SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
884
AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
885
}
886
SDValue AddHiArgs[] = {
887
SDValue(Hi0, 0),
888
SDValue(Hi1, 0),
889
SDValue(AddLo, 1)
890
};
891
SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
892
893
SDValue RegSequenceArgs[] = {
894
CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
895
SDValue(AddLo,0),
896
Sub0,
897
SDValue(AddHi,0),
898
Sub1,
899
};
900
SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
901
MVT::i64, RegSequenceArgs);
902
903
if (ProduceCarry) {
904
// Replace the carry-use
905
ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
906
}
907
908
// Replace the remaining uses.
909
ReplaceNode(N, RegSequence);
910
}
911
912
void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
913
SDLoc DL(N);
914
SDValue LHS = N->getOperand(0);
915
SDValue RHS = N->getOperand(1);
916
SDValue CI = N->getOperand(2);
917
918
if (N->isDivergent()) {
919
unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
920
: AMDGPU::V_SUBB_U32_e64;
921
CurDAG->SelectNodeTo(
922
N, Opc, N->getVTList(),
923
{LHS, RHS, CI,
924
CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
925
} else {
926
unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
927
: AMDGPU::S_SUB_CO_PSEUDO;
928
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
929
}
930
}
931
932
void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
933
// The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
934
// carry out despite the _i32 name. These were renamed in VI to _U32.
935
// FIXME: We should probably rename the opcodes here.
936
bool IsAdd = N->getOpcode() == ISD::UADDO;
937
bool IsVALU = N->isDivergent();
938
939
for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
940
++UI)
941
if (UI.getUse().getResNo() == 1) {
942
if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
943
(!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
944
IsVALU = true;
945
break;
946
}
947
}
948
949
if (IsVALU) {
950
unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
951
952
CurDAG->SelectNodeTo(
953
N, Opc, N->getVTList(),
954
{N->getOperand(0), N->getOperand(1),
955
CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
956
} else {
957
unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
958
: AMDGPU::S_USUBO_PSEUDO;
959
960
CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
961
{N->getOperand(0), N->getOperand(1)});
962
}
963
}
964
965
void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
966
SDLoc SL(N);
967
// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
968
SDValue Ops[10];
969
970
SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
971
SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
972
SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
973
Ops[8] = N->getOperand(0);
974
Ops[9] = N->getOperand(4);
975
976
// If there are no source modifiers, prefer fmac over fma because it can use
977
// the smaller VOP2 encoding.
978
bool UseFMAC = Subtarget->hasDLInsts() &&
979
cast<ConstantSDNode>(Ops[0])->isZero() &&
980
cast<ConstantSDNode>(Ops[2])->isZero() &&
981
cast<ConstantSDNode>(Ops[4])->isZero();
982
unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
983
CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
984
}
985
986
void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
987
SDLoc SL(N);
988
// src0_modifiers, src0, src1_modifiers, src1, clamp, omod
989
SDValue Ops[8];
990
991
SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
992
SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
993
Ops[6] = N->getOperand(0);
994
Ops[7] = N->getOperand(3);
995
996
CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
997
}
998
999
// We need to handle this here because tablegen doesn't support matching
1000
// instructions with multiple outputs.
1001
void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1002
SDLoc SL(N);
1003
EVT VT = N->getValueType(0);
1004
1005
assert(VT == MVT::f32 || VT == MVT::f64);
1006
1007
unsigned Opc
1008
= (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1009
1010
// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1011
// omod
1012
SDValue Ops[8];
1013
SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1014
SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1015
SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1016
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1017
}
1018
1019
// We need to handle this here because tablegen doesn't support matching
1020
// instructions with multiple outputs.
1021
void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1022
SDLoc SL(N);
1023
bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1024
unsigned Opc;
1025
if (Subtarget->hasMADIntraFwdBug())
1026
Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1027
: AMDGPU::V_MAD_U64_U32_gfx11_e64;
1028
else
1029
Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1030
1031
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1032
SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1033
Clamp };
1034
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1035
}
1036
1037
// We need to handle this here because tablegen doesn't support matching
1038
// instructions with multiple outputs.
1039
void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1040
SDLoc SL(N);
1041
bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1042
unsigned Opc;
1043
if (Subtarget->hasMADIntraFwdBug())
1044
Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1045
: AMDGPU::V_MAD_U64_U32_gfx11_e64;
1046
else
1047
Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1048
1049
SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1050
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1051
SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1052
SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1053
if (!SDValue(N, 0).use_empty()) {
1054
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1055
SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1056
MVT::i32, SDValue(Mad, 0), Sub0);
1057
ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1058
}
1059
if (!SDValue(N, 1).use_empty()) {
1060
SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1061
SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1062
MVT::i32, SDValue(Mad, 0), Sub1);
1063
ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1064
}
1065
CurDAG->RemoveDeadNode(N);
1066
}
1067
1068
bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1069
if (!isUInt<16>(Offset))
1070
return false;
1071
1072
if (!Base || Subtarget->hasUsableDSOffset() ||
1073
Subtarget->unsafeDSOffsetFoldingEnabled())
1074
return true;
1075
1076
// On Southern Islands instruction with a negative base value and an offset
1077
// don't seem to work.
1078
return CurDAG->SignBitIsZero(Base);
1079
}
1080
1081
bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1082
SDValue &Offset) const {
1083
SDLoc DL(Addr);
1084
if (CurDAG->isBaseWithConstantOffset(Addr)) {
1085
SDValue N0 = Addr.getOperand(0);
1086
SDValue N1 = Addr.getOperand(1);
1087
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1088
if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1089
// (add n0, c0)
1090
Base = N0;
1091
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1092
return true;
1093
}
1094
} else if (Addr.getOpcode() == ISD::SUB) {
1095
// sub C, x -> add (sub 0, x), C
1096
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1097
int64_t ByteOffset = C->getSExtValue();
1098
if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1099
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1100
1101
// XXX - This is kind of hacky. Create a dummy sub node so we can check
1102
// the known bits in isDSOffsetLegal. We need to emit the selected node
1103
// here, so this is thrown away.
1104
SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1105
Zero, Addr.getOperand(1));
1106
1107
if (isDSOffsetLegal(Sub, ByteOffset)) {
1108
SmallVector<SDValue, 3> Opnds;
1109
Opnds.push_back(Zero);
1110
Opnds.push_back(Addr.getOperand(1));
1111
1112
// FIXME: Select to VOP3 version for with-carry.
1113
unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1114
if (Subtarget->hasAddNoCarry()) {
1115
SubOp = AMDGPU::V_SUB_U32_e64;
1116
Opnds.push_back(
1117
CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1118
}
1119
1120
MachineSDNode *MachineSub =
1121
CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1122
1123
Base = SDValue(MachineSub, 0);
1124
Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1125
return true;
1126
}
1127
}
1128
}
1129
} else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1130
// If we have a constant address, prefer to put the constant into the
1131
// offset. This can save moves to load the constant address since multiple
1132
// operations can share the zero base address register, and enables merging
1133
// into read2 / write2 instructions.
1134
1135
SDLoc DL(Addr);
1136
1137
if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1138
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1139
MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1140
DL, MVT::i32, Zero);
1141
Base = SDValue(MovZero, 0);
1142
Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1143
return true;
1144
}
1145
}
1146
1147
// default case
1148
Base = Addr;
1149
Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1150
return true;
1151
}
1152
1153
bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1154
unsigned Offset1,
1155
unsigned Size) const {
1156
if (Offset0 % Size != 0 || Offset1 % Size != 0)
1157
return false;
1158
if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1159
return false;
1160
1161
if (!Base || Subtarget->hasUsableDSOffset() ||
1162
Subtarget->unsafeDSOffsetFoldingEnabled())
1163
return true;
1164
1165
// On Southern Islands instruction with a negative base value and an offset
1166
// don't seem to work.
1167
return CurDAG->SignBitIsZero(Base);
1168
}
1169
1170
// Return whether the operation has NoUnsignedWrap property.
1171
static bool isNoUnsignedWrap(SDValue Addr) {
1172
return (Addr.getOpcode() == ISD::ADD &&
1173
Addr->getFlags().hasNoUnsignedWrap()) ||
1174
Addr->getOpcode() == ISD::OR;
1175
}
1176
1177
// Check that the base address of flat scratch load/store in the form of `base +
1178
// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1179
// requirement). We always treat the first operand as the base address here.
1180
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1181
if (isNoUnsignedWrap(Addr))
1182
return true;
1183
1184
// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1185
// values.
1186
if (Subtarget->hasSignedScratchOffsets())
1187
return true;
1188
1189
auto LHS = Addr.getOperand(0);
1190
auto RHS = Addr.getOperand(1);
1191
1192
// If the immediate offset is negative and within certain range, the base
1193
// address cannot also be negative. If the base is also negative, the sum
1194
// would be either negative or much larger than the valid range of scratch
1195
// memory a thread can access.
1196
ConstantSDNode *ImmOp = nullptr;
1197
if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1198
if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1199
return true;
1200
}
1201
1202
return CurDAG->SignBitIsZero(LHS);
1203
}
1204
1205
// Check address value in SGPR/VGPR are legal for flat scratch in the form
1206
// of: SGPR + VGPR.
1207
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1208
if (isNoUnsignedWrap(Addr))
1209
return true;
1210
1211
// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1212
// values.
1213
if (Subtarget->hasSignedScratchOffsets())
1214
return true;
1215
1216
auto LHS = Addr.getOperand(0);
1217
auto RHS = Addr.getOperand(1);
1218
return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1219
}
1220
1221
// Check address value in SGPR/VGPR are legal for flat scratch in the form
1222
// of: SGPR + VGPR + Imm.
1223
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1224
// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1225
// values.
1226
if (AMDGPU::isGFX12Plus(*Subtarget))
1227
return true;
1228
1229
auto Base = Addr.getOperand(0);
1230
auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1231
// If the immediate offset is negative and within certain range, the base
1232
// address cannot also be negative. If the base is also negative, the sum
1233
// would be either negative or much larger than the valid range of scratch
1234
// memory a thread can access.
1235
if (isNoUnsignedWrap(Base) &&
1236
(isNoUnsignedWrap(Addr) ||
1237
(RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1238
return true;
1239
1240
auto LHS = Base.getOperand(0);
1241
auto RHS = Base.getOperand(1);
1242
return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1243
}
1244
1245
// TODO: If offset is too big, put low 16-bit into offset.
1246
bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1247
SDValue &Offset0,
1248
SDValue &Offset1) const {
1249
return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1250
}
1251
1252
bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1253
SDValue &Offset0,
1254
SDValue &Offset1) const {
1255
return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1256
}
1257
1258
bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1259
SDValue &Offset0, SDValue &Offset1,
1260
unsigned Size) const {
1261
SDLoc DL(Addr);
1262
1263
if (CurDAG->isBaseWithConstantOffset(Addr)) {
1264
SDValue N0 = Addr.getOperand(0);
1265
SDValue N1 = Addr.getOperand(1);
1266
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1267
unsigned OffsetValue0 = C1->getZExtValue();
1268
unsigned OffsetValue1 = OffsetValue0 + Size;
1269
1270
// (add n0, c0)
1271
if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1272
Base = N0;
1273
Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1274
Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1275
return true;
1276
}
1277
} else if (Addr.getOpcode() == ISD::SUB) {
1278
// sub C, x -> add (sub 0, x), C
1279
if (const ConstantSDNode *C =
1280
dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1281
unsigned OffsetValue0 = C->getZExtValue();
1282
unsigned OffsetValue1 = OffsetValue0 + Size;
1283
1284
if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1285
SDLoc DL(Addr);
1286
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1287
1288
// XXX - This is kind of hacky. Create a dummy sub node so we can check
1289
// the known bits in isDSOffsetLegal. We need to emit the selected node
1290
// here, so this is thrown away.
1291
SDValue Sub =
1292
CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1293
1294
if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1295
SmallVector<SDValue, 3> Opnds;
1296
Opnds.push_back(Zero);
1297
Opnds.push_back(Addr.getOperand(1));
1298
unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1299
if (Subtarget->hasAddNoCarry()) {
1300
SubOp = AMDGPU::V_SUB_U32_e64;
1301
Opnds.push_back(
1302
CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1303
}
1304
1305
MachineSDNode *MachineSub = CurDAG->getMachineNode(
1306
SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1307
1308
Base = SDValue(MachineSub, 0);
1309
Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1310
Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1311
return true;
1312
}
1313
}
1314
}
1315
} else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1316
unsigned OffsetValue0 = CAddr->getZExtValue();
1317
unsigned OffsetValue1 = OffsetValue0 + Size;
1318
1319
if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1320
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1321
MachineSDNode *MovZero =
1322
CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1323
Base = SDValue(MovZero, 0);
1324
Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1325
Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1326
return true;
1327
}
1328
}
1329
1330
// default case
1331
1332
Base = Addr;
1333
Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1334
Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1335
return true;
1336
}
1337
1338
bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1339
SDValue &SOffset, SDValue &Offset,
1340
SDValue &Offen, SDValue &Idxen,
1341
SDValue &Addr64) const {
1342
// Subtarget prefers to use flat instruction
1343
// FIXME: This should be a pattern predicate and not reach here
1344
if (Subtarget->useFlatForGlobal())
1345
return false;
1346
1347
SDLoc DL(Addr);
1348
1349
Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1350
Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1351
Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1352
SOffset = Subtarget->hasRestrictedSOffset()
1353
? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1354
: CurDAG->getTargetConstant(0, DL, MVT::i32);
1355
1356
ConstantSDNode *C1 = nullptr;
1357
SDValue N0 = Addr;
1358
if (CurDAG->isBaseWithConstantOffset(Addr)) {
1359
C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1360
if (isUInt<32>(C1->getZExtValue()))
1361
N0 = Addr.getOperand(0);
1362
else
1363
C1 = nullptr;
1364
}
1365
1366
if (N0.getOpcode() == ISD::ADD) {
1367
// (add N2, N3) -> addr64, or
1368
// (add (add N2, N3), C1) -> addr64
1369
SDValue N2 = N0.getOperand(0);
1370
SDValue N3 = N0.getOperand(1);
1371
Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1372
1373
if (N2->isDivergent()) {
1374
if (N3->isDivergent()) {
1375
// Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1376
// addr64, and construct the resource from a 0 address.
1377
Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1378
VAddr = N0;
1379
} else {
1380
// N2 is divergent, N3 is not.
1381
Ptr = N3;
1382
VAddr = N2;
1383
}
1384
} else {
1385
// N2 is not divergent.
1386
Ptr = N2;
1387
VAddr = N3;
1388
}
1389
Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1390
} else if (N0->isDivergent()) {
1391
// N0 is divergent. Use it as the addr64, and construct the resource from a
1392
// 0 address.
1393
Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1394
VAddr = N0;
1395
Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1396
} else {
1397
// N0 -> offset, or
1398
// (N0 + C1) -> offset
1399
VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1400
Ptr = N0;
1401
}
1402
1403
if (!C1) {
1404
// No offset.
1405
Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1406
return true;
1407
}
1408
1409
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1410
if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1411
// Legal offset for instruction.
1412
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1413
return true;
1414
}
1415
1416
// Illegal offset, store it in soffset.
1417
Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1418
SOffset =
1419
SDValue(CurDAG->getMachineNode(
1420
AMDGPU::S_MOV_B32, DL, MVT::i32,
1421
CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1422
0);
1423
return true;
1424
}
1425
1426
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1427
SDValue &VAddr, SDValue &SOffset,
1428
SDValue &Offset) const {
1429
SDValue Ptr, Offen, Idxen, Addr64;
1430
1431
// addr64 bit was removed for volcanic islands.
1432
// FIXME: This should be a pattern predicate and not reach here
1433
if (!Subtarget->hasAddr64())
1434
return false;
1435
1436
if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1437
return false;
1438
1439
ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1440
if (C->getSExtValue()) {
1441
SDLoc DL(Addr);
1442
1443
const SITargetLowering& Lowering =
1444
*static_cast<const SITargetLowering*>(getTargetLowering());
1445
1446
SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1447
return true;
1448
}
1449
1450
return false;
1451
}
1452
1453
std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1454
SDLoc DL(N);
1455
1456
auto *FI = dyn_cast<FrameIndexSDNode>(N);
1457
SDValue TFI =
1458
FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1459
1460
// We rebase the base address into an absolute stack address and hence
1461
// use constant 0 for soffset. This value must be retained until
1462
// frame elimination and eliminateFrameIndex will choose the appropriate
1463
// frame register if need be.
1464
return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1465
}
1466
1467
bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1468
SDValue Addr, SDValue &Rsrc,
1469
SDValue &VAddr, SDValue &SOffset,
1470
SDValue &ImmOffset) const {
1471
1472
SDLoc DL(Addr);
1473
MachineFunction &MF = CurDAG->getMachineFunction();
1474
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1475
1476
Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1477
1478
if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1479
int64_t Imm = CAddr->getSExtValue();
1480
const int64_t NullPtr =
1481
AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
1482
// Don't fold null pointer.
1483
if (Imm != NullPtr) {
1484
const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1485
SDValue HighBits =
1486
CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1487
MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1488
AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1489
VAddr = SDValue(MovHighBits, 0);
1490
1491
SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1492
ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1493
return true;
1494
}
1495
}
1496
1497
if (CurDAG->isBaseWithConstantOffset(Addr)) {
1498
// (add n0, c1)
1499
1500
SDValue N0 = Addr.getOperand(0);
1501
uint64_t C1 = Addr.getConstantOperandVal(1);
1502
1503
// Offsets in vaddr must be positive if range checking is enabled.
1504
//
1505
// The total computation of vaddr + soffset + offset must not overflow. If
1506
// vaddr is negative, even if offset is 0 the sgpr offset add will end up
1507
// overflowing.
1508
//
1509
// Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1510
// always perform a range check. If a negative vaddr base index was used,
1511
// this would fail the range check. The overall address computation would
1512
// compute a valid address, but this doesn't happen due to the range
1513
// check. For out-of-bounds MUBUF loads, a 0 is returned.
1514
//
1515
// Therefore it should be safe to fold any VGPR offset on gfx9 into the
1516
// MUBUF vaddr, but not on older subtargets which can only do this if the
1517
// sign bit is known 0.
1518
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1519
if (TII->isLegalMUBUFImmOffset(C1) &&
1520
(!Subtarget->privateMemoryResourceIsRangeChecked() ||
1521
CurDAG->SignBitIsZero(N0))) {
1522
std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1523
ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1524
return true;
1525
}
1526
}
1527
1528
// (node)
1529
std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1530
ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1531
return true;
1532
}
1533
1534
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1535
if (Val.getOpcode() != ISD::CopyFromReg)
1536
return false;
1537
auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1538
if (!Reg.isPhysical())
1539
return false;
1540
auto RC = TRI.getPhysRegBaseClass(Reg);
1541
return RC && TRI.isSGPRClass(RC);
1542
}
1543
1544
bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1545
SDValue Addr,
1546
SDValue &SRsrc,
1547
SDValue &SOffset,
1548
SDValue &Offset) const {
1549
const SIRegisterInfo *TRI =
1550
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1551
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1552
MachineFunction &MF = CurDAG->getMachineFunction();
1553
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1554
SDLoc DL(Addr);
1555
1556
// CopyFromReg <sgpr>
1557
if (IsCopyFromSGPR(*TRI, Addr)) {
1558
SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1559
SOffset = Addr;
1560
Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1561
return true;
1562
}
1563
1564
ConstantSDNode *CAddr;
1565
if (Addr.getOpcode() == ISD::ADD) {
1566
// Add (CopyFromReg <sgpr>) <constant>
1567
CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1568
if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1569
return false;
1570
if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1571
return false;
1572
1573
SOffset = Addr.getOperand(0);
1574
} else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1575
TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1576
// <constant>
1577
SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1578
} else {
1579
return false;
1580
}
1581
1582
SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1583
1584
Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1585
return true;
1586
}
1587
1588
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1589
SDValue &SOffset, SDValue &Offset
1590
) const {
1591
SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1592
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1593
1594
if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1595
return false;
1596
1597
if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1598
!cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1599
!cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1600
uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1601
APInt::getAllOnes(32).getZExtValue(); // Size
1602
SDLoc DL(Addr);
1603
1604
const SITargetLowering& Lowering =
1605
*static_cast<const SITargetLowering*>(getTargetLowering());
1606
1607
SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1608
return true;
1609
}
1610
return false;
1611
}
1612
1613
bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1614
SDValue &SOffset) const {
1615
if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1616
SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1617
return true;
1618
}
1619
1620
SOffset = ByteOffsetNode;
1621
return true;
1622
}
1623
1624
// Find a load or store from corresponding pattern root.
1625
// Roots may be build_vector, bitconvert or their combinations.
1626
static MemSDNode* findMemSDNode(SDNode *N) {
1627
N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1628
if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1629
return MN;
1630
assert(isa<BuildVectorSDNode>(N));
1631
for (SDValue V : N->op_values())
1632
if (MemSDNode *MN =
1633
dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1634
return MN;
1635
llvm_unreachable("cannot find MemSDNode in the pattern!");
1636
}
1637
1638
bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1639
SDValue &VAddr, SDValue &Offset,
1640
uint64_t FlatVariant) const {
1641
int64_t OffsetVal = 0;
1642
1643
unsigned AS = findMemSDNode(N)->getAddressSpace();
1644
1645
bool CanHaveFlatSegmentOffsetBug =
1646
Subtarget->hasFlatSegmentOffsetBug() &&
1647
FlatVariant == SIInstrFlags::FLAT &&
1648
(AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1649
1650
if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1651
SDValue N0, N1;
1652
if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1653
(FlatVariant != SIInstrFlags::FlatScratch ||
1654
isFlatScratchBaseLegal(Addr))) {
1655
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1656
1657
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1658
if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1659
Addr = N0;
1660
OffsetVal = COffsetVal;
1661
} else {
1662
// If the offset doesn't fit, put the low bits into the offset field and
1663
// add the rest.
1664
//
1665
// For a FLAT instruction the hardware decides whether to access
1666
// global/scratch/shared memory based on the high bits of vaddr,
1667
// ignoring the offset field, so we have to ensure that when we add
1668
// remainder to vaddr it still points into the same underlying object.
1669
// The easiest way to do that is to make sure that we split the offset
1670
// into two pieces that are both >= 0 or both <= 0.
1671
1672
SDLoc DL(N);
1673
uint64_t RemainderOffset;
1674
1675
std::tie(OffsetVal, RemainderOffset) =
1676
TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1677
1678
SDValue AddOffsetLo =
1679
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1680
SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1681
1682
if (Addr.getValueType().getSizeInBits() == 32) {
1683
SmallVector<SDValue, 3> Opnds;
1684
Opnds.push_back(N0);
1685
Opnds.push_back(AddOffsetLo);
1686
unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1687
if (Subtarget->hasAddNoCarry()) {
1688
AddOp = AMDGPU::V_ADD_U32_e64;
1689
Opnds.push_back(Clamp);
1690
}
1691
Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1692
} else {
1693
// TODO: Should this try to use a scalar add pseudo if the base address
1694
// is uniform and saddr is usable?
1695
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1696
SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1697
1698
SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1699
DL, MVT::i32, N0, Sub0);
1700
SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1701
DL, MVT::i32, N0, Sub1);
1702
1703
SDValue AddOffsetHi =
1704
getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1705
1706
SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1707
1708
SDNode *Add =
1709
CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1710
{AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1711
1712
SDNode *Addc = CurDAG->getMachineNode(
1713
AMDGPU::V_ADDC_U32_e64, DL, VTs,
1714
{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1715
1716
SDValue RegSequenceArgs[] = {
1717
CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1718
SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1719
1720
Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1721
MVT::i64, RegSequenceArgs),
1722
0);
1723
}
1724
}
1725
}
1726
}
1727
1728
VAddr = Addr;
1729
Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1730
return true;
1731
}
1732
1733
bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1734
SDValue &VAddr,
1735
SDValue &Offset) const {
1736
return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1737
}
1738
1739
bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1740
SDValue &VAddr,
1741
SDValue &Offset) const {
1742
return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1743
}
1744
1745
bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1746
SDValue &VAddr,
1747
SDValue &Offset) const {
1748
return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1749
SIInstrFlags::FlatScratch);
1750
}
1751
1752
// If this matches zero_extend i32:x, return x
1753
static SDValue matchZExtFromI32(SDValue Op) {
1754
if (Op.getOpcode() != ISD::ZERO_EXTEND)
1755
return SDValue();
1756
1757
SDValue ExtSrc = Op.getOperand(0);
1758
return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1759
}
1760
1761
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1762
bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1763
SDValue Addr,
1764
SDValue &SAddr,
1765
SDValue &VOffset,
1766
SDValue &Offset) const {
1767
int64_t ImmOffset = 0;
1768
1769
// Match the immediate offset first, which canonically is moved as low as
1770
// possible.
1771
1772
SDValue LHS, RHS;
1773
if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1774
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1775
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1776
1777
if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1778
SIInstrFlags::FlatGlobal)) {
1779
Addr = LHS;
1780
ImmOffset = COffsetVal;
1781
} else if (!LHS->isDivergent()) {
1782
if (COffsetVal > 0) {
1783
SDLoc SL(N);
1784
// saddr + large_offset -> saddr +
1785
// (voffset = large_offset & ~MaxOffset) +
1786
// (large_offset & MaxOffset);
1787
int64_t SplitImmOffset, RemainderOffset;
1788
std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1789
COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
1790
1791
if (isUInt<32>(RemainderOffset)) {
1792
SDNode *VMov = CurDAG->getMachineNode(
1793
AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1794
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1795
VOffset = SDValue(VMov, 0);
1796
SAddr = LHS;
1797
Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1798
return true;
1799
}
1800
}
1801
1802
// We are adding a 64 bit SGPR and a constant. If constant bus limit
1803
// is 1 we would need to perform 1 or 2 extra moves for each half of
1804
// the constant and it is better to do a scalar add and then issue a
1805
// single VALU instruction to materialize zero. Otherwise it is less
1806
// instructions to perform VALU adds with immediates or inline literals.
1807
unsigned NumLiterals =
1808
!TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1809
!TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1810
if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1811
return false;
1812
}
1813
}
1814
1815
// Match the variable offset.
1816
if (Addr.getOpcode() == ISD::ADD) {
1817
LHS = Addr.getOperand(0);
1818
RHS = Addr.getOperand(1);
1819
1820
if (!LHS->isDivergent()) {
1821
// add (i64 sgpr), (zero_extend (i32 vgpr))
1822
if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1823
SAddr = LHS;
1824
VOffset = ZextRHS;
1825
}
1826
}
1827
1828
if (!SAddr && !RHS->isDivergent()) {
1829
// add (zero_extend (i32 vgpr)), (i64 sgpr)
1830
if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1831
SAddr = RHS;
1832
VOffset = ZextLHS;
1833
}
1834
}
1835
1836
if (SAddr) {
1837
Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1838
return true;
1839
}
1840
}
1841
1842
if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1843
isa<ConstantSDNode>(Addr))
1844
return false;
1845
1846
// It's cheaper to materialize a single 32-bit zero for vaddr than the two
1847
// moves required to copy a 64-bit SGPR to VGPR.
1848
SAddr = Addr;
1849
SDNode *VMov =
1850
CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1851
CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1852
VOffset = SDValue(VMov, 0);
1853
Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1854
return true;
1855
}
1856
1857
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1858
if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1859
SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1860
} else if (SAddr.getOpcode() == ISD::ADD &&
1861
isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1862
// Materialize this into a scalar move for scalar address to avoid
1863
// readfirstlane.
1864
auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1865
SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1866
FI->getValueType(0));
1867
SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1868
MVT::i32, TFI, SAddr.getOperand(1)),
1869
0);
1870
}
1871
1872
return SAddr;
1873
}
1874
1875
// Match (32-bit SGPR base) + sext(imm offset)
1876
bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1877
SDValue &SAddr,
1878
SDValue &Offset) const {
1879
if (Addr->isDivergent())
1880
return false;
1881
1882
SDLoc DL(Addr);
1883
1884
int64_t COffsetVal = 0;
1885
1886
if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1887
COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1888
SAddr = Addr.getOperand(0);
1889
} else {
1890
SAddr = Addr;
1891
}
1892
1893
SAddr = SelectSAddrFI(CurDAG, SAddr);
1894
1895
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1896
1897
if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1898
SIInstrFlags::FlatScratch)) {
1899
int64_t SplitImmOffset, RemainderOffset;
1900
std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1901
COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
1902
1903
COffsetVal = SplitImmOffset;
1904
1905
SDValue AddOffset =
1906
SAddr.getOpcode() == ISD::TargetFrameIndex
1907
? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1908
: CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1909
SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1910
SAddr, AddOffset),
1911
0);
1912
}
1913
1914
Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i32);
1915
1916
return true;
1917
}
1918
1919
// Check whether the flat scratch SVS swizzle bug affects this access.
1920
bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1921
SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1922
if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1923
return false;
1924
1925
// The bug affects the swizzling of SVS accesses if there is any carry out
1926
// from the two low order bits (i.e. from bit 1 into bit 2) when adding
1927
// voffset to (soffset + inst_offset).
1928
KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1929
KnownBits SKnown = KnownBits::computeForAddSub(
1930
/*Add=*/true, /*NSW=*/false, /*NUW=*/false,
1931
CurDAG->computeKnownBits(SAddr),
1932
KnownBits::makeConstant(APInt(32, ImmOffset)));
1933
uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1934
uint64_t SMax = SKnown.getMaxValue().getZExtValue();
1935
return (VMax & 3) + (SMax & 3) >= 4;
1936
}
1937
1938
bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1939
SDValue &VAddr, SDValue &SAddr,
1940
SDValue &Offset) const {
1941
int64_t ImmOffset = 0;
1942
1943
SDValue LHS, RHS;
1944
SDValue OrigAddr = Addr;
1945
if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1946
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1947
const SIInstrInfo *TII = Subtarget->getInstrInfo();
1948
1949
if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1950
Addr = LHS;
1951
ImmOffset = COffsetVal;
1952
} else if (!LHS->isDivergent() && COffsetVal > 0) {
1953
SDLoc SL(N);
1954
// saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1955
// (large_offset & MaxOffset);
1956
int64_t SplitImmOffset, RemainderOffset;
1957
std::tie(SplitImmOffset, RemainderOffset)
1958
= TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1959
1960
if (isUInt<32>(RemainderOffset)) {
1961
SDNode *VMov = CurDAG->getMachineNode(
1962
AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1963
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1964
VAddr = SDValue(VMov, 0);
1965
SAddr = LHS;
1966
if (!isFlatScratchBaseLegal(Addr))
1967
return false;
1968
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1969
return false;
1970
Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1971
return true;
1972
}
1973
}
1974
}
1975
1976
if (Addr.getOpcode() != ISD::ADD)
1977
return false;
1978
1979
LHS = Addr.getOperand(0);
1980
RHS = Addr.getOperand(1);
1981
1982
if (!LHS->isDivergent() && RHS->isDivergent()) {
1983
SAddr = LHS;
1984
VAddr = RHS;
1985
} else if (!RHS->isDivergent() && LHS->isDivergent()) {
1986
SAddr = RHS;
1987
VAddr = LHS;
1988
} else {
1989
return false;
1990
}
1991
1992
if (OrigAddr != Addr) {
1993
if (!isFlatScratchBaseLegalSVImm(OrigAddr))
1994
return false;
1995
} else {
1996
if (!isFlatScratchBaseLegalSV(OrigAddr))
1997
return false;
1998
}
1999
2000
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2001
return false;
2002
SAddr = SelectSAddrFI(CurDAG, SAddr);
2003
Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2004
return true;
2005
}
2006
2007
// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2008
// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2009
// Handle the case where the Immediate Offset + SOffset is negative.
2010
bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2011
bool Imm32Only,
2012
bool IsBuffer,
2013
int64_t ImmOffset) const {
2014
if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2015
AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2016
KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2017
if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2018
return false;
2019
}
2020
2021
return true;
2022
}
2023
2024
// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2025
// not null) offset. If Imm32Only is true, match only 32-bit immediate
2026
// offsets available on CI.
2027
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2028
SDValue *SOffset, SDValue *Offset,
2029
bool Imm32Only, bool IsBuffer,
2030
bool HasSOffset,
2031
int64_t ImmOffset) const {
2032
assert((!SOffset || !Offset) &&
2033
"Cannot match both soffset and offset at the same time!");
2034
2035
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2036
if (!C) {
2037
if (!SOffset)
2038
return false;
2039
2040
if (ByteOffsetNode.getValueType().isScalarInteger() &&
2041
ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2042
*SOffset = ByteOffsetNode;
2043
return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2044
ImmOffset);
2045
}
2046
if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2047
if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2048
*SOffset = ByteOffsetNode.getOperand(0);
2049
return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2050
ImmOffset);
2051
}
2052
}
2053
return false;
2054
}
2055
2056
SDLoc SL(ByteOffsetNode);
2057
2058
// GFX9 and GFX10 have signed byte immediate offsets. The immediate
2059
// offset for S_BUFFER instructions is unsigned.
2060
int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2061
std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2062
*Subtarget, ByteOffset, IsBuffer, HasSOffset);
2063
if (EncodedOffset && Offset && !Imm32Only) {
2064
*Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2065
return true;
2066
}
2067
2068
// SGPR and literal offsets are unsigned.
2069
if (ByteOffset < 0)
2070
return false;
2071
2072
EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2073
if (EncodedOffset && Offset && Imm32Only) {
2074
*Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2075
return true;
2076
}
2077
2078
if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2079
return false;
2080
2081
if (SOffset) {
2082
SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2083
*SOffset = SDValue(
2084
CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2085
return true;
2086
}
2087
2088
return false;
2089
}
2090
2091
SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2092
if (Addr.getValueType() != MVT::i32)
2093
return Addr;
2094
2095
// Zero-extend a 32-bit address.
2096
SDLoc SL(Addr);
2097
2098
const MachineFunction &MF = CurDAG->getMachineFunction();
2099
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2100
unsigned AddrHiVal = Info->get32BitAddressHighBits();
2101
SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2102
2103
const SDValue Ops[] = {
2104
CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2105
Addr,
2106
CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2107
SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2108
0),
2109
CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2110
};
2111
2112
return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2113
Ops), 0);
2114
}
2115
2116
// Match a base and an immediate (if Offset is not null) or an SGPR (if
2117
// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2118
// true, match only 32-bit immediate offsets available on CI.
2119
bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2120
SDValue *SOffset, SDValue *Offset,
2121
bool Imm32Only, bool IsBuffer,
2122
bool HasSOffset,
2123
int64_t ImmOffset) const {
2124
if (SOffset && Offset) {
2125
assert(!Imm32Only && !IsBuffer);
2126
SDValue B;
2127
2128
if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2129
return false;
2130
2131
int64_t ImmOff = 0;
2132
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2133
ImmOff = C->getSExtValue();
2134
2135
return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2136
ImmOff);
2137
}
2138
2139
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
2140
// wraparound, because s_load instructions perform the addition in 64 bits.
2141
if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2142
!Addr->getFlags().hasNoUnsignedWrap())
2143
return false;
2144
2145
SDValue N0, N1;
2146
// Extract the base and offset if possible.
2147
if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2148
N0 = Addr.getOperand(0);
2149
N1 = Addr.getOperand(1);
2150
} else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2151
assert(N0 && N1 && isa<ConstantSDNode>(N1));
2152
}
2153
if (!N0 || !N1)
2154
return false;
2155
2156
if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2157
ImmOffset)) {
2158
SBase = N0;
2159
return true;
2160
}
2161
if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2162
ImmOffset)) {
2163
SBase = N1;
2164
return true;
2165
}
2166
return false;
2167
}
2168
2169
bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2170
SDValue *SOffset, SDValue *Offset,
2171
bool Imm32Only) const {
2172
if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2173
SBase = Expand32BitAddress(SBase);
2174
return true;
2175
}
2176
2177
if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2178
SBase = Expand32BitAddress(Addr);
2179
*Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2180
return true;
2181
}
2182
2183
return false;
2184
}
2185
2186
bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2187
SDValue &Offset) const {
2188
return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2189
}
2190
2191
bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2192
SDValue &Offset) const {
2193
assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2194
return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2195
/* Imm32Only */ true);
2196
}
2197
2198
bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2199
SDValue &SOffset) const {
2200
return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2201
}
2202
2203
bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2204
SDValue &SOffset,
2205
SDValue &Offset) const {
2206
return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2207
}
2208
2209
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2210
return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2211
/* Imm32Only */ false, /* IsBuffer */ true);
2212
}
2213
2214
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2215
SDValue &Offset) const {
2216
assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2217
return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2218
/* Imm32Only */ true, /* IsBuffer */ true);
2219
}
2220
2221
bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2222
SDValue &Offset) const {
2223
// Match the (soffset + offset) pair as a 32-bit register base and
2224
// an immediate offset.
2225
return N.getValueType() == MVT::i32 &&
2226
SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2227
&Offset, /* Imm32Only */ false,
2228
/* IsBuffer */ true);
2229
}
2230
2231
bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2232
SDValue &Base,
2233
SDValue &Offset) const {
2234
SDLoc DL(Index);
2235
2236
if (CurDAG->isBaseWithConstantOffset(Index)) {
2237
SDValue N0 = Index.getOperand(0);
2238
SDValue N1 = Index.getOperand(1);
2239
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2240
2241
// (add n0, c0)
2242
// Don't peel off the offset (c0) if doing so could possibly lead
2243
// the base (n0) to be negative.
2244
// (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2245
if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2246
(Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2247
Base = N0;
2248
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2249
return true;
2250
}
2251
}
2252
2253
if (isa<ConstantSDNode>(Index))
2254
return false;
2255
2256
Base = Index;
2257
Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2258
return true;
2259
}
2260
2261
SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2262
SDValue Val, uint32_t Offset,
2263
uint32_t Width) {
2264
if (Val->isDivergent()) {
2265
unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2266
SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2267
SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2268
2269
return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2270
}
2271
unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2272
// Transformation function, pack the offset and width of a BFE into
2273
// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2274
// source, bits [5:0] contain the offset and bits [22:16] the width.
2275
uint32_t PackedVal = Offset | (Width << 16);
2276
SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2277
2278
return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2279
}
2280
2281
void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2282
// "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2283
// "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2284
// Predicate: 0 < b <= c < 32
2285
2286
const SDValue &Shl = N->getOperand(0);
2287
ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2288
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2289
2290
if (B && C) {
2291
uint32_t BVal = B->getZExtValue();
2292
uint32_t CVal = C->getZExtValue();
2293
2294
if (0 < BVal && BVal <= CVal && CVal < 32) {
2295
bool Signed = N->getOpcode() == ISD::SRA;
2296
ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2297
32 - CVal));
2298
return;
2299
}
2300
}
2301
SelectCode(N);
2302
}
2303
2304
void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2305
switch (N->getOpcode()) {
2306
case ISD::AND:
2307
if (N->getOperand(0).getOpcode() == ISD::SRL) {
2308
// "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2309
// Predicate: isMask(mask)
2310
const SDValue &Srl = N->getOperand(0);
2311
ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2312
ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2313
2314
if (Shift && Mask) {
2315
uint32_t ShiftVal = Shift->getZExtValue();
2316
uint32_t MaskVal = Mask->getZExtValue();
2317
2318
if (isMask_32(MaskVal)) {
2319
uint32_t WidthVal = llvm::popcount(MaskVal);
2320
ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2321
WidthVal));
2322
return;
2323
}
2324
}
2325
}
2326
break;
2327
case ISD::SRL:
2328
if (N->getOperand(0).getOpcode() == ISD::AND) {
2329
// "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2330
// Predicate: isMask(mask >> b)
2331
const SDValue &And = N->getOperand(0);
2332
ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2333
ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2334
2335
if (Shift && Mask) {
2336
uint32_t ShiftVal = Shift->getZExtValue();
2337
uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2338
2339
if (isMask_32(MaskVal)) {
2340
uint32_t WidthVal = llvm::popcount(MaskVal);
2341
ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2342
WidthVal));
2343
return;
2344
}
2345
}
2346
} else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2347
SelectS_BFEFromShifts(N);
2348
return;
2349
}
2350
break;
2351
case ISD::SRA:
2352
if (N->getOperand(0).getOpcode() == ISD::SHL) {
2353
SelectS_BFEFromShifts(N);
2354
return;
2355
}
2356
break;
2357
2358
case ISD::SIGN_EXTEND_INREG: {
2359
// sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2360
SDValue Src = N->getOperand(0);
2361
if (Src.getOpcode() != ISD::SRL)
2362
break;
2363
2364
const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2365
if (!Amt)
2366
break;
2367
2368
unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2369
ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2370
Amt->getZExtValue(), Width));
2371
return;
2372
}
2373
}
2374
2375
SelectCode(N);
2376
}
2377
2378
bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2379
assert(N->getOpcode() == ISD::BRCOND);
2380
if (!N->hasOneUse())
2381
return false;
2382
2383
SDValue Cond = N->getOperand(1);
2384
if (Cond.getOpcode() == ISD::CopyToReg)
2385
Cond = Cond.getOperand(2);
2386
2387
if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2388
return false;
2389
2390
MVT VT = Cond.getOperand(0).getSimpleValueType();
2391
if (VT == MVT::i32)
2392
return true;
2393
2394
if (VT == MVT::i64) {
2395
auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2396
2397
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2398
return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2399
}
2400
2401
return false;
2402
}
2403
2404
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2405
assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2406
// Special case for amdgcn.ballot:
2407
// %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2408
// %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2409
// =>
2410
// Use i1 %Cond value instead of i(WaveSize) %VCMP.
2411
// This is possible because divergent ISD::SETCC is selected as V_CMP and
2412
// Cond becomes a i(WaveSize) full mask value.
2413
// Note that ballot doesn't use SETEQ condition but its easy to support it
2414
// here for completeness, so in this case Negate is set true on return.
2415
auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2416
if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2417
isNullConstant(VCMP.getOperand(1))) {
2418
2419
auto Cond = VCMP.getOperand(0);
2420
if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2421
Cond = Cond.getOperand(0);
2422
2423
if (isBoolSGPR(Cond)) {
2424
Negate = VCMP_CC == ISD::SETEQ;
2425
return Cond;
2426
}
2427
}
2428
return SDValue();
2429
}
2430
2431
void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2432
SDValue Cond = N->getOperand(1);
2433
2434
if (Cond.isUndef()) {
2435
CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2436
N->getOperand(2), N->getOperand(0));
2437
return;
2438
}
2439
2440
const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2441
const SIRegisterInfo *TRI = ST->getRegisterInfo();
2442
2443
bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2444
bool AndExec = !UseSCCBr;
2445
bool Negate = false;
2446
2447
if (Cond.getOpcode() == ISD::SETCC &&
2448
Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2449
SDValue VCMP = Cond->getOperand(0);
2450
auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2451
if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2452
isNullConstant(Cond->getOperand(1)) &&
2453
// We may encounter ballot.i64 in wave32 mode on -O0.
2454
VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
2455
// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2456
// %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2457
// BRCOND i1 %C, %BB
2458
// =>
2459
// %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2460
// VCC = COPY i(WaveSize) %VCMP
2461
// S_CBRANCH_VCCNZ/VCCZ %BB
2462
Negate = CC == ISD::SETEQ;
2463
bool NegatedBallot = false;
2464
if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2465
Cond = BallotCond;
2466
UseSCCBr = !BallotCond->isDivergent();
2467
Negate = Negate ^ NegatedBallot;
2468
} else {
2469
// TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2470
// selected as V_CMP, but this may change for uniform condition.
2471
Cond = VCMP;
2472
UseSCCBr = false;
2473
}
2474
}
2475
// Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2476
// V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2477
// used.
2478
AndExec = false;
2479
}
2480
2481
unsigned BrOp =
2482
UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2483
: (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2484
Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2485
SDLoc SL(N);
2486
2487
if (AndExec) {
2488
// This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2489
// analyzed what generates the vcc value, so we do not know whether vcc
2490
// bits for disabled lanes are 0. Thus we need to mask out bits for
2491
// disabled lanes.
2492
//
2493
// For the case that we select S_CBRANCH_SCC1 and it gets
2494
// changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2495
// SIInstrInfo::moveToVALU which inserts the S_AND).
2496
//
2497
// We could add an analysis of what generates the vcc value here and omit
2498
// the S_AND when is unnecessary. But it would be better to add a separate
2499
// pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2500
// catches both cases.
2501
Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2502
: AMDGPU::S_AND_B64,
2503
SL, MVT::i1,
2504
CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2505
: AMDGPU::EXEC,
2506
MVT::i1),
2507
Cond),
2508
0);
2509
}
2510
2511
SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2512
CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2513
N->getOperand(2), // Basic Block
2514
VCC.getValue(0));
2515
}
2516
2517
void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2518
if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2519
!N->isDivergent()) {
2520
SDValue Src = N->getOperand(0);
2521
if (Src.getValueType() == MVT::f16) {
2522
if (isExtractHiElt(Src, Src)) {
2523
CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2524
{Src});
2525
return;
2526
}
2527
}
2528
}
2529
2530
SelectCode(N);
2531
}
2532
2533
void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2534
// The address is assumed to be uniform, so if it ends up in a VGPR, it will
2535
// be copied to an SGPR with readfirstlane.
2536
unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2537
AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2538
2539
SDValue Chain = N->getOperand(0);
2540
SDValue Ptr = N->getOperand(2);
2541
MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2542
MachineMemOperand *MMO = M->getMemOperand();
2543
bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2544
2545
SDValue Offset;
2546
if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2547
SDValue PtrBase = Ptr.getOperand(0);
2548
SDValue PtrOffset = Ptr.getOperand(1);
2549
2550
const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2551
if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2552
N = glueCopyToM0(N, PtrBase);
2553
Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2554
}
2555
}
2556
2557
if (!Offset) {
2558
N = glueCopyToM0(N, Ptr);
2559
Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2560
}
2561
2562
SDValue Ops[] = {
2563
Offset,
2564
CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2565
Chain,
2566
N->getOperand(N->getNumOperands() - 1) // New glue
2567
};
2568
2569
SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2570
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2571
}
2572
2573
// We need to handle this here because tablegen doesn't support matching
2574
// instructions with multiple outputs.
2575
void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2576
unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2577
SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2578
N->getOperand(5), N->getOperand(0)};
2579
2580
MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2581
MachineMemOperand *MMO = M->getMemOperand();
2582
SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2583
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2584
}
2585
2586
static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2587
switch (IntrID) {
2588
case Intrinsic::amdgcn_ds_gws_init:
2589
return AMDGPU::DS_GWS_INIT;
2590
case Intrinsic::amdgcn_ds_gws_barrier:
2591
return AMDGPU::DS_GWS_BARRIER;
2592
case Intrinsic::amdgcn_ds_gws_sema_v:
2593
return AMDGPU::DS_GWS_SEMA_V;
2594
case Intrinsic::amdgcn_ds_gws_sema_br:
2595
return AMDGPU::DS_GWS_SEMA_BR;
2596
case Intrinsic::amdgcn_ds_gws_sema_p:
2597
return AMDGPU::DS_GWS_SEMA_P;
2598
case Intrinsic::amdgcn_ds_gws_sema_release_all:
2599
return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2600
default:
2601
llvm_unreachable("not a gws intrinsic");
2602
}
2603
}
2604
2605
void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2606
if (!Subtarget->hasGWS() ||
2607
(IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2608
!Subtarget->hasGWSSemaReleaseAll())) {
2609
// Let this error.
2610
SelectCode(N);
2611
return;
2612
}
2613
2614
// Chain, intrinsic ID, vsrc, offset
2615
const bool HasVSrc = N->getNumOperands() == 4;
2616
assert(HasVSrc || N->getNumOperands() == 3);
2617
2618
SDLoc SL(N);
2619
SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2620
int ImmOffset = 0;
2621
MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2622
MachineMemOperand *MMO = M->getMemOperand();
2623
2624
// Don't worry if the offset ends up in a VGPR. Only one lane will have
2625
// effect, so SIFixSGPRCopies will validly insert readfirstlane.
2626
2627
// The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2628
// offset field) % 64. Some versions of the programming guide omit the m0
2629
// part, or claim it's from offset 0.
2630
if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2631
// If we have a constant offset, try to use the 0 in m0 as the base.
2632
// TODO: Look into changing the default m0 initialization value. If the
2633
// default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2634
// the immediate offset.
2635
glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2636
ImmOffset = ConstOffset->getZExtValue();
2637
} else {
2638
if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2639
ImmOffset = BaseOffset.getConstantOperandVal(1);
2640
BaseOffset = BaseOffset.getOperand(0);
2641
}
2642
2643
// Prefer to do the shift in an SGPR since it should be possible to use m0
2644
// as the result directly. If it's already an SGPR, it will be eliminated
2645
// later.
2646
SDNode *SGPROffset
2647
= CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2648
BaseOffset);
2649
// Shift to offset in m0
2650
SDNode *M0Base
2651
= CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2652
SDValue(SGPROffset, 0),
2653
CurDAG->getTargetConstant(16, SL, MVT::i32));
2654
glueCopyToM0(N, SDValue(M0Base, 0));
2655
}
2656
2657
SDValue Chain = N->getOperand(0);
2658
SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2659
2660
const unsigned Opc = gwsIntrinToOpcode(IntrID);
2661
SmallVector<SDValue, 5> Ops;
2662
if (HasVSrc)
2663
Ops.push_back(N->getOperand(2));
2664
Ops.push_back(OffsetField);
2665
Ops.push_back(Chain);
2666
2667
SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2668
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2669
}
2670
2671
void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2672
if (Subtarget->getLDSBankCount() != 16) {
2673
// This is a single instruction with a pattern.
2674
SelectCode(N);
2675
return;
2676
}
2677
2678
SDLoc DL(N);
2679
2680
// This requires 2 instructions. It is possible to write a pattern to support
2681
// this, but the generated isel emitter doesn't correctly deal with multiple
2682
// output instructions using the same physical register input. The copy to m0
2683
// is incorrectly placed before the second instruction.
2684
//
2685
// TODO: Match source modifiers.
2686
//
2687
// def : Pat <
2688
// (int_amdgcn_interp_p1_f16
2689
// (VOP3Mods f32:$src0, i32:$src0_modifiers),
2690
// (i32 timm:$attrchan), (i32 timm:$attr),
2691
// (i1 timm:$high), M0),
2692
// (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2693
// timm:$attrchan, 0,
2694
// (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2695
// let Predicates = [has16BankLDS];
2696
// }
2697
2698
// 16 bank LDS
2699
SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2700
N->getOperand(5), SDValue());
2701
2702
SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2703
2704
SDNode *InterpMov =
2705
CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2706
CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2707
N->getOperand(3), // Attr
2708
N->getOperand(2), // Attrchan
2709
ToM0.getValue(1) // In glue
2710
});
2711
2712
SDNode *InterpP1LV =
2713
CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2714
CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2715
N->getOperand(1), // Src0
2716
N->getOperand(3), // Attr
2717
N->getOperand(2), // Attrchan
2718
CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2719
SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2720
N->getOperand(4), // high
2721
CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2722
CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2723
SDValue(InterpMov, 1)
2724
});
2725
2726
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2727
}
2728
2729
void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2730
unsigned IntrID = N->getConstantOperandVal(1);
2731
switch (IntrID) {
2732
case Intrinsic::amdgcn_ds_append:
2733
case Intrinsic::amdgcn_ds_consume: {
2734
if (N->getValueType(0) != MVT::i32)
2735
break;
2736
SelectDSAppendConsume(N, IntrID);
2737
return;
2738
}
2739
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2740
SelectDSBvhStackIntrinsic(N);
2741
return;
2742
}
2743
2744
SelectCode(N);
2745
}
2746
2747
void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2748
unsigned IntrID = N->getConstantOperandVal(0);
2749
unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2750
SDNode *ConvGlueNode = N->getGluedNode();
2751
if (ConvGlueNode) {
2752
// FIXME: Possibly iterate over multiple glue nodes?
2753
assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2754
ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2755
ConvGlueNode =
2756
CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2757
MVT::Glue, SDValue(ConvGlueNode, 0));
2758
} else {
2759
ConvGlueNode = nullptr;
2760
}
2761
switch (IntrID) {
2762
case Intrinsic::amdgcn_wqm:
2763
Opcode = AMDGPU::WQM;
2764
break;
2765
case Intrinsic::amdgcn_softwqm:
2766
Opcode = AMDGPU::SOFT_WQM;
2767
break;
2768
case Intrinsic::amdgcn_wwm:
2769
case Intrinsic::amdgcn_strict_wwm:
2770
Opcode = AMDGPU::STRICT_WWM;
2771
break;
2772
case Intrinsic::amdgcn_strict_wqm:
2773
Opcode = AMDGPU::STRICT_WQM;
2774
break;
2775
case Intrinsic::amdgcn_interp_p1_f16:
2776
SelectInterpP1F16(N);
2777
return;
2778
default:
2779
SelectCode(N);
2780
break;
2781
}
2782
2783
if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2784
SDValue Src = N->getOperand(1);
2785
CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2786
}
2787
2788
if (ConvGlueNode) {
2789
SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
2790
NewOps.push_back(SDValue(ConvGlueNode, 0));
2791
CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2792
}
2793
}
2794
2795
void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2796
unsigned IntrID = N->getConstantOperandVal(1);
2797
switch (IntrID) {
2798
case Intrinsic::amdgcn_ds_gws_init:
2799
case Intrinsic::amdgcn_ds_gws_barrier:
2800
case Intrinsic::amdgcn_ds_gws_sema_v:
2801
case Intrinsic::amdgcn_ds_gws_sema_br:
2802
case Intrinsic::amdgcn_ds_gws_sema_p:
2803
case Intrinsic::amdgcn_ds_gws_sema_release_all:
2804
SelectDS_GWS(N, IntrID);
2805
return;
2806
default:
2807
break;
2808
}
2809
2810
SelectCode(N);
2811
}
2812
2813
void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2814
SDValue Log2WaveSize =
2815
CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2816
CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2817
{N->getOperand(0), Log2WaveSize});
2818
}
2819
2820
void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2821
SDValue SrcVal = N->getOperand(1);
2822
if (SrcVal.getValueType() != MVT::i32) {
2823
SelectCode(N); // Emit default error
2824
return;
2825
}
2826
2827
SDValue CopyVal;
2828
Register SP = TLI->getStackPointerRegisterToSaveRestore();
2829
SDLoc SL(N);
2830
2831
if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2832
CopyVal = SrcVal.getOperand(0);
2833
} else {
2834
SDValue Log2WaveSize = CurDAG->getTargetConstant(
2835
Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2836
2837
if (N->isDivergent()) {
2838
SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2839
MVT::i32, SrcVal),
2840
0);
2841
}
2842
2843
CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2844
{SrcVal, Log2WaveSize}),
2845
0);
2846
}
2847
2848
SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2849
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2850
}
2851
2852
bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2853
unsigned &Mods,
2854
bool IsCanonicalizing,
2855
bool AllowAbs) const {
2856
Mods = SISrcMods::NONE;
2857
Src = In;
2858
2859
if (Src.getOpcode() == ISD::FNEG) {
2860
Mods |= SISrcMods::NEG;
2861
Src = Src.getOperand(0);
2862
} else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2863
// Fold fsub [+-]0 into fneg. This may not have folded depending on the
2864
// denormal mode, but we're implicitly canonicalizing in a source operand.
2865
auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2866
if (LHS && LHS->isZero()) {
2867
Mods |= SISrcMods::NEG;
2868
Src = Src.getOperand(1);
2869
}
2870
}
2871
2872
if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2873
Mods |= SISrcMods::ABS;
2874
Src = Src.getOperand(0);
2875
}
2876
2877
return true;
2878
}
2879
2880
bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2881
SDValue &SrcMods) const {
2882
unsigned Mods;
2883
if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2884
/*AllowAbs=*/true)) {
2885
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2886
return true;
2887
}
2888
2889
return false;
2890
}
2891
2892
bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2893
SDValue In, SDValue &Src, SDValue &SrcMods) const {
2894
unsigned Mods;
2895
if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2896
/*AllowAbs=*/true)) {
2897
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2898
return true;
2899
}
2900
2901
return false;
2902
}
2903
2904
bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2905
SDValue &SrcMods) const {
2906
unsigned Mods;
2907
if (SelectVOP3ModsImpl(In, Src, Mods,
2908
/*IsCanonicalizing=*/true,
2909
/*AllowAbs=*/false)) {
2910
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2911
return true;
2912
}
2913
2914
return false;
2915
}
2916
2917
bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2918
if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2919
return false;
2920
2921
Src = In;
2922
return true;
2923
}
2924
2925
bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2926
SDValue &SrcMods,
2927
bool OpSel) const {
2928
unsigned Mods;
2929
if (SelectVOP3ModsImpl(In, Src, Mods,
2930
/*IsCanonicalizing=*/true,
2931
/*AllowAbs=*/false)) {
2932
if (OpSel)
2933
Mods |= SISrcMods::OP_SEL_0;
2934
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2935
return true;
2936
}
2937
2938
return false;
2939
}
2940
2941
bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2942
SDValue &SrcMods) const {
2943
return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2944
}
2945
2946
bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2947
SDValue &SrcMods) const {
2948
return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2949
}
2950
2951
bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2952
SDValue &SrcMods, SDValue &Clamp,
2953
SDValue &Omod) const {
2954
SDLoc DL(In);
2955
Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2956
Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2957
2958
return SelectVOP3Mods(In, Src, SrcMods);
2959
}
2960
2961
bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2962
SDValue &SrcMods, SDValue &Clamp,
2963
SDValue &Omod) const {
2964
SDLoc DL(In);
2965
Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2966
Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2967
2968
return SelectVOP3BMods(In, Src, SrcMods);
2969
}
2970
2971
bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2972
SDValue &Clamp, SDValue &Omod) const {
2973
Src = In;
2974
2975
SDLoc DL(In);
2976
Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2977
Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2978
2979
return true;
2980
}
2981
2982
bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2983
SDValue &SrcMods, bool IsDOT) const {
2984
unsigned Mods = SISrcMods::NONE;
2985
Src = In;
2986
2987
// TODO: Handle G_FSUB 0 as fneg
2988
if (Src.getOpcode() == ISD::FNEG) {
2989
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
2990
Src = Src.getOperand(0);
2991
}
2992
2993
if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
2994
(!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2995
unsigned VecMods = Mods;
2996
2997
SDValue Lo = stripBitcast(Src.getOperand(0));
2998
SDValue Hi = stripBitcast(Src.getOperand(1));
2999
3000
if (Lo.getOpcode() == ISD::FNEG) {
3001
Lo = stripBitcast(Lo.getOperand(0));
3002
Mods ^= SISrcMods::NEG;
3003
}
3004
3005
if (Hi.getOpcode() == ISD::FNEG) {
3006
Hi = stripBitcast(Hi.getOperand(0));
3007
Mods ^= SISrcMods::NEG_HI;
3008
}
3009
3010
if (isExtractHiElt(Lo, Lo))
3011
Mods |= SISrcMods::OP_SEL_0;
3012
3013
if (isExtractHiElt(Hi, Hi))
3014
Mods |= SISrcMods::OP_SEL_1;
3015
3016
unsigned VecSize = Src.getValueSizeInBits();
3017
Lo = stripExtractLoElt(Lo);
3018
Hi = stripExtractLoElt(Hi);
3019
3020
if (Lo.getValueSizeInBits() > VecSize) {
3021
Lo = CurDAG->getTargetExtractSubreg(
3022
(VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3023
MVT::getIntegerVT(VecSize), Lo);
3024
}
3025
3026
if (Hi.getValueSizeInBits() > VecSize) {
3027
Hi = CurDAG->getTargetExtractSubreg(
3028
(VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3029
MVT::getIntegerVT(VecSize), Hi);
3030
}
3031
3032
assert(Lo.getValueSizeInBits() <= VecSize &&
3033
Hi.getValueSizeInBits() <= VecSize);
3034
3035
if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3036
// Really a scalar input. Just select from the low half of the register to
3037
// avoid packing.
3038
3039
if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3040
Src = Lo;
3041
} else {
3042
assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3043
3044
SDLoc SL(In);
3045
SDValue Undef = SDValue(
3046
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3047
Lo.getValueType()), 0);
3048
auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3049
: AMDGPU::SReg_64RegClassID;
3050
const SDValue Ops[] = {
3051
CurDAG->getTargetConstant(RC, SL, MVT::i32),
3052
Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3053
Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3054
3055
Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3056
Src.getValueType(), Ops), 0);
3057
}
3058
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3059
return true;
3060
}
3061
3062
if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3063
uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3064
.bitcastToAPInt().getZExtValue();
3065
if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3066
Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3067
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3068
return true;
3069
}
3070
}
3071
3072
Mods = VecMods;
3073
}
3074
3075
// Packed instructions do not have abs modifiers.
3076
Mods |= SISrcMods::OP_SEL_1;
3077
3078
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3079
return true;
3080
}
3081
3082
bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3083
SDValue &SrcMods) const {
3084
return SelectVOP3PMods(In, Src, SrcMods, true);
3085
}
3086
3087
bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3088
const ConstantSDNode *C = cast<ConstantSDNode>(In);
3089
// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3090
// 1 promotes packed values to signed, 0 treats them as unsigned.
3091
assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3092
3093
unsigned Mods = SISrcMods::OP_SEL_1;
3094
unsigned SrcSign = C->getZExtValue();
3095
if (SrcSign == 1)
3096
Mods ^= SISrcMods::NEG;
3097
3098
Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3099
return true;
3100
}
3101
3102
bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3103
SDValue &Src) const {
3104
const ConstantSDNode *C = cast<ConstantSDNode>(In);
3105
assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3106
3107
unsigned Mods = SISrcMods::OP_SEL_1;
3108
unsigned SrcVal = C->getZExtValue();
3109
if (SrcVal == 1)
3110
Mods |= SISrcMods::OP_SEL_0;
3111
3112
Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3113
return true;
3114
}
3115
3116
static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3117
llvm::SelectionDAG *CurDAG,
3118
const SDLoc &DL) {
3119
unsigned DstRegClass;
3120
EVT DstTy;
3121
switch (Elts.size()) {
3122
case 8:
3123
DstRegClass = AMDGPU::VReg_256RegClassID;
3124
DstTy = MVT::v8i32;
3125
break;
3126
case 4:
3127
DstRegClass = AMDGPU::VReg_128RegClassID;
3128
DstTy = MVT::v4i32;
3129
break;
3130
case 2:
3131
DstRegClass = AMDGPU::VReg_64RegClassID;
3132
DstTy = MVT::v2i32;
3133
break;
3134
default:
3135
llvm_unreachable("unhandled Reg sequence size");
3136
}
3137
3138
SmallVector<SDValue, 17> Ops;
3139
Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3140
for (unsigned i = 0; i < Elts.size(); ++i) {
3141
Ops.push_back(Elts[i]);
3142
Ops.push_back(CurDAG->getTargetConstant(
3143
SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));
3144
}
3145
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3146
}
3147
3148
static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3149
llvm::SelectionDAG *CurDAG,
3150
const SDLoc &DL) {
3151
SmallVector<SDValue, 8> PackedElts;
3152
assert("unhandled Reg sequence size" &&
3153
(Elts.size() == 8 || Elts.size() == 16));
3154
3155
// Pack 16-bit elements in pairs into 32-bit register. If both elements are
3156
// unpacked from 32-bit source use it, otherwise pack them using v_perm.
3157
for (unsigned i = 0; i < Elts.size(); i += 2) {
3158
SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3159
SDValue HiSrc;
3160
if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3161
PackedElts.push_back(HiSrc);
3162
} else {
3163
SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3164
MachineSDNode *Packed =
3165
CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3166
{Elts[i + 1], Elts[i], PackLoLo});
3167
PackedElts.push_back(SDValue(Packed, 0));
3168
}
3169
}
3170
3171
return buildRegSequence32(PackedElts, CurDAG, DL);
3172
}
3173
3174
static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3175
llvm::SelectionDAG *CurDAG,
3176
const SDLoc &DL, unsigned ElementSize) {
3177
if (ElementSize == 16)
3178
return buildRegSequence16(Elts, CurDAG, DL);
3179
if (ElementSize == 32)
3180
return buildRegSequence32(Elts, CurDAG, DL);
3181
llvm_unreachable("Unhandled element size");
3182
}
3183
3184
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3185
SmallVectorImpl<SDValue> &Elts, SDValue &Src,
3186
llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3187
unsigned ElementSize) {
3188
if (ModOpcode == ISD::FNEG) {
3189
Mods |= SISrcMods::NEG;
3190
// Check if all elements also have abs modifier
3191
SmallVector<SDValue, 8> NegAbsElts;
3192
for (auto El : Elts) {
3193
if (El.getOpcode() != ISD::FABS)
3194
break;
3195
NegAbsElts.push_back(El->getOperand(0));
3196
}
3197
if (Elts.size() != NegAbsElts.size()) {
3198
// Neg
3199
Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3200
} else {
3201
// Neg and Abs
3202
Mods |= SISrcMods::NEG_HI;
3203
Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3204
}
3205
} else {
3206
assert(ModOpcode == ISD::FABS);
3207
// Abs
3208
Mods |= SISrcMods::NEG_HI;
3209
Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3210
}
3211
}
3212
3213
// Check all f16 elements for modifiers while looking through b32 and v2b16
3214
// build vector, stop if element does not satisfy ModifierCheck.
3215
static void
3216
checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
3217
std::function<bool(SDValue)> ModifierCheck) {
3218
for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3219
if (auto *F16Pair =
3220
dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3221
for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3222
SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3223
if (!ModifierCheck(ElF16))
3224
break;
3225
}
3226
}
3227
}
3228
}
3229
3230
bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3231
SDValue &SrcMods) const {
3232
Src = In;
3233
unsigned Mods = SISrcMods::OP_SEL_1;
3234
3235
// mods are on f16 elements
3236
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3237
SmallVector<SDValue, 8> EltsF16;
3238
3239
checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3240
if (Element.getOpcode() != ISD::FNEG)
3241
return false;
3242
EltsF16.push_back(Element.getOperand(0));
3243
return true;
3244
});
3245
3246
// All elements have neg modifier
3247
if (BV->getNumOperands() * 2 == EltsF16.size()) {
3248
Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3249
Mods |= SISrcMods::NEG;
3250
Mods |= SISrcMods::NEG_HI;
3251
}
3252
}
3253
3254
// mods are on v2f16 elements
3255
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3256
SmallVector<SDValue, 8> EltsV2F16;
3257
for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3258
SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3259
// Based on first element decide which mod we match, neg or abs
3260
if (ElV2f16.getOpcode() != ISD::FNEG)
3261
break;
3262
EltsV2F16.push_back(ElV2f16.getOperand(0));
3263
}
3264
3265
// All pairs of elements have neg modifier
3266
if (BV->getNumOperands() == EltsV2F16.size()) {
3267
Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3268
Mods |= SISrcMods::NEG;
3269
Mods |= SISrcMods::NEG_HI;
3270
}
3271
}
3272
3273
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3274
return true;
3275
}
3276
3277
bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3278
SDValue &SrcMods) const {
3279
Src = In;
3280
unsigned Mods = SISrcMods::OP_SEL_1;
3281
unsigned ModOpcode;
3282
3283
// mods are on f16 elements
3284
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3285
SmallVector<SDValue, 8> EltsF16;
3286
checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3287
// Based on first element decide which mod we match, neg or abs
3288
if (EltsF16.empty())
3289
ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3290
if (ElF16.getOpcode() != ModOpcode)
3291
return false;
3292
EltsF16.push_back(ElF16.getOperand(0));
3293
return true;
3294
});
3295
3296
// All elements have ModOpcode modifier
3297
if (BV->getNumOperands() * 2 == EltsF16.size())
3298
selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3299
16);
3300
}
3301
3302
// mods are on v2f16 elements
3303
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3304
SmallVector<SDValue, 8> EltsV2F16;
3305
3306
for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3307
SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3308
// Based on first element decide which mod we match, neg or abs
3309
if (EltsV2F16.empty())
3310
ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3311
if (ElV2f16->getOpcode() != ModOpcode)
3312
break;
3313
EltsV2F16.push_back(ElV2f16->getOperand(0));
3314
}
3315
3316
// All elements have ModOpcode modifier
3317
if (BV->getNumOperands() == EltsV2F16.size())
3318
selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3319
32);
3320
}
3321
3322
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3323
return true;
3324
}
3325
3326
bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3327
SDValue &SrcMods) const {
3328
Src = In;
3329
unsigned Mods = SISrcMods::OP_SEL_1;
3330
SmallVector<SDValue, 8> EltsF32;
3331
3332
if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3333
assert(BV->getNumOperands() > 0);
3334
// Based on first element decide which mod we match, neg or abs
3335
SDValue ElF32 = stripBitcast(BV->getOperand(0));
3336
unsigned ModOpcode =
3337
(ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3338
for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3339
SDValue ElF32 = stripBitcast(BV->getOperand(i));
3340
if (ElF32.getOpcode() != ModOpcode)
3341
break;
3342
EltsF32.push_back(ElF32.getOperand(0));
3343
}
3344
3345
// All elements had ModOpcode modifier
3346
if (BV->getNumOperands() == EltsF32.size())
3347
selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3348
32);
3349
}
3350
3351
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3352
return true;
3353
}
3354
3355
bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3356
if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3357
BitVector UndefElements;
3358
if (SDValue Splat = BV->getSplatValue(&UndefElements))
3359
if (isInlineImmediate(Splat.getNode())) {
3360
if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3361
unsigned Imm = C->getAPIntValue().getSExtValue();
3362
Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3363
return true;
3364
}
3365
if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3366
unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3367
Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3368
return true;
3369
}
3370
llvm_unreachable("unhandled Constant node");
3371
}
3372
}
3373
3374
// 16 bit splat
3375
SDValue SplatSrc32 = stripBitcast(In);
3376
if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3377
if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3378
SDValue SplatSrc16 = stripBitcast(Splat32);
3379
if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3380
if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3381
const SIInstrInfo *TII = Subtarget->getInstrInfo();
3382
std::optional<APInt> RawValue;
3383
if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3384
RawValue = C->getValueAPF().bitcastToAPInt();
3385
else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3386
RawValue = C->getAPIntValue();
3387
3388
if (RawValue.has_value()) {
3389
EVT VT = In.getValueType().getScalarType();
3390
if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3391
APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3392
? APFloatBase::IEEEhalf()
3393
: APFloatBase::BFloat(),
3394
RawValue.value());
3395
if (TII->isInlineConstant(FloatVal)) {
3396
Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3397
MVT::i16);
3398
return true;
3399
}
3400
} else if (VT.getSimpleVT() == MVT::i16) {
3401
if (TII->isInlineConstant(RawValue.value())) {
3402
Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3403
MVT::i16);
3404
return true;
3405
}
3406
} else
3407
llvm_unreachable("unknown 16-bit type");
3408
}
3409
}
3410
}
3411
3412
return false;
3413
}
3414
3415
bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3416
SDValue &IndexKey) const {
3417
unsigned Key = 0;
3418
Src = In;
3419
3420
if (In.getOpcode() == ISD::SRL) {
3421
const llvm::SDValue &ShiftSrc = In.getOperand(0);
3422
ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3423
if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3424
ShiftAmt->getZExtValue() % 8 == 0) {
3425
Key = ShiftAmt->getZExtValue() / 8;
3426
Src = ShiftSrc;
3427
}
3428
}
3429
3430
IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3431
return true;
3432
}
3433
3434
bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3435
SDValue &IndexKey) const {
3436
unsigned Key = 0;
3437
Src = In;
3438
3439
if (In.getOpcode() == ISD::SRL) {
3440
const llvm::SDValue &ShiftSrc = In.getOperand(0);
3441
ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3442
if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3443
ShiftAmt->getZExtValue() == 16) {
3444
Key = 1;
3445
Src = ShiftSrc;
3446
}
3447
}
3448
3449
IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3450
return true;
3451
}
3452
3453
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3454
SDValue &SrcMods) const {
3455
Src = In;
3456
// FIXME: Handle op_sel
3457
SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3458
return true;
3459
}
3460
3461
bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3462
SDValue &SrcMods) const {
3463
// FIXME: Handle op_sel
3464
return SelectVOP3Mods(In, Src, SrcMods);
3465
}
3466
3467
// The return value is not whether the match is possible (which it always is),
3468
// but whether or not it a conversion is really used.
3469
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3470
unsigned &Mods) const {
3471
Mods = 0;
3472
SelectVOP3ModsImpl(In, Src, Mods);
3473
3474
if (Src.getOpcode() == ISD::FP_EXTEND) {
3475
Src = Src.getOperand(0);
3476
assert(Src.getValueType() == MVT::f16);
3477
Src = stripBitcast(Src);
3478
3479
// Be careful about folding modifiers if we already have an abs. fneg is
3480
// applied last, so we don't want to apply an earlier fneg.
3481
if ((Mods & SISrcMods::ABS) == 0) {
3482
unsigned ModsTmp;
3483
SelectVOP3ModsImpl(Src, Src, ModsTmp);
3484
3485
if ((ModsTmp & SISrcMods::NEG) != 0)
3486
Mods ^= SISrcMods::NEG;
3487
3488
if ((ModsTmp & SISrcMods::ABS) != 0)
3489
Mods |= SISrcMods::ABS;
3490
}
3491
3492
// op_sel/op_sel_hi decide the source type and source.
3493
// If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3494
// If the sources's op_sel is set, it picks the high half of the source
3495
// register.
3496
3497
Mods |= SISrcMods::OP_SEL_1;
3498
if (isExtractHiElt(Src, Src)) {
3499
Mods |= SISrcMods::OP_SEL_0;
3500
3501
// TODO: Should we try to look for neg/abs here?
3502
}
3503
3504
return true;
3505
}
3506
3507
return false;
3508
}
3509
3510
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3511
SDValue &SrcMods) const {
3512
unsigned Mods = 0;
3513
if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3514
return false;
3515
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3516
return true;
3517
}
3518
3519
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3520
SDValue &SrcMods) const {
3521
unsigned Mods = 0;
3522
SelectVOP3PMadMixModsImpl(In, Src, Mods);
3523
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3524
return true;
3525
}
3526
3527
SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3528
if (In.isUndef())
3529
return CurDAG->getUNDEF(MVT::i32);
3530
3531
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3532
SDLoc SL(In);
3533
return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3534
}
3535
3536
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3537
SDLoc SL(In);
3538
return CurDAG->getConstant(
3539
C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3540
}
3541
3542
SDValue Src;
3543
if (isExtractHiElt(In, Src))
3544
return Src;
3545
3546
return SDValue();
3547
}
3548
3549
bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3550
assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
3551
3552
const SIRegisterInfo *SIRI =
3553
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3554
const SIInstrInfo * SII =
3555
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3556
3557
unsigned Limit = 0;
3558
bool AllUsesAcceptSReg = true;
3559
for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3560
Limit < 10 && U != E; ++U, ++Limit) {
3561
const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
3562
3563
// If the register class is unknown, it could be an unknown
3564
// register class that needs to be an SGPR, e.g. an inline asm
3565
// constraint
3566
if (!RC || SIRI->isSGPRClass(RC))
3567
return false;
3568
3569
if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3570
AllUsesAcceptSReg = false;
3571
SDNode * User = *U;
3572
if (User->isMachineOpcode()) {
3573
unsigned Opc = User->getMachineOpcode();
3574
const MCInstrDesc &Desc = SII->get(Opc);
3575
if (Desc.isCommutable()) {
3576
unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
3577
unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3578
if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3579
unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3580
const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
3581
if (CommutedRC == &AMDGPU::VS_32RegClass ||
3582
CommutedRC == &AMDGPU::VS_64RegClass)
3583
AllUsesAcceptSReg = true;
3584
}
3585
}
3586
}
3587
// If "AllUsesAcceptSReg == false" so far we haven't succeeded
3588
// commuting current user. This means have at least one use
3589
// that strictly require VGPR. Thus, we will not attempt to commute
3590
// other user instructions.
3591
if (!AllUsesAcceptSReg)
3592
break;
3593
}
3594
}
3595
return !AllUsesAcceptSReg && (Limit < 10);
3596
}
3597
3598
bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3599
auto Ld = cast<LoadSDNode>(N);
3600
3601
const MachineMemOperand *MMO = Ld->getMemOperand();
3602
if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3603
return false;
3604
3605
return MMO->getSize().hasValue() &&
3606
Ld->getAlign() >=
3607
Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3608
uint64_t(4))) &&
3609
((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3610
Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3611
(Subtarget->getScalarizeGlobalBehavior() &&
3612
Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3613
Ld->isSimple() &&
3614
static_cast<const SITargetLowering *>(getTargetLowering())
3615
->isMemOpHasNoClobberedMemOperand(N)));
3616
}
3617
3618
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
3619
const AMDGPUTargetLowering& Lowering =
3620
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3621
bool IsModified = false;
3622
do {
3623
IsModified = false;
3624
3625
// Go over all selected nodes and try to fold them a bit more
3626
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
3627
while (Position != CurDAG->allnodes_end()) {
3628
SDNode *Node = &*Position++;
3629
MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3630
if (!MachineNode)
3631
continue;
3632
3633
SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3634
if (ResNode != Node) {
3635
if (ResNode)
3636
ReplaceUses(Node, ResNode);
3637
IsModified = true;
3638
}
3639
}
3640
CurDAG->RemoveDeadNodes();
3641
} while (IsModified);
3642
}
3643
3644
AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,
3645
CodeGenOptLevel OptLevel)
3646
: SelectionDAGISelLegacy(
3647
ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
3648
3649
char AMDGPUDAGToDAGISelLegacy::ID = 0;
3650
3651