CoCalc -- AMDGPUISelDAGToDAG.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
³⁵²⁶⁷ views
1
//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//==-----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// Defines an instruction selector for the AMDGPU target.
11
//
12
//===----------------------------------------------------------------------===//
13

14
#include "AMDGPUISelDAGToDAG.h"
15
#include "AMDGPU.h"
16
#include "AMDGPUInstrInfo.h"
17
#include "AMDGPUSubtarget.h"
18
#include "AMDGPUTargetMachine.h"
19
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20
#include "MCTargetDesc/R600MCTargetDesc.h"
21
#include "R600RegisterInfo.h"
22
#include "SIISelLowering.h"
23
#include "SIMachineFunctionInfo.h"
24
#include "llvm/Analysis/UniformityAnalysis.h"
25
#include "llvm/Analysis/ValueTracking.h"
26
#include "llvm/CodeGen/FunctionLoweringInfo.h"
27
#include "llvm/CodeGen/SelectionDAG.h"
28
#include "llvm/CodeGen/SelectionDAGISel.h"
29
#include "llvm/CodeGen/SelectionDAGNodes.h"
30
#include "llvm/IR/IntrinsicsAMDGPU.h"
31
#include "llvm/InitializePasses.h"
32
#include "llvm/Support/ErrorHandling.h"
33

34
#ifdef EXPENSIVE_CHECKS
35
#include "llvm/Analysis/LoopInfo.h"
36
#include "llvm/IR/Dominators.h"
37
#endif
38

39
#define DEBUG_TYPE "amdgpu-isel"
40

41
using namespace llvm;
42

43
//===----------------------------------------------------------------------===//
44
// Instruction Selector Implementation
45
//===----------------------------------------------------------------------===//
46

47
namespace {
48
static SDValue stripBitcast(SDValue Val) {
49
  return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
50
}
51

52
// Figure out if this is really an extract of the high 16-bits of a dword.
53
static bool isExtractHiElt(SDValue In, SDValue &Out) {
54
  In = stripBitcast(In);
55

56
  if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
57
    if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
58
      if (!Idx->isOne())
59
        return false;
60
      Out = In.getOperand(0);
61
      return true;
62
    }
63
  }
64

65
  if (In.getOpcode() != ISD::TRUNCATE)
66
    return false;
67

68
  SDValue Srl = In.getOperand(0);
69
  if (Srl.getOpcode() == ISD::SRL) {
70
    if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
71
      if (ShiftAmt->getZExtValue() == 16) {
72
        Out = stripBitcast(Srl.getOperand(0));
73
        return true;
74
      }
75
    }
76
  }
77

78
  return false;
79
}
80

81
// Look through operations that obscure just looking at the low 16-bits of the
82
// same register.
83
static SDValue stripExtractLoElt(SDValue In) {
84
  if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
85
    SDValue Idx = In.getOperand(1);
86
    if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
87
      return In.getOperand(0);
88
  }
89

90
  if (In.getOpcode() == ISD::TRUNCATE) {
91
    SDValue Src = In.getOperand(0);
92
    if (Src.getValueType().getSizeInBits() == 32)
93
      return stripBitcast(Src);
94
  }
95

96
  return In;
97
}
98

99
} // end anonymous namespace
100

101
INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
102
                      "AMDGPU DAG->DAG Pattern Instruction Selection", false,
103
                      false)
104
INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
105
INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
106
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
107
#ifdef EXPENSIVE_CHECKS
108
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
109
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
110
#endif
111
INITIALIZE_PASS_END(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel",
112
                    "AMDGPU DAG->DAG Pattern Instruction Selection", false,
113
                    false)
114

115
/// This pass converts a legalized DAG into a AMDGPU-specific
116
// DAG, ready for instruction scheduling.
117
FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
118
                                        CodeGenOptLevel OptLevel) {
119
  return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
120
}
121

122
AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,
123
                                       CodeGenOptLevel OptLevel)
124
    : SelectionDAGISel(TM, OptLevel) {
125
  EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
126
}
127

128
bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
129
  Subtarget = &MF.getSubtarget<GCNSubtarget>();
130
  Subtarget->checkSubtargetFeatures(MF.getFunction());
131
  Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
132
  return SelectionDAGISel::runOnMachineFunction(MF);
133
}
134

135
bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
136
  // XXX - only need to list legal operations.
137
  switch (Opc) {
138
  case ISD::FADD:
139
  case ISD::FSUB:
140
  case ISD::FMUL:
141
  case ISD::FDIV:
142
  case ISD::FREM:
143
  case ISD::FCANONICALIZE:
144
  case ISD::UINT_TO_FP:
145
  case ISD::SINT_TO_FP:
146
  case ISD::FABS:
147
    // Fabs is lowered to a bit operation, but it's an and which will clear the
148
    // high bits anyway.
149
  case ISD::FSQRT:
150
  case ISD::FSIN:
151
  case ISD::FCOS:
152
  case ISD::FPOWI:
153
  case ISD::FPOW:
154
  case ISD::FLOG:
155
  case ISD::FLOG2:
156
  case ISD::FLOG10:
157
  case ISD::FEXP:
158
  case ISD::FEXP2:
159
  case ISD::FCEIL:
160
  case ISD::FTRUNC:
161
  case ISD::FRINT:
162
  case ISD::FNEARBYINT:
163
  case ISD::FROUNDEVEN:
164
  case ISD::FROUND:
165
  case ISD::FFLOOR:
166
  case ISD::FMINNUM:
167
  case ISD::FMAXNUM:
168
  case ISD::FLDEXP:
169
  case AMDGPUISD::FRACT:
170
  case AMDGPUISD::CLAMP:
171
  case AMDGPUISD::COS_HW:
172
  case AMDGPUISD::SIN_HW:
173
  case AMDGPUISD::FMIN3:
174
  case AMDGPUISD::FMAX3:
175
  case AMDGPUISD::FMED3:
176
  case AMDGPUISD::FMAD_FTZ:
177
  case AMDGPUISD::RCP:
178
  case AMDGPUISD::RSQ:
179
  case AMDGPUISD::RCP_IFLAG:
180
    // On gfx10, all 16-bit instructions preserve the high bits.
181
    return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
182
  case ISD::FP_ROUND:
183
    // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
184
    // high bits on gfx9.
185
    // TODO: If we had the source node we could see if the source was fma/mad
186
    return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
187
  case ISD::FMA:
188
  case ISD::FMAD:
189
  case AMDGPUISD::DIV_FIXUP:
190
    return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
191
  default:
192
    // fcopysign, select and others may be lowered to 32-bit bit operations
193
    // which don't zero the high bits.
194
    return false;
195
  }
196
}
197

198
bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) {
199
#ifdef EXPENSIVE_CHECKS
200
  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
201
  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
202
  for (auto &L : LI->getLoopsInPreorder()) {
203
    assert(L->isLCSSAForm(DT));
204
  }
205
#endif
206
  return SelectionDAGISelLegacy::runOnMachineFunction(MF);
207
}
208

209
void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
210
  AU.addRequired<AMDGPUArgumentUsageInfo>();
211
  AU.addRequired<UniformityInfoWrapperPass>();
212
#ifdef EXPENSIVE_CHECKS
213
  AU.addRequired<DominatorTreeWrapperPass>();
214
  AU.addRequired<LoopInfoWrapperPass>();
215
#endif
216
  SelectionDAGISelLegacy::getAnalysisUsage(AU);
217
}
218

219
bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
220
  assert(Subtarget->d16PreservesUnusedBits());
221
  MVT VT = N->getValueType(0).getSimpleVT();
222
  if (VT != MVT::v2i16 && VT != MVT::v2f16)
223
    return false;
224

225
  SDValue Lo = N->getOperand(0);
226
  SDValue Hi = N->getOperand(1);
227

228
  LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
229

230
  // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
231
  // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
232
  // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
233

234
  // Need to check for possible indirect dependencies on the other half of the
235
  // vector to avoid introducing a cycle.
236
  if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
237
    SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
238

239
    SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
240
    SDValue Ops[] = {
241
      LdHi->getChain(), LdHi->getBasePtr(), TiedIn
242
    };
243

244
    unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
245
    if (LdHi->getMemoryVT() == MVT::i8) {
246
      LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
247
        AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
248
    } else {
249
      assert(LdHi->getMemoryVT() == MVT::i16);
250
    }
251

252
    SDValue NewLoadHi =
253
      CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
254
                                  Ops, LdHi->getMemoryVT(),
255
                                  LdHi->getMemOperand());
256

257
    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
258
    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
259
    return true;
260
  }
261

262
  // build_vector (load ptr), hi -> load_d16_lo ptr, hi
263
  // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
264
  // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
265
  LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
266
  if (LdLo && Lo.hasOneUse()) {
267
    SDValue TiedIn = getHi16Elt(Hi);
268
    if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
269
      return false;
270

271
    SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
272
    unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
273
    if (LdLo->getMemoryVT() == MVT::i8) {
274
      LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
275
        AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
276
    } else {
277
      assert(LdLo->getMemoryVT() == MVT::i16);
278
    }
279

280
    TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
281

282
    SDValue Ops[] = {
283
      LdLo->getChain(), LdLo->getBasePtr(), TiedIn
284
    };
285

286
    SDValue NewLoadLo =
287
      CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
288
                                  Ops, LdLo->getMemoryVT(),
289
                                  LdLo->getMemOperand());
290

291
    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
292
    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
293
    return true;
294
  }
295

296
  return false;
297
}
298

299
void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
300
  if (!Subtarget->d16PreservesUnusedBits())
301
    return;
302

303
  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
304

305
  bool MadeChange = false;
306
  while (Position != CurDAG->allnodes_begin()) {
307
    SDNode *N = &*--Position;
308
    if (N->use_empty())
309
      continue;
310

311
    switch (N->getOpcode()) {
312
    case ISD::BUILD_VECTOR:
313
      // TODO: Match load d16 from shl (extload:i16), 16
314
      MadeChange |= matchLoadD16FromBuildVector(N);
315
      break;
316
    default:
317
      break;
318
    }
319
  }
320

321
  if (MadeChange) {
322
    CurDAG->RemoveDeadNodes();
323
    LLVM_DEBUG(dbgs() << "After PreProcess:\n";
324
               CurDAG->dump(););
325
  }
326
}
327

328
bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
329
  if (N->isUndef())
330
    return true;
331

332
  const SIInstrInfo *TII = Subtarget->getInstrInfo();
333
  if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
334
    return TII->isInlineConstant(C->getAPIntValue());
335

336
  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
337
    return TII->isInlineConstant(C->getValueAPF());
338

339
  return false;
340
}
341

342
/// Determine the register class for \p OpNo
343
/// \returns The register class of the virtual register that will be used for
344
/// the given operand number \OpNo or NULL if the register class cannot be
345
/// determined.
346
const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
347
                                                          unsigned OpNo) const {
348
  if (!N->isMachineOpcode()) {
349
    if (N->getOpcode() == ISD::CopyToReg) {
350
      Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
351
      if (Reg.isVirtual()) {
352
        MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
353
        return MRI.getRegClass(Reg);
354
      }
355

356
      const SIRegisterInfo *TRI
357
        = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
358
      return TRI->getPhysRegBaseClass(Reg);
359
    }
360

361
    return nullptr;
362
  }
363

364
  switch (N->getMachineOpcode()) {
365
  default: {
366
    const MCInstrDesc &Desc =
367
        Subtarget->getInstrInfo()->get(N->getMachineOpcode());
368
    unsigned OpIdx = Desc.getNumDefs() + OpNo;
369
    if (OpIdx >= Desc.getNumOperands())
370
      return nullptr;
371
    int RegClass = Desc.operands()[OpIdx].RegClass;
372
    if (RegClass == -1)
373
      return nullptr;
374

375
    return Subtarget->getRegisterInfo()->getRegClass(RegClass);
376
  }
377
  case AMDGPU::REG_SEQUENCE: {
378
    unsigned RCID = N->getConstantOperandVal(0);
379
    const TargetRegisterClass *SuperRC =
380
        Subtarget->getRegisterInfo()->getRegClass(RCID);
381

382
    SDValue SubRegOp = N->getOperand(OpNo + 1);
383
    unsigned SubRegIdx = SubRegOp->getAsZExtVal();
384
    return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
385
                                                              SubRegIdx);
386
  }
387
  }
388
}
389

390
SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
391
                                         SDValue Glue) const {
392
  SmallVector <SDValue, 8> Ops;
393
  Ops.push_back(NewChain); // Replace the chain.
394
  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
395
    Ops.push_back(N->getOperand(i));
396

397
  Ops.push_back(Glue);
398
  return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
399
}
400

401
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
402
  const SITargetLowering& Lowering =
403
    *static_cast<const SITargetLowering*>(getTargetLowering());
404

405
  assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
406

407
  SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
408
  return glueCopyToOp(N, M0, M0.getValue(1));
409
}
410

411
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
412
  unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
413
  if (AS == AMDGPUAS::LOCAL_ADDRESS) {
414
    if (Subtarget->ldsRequiresM0Init())
415
      return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
416
  } else if (AS == AMDGPUAS::REGION_ADDRESS) {
417
    MachineFunction &MF = CurDAG->getMachineFunction();
418
    unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
419
    return
420
        glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
421
  }
422
  return N;
423
}
424

425
MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
426
                                                  EVT VT) const {
427
  SDNode *Lo = CurDAG->getMachineNode(
428
      AMDGPU::S_MOV_B32, DL, MVT::i32,
429
      CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
430
  SDNode *Hi =
431
      CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
432
                             CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
433
  const SDValue Ops[] = {
434
      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
435
      SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
436
      SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
437

438
  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
439
}
440

441
void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
442
  EVT VT = N->getValueType(0);
443
  unsigned NumVectorElts = VT.getVectorNumElements();
444
  EVT EltVT = VT.getVectorElementType();
445
  SDLoc DL(N);
446
  SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
447

448
  if (NumVectorElts == 1) {
449
    CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
450
                         RegClass);
451
    return;
452
  }
453

454
  assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
455
                                  "supported yet");
456
  // 32 = Max Num Vector Elements
457
  // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
458
  // 1 = Vector Register Class
459
  SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
460

461
  bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
462
               Triple::amdgcn;
463
  RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
464
  bool IsRegSeq = true;
465
  unsigned NOps = N->getNumOperands();
466
  for (unsigned i = 0; i < NOps; i++) {
467
    // XXX: Why is this here?
468
    if (isa<RegisterSDNode>(N->getOperand(i))) {
469
      IsRegSeq = false;
470
      break;
471
    }
472
    unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
473
                         : R600RegisterInfo::getSubRegFromChannel(i);
474
    RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
475
    RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
476
  }
477
  if (NOps != NumVectorElts) {
478
    // Fill in the missing undef elements if this was a scalar_to_vector.
479
    assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
480
    MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
481
                                                   DL, EltVT);
482
    for (unsigned i = NOps; i < NumVectorElts; ++i) {
483
      unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
484
                           : R600RegisterInfo::getSubRegFromChannel(i);
485
      RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
486
      RegSeqArgs[1 + (2 * i) + 1] =
487
          CurDAG->getTargetConstant(Sub, DL, MVT::i32);
488
    }
489
  }
490

491
  if (!IsRegSeq)
492
    SelectCode(N);
493
  CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
494
}
495

496
void AMDGPUDAGToDAGISel::Select(SDNode *N) {
497
  unsigned int Opc = N->getOpcode();
498
  if (N->isMachineOpcode()) {
499
    N->setNodeId(-1);
500
    return;   // Already selected.
501
  }
502

503
  // isa<MemSDNode> almost works but is slightly too permissive for some DS
504
  // intrinsics.
505
  if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
506
    N = glueCopyToM0LDSInit(N);
507
    SelectCode(N);
508
    return;
509
  }
510

511
  switch (Opc) {
512
  default:
513
    break;
514
  // We are selecting i64 ADD here instead of custom lower it during
515
  // DAG legalization, so we can fold some i64 ADDs used for address
516
  // calculation into the LOAD and STORE instructions.
517
  case ISD::ADDC:
518
  case ISD::ADDE:
519
  case ISD::SUBC:
520
  case ISD::SUBE: {
521
    if (N->getValueType(0) != MVT::i64)
522
      break;
523

524
    SelectADD_SUB_I64(N);
525
    return;
526
  }
527
  case ISD::UADDO_CARRY:
528
  case ISD::USUBO_CARRY:
529
    if (N->getValueType(0) != MVT::i32)
530
      break;
531

532
    SelectAddcSubb(N);
533
    return;
534
  case ISD::UADDO:
535
  case ISD::USUBO: {
536
    SelectUADDO_USUBO(N);
537
    return;
538
  }
539
  case AMDGPUISD::FMUL_W_CHAIN: {
540
    SelectFMUL_W_CHAIN(N);
541
    return;
542
  }
543
  case AMDGPUISD::FMA_W_CHAIN: {
544
    SelectFMA_W_CHAIN(N);
545
    return;
546
  }
547

548
  case ISD::SCALAR_TO_VECTOR:
549
  case ISD::BUILD_VECTOR: {
550
    EVT VT = N->getValueType(0);
551
    unsigned NumVectorElts = VT.getVectorNumElements();
552
    if (VT.getScalarSizeInBits() == 16) {
553
      if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
554
        if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
555
          ReplaceNode(N, Packed);
556
          return;
557
        }
558
      }
559

560
      break;
561
    }
562

563
    assert(VT.getVectorElementType().bitsEq(MVT::i32));
564
    unsigned RegClassID =
565
        SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
566
    SelectBuildVector(N, RegClassID);
567
    return;
568
  }
569
  case ISD::BUILD_PAIR: {
570
    SDValue RC, SubReg0, SubReg1;
571
    SDLoc DL(N);
572
    if (N->getValueType(0) == MVT::i128) {
573
      RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
574
      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
575
      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
576
    } else if (N->getValueType(0) == MVT::i64) {
577
      RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
578
      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
579
      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
580
    } else {
581
      llvm_unreachable("Unhandled value type for BUILD_PAIR");
582
    }
583
    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
584
                            N->getOperand(1), SubReg1 };
585
    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
586
                                          N->getValueType(0), Ops));
587
    return;
588
  }
589

590
  case ISD::Constant:
591
  case ISD::ConstantFP: {
592
    if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
593
      break;
594

595
    uint64_t Imm;
596
    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
597
      Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
598
      if (AMDGPU::isValid32BitLiteral(Imm, true))
599
        break;
600
    } else {
601
      ConstantSDNode *C = cast<ConstantSDNode>(N);
602
      Imm = C->getZExtValue();
603
      if (AMDGPU::isValid32BitLiteral(Imm, false))
604
        break;
605
    }
606

607
    SDLoc DL(N);
608
    ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
609
    return;
610
  }
611
  case AMDGPUISD::BFE_I32:
612
  case AMDGPUISD::BFE_U32: {
613
    // There is a scalar version available, but unlike the vector version which
614
    // has a separate operand for the offset and width, the scalar version packs
615
    // the width and offset into a single operand. Try to move to the scalar
616
    // version if the offsets are constant, so that we can try to keep extended
617
    // loads of kernel arguments in SGPRs.
618

619
    // TODO: Technically we could try to pattern match scalar bitshifts of
620
    // dynamic values, but it's probably not useful.
621
    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
622
    if (!Offset)
623
      break;
624

625
    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
626
    if (!Width)
627
      break;
628

629
    bool Signed = Opc == AMDGPUISD::BFE_I32;
630

631
    uint32_t OffsetVal = Offset->getZExtValue();
632
    uint32_t WidthVal = Width->getZExtValue();
633

634
    ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
635
                            WidthVal));
636
    return;
637
  }
638
  case AMDGPUISD::DIV_SCALE: {
639
    SelectDIV_SCALE(N);
640
    return;
641
  }
642
  case AMDGPUISD::MAD_I64_I32:
643
  case AMDGPUISD::MAD_U64_U32: {
644
    SelectMAD_64_32(N);
645
    return;
646
  }
647
  case ISD::SMUL_LOHI:
648
  case ISD::UMUL_LOHI:
649
    return SelectMUL_LOHI(N);
650
  case ISD::CopyToReg: {
651
    const SITargetLowering& Lowering =
652
      *static_cast<const SITargetLowering*>(getTargetLowering());
653
    N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
654
    break;
655
  }
656
  case ISD::AND:
657
  case ISD::SRL:
658
  case ISD::SRA:
659
  case ISD::SIGN_EXTEND_INREG:
660
    if (N->getValueType(0) != MVT::i32)
661
      break;
662

663
    SelectS_BFE(N);
664
    return;
665
  case ISD::BRCOND:
666
    SelectBRCOND(N);
667
    return;
668
  case ISD::FP_EXTEND:
669
    SelectFP_EXTEND(N);
670
    return;
671
  case AMDGPUISD::CVT_PKRTZ_F16_F32:
672
  case AMDGPUISD::CVT_PKNORM_I16_F32:
673
  case AMDGPUISD::CVT_PKNORM_U16_F32:
674
  case AMDGPUISD::CVT_PK_U16_U32:
675
  case AMDGPUISD::CVT_PK_I16_I32: {
676
    // Hack around using a legal type if f16 is illegal.
677
    if (N->getValueType(0) == MVT::i32) {
678
      MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
679
      N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
680
                              { N->getOperand(0), N->getOperand(1) });
681
      SelectCode(N);
682
      return;
683
    }
684

685
    break;
686
  }
687
  case ISD::INTRINSIC_W_CHAIN: {
688
    SelectINTRINSIC_W_CHAIN(N);
689
    return;
690
  }
691
  case ISD::INTRINSIC_WO_CHAIN: {
692
    SelectINTRINSIC_WO_CHAIN(N);
693
    return;
694
  }
695
  case ISD::INTRINSIC_VOID: {
696
    SelectINTRINSIC_VOID(N);
697
    return;
698
  }
699
  case AMDGPUISD::WAVE_ADDRESS: {
700
    SelectWAVE_ADDRESS(N);
701
    return;
702
  }
703
  case ISD::STACKRESTORE: {
704
    SelectSTACKRESTORE(N);
705
    return;
706
  }
707
  }
708

709
  SelectCode(N);
710
}
711

712
bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
713
  const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
714
  const Instruction *Term = BB->getTerminator();
715
  return Term->getMetadata("amdgpu.uniform") ||
716
         Term->getMetadata("structurizecfg.uniform");
717
}
718

719
bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
720
                                             unsigned ShAmtBits) const {
721
  assert(N->getOpcode() == ISD::AND);
722

723
  const APInt &RHS = N->getConstantOperandAPInt(1);
724
  if (RHS.countr_one() >= ShAmtBits)
725
    return true;
726

727
  const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
728
  return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
729
}
730

731
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
732
                                          SDValue &N0, SDValue &N1) {
733
  if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
734
      Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
735
    // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
736
    // (i64 (bitcast (v2i32 (build_vector
737
    //                        (or (extract_vector_elt V, 0), OFFSET),
738
    //                        (extract_vector_elt V, 1)))))
739
    SDValue Lo = Addr.getOperand(0).getOperand(0);
740
    if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
741
      SDValue BaseLo = Lo.getOperand(0);
742
      SDValue BaseHi = Addr.getOperand(0).getOperand(1);
743
      // Check that split base (Lo and Hi) are extracted from the same one.
744
      if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
745
          BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
746
          BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
747
          // Lo is statically extracted from index 0.
748
          isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
749
          BaseLo.getConstantOperandVal(1) == 0 &&
750
          // Hi is statically extracted from index 0.
751
          isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
752
          BaseHi.getConstantOperandVal(1) == 1) {
753
        N0 = BaseLo.getOperand(0).getOperand(0);
754
        N1 = Lo.getOperand(1);
755
        return true;
756
      }
757
    }
758
  }
759
  return false;
760
}
761

762
bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
763
                                                    SDValue &RHS) const {
764
  if (CurDAG->isBaseWithConstantOffset(Addr)) {
765
    LHS = Addr.getOperand(0);
766
    RHS = Addr.getOperand(1);
767
    return true;
768
  }
769

770
  if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
771
    assert(LHS && RHS && isa<ConstantSDNode>(RHS));
772
    return true;
773
  }
774

775
  return false;
776
}
777

778
StringRef AMDGPUDAGToDAGISelLegacy::getPassName() const {
779
  return "AMDGPU DAG->DAG Pattern Instruction Selection";
780
}
781

782
AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass(TargetMachine &TM)
783
    : SelectionDAGISelPass(
784
          std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
785

786
PreservedAnalyses
787
AMDGPUISelDAGToDAGPass::run(MachineFunction &MF,
788
                            MachineFunctionAnalysisManager &MFAM) {
789
#ifdef EXPENSIVE_CHECKS
790
  auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
791
                  .getManager();
792
  auto &F = MF.getFunction();
793
  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
794
  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
795
  for (auto &L : LI.getLoopsInPreorder())
796
    assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
797
#endif
798
  return SelectionDAGISelPass::run(MF, MFAM);
799
}
800

801
//===----------------------------------------------------------------------===//
802
// Complex Patterns
803
//===----------------------------------------------------------------------===//
804

805
bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
806
                                            SDValue &Offset) {
807
  return false;
808
}
809

810
bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
811
                                            SDValue &Offset) {
812
  ConstantSDNode *C;
813
  SDLoc DL(Addr);
814

815
  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
816
    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
817
    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
818
  } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
819
             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
820
    Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
821
    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
822
  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
823
            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
824
    Base = Addr.getOperand(0);
825
    Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
826
  } else {
827
    Base = Addr;
828
    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
829
  }
830

831
  return true;
832
}
833

834
SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
835
                                                       const SDLoc &DL) const {
836
  SDNode *Mov = CurDAG->getMachineNode(
837
    AMDGPU::S_MOV_B32, DL, MVT::i32,
838
    CurDAG->getTargetConstant(Val, DL, MVT::i32));
839
  return SDValue(Mov, 0);
840
}
841

842
// FIXME: Should only handle uaddo_carry/usubo_carry
843
void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
844
  SDLoc DL(N);
845
  SDValue LHS = N->getOperand(0);
846
  SDValue RHS = N->getOperand(1);
847

848
  unsigned Opcode = N->getOpcode();
849
  bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
850
  bool ProduceCarry =
851
      ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
852
  bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
853

854
  SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
855
  SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
856

857
  SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
858
                                       DL, MVT::i32, LHS, Sub0);
859
  SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
860
                                       DL, MVT::i32, LHS, Sub1);
861

862
  SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
863
                                       DL, MVT::i32, RHS, Sub0);
864
  SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
865
                                       DL, MVT::i32, RHS, Sub1);
866

867
  SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
868

869
  static const unsigned OpcMap[2][2][2] = {
870
      {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
871
       {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
872
      {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
873
       {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
874

875
  unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
876
  unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
877

878
  SDNode *AddLo;
879
  if (!ConsumeCarry) {
880
    SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
881
    AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
882
  } else {
883
    SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
884
    AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
885
  }
886
  SDValue AddHiArgs[] = {
887
    SDValue(Hi0, 0),
888
    SDValue(Hi1, 0),
889
    SDValue(AddLo, 1)
890
  };
891
  SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
892

893
  SDValue RegSequenceArgs[] = {
894
    CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
895
    SDValue(AddLo,0),
896
    Sub0,
897
    SDValue(AddHi,0),
898
    Sub1,
899
  };
900
  SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
901
                                               MVT::i64, RegSequenceArgs);
902

903
  if (ProduceCarry) {
904
    // Replace the carry-use
905
    ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
906
  }
907

908
  // Replace the remaining uses.
909
  ReplaceNode(N, RegSequence);
910
}
911

912
void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
913
  SDLoc DL(N);
914
  SDValue LHS = N->getOperand(0);
915
  SDValue RHS = N->getOperand(1);
916
  SDValue CI = N->getOperand(2);
917

918
  if (N->isDivergent()) {
919
    unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
920
                                                      : AMDGPU::V_SUBB_U32_e64;
921
    CurDAG->SelectNodeTo(
922
        N, Opc, N->getVTList(),
923
        {LHS, RHS, CI,
924
         CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
925
  } else {
926
    unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
927
                                                      : AMDGPU::S_SUB_CO_PSEUDO;
928
    CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
929
  }
930
}
931

932
void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
933
  // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
934
  // carry out despite the _i32 name. These were renamed in VI to _U32.
935
  // FIXME: We should probably rename the opcodes here.
936
  bool IsAdd = N->getOpcode() == ISD::UADDO;
937
  bool IsVALU = N->isDivergent();
938

939
  for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
940
       ++UI)
941
    if (UI.getUse().getResNo() == 1) {
942
      if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
943
          (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
944
        IsVALU = true;
945
        break;
946
      }
947
    }
948

949
  if (IsVALU) {
950
    unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
951

952
    CurDAG->SelectNodeTo(
953
        N, Opc, N->getVTList(),
954
        {N->getOperand(0), N->getOperand(1),
955
         CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
956
  } else {
957
    unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
958
                                                : AMDGPU::S_USUBO_PSEUDO;
959

960
    CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
961
                         {N->getOperand(0), N->getOperand(1)});
962
  }
963
}
964

965
void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
966
  SDLoc SL(N);
967
  //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
968
  SDValue Ops[10];
969

970
  SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
971
  SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
972
  SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
973
  Ops[8] = N->getOperand(0);
974
  Ops[9] = N->getOperand(4);
975

976
  // If there are no source modifiers, prefer fmac over fma because it can use
977
  // the smaller VOP2 encoding.
978
  bool UseFMAC = Subtarget->hasDLInsts() &&
979
                 cast<ConstantSDNode>(Ops[0])->isZero() &&
980
                 cast<ConstantSDNode>(Ops[2])->isZero() &&
981
                 cast<ConstantSDNode>(Ops[4])->isZero();
982
  unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
983
  CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
984
}
985

986
void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
987
  SDLoc SL(N);
988
  //    src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
989
  SDValue Ops[8];
990

991
  SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
992
  SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
993
  Ops[6] = N->getOperand(0);
994
  Ops[7] = N->getOperand(3);
995

996
  CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
997
}
998

999
// We need to handle this here because tablegen doesn't support matching
1000
// instructions with multiple outputs.
1001
void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1002
  SDLoc SL(N);
1003
  EVT VT = N->getValueType(0);
1004

1005
  assert(VT == MVT::f32 || VT == MVT::f64);
1006

1007
  unsigned Opc
1008
    = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1009

1010
  // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1011
  // omod
1012
  SDValue Ops[8];
1013
  SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1014
  SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1015
  SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1016
  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1017
}
1018

1019
// We need to handle this here because tablegen doesn't support matching
1020
// instructions with multiple outputs.
1021
void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1022
  SDLoc SL(N);
1023
  bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1024
  unsigned Opc;
1025
  if (Subtarget->hasMADIntraFwdBug())
1026
    Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1027
                 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1028
  else
1029
    Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1030

1031
  SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1032
  SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1033
                    Clamp };
1034
  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1035
}
1036

1037
// We need to handle this here because tablegen doesn't support matching
1038
// instructions with multiple outputs.
1039
void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1040
  SDLoc SL(N);
1041
  bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1042
  unsigned Opc;
1043
  if (Subtarget->hasMADIntraFwdBug())
1044
    Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1045
                 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1046
  else
1047
    Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1048

1049
  SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1050
  SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1051
  SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1052
  SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1053
  if (!SDValue(N, 0).use_empty()) {
1054
    SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1055
    SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1056
                                        MVT::i32, SDValue(Mad, 0), Sub0);
1057
    ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1058
  }
1059
  if (!SDValue(N, 1).use_empty()) {
1060
    SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1061
    SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1062
                                        MVT::i32, SDValue(Mad, 0), Sub1);
1063
    ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1064
  }
1065
  CurDAG->RemoveDeadNode(N);
1066
}
1067

1068
bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1069
  if (!isUInt<16>(Offset))
1070
    return false;
1071

1072
  if (!Base || Subtarget->hasUsableDSOffset() ||
1073
      Subtarget->unsafeDSOffsetFoldingEnabled())
1074
    return true;
1075

1076
  // On Southern Islands instruction with a negative base value and an offset
1077
  // don't seem to work.
1078
  return CurDAG->SignBitIsZero(Base);
1079
}
1080

1081
bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1082
                                              SDValue &Offset) const {
1083
  SDLoc DL(Addr);
1084
  if (CurDAG->isBaseWithConstantOffset(Addr)) {
1085
    SDValue N0 = Addr.getOperand(0);
1086
    SDValue N1 = Addr.getOperand(1);
1087
    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1088
    if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1089
      // (add n0, c0)
1090
      Base = N0;
1091
      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1092
      return true;
1093
    }
1094
  } else if (Addr.getOpcode() == ISD::SUB) {
1095
    // sub C, x -> add (sub 0, x), C
1096
    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1097
      int64_t ByteOffset = C->getSExtValue();
1098
      if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1099
        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1100

1101
        // XXX - This is kind of hacky. Create a dummy sub node so we can check
1102
        // the known bits in isDSOffsetLegal. We need to emit the selected node
1103
        // here, so this is thrown away.
1104
        SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1105
                                      Zero, Addr.getOperand(1));
1106

1107
        if (isDSOffsetLegal(Sub, ByteOffset)) {
1108
          SmallVector<SDValue, 3> Opnds;
1109
          Opnds.push_back(Zero);
1110
          Opnds.push_back(Addr.getOperand(1));
1111

1112
          // FIXME: Select to VOP3 version for with-carry.
1113
          unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1114
          if (Subtarget->hasAddNoCarry()) {
1115
            SubOp = AMDGPU::V_SUB_U32_e64;
1116
            Opnds.push_back(
1117
                CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1118
          }
1119

1120
          MachineSDNode *MachineSub =
1121
              CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1122

1123
          Base = SDValue(MachineSub, 0);
1124
          Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1125
          return true;
1126
        }
1127
      }
1128
    }
1129
  } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1130
    // If we have a constant address, prefer to put the constant into the
1131
    // offset. This can save moves to load the constant address since multiple
1132
    // operations can share the zero base address register, and enables merging
1133
    // into read2 / write2 instructions.
1134

1135
    SDLoc DL(Addr);
1136

1137
    if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1138
      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1139
      MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1140
                                 DL, MVT::i32, Zero);
1141
      Base = SDValue(MovZero, 0);
1142
      Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1143
      return true;
1144
    }
1145
  }
1146

1147
  // default case
1148
  Base = Addr;
1149
  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1150
  return true;
1151
}
1152

1153
bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1154
                                          unsigned Offset1,
1155
                                          unsigned Size) const {
1156
  if (Offset0 % Size != 0 || Offset1 % Size != 0)
1157
    return false;
1158
  if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1159
    return false;
1160

1161
  if (!Base || Subtarget->hasUsableDSOffset() ||
1162
      Subtarget->unsafeDSOffsetFoldingEnabled())
1163
    return true;
1164

1165
  // On Southern Islands instruction with a negative base value and an offset
1166
  // don't seem to work.
1167
  return CurDAG->SignBitIsZero(Base);
1168
}
1169

1170
// Return whether the operation has NoUnsignedWrap property.
1171
static bool isNoUnsignedWrap(SDValue Addr) {
1172
  return (Addr.getOpcode() == ISD::ADD &&
1173
          Addr->getFlags().hasNoUnsignedWrap()) ||
1174
         Addr->getOpcode() == ISD::OR;
1175
}
1176

1177
// Check that the base address of flat scratch load/store in the form of `base +
1178
// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1179
// requirement). We always treat the first operand as the base address here.
1180
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1181
  if (isNoUnsignedWrap(Addr))
1182
    return true;
1183

1184
  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1185
  // values.
1186
  if (Subtarget->hasSignedScratchOffsets())
1187
    return true;
1188

1189
  auto LHS = Addr.getOperand(0);
1190
  auto RHS = Addr.getOperand(1);
1191

1192
  // If the immediate offset is negative and within certain range, the base
1193
  // address cannot also be negative. If the base is also negative, the sum
1194
  // would be either negative or much larger than the valid range of scratch
1195
  // memory a thread can access.
1196
  ConstantSDNode *ImmOp = nullptr;
1197
  if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1198
    if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1199
      return true;
1200
  }
1201

1202
  return CurDAG->SignBitIsZero(LHS);
1203
}
1204

1205
// Check address value in SGPR/VGPR are legal for flat scratch in the form
1206
// of: SGPR + VGPR.
1207
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1208
  if (isNoUnsignedWrap(Addr))
1209
    return true;
1210

1211
  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1212
  // values.
1213
  if (Subtarget->hasSignedScratchOffsets())
1214
    return true;
1215

1216
  auto LHS = Addr.getOperand(0);
1217
  auto RHS = Addr.getOperand(1);
1218
  return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1219
}
1220

1221
// Check address value in SGPR/VGPR are legal for flat scratch in the form
1222
// of: SGPR + VGPR + Imm.
1223
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1224
  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1225
  // values.
1226
  if (AMDGPU::isGFX12Plus(*Subtarget))
1227
    return true;
1228

1229
  auto Base = Addr.getOperand(0);
1230
  auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1231
  // If the immediate offset is negative and within certain range, the base
1232
  // address cannot also be negative. If the base is also negative, the sum
1233
  // would be either negative or much larger than the valid range of scratch
1234
  // memory a thread can access.
1235
  if (isNoUnsignedWrap(Base) &&
1236
      (isNoUnsignedWrap(Addr) ||
1237
       (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1238
    return true;
1239

1240
  auto LHS = Base.getOperand(0);
1241
  auto RHS = Base.getOperand(1);
1242
  return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1243
}
1244

1245
// TODO: If offset is too big, put low 16-bit into offset.
1246
bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1247
                                                   SDValue &Offset0,
1248
                                                   SDValue &Offset1) const {
1249
  return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1250
}
1251

1252
bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1253
                                                    SDValue &Offset0,
1254
                                                    SDValue &Offset1) const {
1255
  return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1256
}
1257

1258
bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1259
                                            SDValue &Offset0, SDValue &Offset1,
1260
                                            unsigned Size) const {
1261
  SDLoc DL(Addr);
1262

1263
  if (CurDAG->isBaseWithConstantOffset(Addr)) {
1264
    SDValue N0 = Addr.getOperand(0);
1265
    SDValue N1 = Addr.getOperand(1);
1266
    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1267
    unsigned OffsetValue0 = C1->getZExtValue();
1268
    unsigned OffsetValue1 = OffsetValue0 + Size;
1269

1270
    // (add n0, c0)
1271
    if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1272
      Base = N0;
1273
      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1274
      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1275
      return true;
1276
    }
1277
  } else if (Addr.getOpcode() == ISD::SUB) {
1278
    // sub C, x -> add (sub 0, x), C
1279
    if (const ConstantSDNode *C =
1280
            dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1281
      unsigned OffsetValue0 = C->getZExtValue();
1282
      unsigned OffsetValue1 = OffsetValue0 + Size;
1283

1284
      if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1285
        SDLoc DL(Addr);
1286
        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1287

1288
        // XXX - This is kind of hacky. Create a dummy sub node so we can check
1289
        // the known bits in isDSOffsetLegal. We need to emit the selected node
1290
        // here, so this is thrown away.
1291
        SDValue Sub =
1292
            CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1293

1294
        if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1295
          SmallVector<SDValue, 3> Opnds;
1296
          Opnds.push_back(Zero);
1297
          Opnds.push_back(Addr.getOperand(1));
1298
          unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1299
          if (Subtarget->hasAddNoCarry()) {
1300
            SubOp = AMDGPU::V_SUB_U32_e64;
1301
            Opnds.push_back(
1302
                CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1303
          }
1304

1305
          MachineSDNode *MachineSub = CurDAG->getMachineNode(
1306
              SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1307

1308
          Base = SDValue(MachineSub, 0);
1309
          Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1310
          Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1311
          return true;
1312
        }
1313
      }
1314
    }
1315
  } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1316
    unsigned OffsetValue0 = CAddr->getZExtValue();
1317
    unsigned OffsetValue1 = OffsetValue0 + Size;
1318

1319
    if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1320
      SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1321
      MachineSDNode *MovZero =
1322
          CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1323
      Base = SDValue(MovZero, 0);
1324
      Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1325
      Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1326
      return true;
1327
    }
1328
  }
1329

1330
  // default case
1331

1332
  Base = Addr;
1333
  Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1334
  Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1335
  return true;
1336
}
1337

1338
bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1339
                                     SDValue &SOffset, SDValue &Offset,
1340
                                     SDValue &Offen, SDValue &Idxen,
1341
                                     SDValue &Addr64) const {
1342
  // Subtarget prefers to use flat instruction
1343
  // FIXME: This should be a pattern predicate and not reach here
1344
  if (Subtarget->useFlatForGlobal())
1345
    return false;
1346

1347
  SDLoc DL(Addr);
1348

1349
  Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1350
  Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1351
  Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1352
  SOffset = Subtarget->hasRestrictedSOffset()
1353
                ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1354
                : CurDAG->getTargetConstant(0, DL, MVT::i32);
1355

1356
  ConstantSDNode *C1 = nullptr;
1357
  SDValue N0 = Addr;
1358
  if (CurDAG->isBaseWithConstantOffset(Addr)) {
1359
    C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1360
    if (isUInt<32>(C1->getZExtValue()))
1361
      N0 = Addr.getOperand(0);
1362
    else
1363
      C1 = nullptr;
1364
  }
1365

1366
  if (N0.getOpcode() == ISD::ADD) {
1367
    // (add N2, N3) -> addr64, or
1368
    // (add (add N2, N3), C1) -> addr64
1369
    SDValue N2 = N0.getOperand(0);
1370
    SDValue N3 = N0.getOperand(1);
1371
    Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1372

1373
    if (N2->isDivergent()) {
1374
      if (N3->isDivergent()) {
1375
        // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1376
        // addr64, and construct the resource from a 0 address.
1377
        Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1378
        VAddr = N0;
1379
      } else {
1380
        // N2 is divergent, N3 is not.
1381
        Ptr = N3;
1382
        VAddr = N2;
1383
      }
1384
    } else {
1385
      // N2 is not divergent.
1386
      Ptr = N2;
1387
      VAddr = N3;
1388
    }
1389
    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1390
  } else if (N0->isDivergent()) {
1391
    // N0 is divergent. Use it as the addr64, and construct the resource from a
1392
    // 0 address.
1393
    Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1394
    VAddr = N0;
1395
    Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1396
  } else {
1397
    // N0 -> offset, or
1398
    // (N0 + C1) -> offset
1399
    VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1400
    Ptr = N0;
1401
  }
1402

1403
  if (!C1) {
1404
    // No offset.
1405
    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1406
    return true;
1407
  }
1408

1409
  const SIInstrInfo *TII = Subtarget->getInstrInfo();
1410
  if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1411
    // Legal offset for instruction.
1412
    Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1413
    return true;
1414
  }
1415

1416
  // Illegal offset, store it in soffset.
1417
  Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1418
  SOffset =
1419
      SDValue(CurDAG->getMachineNode(
1420
                  AMDGPU::S_MOV_B32, DL, MVT::i32,
1421
                  CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1422
              0);
1423
  return true;
1424
}
1425

1426
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1427
                                           SDValue &VAddr, SDValue &SOffset,
1428
                                           SDValue &Offset) const {
1429
  SDValue Ptr, Offen, Idxen, Addr64;
1430

1431
  // addr64 bit was removed for volcanic islands.
1432
  // FIXME: This should be a pattern predicate and not reach here
1433
  if (!Subtarget->hasAddr64())
1434
    return false;
1435

1436
  if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1437
    return false;
1438

1439
  ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1440
  if (C->getSExtValue()) {
1441
    SDLoc DL(Addr);
1442

1443
    const SITargetLowering& Lowering =
1444
      *static_cast<const SITargetLowering*>(getTargetLowering());
1445

1446
    SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1447
    return true;
1448
  }
1449

1450
  return false;
1451
}
1452

1453
std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1454
  SDLoc DL(N);
1455

1456
  auto *FI = dyn_cast<FrameIndexSDNode>(N);
1457
  SDValue TFI =
1458
      FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1459

1460
  // We rebase the base address into an absolute stack address and hence
1461
  // use constant 0 for soffset. This value must be retained until
1462
  // frame elimination and eliminateFrameIndex will choose the appropriate
1463
  // frame register if need be.
1464
  return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1465
}
1466

1467
bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1468
                                                 SDValue Addr, SDValue &Rsrc,
1469
                                                 SDValue &VAddr, SDValue &SOffset,
1470
                                                 SDValue &ImmOffset) const {
1471

1472
  SDLoc DL(Addr);
1473
  MachineFunction &MF = CurDAG->getMachineFunction();
1474
  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1475

1476
  Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1477

1478
  if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1479
    int64_t Imm = CAddr->getSExtValue();
1480
    const int64_t NullPtr =
1481
        AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
1482
    // Don't fold null pointer.
1483
    if (Imm != NullPtr) {
1484
      const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1485
      SDValue HighBits =
1486
          CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1487
      MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1488
        AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1489
      VAddr = SDValue(MovHighBits, 0);
1490

1491
      SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1492
      ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1493
      return true;
1494
    }
1495
  }
1496

1497
  if (CurDAG->isBaseWithConstantOffset(Addr)) {
1498
    // (add n0, c1)
1499

1500
    SDValue N0 = Addr.getOperand(0);
1501
    uint64_t C1 = Addr.getConstantOperandVal(1);
1502

1503
    // Offsets in vaddr must be positive if range checking is enabled.
1504
    //
1505
    // The total computation of vaddr + soffset + offset must not overflow.  If
1506
    // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1507
    // overflowing.
1508
    //
1509
    // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1510
    // always perform a range check. If a negative vaddr base index was used,
1511
    // this would fail the range check. The overall address computation would
1512
    // compute a valid address, but this doesn't happen due to the range
1513
    // check. For out-of-bounds MUBUF loads, a 0 is returned.
1514
    //
1515
    // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1516
    // MUBUF vaddr, but not on older subtargets which can only do this if the
1517
    // sign bit is known 0.
1518
    const SIInstrInfo *TII = Subtarget->getInstrInfo();
1519
    if (TII->isLegalMUBUFImmOffset(C1) &&
1520
        (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1521
         CurDAG->SignBitIsZero(N0))) {
1522
      std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1523
      ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1524
      return true;
1525
    }
1526
  }
1527

1528
  // (node)
1529
  std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1530
  ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1531
  return true;
1532
}
1533

1534
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1535
  if (Val.getOpcode() != ISD::CopyFromReg)
1536
    return false;
1537
  auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1538
  if (!Reg.isPhysical())
1539
    return false;
1540
  auto RC = TRI.getPhysRegBaseClass(Reg);
1541
  return RC && TRI.isSGPRClass(RC);
1542
}
1543

1544
bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1545
                                                  SDValue Addr,
1546
                                                  SDValue &SRsrc,
1547
                                                  SDValue &SOffset,
1548
                                                  SDValue &Offset) const {
1549
  const SIRegisterInfo *TRI =
1550
      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1551
  const SIInstrInfo *TII = Subtarget->getInstrInfo();
1552
  MachineFunction &MF = CurDAG->getMachineFunction();
1553
  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1554
  SDLoc DL(Addr);
1555

1556
  // CopyFromReg <sgpr>
1557
  if (IsCopyFromSGPR(*TRI, Addr)) {
1558
    SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1559
    SOffset = Addr;
1560
    Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1561
    return true;
1562
  }
1563

1564
  ConstantSDNode *CAddr;
1565
  if (Addr.getOpcode() == ISD::ADD) {
1566
    // Add (CopyFromReg <sgpr>) <constant>
1567
    CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1568
    if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1569
      return false;
1570
    if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1571
      return false;
1572

1573
    SOffset = Addr.getOperand(0);
1574
  } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1575
             TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1576
    // <constant>
1577
    SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1578
  } else {
1579
    return false;
1580
  }
1581

1582
  SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1583

1584
  Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1585
  return true;
1586
}
1587

1588
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1589
                                           SDValue &SOffset, SDValue &Offset
1590
                                           ) const {
1591
  SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1592
  const SIInstrInfo *TII = Subtarget->getInstrInfo();
1593

1594
  if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1595
    return false;
1596

1597
  if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1598
      !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1599
      !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1600
    uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1601
                    APInt::getAllOnes(32).getZExtValue(); // Size
1602
    SDLoc DL(Addr);
1603

1604
    const SITargetLowering& Lowering =
1605
      *static_cast<const SITargetLowering*>(getTargetLowering());
1606

1607
    SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1608
    return true;
1609
  }
1610
  return false;
1611
}
1612

1613
bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1614
                                          SDValue &SOffset) const {
1615
  if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1616
    SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1617
    return true;
1618
  }
1619

1620
  SOffset = ByteOffsetNode;
1621
  return true;
1622
}
1623

1624
// Find a load or store from corresponding pattern root.
1625
// Roots may be build_vector, bitconvert or their combinations.
1626
static MemSDNode* findMemSDNode(SDNode *N) {
1627
  N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1628
  if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1629
    return MN;
1630
  assert(isa<BuildVectorSDNode>(N));
1631
  for (SDValue V : N->op_values())
1632
    if (MemSDNode *MN =
1633
          dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1634
      return MN;
1635
  llvm_unreachable("cannot find MemSDNode in the pattern!");
1636
}
1637

1638
bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1639
                                              SDValue &VAddr, SDValue &Offset,
1640
                                              uint64_t FlatVariant) const {
1641
  int64_t OffsetVal = 0;
1642

1643
  unsigned AS = findMemSDNode(N)->getAddressSpace();
1644

1645
  bool CanHaveFlatSegmentOffsetBug =
1646
      Subtarget->hasFlatSegmentOffsetBug() &&
1647
      FlatVariant == SIInstrFlags::FLAT &&
1648
      (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1649

1650
  if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1651
    SDValue N0, N1;
1652
    if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1653
        (FlatVariant != SIInstrFlags::FlatScratch ||
1654
         isFlatScratchBaseLegal(Addr))) {
1655
      int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1656

1657
      const SIInstrInfo *TII = Subtarget->getInstrInfo();
1658
      if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1659
        Addr = N0;
1660
        OffsetVal = COffsetVal;
1661
      } else {
1662
        // If the offset doesn't fit, put the low bits into the offset field and
1663
        // add the rest.
1664
        //
1665
        // For a FLAT instruction the hardware decides whether to access
1666
        // global/scratch/shared memory based on the high bits of vaddr,
1667
        // ignoring the offset field, so we have to ensure that when we add
1668
        // remainder to vaddr it still points into the same underlying object.
1669
        // The easiest way to do that is to make sure that we split the offset
1670
        // into two pieces that are both >= 0 or both <= 0.
1671

1672
        SDLoc DL(N);
1673
        uint64_t RemainderOffset;
1674

1675
        std::tie(OffsetVal, RemainderOffset) =
1676
            TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1677

1678
        SDValue AddOffsetLo =
1679
            getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1680
        SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1681

1682
        if (Addr.getValueType().getSizeInBits() == 32) {
1683
          SmallVector<SDValue, 3> Opnds;
1684
          Opnds.push_back(N0);
1685
          Opnds.push_back(AddOffsetLo);
1686
          unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1687
          if (Subtarget->hasAddNoCarry()) {
1688
            AddOp = AMDGPU::V_ADD_U32_e64;
1689
            Opnds.push_back(Clamp);
1690
          }
1691
          Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1692
        } else {
1693
          // TODO: Should this try to use a scalar add pseudo if the base address
1694
          // is uniform and saddr is usable?
1695
          SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1696
          SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1697

1698
          SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1699
                                                DL, MVT::i32, N0, Sub0);
1700
          SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1701
                                                DL, MVT::i32, N0, Sub1);
1702

1703
          SDValue AddOffsetHi =
1704
              getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1705

1706
          SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1707

1708
          SDNode *Add =
1709
              CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1710
                                     {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1711

1712
          SDNode *Addc = CurDAG->getMachineNode(
1713
              AMDGPU::V_ADDC_U32_e64, DL, VTs,
1714
              {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1715

1716
          SDValue RegSequenceArgs[] = {
1717
              CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1718
              SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1719

1720
          Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1721
                                                MVT::i64, RegSequenceArgs),
1722
                         0);
1723
        }
1724
      }
1725
    }
1726
  }
1727

1728
  VAddr = Addr;
1729
  Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1730
  return true;
1731
}
1732

1733
bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1734
                                          SDValue &VAddr,
1735
                                          SDValue &Offset) const {
1736
  return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1737
}
1738

1739
bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1740
                                            SDValue &VAddr,
1741
                                            SDValue &Offset) const {
1742
  return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1743
}
1744

1745
bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1746
                                             SDValue &VAddr,
1747
                                             SDValue &Offset) const {
1748
  return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1749
                              SIInstrFlags::FlatScratch);
1750
}
1751

1752
// If this matches zero_extend i32:x, return x
1753
static SDValue matchZExtFromI32(SDValue Op) {
1754
  if (Op.getOpcode() != ISD::ZERO_EXTEND)
1755
    return SDValue();
1756

1757
  SDValue ExtSrc = Op.getOperand(0);
1758
  return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1759
}
1760

1761
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1762
bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1763
                                           SDValue Addr,
1764
                                           SDValue &SAddr,
1765
                                           SDValue &VOffset,
1766
                                           SDValue &Offset) const {
1767
  int64_t ImmOffset = 0;
1768

1769
  // Match the immediate offset first, which canonically is moved as low as
1770
  // possible.
1771

1772
  SDValue LHS, RHS;
1773
  if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1774
    int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1775
    const SIInstrInfo *TII = Subtarget->getInstrInfo();
1776

1777
    if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1778
                               SIInstrFlags::FlatGlobal)) {
1779
      Addr = LHS;
1780
      ImmOffset = COffsetVal;
1781
    } else if (!LHS->isDivergent()) {
1782
      if (COffsetVal > 0) {
1783
        SDLoc SL(N);
1784
        // saddr + large_offset -> saddr +
1785
        //                         (voffset = large_offset & ~MaxOffset) +
1786
        //                         (large_offset & MaxOffset);
1787
        int64_t SplitImmOffset, RemainderOffset;
1788
        std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1789
            COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
1790

1791
        if (isUInt<32>(RemainderOffset)) {
1792
          SDNode *VMov = CurDAG->getMachineNode(
1793
              AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1794
              CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1795
          VOffset = SDValue(VMov, 0);
1796
          SAddr = LHS;
1797
          Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1798
          return true;
1799
        }
1800
      }
1801

1802
      // We are adding a 64 bit SGPR and a constant. If constant bus limit
1803
      // is 1 we would need to perform 1 or 2 extra moves for each half of
1804
      // the constant and it is better to do a scalar add and then issue a
1805
      // single VALU instruction to materialize zero. Otherwise it is less
1806
      // instructions to perform VALU adds with immediates or inline literals.
1807
      unsigned NumLiterals =
1808
          !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1809
          !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1810
      if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1811
        return false;
1812
    }
1813
  }
1814

1815
  // Match the variable offset.
1816
  if (Addr.getOpcode() == ISD::ADD) {
1817
    LHS = Addr.getOperand(0);
1818
    RHS = Addr.getOperand(1);
1819

1820
    if (!LHS->isDivergent()) {
1821
      // add (i64 sgpr), (zero_extend (i32 vgpr))
1822
      if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1823
        SAddr = LHS;
1824
        VOffset = ZextRHS;
1825
      }
1826
    }
1827

1828
    if (!SAddr && !RHS->isDivergent()) {
1829
      // add (zero_extend (i32 vgpr)), (i64 sgpr)
1830
      if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1831
        SAddr = RHS;
1832
        VOffset = ZextLHS;
1833
      }
1834
    }
1835

1836
    if (SAddr) {
1837
      Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1838
      return true;
1839
    }
1840
  }
1841

1842
  if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1843
      isa<ConstantSDNode>(Addr))
1844
    return false;
1845

1846
  // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1847
  // moves required to copy a 64-bit SGPR to VGPR.
1848
  SAddr = Addr;
1849
  SDNode *VMov =
1850
      CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1851
                             CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1852
  VOffset = SDValue(VMov, 0);
1853
  Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1854
  return true;
1855
}
1856

1857
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1858
  if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1859
    SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1860
  } else if (SAddr.getOpcode() == ISD::ADD &&
1861
             isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1862
    // Materialize this into a scalar move for scalar address to avoid
1863
    // readfirstlane.
1864
    auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1865
    SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1866
                                              FI->getValueType(0));
1867
    SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1868
                                           MVT::i32, TFI, SAddr.getOperand(1)),
1869
                    0);
1870
  }
1871

1872
  return SAddr;
1873
}
1874

1875
// Match (32-bit SGPR base) + sext(imm offset)
1876
bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1877
                                            SDValue &SAddr,
1878
                                            SDValue &Offset) const {
1879
  if (Addr->isDivergent())
1880
    return false;
1881

1882
  SDLoc DL(Addr);
1883

1884
  int64_t COffsetVal = 0;
1885

1886
  if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1887
    COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1888
    SAddr = Addr.getOperand(0);
1889
  } else {
1890
    SAddr = Addr;
1891
  }
1892

1893
  SAddr = SelectSAddrFI(CurDAG, SAddr);
1894

1895
  const SIInstrInfo *TII = Subtarget->getInstrInfo();
1896

1897
  if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1898
                              SIInstrFlags::FlatScratch)) {
1899
    int64_t SplitImmOffset, RemainderOffset;
1900
    std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1901
        COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
1902

1903
    COffsetVal = SplitImmOffset;
1904

1905
    SDValue AddOffset =
1906
        SAddr.getOpcode() == ISD::TargetFrameIndex
1907
            ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1908
            : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1909
    SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1910
                                           SAddr, AddOffset),
1911
                    0);
1912
  }
1913

1914
  Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i32);
1915

1916
  return true;
1917
}
1918

1919
// Check whether the flat scratch SVS swizzle bug affects this access.
1920
bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1921
    SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1922
  if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1923
    return false;
1924

1925
  // The bug affects the swizzling of SVS accesses if there is any carry out
1926
  // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1927
  // voffset to (soffset + inst_offset).
1928
  KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1929
  KnownBits SKnown = KnownBits::computeForAddSub(
1930
      /*Add=*/true, /*NSW=*/false, /*NUW=*/false,
1931
      CurDAG->computeKnownBits(SAddr),
1932
      KnownBits::makeConstant(APInt(32, ImmOffset)));
1933
  uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1934
  uint64_t SMax = SKnown.getMaxValue().getZExtValue();
1935
  return (VMax & 3) + (SMax & 3) >= 4;
1936
}
1937

1938
bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1939
                                             SDValue &VAddr, SDValue &SAddr,
1940
                                             SDValue &Offset) const  {
1941
  int64_t ImmOffset = 0;
1942

1943
  SDValue LHS, RHS;
1944
  SDValue OrigAddr = Addr;
1945
  if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1946
    int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1947
    const SIInstrInfo *TII = Subtarget->getInstrInfo();
1948

1949
    if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1950
      Addr = LHS;
1951
      ImmOffset = COffsetVal;
1952
    } else if (!LHS->isDivergent() && COffsetVal > 0) {
1953
      SDLoc SL(N);
1954
      // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1955
      //                         (large_offset & MaxOffset);
1956
      int64_t SplitImmOffset, RemainderOffset;
1957
      std::tie(SplitImmOffset, RemainderOffset)
1958
        = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1959

1960
      if (isUInt<32>(RemainderOffset)) {
1961
        SDNode *VMov = CurDAG->getMachineNode(
1962
          AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1963
          CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1964
        VAddr = SDValue(VMov, 0);
1965
        SAddr = LHS;
1966
        if (!isFlatScratchBaseLegal(Addr))
1967
          return false;
1968
        if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1969
          return false;
1970
        Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1971
        return true;
1972
      }
1973
    }
1974
  }
1975

1976
  if (Addr.getOpcode() != ISD::ADD)
1977
    return false;
1978

1979
  LHS = Addr.getOperand(0);
1980
  RHS = Addr.getOperand(1);
1981

1982
  if (!LHS->isDivergent() && RHS->isDivergent()) {
1983
    SAddr = LHS;
1984
    VAddr = RHS;
1985
  } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1986
    SAddr = RHS;
1987
    VAddr = LHS;
1988
  } else {
1989
    return false;
1990
  }
1991

1992
  if (OrigAddr != Addr) {
1993
    if (!isFlatScratchBaseLegalSVImm(OrigAddr))
1994
      return false;
1995
  } else {
1996
    if (!isFlatScratchBaseLegalSV(OrigAddr))
1997
      return false;
1998
  }
1999

2000
  if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2001
    return false;
2002
  SAddr = SelectSAddrFI(CurDAG, SAddr);
2003
  Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2004
  return true;
2005
}
2006

2007
// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2008
// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2009
// Handle the case where the Immediate Offset + SOffset is negative.
2010
bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2011
                                                     bool Imm32Only,
2012
                                                     bool IsBuffer,
2013
                                                     int64_t ImmOffset) const {
2014
  if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2015
      AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2016
    KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2017
    if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2018
      return false;
2019
  }
2020

2021
  return true;
2022
}
2023

2024
// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2025
// not null) offset. If Imm32Only is true, match only 32-bit immediate
2026
// offsets available on CI.
2027
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2028
                                          SDValue *SOffset, SDValue *Offset,
2029
                                          bool Imm32Only, bool IsBuffer,
2030
                                          bool HasSOffset,
2031
                                          int64_t ImmOffset) const {
2032
  assert((!SOffset || !Offset) &&
2033
         "Cannot match both soffset and offset at the same time!");
2034

2035
  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2036
  if (!C) {
2037
    if (!SOffset)
2038
      return false;
2039

2040
    if (ByteOffsetNode.getValueType().isScalarInteger() &&
2041
        ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2042
      *SOffset = ByteOffsetNode;
2043
      return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2044
                                         ImmOffset);
2045
    }
2046
    if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2047
      if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2048
        *SOffset = ByteOffsetNode.getOperand(0);
2049
        return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2050
                                           ImmOffset);
2051
      }
2052
    }
2053
    return false;
2054
  }
2055

2056
  SDLoc SL(ByteOffsetNode);
2057

2058
  // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2059
  // offset for S_BUFFER instructions is unsigned.
2060
  int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2061
  std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2062
      *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2063
  if (EncodedOffset && Offset && !Imm32Only) {
2064
    *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2065
    return true;
2066
  }
2067

2068
  // SGPR and literal offsets are unsigned.
2069
  if (ByteOffset < 0)
2070
    return false;
2071

2072
  EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2073
  if (EncodedOffset && Offset && Imm32Only) {
2074
    *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2075
    return true;
2076
  }
2077

2078
  if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2079
    return false;
2080

2081
  if (SOffset) {
2082
    SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2083
    *SOffset = SDValue(
2084
        CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2085
    return true;
2086
  }
2087

2088
  return false;
2089
}
2090

2091
SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2092
  if (Addr.getValueType() != MVT::i32)
2093
    return Addr;
2094

2095
  // Zero-extend a 32-bit address.
2096
  SDLoc SL(Addr);
2097

2098
  const MachineFunction &MF = CurDAG->getMachineFunction();
2099
  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2100
  unsigned AddrHiVal = Info->get32BitAddressHighBits();
2101
  SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2102

2103
  const SDValue Ops[] = {
2104
    CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2105
    Addr,
2106
    CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2107
    SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2108
            0),
2109
    CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2110
  };
2111

2112
  return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2113
                                        Ops), 0);
2114
}
2115

2116
// Match a base and an immediate (if Offset is not null) or an SGPR (if
2117
// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2118
// true, match only 32-bit immediate offsets available on CI.
2119
bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2120
                                              SDValue *SOffset, SDValue *Offset,
2121
                                              bool Imm32Only, bool IsBuffer,
2122
                                              bool HasSOffset,
2123
                                              int64_t ImmOffset) const {
2124
  if (SOffset && Offset) {
2125
    assert(!Imm32Only && !IsBuffer);
2126
    SDValue B;
2127

2128
    if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2129
      return false;
2130

2131
    int64_t ImmOff = 0;
2132
    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2133
      ImmOff = C->getSExtValue();
2134

2135
    return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2136
                                ImmOff);
2137
  }
2138

2139
  // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2140
  // wraparound, because s_load instructions perform the addition in 64 bits.
2141
  if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2142
      !Addr->getFlags().hasNoUnsignedWrap())
2143
    return false;
2144

2145
  SDValue N0, N1;
2146
  // Extract the base and offset if possible.
2147
  if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2148
    N0 = Addr.getOperand(0);
2149
    N1 = Addr.getOperand(1);
2150
  } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2151
    assert(N0 && N1 && isa<ConstantSDNode>(N1));
2152
  }
2153
  if (!N0 || !N1)
2154
    return false;
2155

2156
  if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2157
                       ImmOffset)) {
2158
    SBase = N0;
2159
    return true;
2160
  }
2161
  if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2162
                       ImmOffset)) {
2163
    SBase = N1;
2164
    return true;
2165
  }
2166
  return false;
2167
}
2168

2169
bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2170
                                    SDValue *SOffset, SDValue *Offset,
2171
                                    bool Imm32Only) const {
2172
  if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2173
    SBase = Expand32BitAddress(SBase);
2174
    return true;
2175
  }
2176

2177
  if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2178
    SBase = Expand32BitAddress(Addr);
2179
    *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2180
    return true;
2181
  }
2182

2183
  return false;
2184
}
2185

2186
bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2187
                                       SDValue &Offset) const {
2188
  return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2189
}
2190

2191
bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2192
                                         SDValue &Offset) const {
2193
  assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2194
  return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2195
                    /* Imm32Only */ true);
2196
}
2197

2198
bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2199
                                        SDValue &SOffset) const {
2200
  return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2201
}
2202

2203
bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2204
                                           SDValue &SOffset,
2205
                                           SDValue &Offset) const {
2206
  return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2207
}
2208

2209
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2210
  return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2211
                          /* Imm32Only */ false, /* IsBuffer */ true);
2212
}
2213

2214
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2215
                                               SDValue &Offset) const {
2216
  assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2217
  return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2218
                          /* Imm32Only */ true, /* IsBuffer */ true);
2219
}
2220

2221
bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2222
                                                 SDValue &Offset) const {
2223
  // Match the (soffset + offset) pair as a 32-bit register base and
2224
  // an immediate offset.
2225
  return N.getValueType() == MVT::i32 &&
2226
         SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2227
                              &Offset, /* Imm32Only */ false,
2228
                              /* IsBuffer */ true);
2229
}
2230

2231
bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2232
                                            SDValue &Base,
2233
                                            SDValue &Offset) const {
2234
  SDLoc DL(Index);
2235

2236
  if (CurDAG->isBaseWithConstantOffset(Index)) {
2237
    SDValue N0 = Index.getOperand(0);
2238
    SDValue N1 = Index.getOperand(1);
2239
    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2240

2241
    // (add n0, c0)
2242
    // Don't peel off the offset (c0) if doing so could possibly lead
2243
    // the base (n0) to be negative.
2244
    // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2245
    if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2246
        (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2247
      Base = N0;
2248
      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2249
      return true;
2250
    }
2251
  }
2252

2253
  if (isa<ConstantSDNode>(Index))
2254
    return false;
2255

2256
  Base = Index;
2257
  Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2258
  return true;
2259
}
2260

2261
SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2262
                                     SDValue Val, uint32_t Offset,
2263
                                     uint32_t Width) {
2264
  if (Val->isDivergent()) {
2265
    unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2266
    SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2267
    SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2268

2269
    return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2270
  }
2271
  unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2272
  // Transformation function, pack the offset and width of a BFE into
2273
  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2274
  // source, bits [5:0] contain the offset and bits [22:16] the width.
2275
  uint32_t PackedVal = Offset | (Width << 16);
2276
  SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2277

2278
  return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2279
}
2280

2281
void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2282
  // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2283
  // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2284
  // Predicate: 0 < b <= c < 32
2285

2286
  const SDValue &Shl = N->getOperand(0);
2287
  ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2288
  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2289

2290
  if (B && C) {
2291
    uint32_t BVal = B->getZExtValue();
2292
    uint32_t CVal = C->getZExtValue();
2293

2294
    if (0 < BVal && BVal <= CVal && CVal < 32) {
2295
      bool Signed = N->getOpcode() == ISD::SRA;
2296
      ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2297
                  32 - CVal));
2298
      return;
2299
    }
2300
  }
2301
  SelectCode(N);
2302
}
2303

2304
void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2305
  switch (N->getOpcode()) {
2306
  case ISD::AND:
2307
    if (N->getOperand(0).getOpcode() == ISD::SRL) {
2308
      // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2309
      // Predicate: isMask(mask)
2310
      const SDValue &Srl = N->getOperand(0);
2311
      ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2312
      ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2313

2314
      if (Shift && Mask) {
2315
        uint32_t ShiftVal = Shift->getZExtValue();
2316
        uint32_t MaskVal = Mask->getZExtValue();
2317

2318
        if (isMask_32(MaskVal)) {
2319
          uint32_t WidthVal = llvm::popcount(MaskVal);
2320
          ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2321
                                  WidthVal));
2322
          return;
2323
        }
2324
      }
2325
    }
2326
    break;
2327
  case ISD::SRL:
2328
    if (N->getOperand(0).getOpcode() == ISD::AND) {
2329
      // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2330
      // Predicate: isMask(mask >> b)
2331
      const SDValue &And = N->getOperand(0);
2332
      ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2333
      ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2334

2335
      if (Shift && Mask) {
2336
        uint32_t ShiftVal = Shift->getZExtValue();
2337
        uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2338

2339
        if (isMask_32(MaskVal)) {
2340
          uint32_t WidthVal = llvm::popcount(MaskVal);
2341
          ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2342
                      WidthVal));
2343
          return;
2344
        }
2345
      }
2346
    } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2347
      SelectS_BFEFromShifts(N);
2348
      return;
2349
    }
2350
    break;
2351
  case ISD::SRA:
2352
    if (N->getOperand(0).getOpcode() == ISD::SHL) {
2353
      SelectS_BFEFromShifts(N);
2354
      return;
2355
    }
2356
    break;
2357

2358
  case ISD::SIGN_EXTEND_INREG: {
2359
    // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2360
    SDValue Src = N->getOperand(0);
2361
    if (Src.getOpcode() != ISD::SRL)
2362
      break;
2363

2364
    const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2365
    if (!Amt)
2366
      break;
2367

2368
    unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2369
    ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2370
                            Amt->getZExtValue(), Width));
2371
    return;
2372
  }
2373
  }
2374

2375
  SelectCode(N);
2376
}
2377

2378
bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2379
  assert(N->getOpcode() == ISD::BRCOND);
2380
  if (!N->hasOneUse())
2381
    return false;
2382

2383
  SDValue Cond = N->getOperand(1);
2384
  if (Cond.getOpcode() == ISD::CopyToReg)
2385
    Cond = Cond.getOperand(2);
2386

2387
  if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2388
    return false;
2389

2390
  MVT VT = Cond.getOperand(0).getSimpleValueType();
2391
  if (VT == MVT::i32)
2392
    return true;
2393

2394
  if (VT == MVT::i64) {
2395
    auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2396

2397
    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2398
    return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2399
  }
2400

2401
  return false;
2402
}
2403

2404
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2405
  assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2406
  // Special case for amdgcn.ballot:
2407
  // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2408
  // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2409
  // =>
2410
  // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2411
  // This is possible because divergent ISD::SETCC is selected as V_CMP and
2412
  // Cond becomes a i(WaveSize) full mask value.
2413
  // Note that ballot doesn't use SETEQ condition but its easy to support it
2414
  // here for completeness, so in this case Negate is set true on return.
2415
  auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2416
  if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2417
      isNullConstant(VCMP.getOperand(1))) {
2418

2419
    auto Cond = VCMP.getOperand(0);
2420
    if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2421
      Cond = Cond.getOperand(0);
2422

2423
    if (isBoolSGPR(Cond)) {
2424
      Negate = VCMP_CC == ISD::SETEQ;
2425
      return Cond;
2426
    }
2427
  }
2428
  return SDValue();
2429
}
2430

2431
void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2432
  SDValue Cond = N->getOperand(1);
2433

2434
  if (Cond.isUndef()) {
2435
    CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2436
                         N->getOperand(2), N->getOperand(0));
2437
    return;
2438
  }
2439

2440
  const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2441
  const SIRegisterInfo *TRI = ST->getRegisterInfo();
2442

2443
  bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2444
  bool AndExec = !UseSCCBr;
2445
  bool Negate = false;
2446

2447
  if (Cond.getOpcode() == ISD::SETCC &&
2448
      Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2449
    SDValue VCMP = Cond->getOperand(0);
2450
    auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2451
    if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2452
        isNullConstant(Cond->getOperand(1)) &&
2453
        // We may encounter ballot.i64 in wave32 mode on -O0.
2454
        VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
2455
      // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2456
      // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2457
      // BRCOND i1 %C, %BB
2458
      // =>
2459
      // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2460
      // VCC = COPY i(WaveSize) %VCMP
2461
      // S_CBRANCH_VCCNZ/VCCZ %BB
2462
      Negate = CC == ISD::SETEQ;
2463
      bool NegatedBallot = false;
2464
      if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2465
        Cond = BallotCond;
2466
        UseSCCBr = !BallotCond->isDivergent();
2467
        Negate = Negate ^ NegatedBallot;
2468
      } else {
2469
        // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2470
        // selected as V_CMP, but this may change for uniform condition.
2471
        Cond = VCMP;
2472
        UseSCCBr = false;
2473
      }
2474
    }
2475
    // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2476
    // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2477
    // used.
2478
    AndExec = false;
2479
  }
2480

2481
  unsigned BrOp =
2482
      UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2483
               : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2484
  Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2485
  SDLoc SL(N);
2486

2487
  if (AndExec) {
2488
    // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
2489
    // analyzed what generates the vcc value, so we do not know whether vcc
2490
    // bits for disabled lanes are 0.  Thus we need to mask out bits for
2491
    // disabled lanes.
2492
    //
2493
    // For the case that we select S_CBRANCH_SCC1 and it gets
2494
    // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2495
    // SIInstrInfo::moveToVALU which inserts the S_AND).
2496
    //
2497
    // We could add an analysis of what generates the vcc value here and omit
2498
    // the S_AND when is unnecessary. But it would be better to add a separate
2499
    // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2500
    // catches both cases.
2501
    Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2502
                                                         : AMDGPU::S_AND_B64,
2503
                     SL, MVT::i1,
2504
                     CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2505
                                                        : AMDGPU::EXEC,
2506
                                         MVT::i1),
2507
                    Cond),
2508
                   0);
2509
  }
2510

2511
  SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2512
  CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2513
                       N->getOperand(2), // Basic Block
2514
                       VCC.getValue(0));
2515
}
2516

2517
void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2518
  if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2519
      !N->isDivergent()) {
2520
    SDValue Src = N->getOperand(0);
2521
    if (Src.getValueType() == MVT::f16) {
2522
      if (isExtractHiElt(Src, Src)) {
2523
        CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2524
                             {Src});
2525
        return;
2526
      }
2527
    }
2528
  }
2529

2530
  SelectCode(N);
2531
}
2532

2533
void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2534
  // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2535
  // be copied to an SGPR with readfirstlane.
2536
  unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2537
    AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2538

2539
  SDValue Chain = N->getOperand(0);
2540
  SDValue Ptr = N->getOperand(2);
2541
  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2542
  MachineMemOperand *MMO = M->getMemOperand();
2543
  bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2544

2545
  SDValue Offset;
2546
  if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2547
    SDValue PtrBase = Ptr.getOperand(0);
2548
    SDValue PtrOffset = Ptr.getOperand(1);
2549

2550
    const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2551
    if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2552
      N = glueCopyToM0(N, PtrBase);
2553
      Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2554
    }
2555
  }
2556

2557
  if (!Offset) {
2558
    N = glueCopyToM0(N, Ptr);
2559
    Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2560
  }
2561

2562
  SDValue Ops[] = {
2563
    Offset,
2564
    CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2565
    Chain,
2566
    N->getOperand(N->getNumOperands() - 1) // New glue
2567
  };
2568

2569
  SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2570
  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2571
}
2572

2573
// We need to handle this here because tablegen doesn't support matching
2574
// instructions with multiple outputs.
2575
void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2576
  unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2577
  SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2578
                   N->getOperand(5), N->getOperand(0)};
2579

2580
  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2581
  MachineMemOperand *MMO = M->getMemOperand();
2582
  SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2583
  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2584
}
2585

2586
static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2587
  switch (IntrID) {
2588
  case Intrinsic::amdgcn_ds_gws_init:
2589
    return AMDGPU::DS_GWS_INIT;
2590
  case Intrinsic::amdgcn_ds_gws_barrier:
2591
    return AMDGPU::DS_GWS_BARRIER;
2592
  case Intrinsic::amdgcn_ds_gws_sema_v:
2593
    return AMDGPU::DS_GWS_SEMA_V;
2594
  case Intrinsic::amdgcn_ds_gws_sema_br:
2595
    return AMDGPU::DS_GWS_SEMA_BR;
2596
  case Intrinsic::amdgcn_ds_gws_sema_p:
2597
    return AMDGPU::DS_GWS_SEMA_P;
2598
  case Intrinsic::amdgcn_ds_gws_sema_release_all:
2599
    return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2600
  default:
2601
    llvm_unreachable("not a gws intrinsic");
2602
  }
2603
}
2604

2605
void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2606
  if (!Subtarget->hasGWS() ||
2607
      (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2608
       !Subtarget->hasGWSSemaReleaseAll())) {
2609
    // Let this error.
2610
    SelectCode(N);
2611
    return;
2612
  }
2613

2614
  // Chain, intrinsic ID, vsrc, offset
2615
  const bool HasVSrc = N->getNumOperands() == 4;
2616
  assert(HasVSrc || N->getNumOperands() == 3);
2617

2618
  SDLoc SL(N);
2619
  SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2620
  int ImmOffset = 0;
2621
  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2622
  MachineMemOperand *MMO = M->getMemOperand();
2623

2624
  // Don't worry if the offset ends up in a VGPR. Only one lane will have
2625
  // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2626

2627
  // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2628
  // offset field) % 64. Some versions of the programming guide omit the m0
2629
  // part, or claim it's from offset 0.
2630
  if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2631
    // If we have a constant offset, try to use the 0 in m0 as the base.
2632
    // TODO: Look into changing the default m0 initialization value. If the
2633
    // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2634
    // the immediate offset.
2635
    glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2636
    ImmOffset = ConstOffset->getZExtValue();
2637
  } else {
2638
    if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2639
      ImmOffset = BaseOffset.getConstantOperandVal(1);
2640
      BaseOffset = BaseOffset.getOperand(0);
2641
    }
2642

2643
    // Prefer to do the shift in an SGPR since it should be possible to use m0
2644
    // as the result directly. If it's already an SGPR, it will be eliminated
2645
    // later.
2646
    SDNode *SGPROffset
2647
      = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2648
                               BaseOffset);
2649
    // Shift to offset in m0
2650
    SDNode *M0Base
2651
      = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2652
                               SDValue(SGPROffset, 0),
2653
                               CurDAG->getTargetConstant(16, SL, MVT::i32));
2654
    glueCopyToM0(N, SDValue(M0Base, 0));
2655
  }
2656

2657
  SDValue Chain = N->getOperand(0);
2658
  SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2659

2660
  const unsigned Opc = gwsIntrinToOpcode(IntrID);
2661
  SmallVector<SDValue, 5> Ops;
2662
  if (HasVSrc)
2663
    Ops.push_back(N->getOperand(2));
2664
  Ops.push_back(OffsetField);
2665
  Ops.push_back(Chain);
2666

2667
  SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2668
  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2669
}
2670

2671
void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2672
  if (Subtarget->getLDSBankCount() != 16) {
2673
    // This is a single instruction with a pattern.
2674
    SelectCode(N);
2675
    return;
2676
  }
2677

2678
  SDLoc DL(N);
2679

2680
  // This requires 2 instructions. It is possible to write a pattern to support
2681
  // this, but the generated isel emitter doesn't correctly deal with multiple
2682
  // output instructions using the same physical register input. The copy to m0
2683
  // is incorrectly placed before the second instruction.
2684
  //
2685
  // TODO: Match source modifiers.
2686
  //
2687
  // def : Pat <
2688
  //   (int_amdgcn_interp_p1_f16
2689
  //    (VOP3Mods f32:$src0, i32:$src0_modifiers),
2690
  //                             (i32 timm:$attrchan), (i32 timm:$attr),
2691
  //                             (i1 timm:$high), M0),
2692
  //   (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2693
  //       timm:$attrchan, 0,
2694
  //       (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2695
  //   let Predicates = [has16BankLDS];
2696
  // }
2697

2698
  // 16 bank LDS
2699
  SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2700
                                      N->getOperand(5), SDValue());
2701

2702
  SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2703

2704
  SDNode *InterpMov =
2705
    CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2706
        CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2707
        N->getOperand(3),  // Attr
2708
        N->getOperand(2),  // Attrchan
2709
        ToM0.getValue(1) // In glue
2710
  });
2711

2712
  SDNode *InterpP1LV =
2713
    CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2714
        CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2715
        N->getOperand(1), // Src0
2716
        N->getOperand(3), // Attr
2717
        N->getOperand(2), // Attrchan
2718
        CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2719
        SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2720
        N->getOperand(4), // high
2721
        CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2722
        CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2723
        SDValue(InterpMov, 1)
2724
  });
2725

2726
  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2727
}
2728

2729
void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2730
  unsigned IntrID = N->getConstantOperandVal(1);
2731
  switch (IntrID) {
2732
  case Intrinsic::amdgcn_ds_append:
2733
  case Intrinsic::amdgcn_ds_consume: {
2734
    if (N->getValueType(0) != MVT::i32)
2735
      break;
2736
    SelectDSAppendConsume(N, IntrID);
2737
    return;
2738
  }
2739
  case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2740
    SelectDSBvhStackIntrinsic(N);
2741
    return;
2742
  }
2743

2744
  SelectCode(N);
2745
}
2746

2747
void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2748
  unsigned IntrID = N->getConstantOperandVal(0);
2749
  unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2750
  SDNode *ConvGlueNode = N->getGluedNode();
2751
  if (ConvGlueNode) {
2752
    // FIXME: Possibly iterate over multiple glue nodes?
2753
    assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2754
    ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2755
    ConvGlueNode =
2756
        CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2757
                               MVT::Glue, SDValue(ConvGlueNode, 0));
2758
  } else {
2759
    ConvGlueNode = nullptr;
2760
  }
2761
  switch (IntrID) {
2762
  case Intrinsic::amdgcn_wqm:
2763
    Opcode = AMDGPU::WQM;
2764
    break;
2765
  case Intrinsic::amdgcn_softwqm:
2766
    Opcode = AMDGPU::SOFT_WQM;
2767
    break;
2768
  case Intrinsic::amdgcn_wwm:
2769
  case Intrinsic::amdgcn_strict_wwm:
2770
    Opcode = AMDGPU::STRICT_WWM;
2771
    break;
2772
  case Intrinsic::amdgcn_strict_wqm:
2773
    Opcode = AMDGPU::STRICT_WQM;
2774
    break;
2775
  case Intrinsic::amdgcn_interp_p1_f16:
2776
    SelectInterpP1F16(N);
2777
    return;
2778
  default:
2779
    SelectCode(N);
2780
    break;
2781
  }
2782

2783
  if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2784
    SDValue Src = N->getOperand(1);
2785
    CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2786
  }
2787

2788
  if (ConvGlueNode) {
2789
    SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
2790
    NewOps.push_back(SDValue(ConvGlueNode, 0));
2791
    CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2792
  }
2793
}
2794

2795
void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2796
  unsigned IntrID = N->getConstantOperandVal(1);
2797
  switch (IntrID) {
2798
  case Intrinsic::amdgcn_ds_gws_init:
2799
  case Intrinsic::amdgcn_ds_gws_barrier:
2800
  case Intrinsic::amdgcn_ds_gws_sema_v:
2801
  case Intrinsic::amdgcn_ds_gws_sema_br:
2802
  case Intrinsic::amdgcn_ds_gws_sema_p:
2803
  case Intrinsic::amdgcn_ds_gws_sema_release_all:
2804
    SelectDS_GWS(N, IntrID);
2805
    return;
2806
  default:
2807
    break;
2808
  }
2809

2810
  SelectCode(N);
2811
}
2812

2813
void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2814
  SDValue Log2WaveSize =
2815
    CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2816
  CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2817
                       {N->getOperand(0), Log2WaveSize});
2818
}
2819

2820
void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2821
  SDValue SrcVal = N->getOperand(1);
2822
  if (SrcVal.getValueType() != MVT::i32) {
2823
    SelectCode(N); // Emit default error
2824
    return;
2825
  }
2826

2827
  SDValue CopyVal;
2828
  Register SP = TLI->getStackPointerRegisterToSaveRestore();
2829
  SDLoc SL(N);
2830

2831
  if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2832
    CopyVal = SrcVal.getOperand(0);
2833
  } else {
2834
    SDValue Log2WaveSize = CurDAG->getTargetConstant(
2835
        Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2836

2837
    if (N->isDivergent()) {
2838
      SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2839
                                              MVT::i32, SrcVal),
2840
                       0);
2841
    }
2842

2843
    CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2844
                                             {SrcVal, Log2WaveSize}),
2845
                      0);
2846
  }
2847

2848
  SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2849
  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2850
}
2851

2852
bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2853
                                            unsigned &Mods,
2854
                                            bool IsCanonicalizing,
2855
                                            bool AllowAbs) const {
2856
  Mods = SISrcMods::NONE;
2857
  Src = In;
2858

2859
  if (Src.getOpcode() == ISD::FNEG) {
2860
    Mods |= SISrcMods::NEG;
2861
    Src = Src.getOperand(0);
2862
  } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2863
    // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2864
    // denormal mode, but we're implicitly canonicalizing in a source operand.
2865
    auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2866
    if (LHS && LHS->isZero()) {
2867
      Mods |= SISrcMods::NEG;
2868
      Src = Src.getOperand(1);
2869
    }
2870
  }
2871

2872
  if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2873
    Mods |= SISrcMods::ABS;
2874
    Src = Src.getOperand(0);
2875
  }
2876

2877
  return true;
2878
}
2879

2880
bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2881
                                        SDValue &SrcMods) const {
2882
  unsigned Mods;
2883
  if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2884
                         /*AllowAbs=*/true)) {
2885
    SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2886
    return true;
2887
  }
2888

2889
  return false;
2890
}
2891

2892
bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2893
    SDValue In, SDValue &Src, SDValue &SrcMods) const {
2894
  unsigned Mods;
2895
  if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2896
                         /*AllowAbs=*/true)) {
2897
    SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2898
    return true;
2899
  }
2900

2901
  return false;
2902
}
2903

2904
bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2905
                                         SDValue &SrcMods) const {
2906
  unsigned Mods;
2907
  if (SelectVOP3ModsImpl(In, Src, Mods,
2908
                         /*IsCanonicalizing=*/true,
2909
                         /*AllowAbs=*/false)) {
2910
    SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2911
    return true;
2912
  }
2913

2914
  return false;
2915
}
2916

2917
bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2918
  if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2919
    return false;
2920

2921
  Src = In;
2922
  return true;
2923
}
2924

2925
bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2926
                                               SDValue &SrcMods,
2927
                                               bool OpSel) const {
2928
  unsigned Mods;
2929
  if (SelectVOP3ModsImpl(In, Src, Mods,
2930
                         /*IsCanonicalizing=*/true,
2931
                         /*AllowAbs=*/false)) {
2932
    if (OpSel)
2933
      Mods |= SISrcMods::OP_SEL_0;
2934
    SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2935
    return true;
2936
  }
2937

2938
  return false;
2939
}
2940

2941
bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2942
                                           SDValue &SrcMods) const {
2943
  return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2944
}
2945

2946
bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2947
                                             SDValue &SrcMods) const {
2948
  return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2949
}
2950

2951
bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2952
                                         SDValue &SrcMods, SDValue &Clamp,
2953
                                         SDValue &Omod) const {
2954
  SDLoc DL(In);
2955
  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2956
  Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2957

2958
  return SelectVOP3Mods(In, Src, SrcMods);
2959
}
2960

2961
bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2962
                                          SDValue &SrcMods, SDValue &Clamp,
2963
                                          SDValue &Omod) const {
2964
  SDLoc DL(In);
2965
  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2966
  Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2967

2968
  return SelectVOP3BMods(In, Src, SrcMods);
2969
}
2970

2971
bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2972
                                         SDValue &Clamp, SDValue &Omod) const {
2973
  Src = In;
2974

2975
  SDLoc DL(In);
2976
  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2977
  Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2978

2979
  return true;
2980
}
2981

2982
bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2983
                                         SDValue &SrcMods, bool IsDOT) const {
2984
  unsigned Mods = SISrcMods::NONE;
2985
  Src = In;
2986

2987
  // TODO: Handle G_FSUB 0 as fneg
2988
  if (Src.getOpcode() == ISD::FNEG) {
2989
    Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
2990
    Src = Src.getOperand(0);
2991
  }
2992

2993
  if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
2994
      (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2995
    unsigned VecMods = Mods;
2996

2997
    SDValue Lo = stripBitcast(Src.getOperand(0));
2998
    SDValue Hi = stripBitcast(Src.getOperand(1));
2999

3000
    if (Lo.getOpcode() == ISD::FNEG) {
3001
      Lo = stripBitcast(Lo.getOperand(0));
3002
      Mods ^= SISrcMods::NEG;
3003
    }
3004

3005
    if (Hi.getOpcode() == ISD::FNEG) {
3006
      Hi = stripBitcast(Hi.getOperand(0));
3007
      Mods ^= SISrcMods::NEG_HI;
3008
    }
3009

3010
    if (isExtractHiElt(Lo, Lo))
3011
      Mods |= SISrcMods::OP_SEL_0;
3012

3013
    if (isExtractHiElt(Hi, Hi))
3014
      Mods |= SISrcMods::OP_SEL_1;
3015

3016
    unsigned VecSize = Src.getValueSizeInBits();
3017
    Lo = stripExtractLoElt(Lo);
3018
    Hi = stripExtractLoElt(Hi);
3019

3020
    if (Lo.getValueSizeInBits() > VecSize) {
3021
      Lo = CurDAG->getTargetExtractSubreg(
3022
        (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3023
        MVT::getIntegerVT(VecSize), Lo);
3024
    }
3025

3026
    if (Hi.getValueSizeInBits() > VecSize) {
3027
      Hi = CurDAG->getTargetExtractSubreg(
3028
        (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3029
        MVT::getIntegerVT(VecSize), Hi);
3030
    }
3031

3032
    assert(Lo.getValueSizeInBits() <= VecSize &&
3033
           Hi.getValueSizeInBits() <= VecSize);
3034

3035
    if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3036
      // Really a scalar input. Just select from the low half of the register to
3037
      // avoid packing.
3038

3039
      if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3040
        Src = Lo;
3041
      } else {
3042
        assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3043

3044
        SDLoc SL(In);
3045
        SDValue Undef = SDValue(
3046
          CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3047
                                 Lo.getValueType()), 0);
3048
        auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3049
                                    : AMDGPU::SReg_64RegClassID;
3050
        const SDValue Ops[] = {
3051
          CurDAG->getTargetConstant(RC, SL, MVT::i32),
3052
          Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3053
          Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3054

3055
        Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3056
                                             Src.getValueType(), Ops), 0);
3057
      }
3058
      SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3059
      return true;
3060
    }
3061

3062
    if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3063
      uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3064
                      .bitcastToAPInt().getZExtValue();
3065
      if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3066
        Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3067
        SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3068
        return true;
3069
      }
3070
    }
3071

3072
    Mods = VecMods;
3073
  }
3074

3075
  // Packed instructions do not have abs modifiers.
3076
  Mods |= SISrcMods::OP_SEL_1;
3077

3078
  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3079
  return true;
3080
}
3081

3082
bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3083
                                            SDValue &SrcMods) const {
3084
  return SelectVOP3PMods(In, Src, SrcMods, true);
3085
}
3086

3087
bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3088
  const ConstantSDNode *C = cast<ConstantSDNode>(In);
3089
  // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3090
  // 1 promotes packed values to signed, 0 treats them as unsigned.
3091
  assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3092

3093
  unsigned Mods = SISrcMods::OP_SEL_1;
3094
  unsigned SrcSign = C->getZExtValue();
3095
  if (SrcSign == 1)
3096
    Mods ^= SISrcMods::NEG;
3097

3098
  Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3099
  return true;
3100
}
3101

3102
bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3103
                                                  SDValue &Src) const {
3104
  const ConstantSDNode *C = cast<ConstantSDNode>(In);
3105
  assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3106

3107
  unsigned Mods = SISrcMods::OP_SEL_1;
3108
  unsigned SrcVal = C->getZExtValue();
3109
  if (SrcVal == 1)
3110
    Mods |= SISrcMods::OP_SEL_0;
3111

3112
  Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3113
  return true;
3114
}
3115

3116
static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
3117
                                         llvm::SelectionDAG *CurDAG,
3118
                                         const SDLoc &DL) {
3119
  unsigned DstRegClass;
3120
  EVT DstTy;
3121
  switch (Elts.size()) {
3122
  case 8:
3123
    DstRegClass = AMDGPU::VReg_256RegClassID;
3124
    DstTy = MVT::v8i32;
3125
    break;
3126
  case 4:
3127
    DstRegClass = AMDGPU::VReg_128RegClassID;
3128
    DstTy = MVT::v4i32;
3129
    break;
3130
  case 2:
3131
    DstRegClass = AMDGPU::VReg_64RegClassID;
3132
    DstTy = MVT::v2i32;
3133
    break;
3134
  default:
3135
    llvm_unreachable("unhandled Reg sequence size");
3136
  }
3137

3138
  SmallVector<SDValue, 17> Ops;
3139
  Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3140
  for (unsigned i = 0; i < Elts.size(); ++i) {
3141
    Ops.push_back(Elts[i]);
3142
    Ops.push_back(CurDAG->getTargetConstant(
3143
        SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));
3144
  }
3145
  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3146
}
3147

3148
static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
3149
                                         llvm::SelectionDAG *CurDAG,
3150
                                         const SDLoc &DL) {
3151
  SmallVector<SDValue, 8> PackedElts;
3152
  assert("unhandled Reg sequence size" &&
3153
         (Elts.size() == 8 || Elts.size() == 16));
3154

3155
  // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3156
  // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3157
  for (unsigned i = 0; i < Elts.size(); i += 2) {
3158
    SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3159
    SDValue HiSrc;
3160
    if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3161
      PackedElts.push_back(HiSrc);
3162
    } else {
3163
      SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3164
      MachineSDNode *Packed =
3165
          CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3166
                                 {Elts[i + 1], Elts[i], PackLoLo});
3167
      PackedElts.push_back(SDValue(Packed, 0));
3168
    }
3169
  }
3170

3171
  return buildRegSequence32(PackedElts, CurDAG, DL);
3172
}
3173

3174
static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
3175
                                       llvm::SelectionDAG *CurDAG,
3176
                                       const SDLoc &DL, unsigned ElementSize) {
3177
  if (ElementSize == 16)
3178
    return buildRegSequence16(Elts, CurDAG, DL);
3179
  if (ElementSize == 32)
3180
    return buildRegSequence32(Elts, CurDAG, DL);
3181
  llvm_unreachable("Unhandled element size");
3182
}
3183

3184
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3185
                                 SmallVectorImpl<SDValue> &Elts, SDValue &Src,
3186
                                 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3187
                                 unsigned ElementSize) {
3188
  if (ModOpcode == ISD::FNEG) {
3189
    Mods |= SISrcMods::NEG;
3190
    // Check if all elements also have abs modifier
3191
    SmallVector<SDValue, 8> NegAbsElts;
3192
    for (auto El : Elts) {
3193
      if (El.getOpcode() != ISD::FABS)
3194
        break;
3195
      NegAbsElts.push_back(El->getOperand(0));
3196
    }
3197
    if (Elts.size() != NegAbsElts.size()) {
3198
      // Neg
3199
      Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3200
    } else {
3201
      // Neg and Abs
3202
      Mods |= SISrcMods::NEG_HI;
3203
      Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3204
    }
3205
  } else {
3206
    assert(ModOpcode == ISD::FABS);
3207
    // Abs
3208
    Mods |= SISrcMods::NEG_HI;
3209
    Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3210
  }
3211
}
3212

3213
// Check all f16 elements for modifiers while looking through b32 and v2b16
3214
// build vector, stop if element does not satisfy ModifierCheck.
3215
static void
3216
checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
3217
                              std::function<bool(SDValue)> ModifierCheck) {
3218
  for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3219
    if (auto *F16Pair =
3220
            dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3221
      for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3222
        SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3223
        if (!ModifierCheck(ElF16))
3224
          break;
3225
      }
3226
    }
3227
  }
3228
}
3229

3230
bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3231
                                              SDValue &SrcMods) const {
3232
  Src = In;
3233
  unsigned Mods = SISrcMods::OP_SEL_1;
3234

3235
  // mods are on f16 elements
3236
  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3237
    SmallVector<SDValue, 8> EltsF16;
3238

3239
    checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3240
      if (Element.getOpcode() != ISD::FNEG)
3241
        return false;
3242
      EltsF16.push_back(Element.getOperand(0));
3243
      return true;
3244
    });
3245

3246
    // All elements have neg modifier
3247
    if (BV->getNumOperands() * 2 == EltsF16.size()) {
3248
      Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3249
      Mods |= SISrcMods::NEG;
3250
      Mods |= SISrcMods::NEG_HI;
3251
    }
3252
  }
3253

3254
  // mods are on v2f16 elements
3255
  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3256
    SmallVector<SDValue, 8> EltsV2F16;
3257
    for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3258
      SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3259
      // Based on first element decide which mod we match, neg or abs
3260
      if (ElV2f16.getOpcode() != ISD::FNEG)
3261
        break;
3262
      EltsV2F16.push_back(ElV2f16.getOperand(0));
3263
    }
3264

3265
    // All pairs of elements have neg modifier
3266
    if (BV->getNumOperands() == EltsV2F16.size()) {
3267
      Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3268
      Mods |= SISrcMods::NEG;
3269
      Mods |= SISrcMods::NEG_HI;
3270
    }
3271
  }
3272

3273
  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3274
  return true;
3275
}
3276

3277
bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3278
                                                 SDValue &SrcMods) const {
3279
  Src = In;
3280
  unsigned Mods = SISrcMods::OP_SEL_1;
3281
  unsigned ModOpcode;
3282

3283
  // mods are on f16 elements
3284
  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3285
    SmallVector<SDValue, 8> EltsF16;
3286
    checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3287
      // Based on first element decide which mod we match, neg or abs
3288
      if (EltsF16.empty())
3289
        ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3290
      if (ElF16.getOpcode() != ModOpcode)
3291
        return false;
3292
      EltsF16.push_back(ElF16.getOperand(0));
3293
      return true;
3294
    });
3295

3296
    // All elements have ModOpcode modifier
3297
    if (BV->getNumOperands() * 2 == EltsF16.size())
3298
      selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3299
                           16);
3300
  }
3301

3302
  // mods are on v2f16 elements
3303
  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3304
    SmallVector<SDValue, 8> EltsV2F16;
3305

3306
    for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3307
      SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3308
      // Based on first element decide which mod we match, neg or abs
3309
      if (EltsV2F16.empty())
3310
        ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3311
      if (ElV2f16->getOpcode() != ModOpcode)
3312
        break;
3313
      EltsV2F16.push_back(ElV2f16->getOperand(0));
3314
    }
3315

3316
    // All elements have ModOpcode modifier
3317
    if (BV->getNumOperands() == EltsV2F16.size())
3318
      selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3319
                           32);
3320
  }
3321

3322
  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3323
  return true;
3324
}
3325

3326
bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3327
                                                 SDValue &SrcMods) const {
3328
  Src = In;
3329
  unsigned Mods = SISrcMods::OP_SEL_1;
3330
  SmallVector<SDValue, 8> EltsF32;
3331

3332
  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3333
    assert(BV->getNumOperands() > 0);
3334
    // Based on first element decide which mod we match, neg or abs
3335
    SDValue ElF32 = stripBitcast(BV->getOperand(0));
3336
    unsigned ModOpcode =
3337
        (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3338
    for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3339
      SDValue ElF32 = stripBitcast(BV->getOperand(i));
3340
      if (ElF32.getOpcode() != ModOpcode)
3341
        break;
3342
      EltsF32.push_back(ElF32.getOperand(0));
3343
    }
3344

3345
    // All elements had ModOpcode modifier
3346
    if (BV->getNumOperands() == EltsF32.size())
3347
      selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3348
                           32);
3349
  }
3350

3351
  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3352
  return true;
3353
}
3354

3355
bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3356
  if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3357
    BitVector UndefElements;
3358
    if (SDValue Splat = BV->getSplatValue(&UndefElements))
3359
      if (isInlineImmediate(Splat.getNode())) {
3360
        if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3361
          unsigned Imm = C->getAPIntValue().getSExtValue();
3362
          Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3363
          return true;
3364
        }
3365
        if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3366
          unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3367
          Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3368
          return true;
3369
        }
3370
        llvm_unreachable("unhandled Constant node");
3371
      }
3372
  }
3373

3374
  // 16 bit splat
3375
  SDValue SplatSrc32 = stripBitcast(In);
3376
  if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3377
    if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3378
      SDValue SplatSrc16 = stripBitcast(Splat32);
3379
      if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3380
        if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3381
          const SIInstrInfo *TII = Subtarget->getInstrInfo();
3382
          std::optional<APInt> RawValue;
3383
          if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3384
            RawValue = C->getValueAPF().bitcastToAPInt();
3385
          else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3386
            RawValue = C->getAPIntValue();
3387

3388
          if (RawValue.has_value()) {
3389
            EVT VT = In.getValueType().getScalarType();
3390
            if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3391
              APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3392
                                   ? APFloatBase::IEEEhalf()
3393
                                   : APFloatBase::BFloat(),
3394
                               RawValue.value());
3395
              if (TII->isInlineConstant(FloatVal)) {
3396
                Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3397
                                                MVT::i16);
3398
                return true;
3399
              }
3400
            } else if (VT.getSimpleVT() == MVT::i16) {
3401
              if (TII->isInlineConstant(RawValue.value())) {
3402
                Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3403
                                                MVT::i16);
3404
                return true;
3405
              }
3406
            } else
3407
              llvm_unreachable("unknown 16-bit type");
3408
          }
3409
        }
3410
    }
3411

3412
  return false;
3413
}
3414

3415
bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3416
                                            SDValue &IndexKey) const {
3417
  unsigned Key = 0;
3418
  Src = In;
3419

3420
  if (In.getOpcode() == ISD::SRL) {
3421
    const llvm::SDValue &ShiftSrc = In.getOperand(0);
3422
    ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3423
    if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3424
        ShiftAmt->getZExtValue() % 8 == 0) {
3425
      Key = ShiftAmt->getZExtValue() / 8;
3426
      Src = ShiftSrc;
3427
    }
3428
  }
3429

3430
  IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3431
  return true;
3432
}
3433

3434
bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3435
                                             SDValue &IndexKey) const {
3436
  unsigned Key = 0;
3437
  Src = In;
3438

3439
  if (In.getOpcode() == ISD::SRL) {
3440
    const llvm::SDValue &ShiftSrc = In.getOperand(0);
3441
    ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3442
    if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3443
        ShiftAmt->getZExtValue() == 16) {
3444
      Key = 1;
3445
      Src = ShiftSrc;
3446
    }
3447
  }
3448

3449
  IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3450
  return true;
3451
}
3452

3453
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3454
                                         SDValue &SrcMods) const {
3455
  Src = In;
3456
  // FIXME: Handle op_sel
3457
  SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3458
  return true;
3459
}
3460

3461
bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3462
                                             SDValue &SrcMods) const {
3463
  // FIXME: Handle op_sel
3464
  return SelectVOP3Mods(In, Src, SrcMods);
3465
}
3466

3467
// The return value is not whether the match is possible (which it always is),
3468
// but whether or not it a conversion is really used.
3469
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3470
                                                   unsigned &Mods) const {
3471
  Mods = 0;
3472
  SelectVOP3ModsImpl(In, Src, Mods);
3473

3474
  if (Src.getOpcode() == ISD::FP_EXTEND) {
3475
    Src = Src.getOperand(0);
3476
    assert(Src.getValueType() == MVT::f16);
3477
    Src = stripBitcast(Src);
3478

3479
    // Be careful about folding modifiers if we already have an abs. fneg is
3480
    // applied last, so we don't want to apply an earlier fneg.
3481
    if ((Mods & SISrcMods::ABS) == 0) {
3482
      unsigned ModsTmp;
3483
      SelectVOP3ModsImpl(Src, Src, ModsTmp);
3484

3485
      if ((ModsTmp & SISrcMods::NEG) != 0)
3486
        Mods ^= SISrcMods::NEG;
3487

3488
      if ((ModsTmp & SISrcMods::ABS) != 0)
3489
        Mods |= SISrcMods::ABS;
3490
    }
3491

3492
    // op_sel/op_sel_hi decide the source type and source.
3493
    // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3494
    // If the sources's op_sel is set, it picks the high half of the source
3495
    // register.
3496

3497
    Mods |= SISrcMods::OP_SEL_1;
3498
    if (isExtractHiElt(Src, Src)) {
3499
      Mods |= SISrcMods::OP_SEL_0;
3500

3501
      // TODO: Should we try to look for neg/abs here?
3502
    }
3503

3504
    return true;
3505
  }
3506

3507
  return false;
3508
}
3509

3510
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3511
                                                  SDValue &SrcMods) const {
3512
  unsigned Mods = 0;
3513
  if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3514
    return false;
3515
  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3516
  return true;
3517
}
3518

3519
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3520
                                               SDValue &SrcMods) const {
3521
  unsigned Mods = 0;
3522
  SelectVOP3PMadMixModsImpl(In, Src, Mods);
3523
  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3524
  return true;
3525
}
3526

3527
SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3528
  if (In.isUndef())
3529
    return CurDAG->getUNDEF(MVT::i32);
3530

3531
  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3532
    SDLoc SL(In);
3533
    return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3534
  }
3535

3536
  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3537
    SDLoc SL(In);
3538
    return CurDAG->getConstant(
3539
      C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3540
  }
3541

3542
  SDValue Src;
3543
  if (isExtractHiElt(In, Src))
3544
    return Src;
3545

3546
  return SDValue();
3547
}
3548

3549
bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3550
  assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
3551

3552
  const SIRegisterInfo *SIRI =
3553
    static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3554
  const SIInstrInfo * SII =
3555
    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3556

3557
  unsigned Limit = 0;
3558
  bool AllUsesAcceptSReg = true;
3559
  for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3560
    Limit < 10 && U != E; ++U, ++Limit) {
3561
    const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
3562

3563
    // If the register class is unknown, it could be an unknown
3564
    // register class that needs to be an SGPR, e.g. an inline asm
3565
    // constraint
3566
    if (!RC || SIRI->isSGPRClass(RC))
3567
      return false;
3568

3569
    if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3570
      AllUsesAcceptSReg = false;
3571
      SDNode * User = *U;
3572
      if (User->isMachineOpcode()) {
3573
        unsigned Opc = User->getMachineOpcode();
3574
        const MCInstrDesc &Desc = SII->get(Opc);
3575
        if (Desc.isCommutable()) {
3576
          unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
3577
          unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3578
          if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3579
            unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3580
            const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
3581
            if (CommutedRC == &AMDGPU::VS_32RegClass ||
3582
                CommutedRC == &AMDGPU::VS_64RegClass)
3583
              AllUsesAcceptSReg = true;
3584
          }
3585
        }
3586
      }
3587
      // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3588
      // commuting current user. This means have at least one use
3589
      // that strictly require VGPR. Thus, we will not attempt to commute
3590
      // other user instructions.
3591
      if (!AllUsesAcceptSReg)
3592
        break;
3593
    }
3594
  }
3595
  return !AllUsesAcceptSReg && (Limit < 10);
3596
}
3597

3598
bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3599
  auto Ld = cast<LoadSDNode>(N);
3600

3601
  const MachineMemOperand *MMO = Ld->getMemOperand();
3602
  if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3603
    return false;
3604

3605
  return MMO->getSize().hasValue() &&
3606
         Ld->getAlign() >=
3607
             Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3608
                            uint64_t(4))) &&
3609
         ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3610
           Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3611
          (Subtarget->getScalarizeGlobalBehavior() &&
3612
           Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3613
           Ld->isSimple() &&
3614
           static_cast<const SITargetLowering *>(getTargetLowering())
3615
               ->isMemOpHasNoClobberedMemOperand(N)));
3616
}
3617

3618
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
3619
  const AMDGPUTargetLowering& Lowering =
3620
    *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3621
  bool IsModified = false;
3622
  do {
3623
    IsModified = false;
3624

3625
    // Go over all selected nodes and try to fold them a bit more
3626
    SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
3627
    while (Position != CurDAG->allnodes_end()) {
3628
      SDNode *Node = &*Position++;
3629
      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3630
      if (!MachineNode)
3631
        continue;
3632

3633
      SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3634
      if (ResNode != Node) {
3635
        if (ResNode)
3636
          ReplaceUses(Node, ResNode);
3637
        IsModified = true;
3638
      }
3639
    }
3640
    CurDAG->RemoveDeadNodes();
3641
  } while (IsModified);
3642
}
3643

3644
AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,
3645
                                                   CodeGenOptLevel OptLevel)
3646
    : SelectionDAGISelLegacy(
3647
          ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
3648

3649
char AMDGPUDAGToDAGISelLegacy::ID = 0;
3650

3651
Product

Resources

Company