CoCalc -- AMDGPUISelLowering.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
³⁵²⁹⁴ views
1
//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// This is the parent TargetLowering class for hardware code gen
11
/// targets.
12
//
13
//===----------------------------------------------------------------------===//
14

15
#include "AMDGPUISelLowering.h"
16
#include "AMDGPU.h"
17
#include "AMDGPUInstrInfo.h"
18
#include "AMDGPUMachineFunction.h"
19
#include "SIMachineFunctionInfo.h"
20
#include "llvm/CodeGen/Analysis.h"
21
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22
#include "llvm/CodeGen/MachineFrameInfo.h"
23
#include "llvm/IR/DiagnosticInfo.h"
24
#include "llvm/IR/IntrinsicsAMDGPU.h"
25
#include "llvm/IR/PatternMatch.h"
26
#include "llvm/Support/CommandLine.h"
27
#include "llvm/Support/KnownBits.h"
28
#include "llvm/Target/TargetMachine.h"
29

30
using namespace llvm;
31

32
#include "AMDGPUGenCallingConv.inc"
33

34
static cl::opt<bool> AMDGPUBypassSlowDiv(
35
  "amdgpu-bypass-slow-div",
36
  cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37
  cl::init(true));
38

39
// Find a larger type to do a load / store of a vector with.
40
EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
41
  unsigned StoreSize = VT.getStoreSizeInBits();
42
  if (StoreSize <= 32)
43
    return EVT::getIntegerVT(Ctx, StoreSize);
44

45
  if (StoreSize % 32 == 0)
46
    return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47

48
  return VT;
49
}
50

51
unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
52
  return DAG.computeKnownBits(Op).countMaxActiveBits();
53
}
54

55
unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
56
  // In order for this to be a signed 24-bit value, bit 23, must
57
  // be a sign bit.
58
  return DAG.ComputeMaxSignificantBits(Op);
59
}
60

61
AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
62
                                           const AMDGPUSubtarget &STI)
63
    : TargetLowering(TM), Subtarget(&STI) {
64
  // Always lower memset, memcpy, and memmove intrinsics to load/store
65
  // instructions, rather then generating calls to memset, mempcy or memmove.
66
  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
67
  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
68
  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
69

70
  // Enable ganging up loads and stores in the memcpy DAG lowering.
71
  MaxGluedStoresPerMemcpy = 16;
72

73
  // Lower floating point store/load to integer store/load to reduce the number
74
  // of patterns in tablegen.
75
  setOperationAction(ISD::LOAD, MVT::f32, Promote);
76
  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
77

78
  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
79
  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
80

81
  setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
82
  AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
83

84
  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
85
  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
86

87
  setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
88
  AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
89

90
  setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
91
  AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
92

93
  setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
94
  AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
95

96
  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
97
  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
98

99
  setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
100
  AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
101

102
  setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
103
  AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
104

105
  setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
106
  AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
107

108
  setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
109
  AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
110

111
  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
112
  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
113

114
  setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
115
  AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
116

117
  setOperationAction(ISD::LOAD, MVT::i64, Promote);
118
  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
119

120
  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
121
  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
122

123
  setOperationAction(ISD::LOAD, MVT::f64, Promote);
124
  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
125

126
  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
127
  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
128

129
  setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
130
  AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
131

132
  setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
133
  AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
134

135
  setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
136
  AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
137

138
  setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
139
  AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
140

141
  setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
142
  AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
143

144
  setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
145
  AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
146

147
  setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
148
  AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
149

150
  setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
151
  AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
152

153
  setOperationAction(ISD::LOAD, MVT::i128, Promote);
154
  AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
155

156
  // TODO: Would be better to consume as directly legal
157
  setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote);
158
  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
159

160
  setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote);
161
  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
162

163
  setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote);
164
  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
165

166
  setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote);
167
  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
168

169
  setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote);
170
  AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
171

172
  setOperationAction(ISD::ATOMIC_STORE, MVT::f64, Promote);
173
  AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
174

175
  setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Promote);
176
  AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
177

178
  setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote);
179
  AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
180

181
  // There are no 64-bit extloads. These should be done as a 32-bit extload and
182
  // an extension to 64-bit.
183
  for (MVT VT : MVT::integer_valuetypes())
184
    setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,
185
                     Expand);
186

187
  for (MVT VT : MVT::integer_valuetypes()) {
188
    if (VT == MVT::i64)
189
      continue;
190

191
    for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192
      setLoadExtAction(Op, VT, MVT::i1, Promote);
193
      setLoadExtAction(Op, VT, MVT::i8, Legal);
194
      setLoadExtAction(Op, VT, MVT::i16, Legal);
195
      setLoadExtAction(Op, VT, MVT::i32, Expand);
196
    }
197
  }
198

199
  for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
200
    for (auto MemVT :
201
         {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
202
      setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,
203
                       Expand);
204

205
  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
206
  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
207
  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
208
  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
209
  setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
210
  setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
211
  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
212
  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
213
  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
214
  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
215
  setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
216
  setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
217
  setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
218
  setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
219

220
  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
221
  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
222
  setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
223
  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
224
  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
225
  setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
226

227
  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
228
  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
229
  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
230
  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
231
  setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
232
  setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
233
  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
234
  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
235
  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
236
  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
237
  setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
238
  setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
239

240
  setOperationAction(ISD::STORE, MVT::f32, Promote);
241
  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
242

243
  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
244
  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
245

246
  setOperationAction(ISD::STORE, MVT::v3f32, Promote);
247
  AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
248

249
  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
250
  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
251

252
  setOperationAction(ISD::STORE, MVT::v5f32, Promote);
253
  AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
254

255
  setOperationAction(ISD::STORE, MVT::v6f32, Promote);
256
  AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
257

258
  setOperationAction(ISD::STORE, MVT::v7f32, Promote);
259
  AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
260

261
  setOperationAction(ISD::STORE, MVT::v8f32, Promote);
262
  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
263

264
  setOperationAction(ISD::STORE, MVT::v9f32, Promote);
265
  AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
266

267
  setOperationAction(ISD::STORE, MVT::v10f32, Promote);
268
  AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
269

270
  setOperationAction(ISD::STORE, MVT::v11f32, Promote);
271
  AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
272

273
  setOperationAction(ISD::STORE, MVT::v12f32, Promote);
274
  AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
275

276
  setOperationAction(ISD::STORE, MVT::v16f32, Promote);
277
  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
278

279
  setOperationAction(ISD::STORE, MVT::v32f32, Promote);
280
  AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
281

282
  setOperationAction(ISD::STORE, MVT::i64, Promote);
283
  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
284

285
  setOperationAction(ISD::STORE, MVT::v2i64, Promote);
286
  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
287

288
  setOperationAction(ISD::STORE, MVT::f64, Promote);
289
  AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
290

291
  setOperationAction(ISD::STORE, MVT::v2f64, Promote);
292
  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
293

294
  setOperationAction(ISD::STORE, MVT::v3i64, Promote);
295
  AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
296

297
  setOperationAction(ISD::STORE, MVT::v3f64, Promote);
298
  AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
299

300
  setOperationAction(ISD::STORE, MVT::v4i64, Promote);
301
  AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
302

303
  setOperationAction(ISD::STORE, MVT::v4f64, Promote);
304
  AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
305

306
  setOperationAction(ISD::STORE, MVT::v8i64, Promote);
307
  AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
308

309
  setOperationAction(ISD::STORE, MVT::v8f64, Promote);
310
  AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
311

312
  setOperationAction(ISD::STORE, MVT::v16i64, Promote);
313
  AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
314

315
  setOperationAction(ISD::STORE, MVT::v16f64, Promote);
316
  AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
317

318
  setOperationAction(ISD::STORE, MVT::i128, Promote);
319
  AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
320

321
  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
322
  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
323
  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
324
  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
325

326
  setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
327
  setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
328
  setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
329
  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
330

331
  setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
332
  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
333
  setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
334
  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
335
  setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
336
  setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
337
  setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
338
  setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
339
  setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
340
  setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
341
  setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
342
  setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
343
  setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
344
  setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
345

346
  setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
347
  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
348
  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
349

350
  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
351
  setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
352
  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
353

354
  setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
355

356
  setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
357
  setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
358
  setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
359
  setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
360
  setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
361
  setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
362
  setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
363

364
  setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
365
  setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
366
  setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
367
  setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
368
  setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
369

370
  setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
371
  setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
372
  setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
373

374
  setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
375
  setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
376
  setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
377
  setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
378
  setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
379
  setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
380
  setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
381
  setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
382

383
  setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
384
  setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
385

386
  setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
387

388
  // For R600, this is totally unsupported, just custom lower to produce an
389
  // error.
390
  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
391

392
  // Library functions.  These default to Expand, but we have instructions
393
  // for them.
394
  setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
395
                      ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
396
                     MVT::f32, Legal);
397

398
  setOperationAction(ISD::FLOG2, MVT::f32, Custom);
399
  setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
400

401
  setOperationAction(
402
      {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
403
      Custom);
404

405
  setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
406

407
  setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
408

409
  setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
410

411
  if (Subtarget->has16BitInsts())
412
    setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
413
  else {
414
    setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
415
    setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
416
  }
417

418
  setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
419
                     Custom);
420

421
  // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
422
  // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
423
  // default unless marked custom/legal.
424
  setOperationAction(
425
      ISD::IS_FPCLASS,
426
      {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
427
       MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
428
       MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
429
      Custom);
430

431
  // Expand to fneg + fadd.
432
  setOperationAction(ISD::FSUB, MVT::f64, Expand);
433

434
  setOperationAction(ISD::CONCAT_VECTORS,
435
                     {MVT::v3i32,  MVT::v3f32,  MVT::v4i32,  MVT::v4f32,
436
                      MVT::v5i32,  MVT::v5f32,  MVT::v6i32,  MVT::v6f32,
437
                      MVT::v7i32,  MVT::v7f32,  MVT::v8i32,  MVT::v8f32,
438
                      MVT::v9i32,  MVT::v9f32,  MVT::v10i32, MVT::v10f32,
439
                      MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
440
                     Custom);
441

442
  // FIXME: Why is v8f16/v8bf16 missing?
443
  setOperationAction(
444
      ISD::EXTRACT_SUBVECTOR,
445
      {MVT::v2f16,  MVT::v2bf16, MVT::v2i16,  MVT::v4f16,  MVT::v4bf16,
446
       MVT::v4i16,  MVT::v2f32,  MVT::v2i32,  MVT::v3f32,  MVT::v3i32,
447
       MVT::v4f32,  MVT::v4i32,  MVT::v5f32,  MVT::v5i32,  MVT::v6f32,
448
       MVT::v6i32,  MVT::v7f32,  MVT::v7i32,  MVT::v8f32,  MVT::v8i32,
449
       MVT::v9f32,  MVT::v9i32,  MVT::v10i32, MVT::v10f32, MVT::v11i32,
450
       MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
451
       MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
452
       MVT::v2f64,  MVT::v2i64,  MVT::v3f64,  MVT::v3i64,  MVT::v4f64,
453
       MVT::v4i64,  MVT::v8f64,  MVT::v8i64,  MVT::v16f64, MVT::v16i64,
454
       MVT::v32i16, MVT::v32f16, MVT::v32bf16},
455
      Custom);
456

457
  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
458
  setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
459

460
  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
461
  for (MVT VT : ScalarIntVTs) {
462
    // These should use [SU]DIVREM, so set them to expand
463
    setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
464
                       Expand);
465

466
    // GPU does not have divrem function for signed or unsigned.
467
    setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);
468

469
    // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
470
    setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
471

472
    setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);
473

474
    // AMDGPU uses ADDC/SUBC/ADDE/SUBE
475
    setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
476
  }
477

478
  // The hardware supports 32-bit FSHR, but not FSHL.
479
  setOperationAction(ISD::FSHR, MVT::i32, Legal);
480

481
  // The hardware supports 32-bit ROTR, but not ROTL.
482
  setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
483
  setOperationAction(ISD::ROTR, MVT::i64, Expand);
484

485
  setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
486

487
  setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
488
  setOperationAction(
489
      {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
490
      MVT::i64, Custom);
491
  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
492

493
  setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
494
                     Legal);
495

496
  setOperationAction(
497
      {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
498
      MVT::i64, Custom);
499

500
  for (auto VT : {MVT::i8, MVT::i16})
501
    setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom);
502

503
  static const MVT::SimpleValueType VectorIntTypes[] = {
504
      MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
505
      MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
506

507
  for (MVT VT : VectorIntTypes) {
508
    // Expand the following operations for the current type by default.
509
    setOperationAction({ISD::ADD,        ISD::AND,     ISD::FP_TO_SINT,
510
                        ISD::FP_TO_UINT, ISD::MUL,     ISD::MULHU,
511
                        ISD::MULHS,      ISD::OR,      ISD::SHL,
512
                        ISD::SRA,        ISD::SRL,     ISD::ROTL,
513
                        ISD::ROTR,       ISD::SUB,     ISD::SINT_TO_FP,
514
                        ISD::UINT_TO_FP, ISD::SDIV,    ISD::UDIV,
515
                        ISD::SREM,       ISD::UREM,    ISD::SMUL_LOHI,
516
                        ISD::UMUL_LOHI,  ISD::SDIVREM, ISD::UDIVREM,
517
                        ISD::SELECT,     ISD::VSELECT, ISD::SELECT_CC,
518
                        ISD::XOR,        ISD::BSWAP,   ISD::CTPOP,
519
                        ISD::CTTZ,       ISD::CTLZ,    ISD::VECTOR_SHUFFLE,
520
                        ISD::SETCC},
521
                       VT, Expand);
522
  }
523

524
  static const MVT::SimpleValueType FloatVectorTypes[] = {
525
      MVT::v2f32, MVT::v3f32,  MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
526
      MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
527

528
  for (MVT VT : FloatVectorTypes) {
529
    setOperationAction(
530
        {ISD::FABS,          ISD::FMINNUM,        ISD::FMAXNUM,
531
         ISD::FADD,          ISD::FCEIL,          ISD::FCOS,
532
         ISD::FDIV,          ISD::FEXP2,          ISD::FEXP,
533
         ISD::FEXP10,        ISD::FLOG2,          ISD::FREM,
534
         ISD::FLOG,          ISD::FLOG10,         ISD::FPOW,
535
         ISD::FFLOOR,        ISD::FTRUNC,         ISD::FMUL,
536
         ISD::FMA,           ISD::FRINT,          ISD::FNEARBYINT,
537
         ISD::FSQRT,         ISD::FSIN,           ISD::FSUB,
538
         ISD::FNEG,          ISD::VSELECT,        ISD::SELECT_CC,
539
         ISD::FCOPYSIGN,     ISD::VECTOR_SHUFFLE, ISD::SETCC,
540
         ISD::FCANONICALIZE, ISD::FROUNDEVEN},
541
        VT, Expand);
542
  }
543

544
  // This causes using an unrolled select operation rather than expansion with
545
  // bit operations. This is in general better, but the alternative using BFI
546
  // instructions may be better if the select sources are SGPRs.
547
  setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
548
  AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
549

550
  setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
551
  AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
552

553
  setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
554
  AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
555

556
  setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
557
  AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
558

559
  setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
560
  AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
561

562
  setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
563
  AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
564

565
  setOperationAction(ISD::SELECT, MVT::v9f32, Promote);
566
  AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
567

568
  setOperationAction(ISD::SELECT, MVT::v10f32, Promote);
569
  AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
570

571
  setOperationAction(ISD::SELECT, MVT::v11f32, Promote);
572
  AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
573

574
  setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
575
  AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
576

577
  setSchedulingPreference(Sched::RegPressure);
578
  setJumpIsExpensive(true);
579

580
  // FIXME: This is only partially true. If we have to do vector compares, any
581
  // SGPR pair can be a condition register. If we have a uniform condition, we
582
  // are better off doing SALU operations, where there is only one SCC. For now,
583
  // we don't have a way of knowing during instruction selection if a condition
584
  // will be uniform and we always use vector compares. Assume we are using
585
  // vector compares until that is fixed.
586
  setHasMultipleConditionRegisters(true);
587

588
  setMinCmpXchgSizeInBits(32);
589
  setSupportsUnalignedAtomics(false);
590

591
  PredictableSelectIsExpensive = false;
592

593
  // We want to find all load dependencies for long chains of stores to enable
594
  // merging into very wide vectors. The problem is with vectors with > 4
595
  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
596
  // vectors are a legal type, even though we have to split the loads
597
  // usually. When we can more precisely specify load legality per address
598
  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
599
  // smarter so that they can figure out what to do in 2 iterations without all
600
  // N > 4 stores on the same chain.
601
  GatherAllAliasesMaxDepth = 16;
602

603
  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
604
  // about these during lowering.
605
  MaxStoresPerMemcpy  = 0xffffffff;
606
  MaxStoresPerMemmove = 0xffffffff;
607
  MaxStoresPerMemset  = 0xffffffff;
608

609
  // The expansion for 64-bit division is enormous.
610
  if (AMDGPUBypassSlowDiv)
611
    addBypassSlowDiv(64, 32);
612

613
  setTargetDAGCombine({ISD::BITCAST,    ISD::SHL,
614
                       ISD::SRA,        ISD::SRL,
615
                       ISD::TRUNCATE,   ISD::MUL,
616
                       ISD::SMUL_LOHI,  ISD::UMUL_LOHI,
617
                       ISD::MULHU,      ISD::MULHS,
618
                       ISD::SELECT,     ISD::SELECT_CC,
619
                       ISD::STORE,      ISD::FADD,
620
                       ISD::FSUB,       ISD::FNEG,
621
                       ISD::FABS,       ISD::AssertZext,
622
                       ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
623

624
  setMaxAtomicSizeInBitsSupported(64);
625
  setMaxDivRemBitWidthSupported(64);
626
  setMaxLargeFPConvertBitWidthSupported(64);
627
}
628

629
bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
630
  if (getTargetMachine().Options.NoSignedZerosFPMath)
631
    return true;
632

633
  const auto Flags = Op.getNode()->getFlags();
634
  if (Flags.hasNoSignedZeros())
635
    return true;
636

637
  return false;
638
}
639

640
//===----------------------------------------------------------------------===//
641
// Target Information
642
//===----------------------------------------------------------------------===//
643

644
LLVM_READNONE
645
static bool fnegFoldsIntoOpcode(unsigned Opc) {
646
  switch (Opc) {
647
  case ISD::FADD:
648
  case ISD::FSUB:
649
  case ISD::FMUL:
650
  case ISD::FMA:
651
  case ISD::FMAD:
652
  case ISD::FMINNUM:
653
  case ISD::FMAXNUM:
654
  case ISD::FMINNUM_IEEE:
655
  case ISD::FMAXNUM_IEEE:
656
  case ISD::FMINIMUM:
657
  case ISD::FMAXIMUM:
658
  case ISD::SELECT:
659
  case ISD::FSIN:
660
  case ISD::FTRUNC:
661
  case ISD::FRINT:
662
  case ISD::FNEARBYINT:
663
  case ISD::FROUNDEVEN:
664
  case ISD::FCANONICALIZE:
665
  case AMDGPUISD::RCP:
666
  case AMDGPUISD::RCP_LEGACY:
667
  case AMDGPUISD::RCP_IFLAG:
668
  case AMDGPUISD::SIN_HW:
669
  case AMDGPUISD::FMUL_LEGACY:
670
  case AMDGPUISD::FMIN_LEGACY:
671
  case AMDGPUISD::FMAX_LEGACY:
672
  case AMDGPUISD::FMED3:
673
    // TODO: handle llvm.amdgcn.fma.legacy
674
    return true;
675
  case ISD::BITCAST:
676
    llvm_unreachable("bitcast is special cased");
677
  default:
678
    return false;
679
  }
680
}
681

682
static bool fnegFoldsIntoOp(const SDNode *N) {
683
  unsigned Opc = N->getOpcode();
684
  if (Opc == ISD::BITCAST) {
685
    // TODO: Is there a benefit to checking the conditions performFNegCombine
686
    // does? We don't for the other cases.
687
    SDValue BCSrc = N->getOperand(0);
688
    if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
689
      return BCSrc.getNumOperands() == 2 &&
690
             BCSrc.getOperand(1).getValueSizeInBits() == 32;
691
    }
692

693
    return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
694
  }
695

696
  return fnegFoldsIntoOpcode(Opc);
697
}
698

699
/// \p returns true if the operation will definitely need to use a 64-bit
700
/// encoding, and thus will use a VOP3 encoding regardless of the source
701
/// modifiers.
702
LLVM_READONLY
703
static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
704
  return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
705
         VT == MVT::f64;
706
}
707

708
/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
709
/// type for ISD::SELECT.
710
LLVM_READONLY
711
static bool selectSupportsSourceMods(const SDNode *N) {
712
  // TODO: Only applies if select will be vector
713
  return N->getValueType(0) == MVT::f32;
714
}
715

716
// Most FP instructions support source modifiers, but this could be refined
717
// slightly.
718
LLVM_READONLY
719
static bool hasSourceMods(const SDNode *N) {
720
  if (isa<MemSDNode>(N))
721
    return false;
722

723
  switch (N->getOpcode()) {
724
  case ISD::CopyToReg:
725
  case ISD::FDIV:
726
  case ISD::FREM:
727
  case ISD::INLINEASM:
728
  case ISD::INLINEASM_BR:
729
  case AMDGPUISD::DIV_SCALE:
730
  case ISD::INTRINSIC_W_CHAIN:
731

732
  // TODO: Should really be looking at the users of the bitcast. These are
733
  // problematic because bitcasts are used to legalize all stores to integer
734
  // types.
735
  case ISD::BITCAST:
736
    return false;
737
  case ISD::INTRINSIC_WO_CHAIN: {
738
    switch (N->getConstantOperandVal(0)) {
739
    case Intrinsic::amdgcn_interp_p1:
740
    case Intrinsic::amdgcn_interp_p2:
741
    case Intrinsic::amdgcn_interp_mov:
742
    case Intrinsic::amdgcn_interp_p1_f16:
743
    case Intrinsic::amdgcn_interp_p2_f16:
744
      return false;
745
    default:
746
      return true;
747
    }
748
  }
749
  case ISD::SELECT:
750
    return selectSupportsSourceMods(N);
751
  default:
752
    return true;
753
  }
754
}
755

756
bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
757
                                                 unsigned CostThreshold) {
758
  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
759
  // it is truly free to use a source modifier in all cases. If there are
760
  // multiple users but for each one will necessitate using VOP3, there will be
761
  // a code size increase. Try to avoid increasing code size unless we know it
762
  // will save on the instruction count.
763
  unsigned NumMayIncreaseSize = 0;
764
  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
765

766
  assert(!N->use_empty());
767

768
  // XXX - Should this limit number of uses to check?
769
  for (const SDNode *U : N->uses()) {
770
    if (!hasSourceMods(U))
771
      return false;
772

773
    if (!opMustUseVOP3Encoding(U, VT)) {
774
      if (++NumMayIncreaseSize > CostThreshold)
775
        return false;
776
    }
777
  }
778

779
  return true;
780
}
781

782
EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
783
                                              ISD::NodeType ExtendKind) const {
784
  assert(!VT.isVector() && "only scalar expected");
785

786
  // Round to the next multiple of 32-bits.
787
  unsigned Size = VT.getSizeInBits();
788
  if (Size <= 32)
789
    return MVT::i32;
790
  return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
791
}
792

793
MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
794
  return MVT::i32;
795
}
796

797
bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
798
  return true;
799
}
800

801
// The backend supports 32 and 64 bit floating point immediates.
802
// FIXME: Why are we reporting vectors of FP immediates as legal?
803
bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
804
                                        bool ForCodeSize) const {
805
  EVT ScalarVT = VT.getScalarType();
806
  return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
807
         (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
808
}
809

810
// We don't want to shrink f64 / f32 constants.
811
bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
812
  EVT ScalarVT = VT.getScalarType();
813
  return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
814
}
815

816
bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
817
                                                 ISD::LoadExtType ExtTy,
818
                                                 EVT NewVT) const {
819
  // TODO: This may be worth removing. Check regression tests for diffs.
820
  if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
821
    return false;
822

823
  unsigned NewSize = NewVT.getStoreSizeInBits();
824

825
  // If we are reducing to a 32-bit load or a smaller multi-dword load,
826
  // this is always better.
827
  if (NewSize >= 32)
828
    return true;
829

830
  EVT OldVT = N->getValueType(0);
831
  unsigned OldSize = OldVT.getStoreSizeInBits();
832

833
  MemSDNode *MN = cast<MemSDNode>(N);
834
  unsigned AS = MN->getAddressSpace();
835
  // Do not shrink an aligned scalar load to sub-dword.
836
  // Scalar engine cannot do sub-dword loads.
837
  // TODO: Update this for GFX12 which does have scalar sub-dword loads.
838
  if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
839
      (AS == AMDGPUAS::CONSTANT_ADDRESS ||
840
       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
841
       (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
842
        MN->isInvariant())) &&
843
      AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
844
    return false;
845

846
  // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
847
  // extloads, so doing one requires using a buffer_load. In cases where we
848
  // still couldn't use a scalar load, using the wider load shouldn't really
849
  // hurt anything.
850

851
  // If the old size already had to be an extload, there's no harm in continuing
852
  // to reduce the width.
853
  return (OldSize < 32);
854
}
855

856
bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
857
                                                   const SelectionDAG &DAG,
858
                                                   const MachineMemOperand &MMO) const {
859

860
  assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
861

862
  if (LoadTy.getScalarType() == MVT::i32)
863
    return false;
864

865
  unsigned LScalarSize = LoadTy.getScalarSizeInBits();
866
  unsigned CastScalarSize = CastTy.getScalarSizeInBits();
867

868
  if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
869
    return false;
870

871
  unsigned Fast = 0;
872
  return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
873
                                        CastTy, MMO, &Fast) &&
874
         Fast;
875
}
876

877
// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
878
// profitable with the expansion for 64-bit since it's generally good to
879
// speculate things.
880
bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
881
  return true;
882
}
883

884
bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
885
  return true;
886
}
887

888
bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
889
  switch (N->getOpcode()) {
890
  case ISD::EntryToken:
891
  case ISD::TokenFactor:
892
    return true;
893
  case ISD::INTRINSIC_WO_CHAIN: {
894
    unsigned IntrID = N->getConstantOperandVal(0);
895
    return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
896
  }
897
  case ISD::LOAD:
898
    if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
899
        AMDGPUAS::CONSTANT_ADDRESS_32BIT)
900
      return true;
901
    return false;
902
  case AMDGPUISD::SETCC: // ballot-style instruction
903
    return true;
904
  }
905
  return false;
906
}
907

908
SDValue AMDGPUTargetLowering::getNegatedExpression(
909
    SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
910
    NegatibleCost &Cost, unsigned Depth) const {
911

912
  switch (Op.getOpcode()) {
913
  case ISD::FMA:
914
  case ISD::FMAD: {
915
    // Negating a fma is not free if it has users without source mods.
916
    if (!allUsesHaveSourceMods(Op.getNode()))
917
      return SDValue();
918
    break;
919
  }
920
  case AMDGPUISD::RCP: {
921
    SDValue Src = Op.getOperand(0);
922
    EVT VT = Op.getValueType();
923
    SDLoc SL(Op);
924

925
    SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
926
                                          ForCodeSize, Cost, Depth + 1);
927
    if (NegSrc)
928
      return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
929
    return SDValue();
930
  }
931
  default:
932
    break;
933
  }
934

935
  return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
936
                                              ForCodeSize, Cost, Depth);
937
}
938

939
//===---------------------------------------------------------------------===//
940
// Target Properties
941
//===---------------------------------------------------------------------===//
942

943
bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
944
  assert(VT.isFloatingPoint());
945

946
  // Packed operations do not have a fabs modifier.
947
  return VT == MVT::f32 || VT == MVT::f64 ||
948
         (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
949
}
950

951
bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
952
  assert(VT.isFloatingPoint());
953
  // Report this based on the end legalized type.
954
  VT = VT.getScalarType();
955
  return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
956
}
957

958
bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
959
                                                         unsigned NumElem,
960
                                                         unsigned AS) const {
961
  return true;
962
}
963

964
bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
965
  // There are few operations which truly have vector input operands. Any vector
966
  // operation is going to involve operations on each component, and a
967
  // build_vector will be a copy per element, so it always makes sense to use a
968
  // build_vector input in place of the extracted element to avoid a copy into a
969
  // super register.
970
  //
971
  // We should probably only do this if all users are extracts only, but this
972
  // should be the common case.
973
  return true;
974
}
975

976
bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
977
  // Truncate is just accessing a subregister.
978

979
  unsigned SrcSize = Source.getSizeInBits();
980
  unsigned DestSize = Dest.getSizeInBits();
981

982
  return DestSize < SrcSize && DestSize % 32 == 0 ;
983
}
984

985
bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
986
  // Truncate is just accessing a subregister.
987

988
  unsigned SrcSize = Source->getScalarSizeInBits();
989
  unsigned DestSize = Dest->getScalarSizeInBits();
990

991
  if (DestSize== 16 && Subtarget->has16BitInsts())
992
    return SrcSize >= 32;
993

994
  return DestSize < SrcSize && DestSize % 32 == 0;
995
}
996

997
bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
998
  unsigned SrcSize = Src->getScalarSizeInBits();
999
  unsigned DestSize = Dest->getScalarSizeInBits();
1000

1001
  if (SrcSize == 16 && Subtarget->has16BitInsts())
1002
    return DestSize >= 32;
1003

1004
  return SrcSize == 32 && DestSize == 64;
1005
}
1006

1007
bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
1008
  // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1009
  // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
1010
  // this will enable reducing 64-bit operations the 32-bit, which is always
1011
  // good.
1012

1013
  if (Src == MVT::i16)
1014
    return Dest == MVT::i32 ||Dest == MVT::i64 ;
1015

1016
  return Src == MVT::i32 && Dest == MVT::i64;
1017
}
1018

1019
bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
1020
  // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1021
  // limited number of native 64-bit operations. Shrinking an operation to fit
1022
  // in a single 32-bit register should always be helpful. As currently used,
1023
  // this is much less general than the name suggests, and is only used in
1024
  // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1025
  // not profitable, and may actually be harmful.
1026
  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1027
}
1028

1029
bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
1030
    const SDNode* N, CombineLevel Level) const {
1031
  assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1032
          N->getOpcode() == ISD::SRL) &&
1033
         "Expected shift op");
1034
  // Always commute pre-type legalization and right shifts.
1035
  // We're looking for shl(or(x,y),z) patterns.
1036
  if (Level < CombineLevel::AfterLegalizeTypes ||
1037
      N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1038
    return true;
1039

1040
  // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1041
  if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1042
      (N->use_begin()->getOpcode() == ISD::SRA ||
1043
       N->use_begin()->getOpcode() == ISD::SRL))
1044
    return false;
1045

1046
  // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1047
  auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1048
    if (LHS.getOpcode() != ISD::SHL)
1049
      return false;
1050
    auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1051
    auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1052
    auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1053
    return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1054
           LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1055
           RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1056
  };
1057
  SDValue LHS = N->getOperand(0).getOperand(0);
1058
  SDValue RHS = N->getOperand(0).getOperand(1);
1059
  return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1060
}
1061

1062
//===---------------------------------------------------------------------===//
1063
// TargetLowering Callbacks
1064
//===---------------------------------------------------------------------===//
1065

1066
CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
1067
                                                  bool IsVarArg) {
1068
  switch (CC) {
1069
  case CallingConv::AMDGPU_VS:
1070
  case CallingConv::AMDGPU_GS:
1071
  case CallingConv::AMDGPU_PS:
1072
  case CallingConv::AMDGPU_CS:
1073
  case CallingConv::AMDGPU_HS:
1074
  case CallingConv::AMDGPU_ES:
1075
  case CallingConv::AMDGPU_LS:
1076
    return CC_AMDGPU;
1077
  case CallingConv::AMDGPU_CS_Chain:
1078
  case CallingConv::AMDGPU_CS_ChainPreserve:
1079
    return CC_AMDGPU_CS_CHAIN;
1080
  case CallingConv::C:
1081
  case CallingConv::Fast:
1082
  case CallingConv::Cold:
1083
    return CC_AMDGPU_Func;
1084
  case CallingConv::AMDGPU_Gfx:
1085
    return CC_SI_Gfx;
1086
  case CallingConv::AMDGPU_KERNEL:
1087
  case CallingConv::SPIR_KERNEL:
1088
  default:
1089
    report_fatal_error("Unsupported calling convention for call");
1090
  }
1091
}
1092

1093
CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1094
                                                    bool IsVarArg) {
1095
  switch (CC) {
1096
  case CallingConv::AMDGPU_KERNEL:
1097
  case CallingConv::SPIR_KERNEL:
1098
    llvm_unreachable("kernels should not be handled here");
1099
  case CallingConv::AMDGPU_VS:
1100
  case CallingConv::AMDGPU_GS:
1101
  case CallingConv::AMDGPU_PS:
1102
  case CallingConv::AMDGPU_CS:
1103
  case CallingConv::AMDGPU_CS_Chain:
1104
  case CallingConv::AMDGPU_CS_ChainPreserve:
1105
  case CallingConv::AMDGPU_HS:
1106
  case CallingConv::AMDGPU_ES:
1107
  case CallingConv::AMDGPU_LS:
1108
    return RetCC_SI_Shader;
1109
  case CallingConv::AMDGPU_Gfx:
1110
    return RetCC_SI_Gfx;
1111
  case CallingConv::C:
1112
  case CallingConv::Fast:
1113
  case CallingConv::Cold:
1114
    return RetCC_AMDGPU_Func;
1115
  default:
1116
    report_fatal_error("Unsupported calling convention.");
1117
  }
1118
}
1119

1120
/// The SelectionDAGBuilder will automatically promote function arguments
1121
/// with illegal types.  However, this does not work for the AMDGPU targets
1122
/// since the function arguments are stored in memory as these illegal types.
1123
/// In order to handle this properly we need to get the original types sizes
1124
/// from the LLVM IR Function and fixup the ISD:InputArg values before
1125
/// passing them to AnalyzeFormalArguments()
1126

1127
/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1128
/// input values across multiple registers.  Each item in the Ins array
1129
/// represents a single value that will be stored in registers.  Ins[x].VT is
1130
/// the value type of the value that will be stored in the register, so
1131
/// whatever SDNode we lower the argument to needs to be this type.
1132
///
1133
/// In order to correctly lower the arguments we need to know the size of each
1134
/// argument.  Since Ins[x].VT gives us the size of the register that will
1135
/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1136
/// for the original function argument so that we can deduce the correct memory
1137
/// type to use for Ins[x].  In most cases the correct memory type will be
1138
/// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
1139
/// we have a kernel argument of type v8i8, this argument will be split into
1140
/// 8 parts and each part will be represented by its own item in the Ins array.
1141
/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1142
/// the argument before it was split.  From this, we deduce that the memory type
1143
/// for each individual part is i8.  We pass the memory type as LocVT to the
1144
/// calling convention analysis function and the register type (Ins[x].VT) as
1145
/// the ValVT.
1146
void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1147
  CCState &State,
1148
  const SmallVectorImpl<ISD::InputArg> &Ins) const {
1149
  const MachineFunction &MF = State.getMachineFunction();
1150
  const Function &Fn = MF.getFunction();
1151
  LLVMContext &Ctx = Fn.getParent()->getContext();
1152
  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1153
  const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1154
  CallingConv::ID CC = Fn.getCallingConv();
1155

1156
  Align MaxAlign = Align(1);
1157
  uint64_t ExplicitArgOffset = 0;
1158
  const DataLayout &DL = Fn.getDataLayout();
1159

1160
  unsigned InIndex = 0;
1161

1162
  for (const Argument &Arg : Fn.args()) {
1163
    const bool IsByRef = Arg.hasByRefAttr();
1164
    Type *BaseArgTy = Arg.getType();
1165
    Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1166
    Align Alignment = DL.getValueOrABITypeAlignment(
1167
        IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1168
    MaxAlign = std::max(Alignment, MaxAlign);
1169
    uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1170

1171
    uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1172
    ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1173

1174
    // We're basically throwing away everything passed into us and starting over
1175
    // to get accurate in-memory offsets. The "PartOffset" is completely useless
1176
    // to us as computed in Ins.
1177
    //
1178
    // We also need to figure out what type legalization is trying to do to get
1179
    // the correct memory offsets.
1180

1181
    SmallVector<EVT, 16> ValueVTs;
1182
    SmallVector<uint64_t, 16> Offsets;
1183
    ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1184

1185
    for (unsigned Value = 0, NumValues = ValueVTs.size();
1186
         Value != NumValues; ++Value) {
1187
      uint64_t BasePartOffset = Offsets[Value];
1188

1189
      EVT ArgVT = ValueVTs[Value];
1190
      EVT MemVT = ArgVT;
1191
      MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1192
      unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1193

1194
      if (NumRegs == 1) {
1195
        // This argument is not split, so the IR type is the memory type.
1196
        if (ArgVT.isExtended()) {
1197
          // We have an extended type, like i24, so we should just use the
1198
          // register type.
1199
          MemVT = RegisterVT;
1200
        } else {
1201
          MemVT = ArgVT;
1202
        }
1203
      } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1204
                 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1205
        assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1206
        // We have a vector value which has been split into a vector with
1207
        // the same scalar type, but fewer elements.  This should handle
1208
        // all the floating-point vector types.
1209
        MemVT = RegisterVT;
1210
      } else if (ArgVT.isVector() &&
1211
                 ArgVT.getVectorNumElements() == NumRegs) {
1212
        // This arg has been split so that each element is stored in a separate
1213
        // register.
1214
        MemVT = ArgVT.getScalarType();
1215
      } else if (ArgVT.isExtended()) {
1216
        // We have an extended type, like i65.
1217
        MemVT = RegisterVT;
1218
      } else {
1219
        unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1220
        assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1221
        if (RegisterVT.isInteger()) {
1222
          MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1223
        } else if (RegisterVT.isVector()) {
1224
          assert(!RegisterVT.getScalarType().isFloatingPoint());
1225
          unsigned NumElements = RegisterVT.getVectorNumElements();
1226
          assert(MemoryBits % NumElements == 0);
1227
          // This vector type has been split into another vector type with
1228
          // a different elements size.
1229
          EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1230
                                           MemoryBits / NumElements);
1231
          MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1232
        } else {
1233
          llvm_unreachable("cannot deduce memory type.");
1234
        }
1235
      }
1236

1237
      // Convert one element vectors to scalar.
1238
      if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1239
        MemVT = MemVT.getScalarType();
1240

1241
      // Round up vec3/vec5 argument.
1242
      if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1243
        assert(MemVT.getVectorNumElements() == 3 ||
1244
               MemVT.getVectorNumElements() == 5 ||
1245
               (MemVT.getVectorNumElements() >= 9 &&
1246
                MemVT.getVectorNumElements() <= 12));
1247
        MemVT = MemVT.getPow2VectorType(State.getContext());
1248
      } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1249
        MemVT = MemVT.getRoundIntegerType(State.getContext());
1250
      }
1251

1252
      unsigned PartOffset = 0;
1253
      for (unsigned i = 0; i != NumRegs; ++i) {
1254
        State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1255
                                               BasePartOffset + PartOffset,
1256
                                               MemVT.getSimpleVT(),
1257
                                               CCValAssign::Full));
1258
        PartOffset += MemVT.getStoreSize();
1259
      }
1260
    }
1261
  }
1262
}
1263

1264
SDValue AMDGPUTargetLowering::LowerReturn(
1265
  SDValue Chain, CallingConv::ID CallConv,
1266
  bool isVarArg,
1267
  const SmallVectorImpl<ISD::OutputArg> &Outs,
1268
  const SmallVectorImpl<SDValue> &OutVals,
1269
  const SDLoc &DL, SelectionDAG &DAG) const {
1270
  // FIXME: Fails for r600 tests
1271
  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1272
  // "wave terminate should not have return values");
1273
  return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1274
}
1275

1276
//===---------------------------------------------------------------------===//
1277
// Target specific lowering
1278
//===---------------------------------------------------------------------===//
1279

1280
/// Selects the correct CCAssignFn for a given CallingConvention value.
1281
CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1282
                                                    bool IsVarArg) {
1283
  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1284
}
1285

1286
CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1287
                                                      bool IsVarArg) {
1288
  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1289
}
1290

1291
SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1292
                                                  SelectionDAG &DAG,
1293
                                                  MachineFrameInfo &MFI,
1294
                                                  int ClobberedFI) const {
1295
  SmallVector<SDValue, 8> ArgChains;
1296
  int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1297
  int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1298

1299
  // Include the original chain at the beginning of the list. When this is
1300
  // used by target LowerCall hooks, this helps legalize find the
1301
  // CALLSEQ_BEGIN node.
1302
  ArgChains.push_back(Chain);
1303

1304
  // Add a chain value for each stack argument corresponding
1305
  for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1306
    if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1307
      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1308
        if (FI->getIndex() < 0) {
1309
          int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1310
          int64_t InLastByte = InFirstByte;
1311
          InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1312

1313
          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1314
              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1315
            ArgChains.push_back(SDValue(L, 1));
1316
        }
1317
      }
1318
    }
1319
  }
1320

1321
  // Build a tokenfactor for all the chains.
1322
  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1323
}
1324

1325
SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1326
                                                 SmallVectorImpl<SDValue> &InVals,
1327
                                                 StringRef Reason) const {
1328
  SDValue Callee = CLI.Callee;
1329
  SelectionDAG &DAG = CLI.DAG;
1330

1331
  const Function &Fn = DAG.getMachineFunction().getFunction();
1332

1333
  StringRef FuncName("<unknown>");
1334

1335
  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1336
    FuncName = G->getSymbol();
1337
  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1338
    FuncName = G->getGlobal()->getName();
1339

1340
  DiagnosticInfoUnsupported NoCalls(
1341
    Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1342
  DAG.getContext()->diagnose(NoCalls);
1343

1344
  if (!CLI.IsTailCall) {
1345
    for (ISD::InputArg &Arg : CLI.Ins)
1346
      InVals.push_back(DAG.getUNDEF(Arg.VT));
1347
  }
1348

1349
  return DAG.getEntryNode();
1350
}
1351

1352
SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1353
                                        SmallVectorImpl<SDValue> &InVals) const {
1354
  return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1355
}
1356

1357
SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1358
                                                      SelectionDAG &DAG) const {
1359
  const Function &Fn = DAG.getMachineFunction().getFunction();
1360

1361
  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1362
                                            SDLoc(Op).getDebugLoc());
1363
  DAG.getContext()->diagnose(NoDynamicAlloca);
1364
  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1365
  return DAG.getMergeValues(Ops, SDLoc());
1366
}
1367

1368
SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1369
                                             SelectionDAG &DAG) const {
1370
  switch (Op.getOpcode()) {
1371
  default:
1372
    Op->print(errs(), &DAG);
1373
    llvm_unreachable("Custom lowering code for this "
1374
                     "instruction is not implemented yet!");
1375
    break;
1376
  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1377
  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1378
  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1379
  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1380
  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1381
  case ISD::FREM: return LowerFREM(Op, DAG);
1382
  case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1383
  case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1384
  case ISD::FRINT: return LowerFRINT(Op, DAG);
1385
  case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1386
  case ISD::FROUNDEVEN:
1387
    return LowerFROUNDEVEN(Op, DAG);
1388
  case ISD::FROUND: return LowerFROUND(Op, DAG);
1389
  case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1390
  case ISD::FLOG2:
1391
    return LowerFLOG2(Op, DAG);
1392
  case ISD::FLOG:
1393
  case ISD::FLOG10:
1394
    return LowerFLOGCommon(Op, DAG);
1395
  case ISD::FEXP:
1396
  case ISD::FEXP10:
1397
    return lowerFEXP(Op, DAG);
1398
  case ISD::FEXP2:
1399
    return lowerFEXP2(Op, DAG);
1400
  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1401
  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1402
  case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1403
  case ISD::FP_TO_SINT:
1404
  case ISD::FP_TO_UINT:
1405
    return LowerFP_TO_INT(Op, DAG);
1406
  case ISD::CTTZ:
1407
  case ISD::CTTZ_ZERO_UNDEF:
1408
  case ISD::CTLZ:
1409
  case ISD::CTLZ_ZERO_UNDEF:
1410
    return LowerCTLZ_CTTZ(Op, DAG);
1411
  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1412
  }
1413
  return Op;
1414
}
1415

1416
void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1417
                                              SmallVectorImpl<SDValue> &Results,
1418
                                              SelectionDAG &DAG) const {
1419
  switch (N->getOpcode()) {
1420
  case ISD::SIGN_EXTEND_INREG:
1421
    // Different parts of legalization seem to interpret which type of
1422
    // sign_extend_inreg is the one to check for custom lowering. The extended
1423
    // from type is what really matters, but some places check for custom
1424
    // lowering of the result type. This results in trying to use
1425
    // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1426
    // nothing here and let the illegal result integer be handled normally.
1427
    return;
1428
  case ISD::FLOG2:
1429
    if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1430
      Results.push_back(Lowered);
1431
    return;
1432
  case ISD::FLOG:
1433
  case ISD::FLOG10:
1434
    if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1435
      Results.push_back(Lowered);
1436
    return;
1437
  case ISD::FEXP2:
1438
    if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1439
      Results.push_back(Lowered);
1440
    return;
1441
  case ISD::FEXP:
1442
  case ISD::FEXP10:
1443
    if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1444
      Results.push_back(Lowered);
1445
    return;
1446
  case ISD::CTLZ:
1447
  case ISD::CTLZ_ZERO_UNDEF:
1448
    if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1449
      Results.push_back(Lowered);
1450
    return;
1451
  default:
1452
    return;
1453
  }
1454
}
1455

1456
SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1457
                                                 SDValue Op,
1458
                                                 SelectionDAG &DAG) const {
1459

1460
  const DataLayout &DL = DAG.getDataLayout();
1461
  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1462
  const GlobalValue *GV = G->getGlobal();
1463

1464
  if (!MFI->isModuleEntryFunction()) {
1465
    if (std::optional<uint32_t> Address =
1466
            AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
1467
      return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1468
    }
1469
  }
1470

1471
  if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1472
      G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1473
    if (!MFI->isModuleEntryFunction() &&
1474
        GV->getName() != "llvm.amdgcn.module.lds") {
1475
      SDLoc DL(Op);
1476
      const Function &Fn = DAG.getMachineFunction().getFunction();
1477
      DiagnosticInfoUnsupported BadLDSDecl(
1478
        Fn, "local memory global used by non-kernel function",
1479
        DL.getDebugLoc(), DS_Warning);
1480
      DAG.getContext()->diagnose(BadLDSDecl);
1481

1482
      // We currently don't have a way to correctly allocate LDS objects that
1483
      // aren't directly associated with a kernel. We do force inlining of
1484
      // functions that use local objects. However, if these dead functions are
1485
      // not eliminated, we don't want a compile time error. Just emit a warning
1486
      // and a trap, since there should be no callable path here.
1487
      SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1488
      SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1489
                                        Trap, DAG.getRoot());
1490
      DAG.setRoot(OutputChain);
1491
      return DAG.getUNDEF(Op.getValueType());
1492
    }
1493

1494
    // XXX: What does the value of G->getOffset() mean?
1495
    assert(G->getOffset() == 0 &&
1496
         "Do not know what to do with an non-zero offset");
1497

1498
    // TODO: We could emit code to handle the initialization somewhere.
1499
    // We ignore the initializer for now and legalize it to allow selection.
1500
    // The initializer will anyway get errored out during assembly emission.
1501
    unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1502
    return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1503
  }
1504
  return SDValue();
1505
}
1506

1507
SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1508
                                                  SelectionDAG &DAG) const {
1509
  SmallVector<SDValue, 8> Args;
1510
  SDLoc SL(Op);
1511

1512
  EVT VT = Op.getValueType();
1513
  if (VT.getVectorElementType().getSizeInBits() < 32) {
1514
    unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1515
    if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1516
      unsigned NewNumElt = OpBitSize / 32;
1517
      EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1518
                                      : EVT::getVectorVT(*DAG.getContext(),
1519
                                                         MVT::i32, NewNumElt);
1520
      for (const SDUse &U : Op->ops()) {
1521
        SDValue In = U.get();
1522
        SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1523
        if (NewNumElt > 1)
1524
          DAG.ExtractVectorElements(NewIn, Args);
1525
        else
1526
          Args.push_back(NewIn);
1527
      }
1528

1529
      EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1530
                                   NewNumElt * Op.getNumOperands());
1531
      SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1532
      return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1533
    }
1534
  }
1535

1536
  for (const SDUse &U : Op->ops())
1537
    DAG.ExtractVectorElements(U.get(), Args);
1538

1539
  return DAG.getBuildVector(Op.getValueType(), SL, Args);
1540
}
1541

1542
SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1543
                                                     SelectionDAG &DAG) const {
1544
  SDLoc SL(Op);
1545
  SmallVector<SDValue, 8> Args;
1546
  unsigned Start = Op.getConstantOperandVal(1);
1547
  EVT VT = Op.getValueType();
1548
  EVT SrcVT = Op.getOperand(0).getValueType();
1549

1550
  if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1551
    unsigned NumElt = VT.getVectorNumElements();
1552
    unsigned NumSrcElt = SrcVT.getVectorNumElements();
1553
    assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1554

1555
    // Extract 32-bit registers at a time.
1556
    EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1557
    EVT NewVT = NumElt == 2
1558
                    ? MVT::i32
1559
                    : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1560
    SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1561

1562
    DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1563
    if (NumElt == 2)
1564
      Tmp = Args[0];
1565
    else
1566
      Tmp = DAG.getBuildVector(NewVT, SL, Args);
1567

1568
    return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1569
  }
1570

1571
  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1572
                            VT.getVectorNumElements());
1573

1574
  return DAG.getBuildVector(Op.getValueType(), SL, Args);
1575
}
1576

1577
// TODO: Handle fabs too
1578
static SDValue peekFNeg(SDValue Val) {
1579
  if (Val.getOpcode() == ISD::FNEG)
1580
    return Val.getOperand(0);
1581

1582
  return Val;
1583
}
1584

1585
static SDValue peekFPSignOps(SDValue Val) {
1586
  if (Val.getOpcode() == ISD::FNEG)
1587
    Val = Val.getOperand(0);
1588
  if (Val.getOpcode() == ISD::FABS)
1589
    Val = Val.getOperand(0);
1590
  if (Val.getOpcode() == ISD::FCOPYSIGN)
1591
    Val = Val.getOperand(0);
1592
  return Val;
1593
}
1594

1595
SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1596
    const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1597
    SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1598
  SelectionDAG &DAG = DCI.DAG;
1599
  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1600
  switch (CCOpcode) {
1601
  case ISD::SETOEQ:
1602
  case ISD::SETONE:
1603
  case ISD::SETUNE:
1604
  case ISD::SETNE:
1605
  case ISD::SETUEQ:
1606
  case ISD::SETEQ:
1607
  case ISD::SETFALSE:
1608
  case ISD::SETFALSE2:
1609
  case ISD::SETTRUE:
1610
  case ISD::SETTRUE2:
1611
  case ISD::SETUO:
1612
  case ISD::SETO:
1613
    break;
1614
  case ISD::SETULE:
1615
  case ISD::SETULT: {
1616
    if (LHS == True)
1617
      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1618
    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1619
  }
1620
  case ISD::SETOLE:
1621
  case ISD::SETOLT:
1622
  case ISD::SETLE:
1623
  case ISD::SETLT: {
1624
    // Ordered. Assume ordered for undefined.
1625

1626
    // Only do this after legalization to avoid interfering with other combines
1627
    // which might occur.
1628
    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1629
        !DCI.isCalledByLegalizer())
1630
      return SDValue();
1631

1632
    // We need to permute the operands to get the correct NaN behavior. The
1633
    // selected operand is the second one based on the failing compare with NaN,
1634
    // so permute it based on the compare type the hardware uses.
1635
    if (LHS == True)
1636
      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1637
    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1638
  }
1639
  case ISD::SETUGE:
1640
  case ISD::SETUGT: {
1641
    if (LHS == True)
1642
      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1643
    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1644
  }
1645
  case ISD::SETGT:
1646
  case ISD::SETGE:
1647
  case ISD::SETOGE:
1648
  case ISD::SETOGT: {
1649
    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1650
        !DCI.isCalledByLegalizer())
1651
      return SDValue();
1652

1653
    if (LHS == True)
1654
      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1655
    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1656
  }
1657
  case ISD::SETCC_INVALID:
1658
    llvm_unreachable("Invalid setcc condcode!");
1659
  }
1660
  return SDValue();
1661
}
1662

1663
/// Generate Min/Max node
1664
SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1665
                                                   SDValue LHS, SDValue RHS,
1666
                                                   SDValue True, SDValue False,
1667
                                                   SDValue CC,
1668
                                                   DAGCombinerInfo &DCI) const {
1669
  if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1670
    return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1671

1672
  SelectionDAG &DAG = DCI.DAG;
1673

1674
  // If we can't directly match this, try to see if we can fold an fneg to
1675
  // match.
1676

1677
  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1678
  ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1679
  SDValue NegTrue = peekFNeg(True);
1680

1681
  // Undo the combine foldFreeOpFromSelect does if it helps us match the
1682
  // fmin/fmax.
1683
  //
1684
  // select (fcmp olt (lhs, K)), (fneg lhs), -K
1685
  // -> fneg (fmin_legacy lhs, K)
1686
  //
1687
  // TODO: Use getNegatedExpression
1688
  if (LHS == NegTrue && CFalse && CRHS) {
1689
    APFloat NegRHS = neg(CRHS->getValueAPF());
1690
    if (NegRHS == CFalse->getValueAPF()) {
1691
      SDValue Combined =
1692
          combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1693
      if (Combined)
1694
        return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1695
      return SDValue();
1696
    }
1697
  }
1698

1699
  return SDValue();
1700
}
1701

1702
std::pair<SDValue, SDValue>
1703
AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1704
  SDLoc SL(Op);
1705

1706
  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1707

1708
  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1709
  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1710

1711
  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1712
  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1713

1714
  return std::pair(Lo, Hi);
1715
}
1716

1717
SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1718
  SDLoc SL(Op);
1719

1720
  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1721
  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1722
  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1723
}
1724

1725
SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1726
  SDLoc SL(Op);
1727

1728
  SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1729
  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1730
  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1731
}
1732

1733
// Split a vector type into two parts. The first part is a power of two vector.
1734
// The second part is whatever is left over, and is a scalar if it would
1735
// otherwise be a 1-vector.
1736
std::pair<EVT, EVT>
1737
AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1738
  EVT LoVT, HiVT;
1739
  EVT EltVT = VT.getVectorElementType();
1740
  unsigned NumElts = VT.getVectorNumElements();
1741
  unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1742
  LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1743
  HiVT = NumElts - LoNumElts == 1
1744
             ? EltVT
1745
             : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1746
  return std::pair(LoVT, HiVT);
1747
}
1748

1749
// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1750
// scalar.
1751
std::pair<SDValue, SDValue>
1752
AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1753
                                  const EVT &LoVT, const EVT &HiVT,
1754
                                  SelectionDAG &DAG) const {
1755
  assert(LoVT.getVectorNumElements() +
1756
                 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1757
             N.getValueType().getVectorNumElements() &&
1758
         "More vector elements requested than available!");
1759
  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1760
                           DAG.getVectorIdxConstant(0, DL));
1761
  SDValue Hi = DAG.getNode(
1762
      HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1763
      HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1764
  return std::pair(Lo, Hi);
1765
}
1766

1767
SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1768
                                              SelectionDAG &DAG) const {
1769
  LoadSDNode *Load = cast<LoadSDNode>(Op);
1770
  EVT VT = Op.getValueType();
1771
  SDLoc SL(Op);
1772

1773

1774
  // If this is a 2 element vector, we really want to scalarize and not create
1775
  // weird 1 element vectors.
1776
  if (VT.getVectorNumElements() == 2) {
1777
    SDValue Ops[2];
1778
    std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1779
    return DAG.getMergeValues(Ops, SL);
1780
  }
1781

1782
  SDValue BasePtr = Load->getBasePtr();
1783
  EVT MemVT = Load->getMemoryVT();
1784

1785
  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1786

1787
  EVT LoVT, HiVT;
1788
  EVT LoMemVT, HiMemVT;
1789
  SDValue Lo, Hi;
1790

1791
  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1792
  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1793
  std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1794

1795
  unsigned Size = LoMemVT.getStoreSize();
1796
  Align BaseAlign = Load->getAlign();
1797
  Align HiAlign = commonAlignment(BaseAlign, Size);
1798

1799
  SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1800
                                  Load->getChain(), BasePtr, SrcValue, LoMemVT,
1801
                                  BaseAlign, Load->getMemOperand()->getFlags());
1802
  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1803
  SDValue HiLoad =
1804
      DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1805
                     HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1806
                     HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1807

1808
  SDValue Join;
1809
  if (LoVT == HiVT) {
1810
    // This is the case that the vector is power of two so was evenly split.
1811
    Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1812
  } else {
1813
    Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1814
                       DAG.getVectorIdxConstant(0, SL));
1815
    Join = DAG.getNode(
1816
        HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1817
        VT, Join, HiLoad,
1818
        DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1819
  }
1820

1821
  SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1822
                                     LoLoad.getValue(1), HiLoad.getValue(1))};
1823

1824
  return DAG.getMergeValues(Ops, SL);
1825
}
1826

1827
SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1828
                                                     SelectionDAG &DAG) const {
1829
  LoadSDNode *Load = cast<LoadSDNode>(Op);
1830
  EVT VT = Op.getValueType();
1831
  SDValue BasePtr = Load->getBasePtr();
1832
  EVT MemVT = Load->getMemoryVT();
1833
  SDLoc SL(Op);
1834
  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1835
  Align BaseAlign = Load->getAlign();
1836
  unsigned NumElements = MemVT.getVectorNumElements();
1837

1838
  // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1839
  // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1840
  if (NumElements != 3 ||
1841
      (BaseAlign < Align(8) &&
1842
       !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1843
    return SplitVectorLoad(Op, DAG);
1844

1845
  assert(NumElements == 3);
1846

1847
  EVT WideVT =
1848
      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1849
  EVT WideMemVT =
1850
      EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1851
  SDValue WideLoad = DAG.getExtLoad(
1852
      Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1853
      WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1854
  return DAG.getMergeValues(
1855
      {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1856
                   DAG.getVectorIdxConstant(0, SL)),
1857
       WideLoad.getValue(1)},
1858
      SL);
1859
}
1860

1861
SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1862
                                               SelectionDAG &DAG) const {
1863
  StoreSDNode *Store = cast<StoreSDNode>(Op);
1864
  SDValue Val = Store->getValue();
1865
  EVT VT = Val.getValueType();
1866

1867
  // If this is a 2 element vector, we really want to scalarize and not create
1868
  // weird 1 element vectors.
1869
  if (VT.getVectorNumElements() == 2)
1870
    return scalarizeVectorStore(Store, DAG);
1871

1872
  EVT MemVT = Store->getMemoryVT();
1873
  SDValue Chain = Store->getChain();
1874
  SDValue BasePtr = Store->getBasePtr();
1875
  SDLoc SL(Op);
1876

1877
  EVT LoVT, HiVT;
1878
  EVT LoMemVT, HiMemVT;
1879
  SDValue Lo, Hi;
1880

1881
  std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1882
  std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1883
  std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1884

1885
  SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1886

1887
  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1888
  Align BaseAlign = Store->getAlign();
1889
  unsigned Size = LoMemVT.getStoreSize();
1890
  Align HiAlign = commonAlignment(BaseAlign, Size);
1891

1892
  SDValue LoStore =
1893
      DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1894
                        Store->getMemOperand()->getFlags());
1895
  SDValue HiStore =
1896
      DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1897
                        HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1898

1899
  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1900
}
1901

1902
// This is a shortcut for integer division because we have fast i32<->f32
1903
// conversions, and fast f32 reciprocal instructions. The fractional part of a
1904
// float is enough to accurately represent up to a 24-bit signed integer.
1905
SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1906
                                            bool Sign) const {
1907
  SDLoc DL(Op);
1908
  EVT VT = Op.getValueType();
1909
  SDValue LHS = Op.getOperand(0);
1910
  SDValue RHS = Op.getOperand(1);
1911
  MVT IntVT = MVT::i32;
1912
  MVT FltVT = MVT::f32;
1913

1914
  unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1915
  if (LHSSignBits < 9)
1916
    return SDValue();
1917

1918
  unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1919
  if (RHSSignBits < 9)
1920
    return SDValue();
1921

1922
  unsigned BitSize = VT.getSizeInBits();
1923
  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1924
  unsigned DivBits = BitSize - SignBits;
1925
  if (Sign)
1926
    ++DivBits;
1927

1928
  ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1929
  ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1930

1931
  SDValue jq = DAG.getConstant(1, DL, IntVT);
1932

1933
  if (Sign) {
1934
    // char|short jq = ia ^ ib;
1935
    jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1936

1937
    // jq = jq >> (bitsize - 2)
1938
    jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1939
                     DAG.getConstant(BitSize - 2, DL, VT));
1940

1941
    // jq = jq | 0x1
1942
    jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1943
  }
1944

1945
  // int ia = (int)LHS;
1946
  SDValue ia = LHS;
1947

1948
  // int ib, (int)RHS;
1949
  SDValue ib = RHS;
1950

1951
  // float fa = (float)ia;
1952
  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1953

1954
  // float fb = (float)ib;
1955
  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1956

1957
  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1958
                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1959

1960
  // fq = trunc(fq);
1961
  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1962

1963
  // float fqneg = -fq;
1964
  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1965

1966
  MachineFunction &MF = DAG.getMachineFunction();
1967

1968
  bool UseFmadFtz = false;
1969
  if (Subtarget->isGCN()) {
1970
    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1971
    UseFmadFtz =
1972
        MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
1973
  }
1974

1975
  // float fr = mad(fqneg, fb, fa);
1976
  unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1977
                    : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1978
                                 : (unsigned)ISD::FMAD;
1979
  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1980

1981
  // int iq = (int)fq;
1982
  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1983

1984
  // fr = fabs(fr);
1985
  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1986

1987
  // fb = fabs(fb);
1988
  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1989

1990
  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1991

1992
  // int cv = fr >= fb;
1993
  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1994

1995
  // jq = (cv ? jq : 0);
1996
  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1997

1998
  // dst = iq + jq;
1999
  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2000

2001
  // Rem needs compensation, it's easier to recompute it
2002
  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2003
  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2004

2005
  // Truncate to number of bits this divide really is.
2006
  if (Sign) {
2007
    SDValue InRegSize
2008
      = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2009
    Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2010
    Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2011
  } else {
2012
    SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2013
    Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2014
    Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2015
  }
2016

2017
  return DAG.getMergeValues({ Div, Rem }, DL);
2018
}
2019

2020
void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
2021
                                      SelectionDAG &DAG,
2022
                                      SmallVectorImpl<SDValue> &Results) const {
2023
  SDLoc DL(Op);
2024
  EVT VT = Op.getValueType();
2025

2026
  assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2027

2028
  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2029

2030
  SDValue One = DAG.getConstant(1, DL, HalfVT);
2031
  SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2032

2033
  //HiLo split
2034
  SDValue LHS_Lo, LHS_Hi;
2035
  SDValue LHS = Op.getOperand(0);
2036
  std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2037

2038
  SDValue RHS_Lo, RHS_Hi;
2039
  SDValue RHS = Op.getOperand(1);
2040
  std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2041

2042
  if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2043
      DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2044

2045
    SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2046
                              LHS_Lo, RHS_Lo);
2047

2048
    SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2049
    SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2050

2051
    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2052
    Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2053
    return;
2054
  }
2055

2056
  if (isTypeLegal(MVT::i64)) {
2057
    // The algorithm here is based on ideas from "Software Integer Division",
2058
    // Tom Rodeheffer, August 2008.
2059

2060
    MachineFunction &MF = DAG.getMachineFunction();
2061
    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2062

2063
    // Compute denominator reciprocal.
2064
    unsigned FMAD =
2065
        !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2066
        : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
2067
            ? (unsigned)ISD::FMAD
2068
            : (unsigned)AMDGPUISD::FMAD_FTZ;
2069

2070
    SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2071
    SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2072
    SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2073
      DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2074
      Cvt_Lo);
2075
    SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2076
    SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2077
      DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2078
    SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2079
      DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2080
    SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2081
    SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2082
      DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2083
      Mul1);
2084
    SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2085
    SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2086
    SDValue Rcp64 = DAG.getBitcast(VT,
2087
                        DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2088

2089
    SDValue Zero64 = DAG.getConstant(0, DL, VT);
2090
    SDValue One64  = DAG.getConstant(1, DL, VT);
2091
    SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2092
    SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2093

2094
    // First round of UNR (Unsigned integer Newton-Raphson).
2095
    SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2096
    SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2097
    SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2098
    SDValue Mulhi1_Lo, Mulhi1_Hi;
2099
    std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2100
        DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2101
    SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2102
                                  Mulhi1_Lo, Zero1);
2103
    SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2104
                                  Mulhi1_Hi, Add1_Lo.getValue(1));
2105
    SDValue Add1 = DAG.getBitcast(VT,
2106
                        DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2107

2108
    // Second round of UNR.
2109
    SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2110
    SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2111
    SDValue Mulhi2_Lo, Mulhi2_Hi;
2112
    std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2113
        DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2114
    SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2115
                                  Mulhi2_Lo, Zero1);
2116
    SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2117
                                  Mulhi2_Hi, Add2_Lo.getValue(1));
2118
    SDValue Add2 = DAG.getBitcast(VT,
2119
                        DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2120

2121
    SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2122

2123
    SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2124

2125
    SDValue Mul3_Lo, Mul3_Hi;
2126
    std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2127
    SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2128
                                  Mul3_Lo, Zero1);
2129
    SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2130
                                  Mul3_Hi, Sub1_Lo.getValue(1));
2131
    SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2132
    SDValue Sub1 = DAG.getBitcast(VT,
2133
                        DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2134

2135
    SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2136
    SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2137
                                 ISD::SETUGE);
2138
    SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2139
                                 ISD::SETUGE);
2140
    SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2141

2142
    // TODO: Here and below portions of the code can be enclosed into if/endif.
2143
    // Currently control flow is unconditional and we have 4 selects after
2144
    // potential endif to substitute PHIs.
2145

2146
    // if C3 != 0 ...
2147
    SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2148
                                  RHS_Lo, Zero1);
2149
    SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2150
                                  RHS_Hi, Sub1_Lo.getValue(1));
2151
    SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2152
                                  Zero, Sub2_Lo.getValue(1));
2153
    SDValue Sub2 = DAG.getBitcast(VT,
2154
                        DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2155

2156
    SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2157

2158
    SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2159
                                 ISD::SETUGE);
2160
    SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2161
                                 ISD::SETUGE);
2162
    SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2163

2164
    // if (C6 != 0)
2165
    SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2166

2167
    SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2168
                                  RHS_Lo, Zero1);
2169
    SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2170
                                  RHS_Hi, Sub2_Lo.getValue(1));
2171
    SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2172
                                  Zero, Sub3_Lo.getValue(1));
2173
    SDValue Sub3 = DAG.getBitcast(VT,
2174
                        DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2175

2176
    // endif C6
2177
    // endif C3
2178

2179
    SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2180
    SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2181

2182
    SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2183
    SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2184

2185
    Results.push_back(Div);
2186
    Results.push_back(Rem);
2187

2188
    return;
2189
  }
2190

2191
  // r600 expandion.
2192
  // Get Speculative values
2193
  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2194
  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2195

2196
  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2197
  SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2198
  REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2199

2200
  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2201
  SDValue DIV_Lo = Zero;
2202

2203
  const unsigned halfBitWidth = HalfVT.getSizeInBits();
2204

2205
  for (unsigned i = 0; i < halfBitWidth; ++i) {
2206
    const unsigned bitPos = halfBitWidth - i - 1;
2207
    SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2208
    // Get value of high bit
2209
    SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2210
    HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2211
    HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2212

2213
    // Shift
2214
    REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2215
    // Add LHS high bit
2216
    REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2217

2218
    SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2219
    SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2220

2221
    DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2222

2223
    // Update REM
2224
    SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2225
    REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2226
  }
2227

2228
  SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2229
  DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2230
  Results.push_back(DIV);
2231
  Results.push_back(REM);
2232
}
2233

2234
SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2235
                                           SelectionDAG &DAG) const {
2236
  SDLoc DL(Op);
2237
  EVT VT = Op.getValueType();
2238

2239
  if (VT == MVT::i64) {
2240
    SmallVector<SDValue, 2> Results;
2241
    LowerUDIVREM64(Op, DAG, Results);
2242
    return DAG.getMergeValues(Results, DL);
2243
  }
2244

2245
  if (VT == MVT::i32) {
2246
    if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2247
      return Res;
2248
  }
2249

2250
  SDValue X = Op.getOperand(0);
2251
  SDValue Y = Op.getOperand(1);
2252

2253
  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2254
  // algorithm used here.
2255

2256
  // Initial estimate of inv(y).
2257
  SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2258

2259
  // One round of UNR.
2260
  SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2261
  SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2262
  Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2263
                  DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2264

2265
  // Quotient/remainder estimate.
2266
  SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2267
  SDValue R =
2268
      DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2269

2270
  // First quotient/remainder refinement.
2271
  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2272
  SDValue One = DAG.getConstant(1, DL, VT);
2273
  SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2274
  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2275
                  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2276
  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2277
                  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2278

2279
  // Second quotient/remainder refinement.
2280
  Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2281
  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2282
                  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2283
  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2284
                  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2285

2286
  return DAG.getMergeValues({Q, R}, DL);
2287
}
2288

2289
SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2290
                                           SelectionDAG &DAG) const {
2291
  SDLoc DL(Op);
2292
  EVT VT = Op.getValueType();
2293

2294
  SDValue LHS = Op.getOperand(0);
2295
  SDValue RHS = Op.getOperand(1);
2296

2297
  SDValue Zero = DAG.getConstant(0, DL, VT);
2298
  SDValue NegOne = DAG.getConstant(-1, DL, VT);
2299

2300
  if (VT == MVT::i32) {
2301
    if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2302
      return Res;
2303
  }
2304

2305
  if (VT == MVT::i64 &&
2306
      DAG.ComputeNumSignBits(LHS) > 32 &&
2307
      DAG.ComputeNumSignBits(RHS) > 32) {
2308
    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2309

2310
    //HiLo split
2311
    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2312
    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2313
    SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2314
                                 LHS_Lo, RHS_Lo);
2315
    SDValue Res[2] = {
2316
      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2317
      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2318
    };
2319
    return DAG.getMergeValues(Res, DL);
2320
  }
2321

2322
  SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2323
  SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2324
  SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2325
  SDValue RSign = LHSign; // Remainder sign is the same as LHS
2326

2327
  LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2328
  RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2329

2330
  LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2331
  RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2332

2333
  SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2334
  SDValue Rem = Div.getValue(1);
2335

2336
  Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2337
  Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2338

2339
  Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2340
  Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2341

2342
  SDValue Res[2] = {
2343
    Div,
2344
    Rem
2345
  };
2346
  return DAG.getMergeValues(Res, DL);
2347
}
2348

2349
// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2350
SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2351
  SDLoc SL(Op);
2352
  EVT VT = Op.getValueType();
2353
  auto Flags = Op->getFlags();
2354
  SDValue X = Op.getOperand(0);
2355
  SDValue Y = Op.getOperand(1);
2356

2357
  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2358
  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2359
  SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2360
  // TODO: For f32 use FMAD instead if !hasFastFMA32?
2361
  return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2362
}
2363

2364
SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2365
  SDLoc SL(Op);
2366
  SDValue Src = Op.getOperand(0);
2367

2368
  // result = trunc(src)
2369
  // if (src > 0.0 && src != result)
2370
  //   result += 1.0
2371

2372
  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2373

2374
  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2375
  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2376

2377
  EVT SetCCVT =
2378
      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2379

2380
  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2381
  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2382
  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2383

2384
  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2385
  // TODO: Should this propagate fast-math-flags?
2386
  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2387
}
2388

2389
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2390
                                  SelectionDAG &DAG) {
2391
  const unsigned FractBits = 52;
2392
  const unsigned ExpBits = 11;
2393

2394
  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2395
                                Hi,
2396
                                DAG.getConstant(FractBits - 32, SL, MVT::i32),
2397
                                DAG.getConstant(ExpBits, SL, MVT::i32));
2398
  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2399
                            DAG.getConstant(1023, SL, MVT::i32));
2400

2401
  return Exp;
2402
}
2403

2404
SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2405
  SDLoc SL(Op);
2406
  SDValue Src = Op.getOperand(0);
2407

2408
  assert(Op.getValueType() == MVT::f64);
2409

2410
  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2411

2412
  // Extract the upper half, since this is where we will find the sign and
2413
  // exponent.
2414
  SDValue Hi = getHiHalf64(Src, DAG);
2415

2416
  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2417

2418
  const unsigned FractBits = 52;
2419

2420
  // Extract the sign bit.
2421
  const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2422
  SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2423

2424
  // Extend back to 64-bits.
2425
  SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2426
  SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2427

2428
  SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2429
  const SDValue FractMask
2430
    = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2431

2432
  SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2433
  SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2434
  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2435

2436
  EVT SetCCVT =
2437
      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2438

2439
  const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2440

2441
  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2442
  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2443

2444
  SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2445
  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2446

2447
  return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2448
}
2449

2450
SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2451
                                              SelectionDAG &DAG) const {
2452
  SDLoc SL(Op);
2453
  SDValue Src = Op.getOperand(0);
2454

2455
  assert(Op.getValueType() == MVT::f64);
2456

2457
  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2458
  SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2459
  SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2460

2461
  // TODO: Should this propagate fast-math-flags?
2462

2463
  SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2464
  SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2465

2466
  SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2467

2468
  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2469
  SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2470

2471
  EVT SetCCVT =
2472
      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2473
  SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2474

2475
  return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2476
}
2477

2478
SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
2479
                                              SelectionDAG &DAG) const {
2480
  // FNEARBYINT and FRINT are the same, except in their handling of FP
2481
  // exceptions. Those aren't really meaningful for us, and OpenCL only has
2482
  // rint, so just treat them as equivalent.
2483
  return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2484
                     Op.getOperand(0));
2485
}
2486

2487
SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2488
  auto VT = Op.getValueType();
2489
  auto Arg = Op.getOperand(0u);
2490
  return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2491
}
2492

2493
// XXX - May require not supporting f32 denormals?
2494

2495
// Don't handle v2f16. The extra instructions to scalarize and repack around the
2496
// compare and vselect end up producing worse code than scalarizing the whole
2497
// operation.
2498
SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2499
  SDLoc SL(Op);
2500
  SDValue X = Op.getOperand(0);
2501
  EVT VT = Op.getValueType();
2502

2503
  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2504

2505
  // TODO: Should this propagate fast-math-flags?
2506

2507
  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2508

2509
  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2510

2511
  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2512
  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2513

2514
  EVT SetCCVT =
2515
      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2516

2517
  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2518
  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2519
  SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2520

2521
  SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2522
  return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2523
}
2524

2525
SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2526
  SDLoc SL(Op);
2527
  SDValue Src = Op.getOperand(0);
2528

2529
  // result = trunc(src);
2530
  // if (src < 0.0 && src != result)
2531
  //   result += -1.0.
2532

2533
  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2534

2535
  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2536
  const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2537

2538
  EVT SetCCVT =
2539
      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2540

2541
  SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2542
  SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2543
  SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2544

2545
  SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2546
  // TODO: Should this propagate fast-math-flags?
2547
  return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2548
}
2549

2550
/// Return true if it's known that \p Src can never be an f32 denormal value.
2551
static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2552
  switch (Src.getOpcode()) {
2553
  case ISD::FP_EXTEND:
2554
    return Src.getOperand(0).getValueType() == MVT::f16;
2555
  case ISD::FP16_TO_FP:
2556
  case ISD::FFREXP:
2557
    return true;
2558
  case ISD::INTRINSIC_WO_CHAIN: {
2559
    unsigned IntrinsicID = Src.getConstantOperandVal(0);
2560
    switch (IntrinsicID) {
2561
    case Intrinsic::amdgcn_frexp_mant:
2562
      return true;
2563
    default:
2564
      return false;
2565
    }
2566
  }
2567
  default:
2568
    return false;
2569
  }
2570

2571
  llvm_unreachable("covered opcode switch");
2572
}
2573

2574
bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
2575
                                           SDNodeFlags Flags) {
2576
  if (Flags.hasApproximateFuncs())
2577
    return true;
2578
  auto &Options = DAG.getTarget().Options;
2579
  return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2580
}
2581

2582
bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
2583
                                                  SDValue Src,
2584
                                                  SDNodeFlags Flags) {
2585
  return !valueIsKnownNeverF32Denorm(Src) &&
2586
         DAG.getMachineFunction()
2587
                 .getDenormalMode(APFloat::IEEEsingle())
2588
                 .Input != DenormalMode::PreserveSign;
2589
}
2590

2591
SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2592
                                                    SDValue Src,
2593
                                                    SDNodeFlags Flags) const {
2594
  SDLoc SL(Src);
2595
  EVT VT = Src.getValueType();
2596
  const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
2597
  SDValue SmallestNormal =
2598
      DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2599

2600
  // Want to scale denormals up, but negatives and 0 work just as well on the
2601
  // scaled path.
2602
  SDValue IsLtSmallestNormal = DAG.getSetCC(
2603
      SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2604
      SmallestNormal, ISD::SETOLT);
2605

2606
  return IsLtSmallestNormal;
2607
}
2608

2609
SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2610
                                          SDNodeFlags Flags) const {
2611
  SDLoc SL(Src);
2612
  EVT VT = Src.getValueType();
2613
  const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
2614
  SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2615

2616
  SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2617
  SDValue IsFinite = DAG.getSetCC(
2618
      SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2619
      Inf, ISD::SETOLT);
2620
  return IsFinite;
2621
}
2622

2623
/// If denormal handling is required return the scaled input to FLOG2, and the
2624
/// check for denormal range. Otherwise, return null values.
2625
std::pair<SDValue, SDValue>
2626
AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2627
                                        SDValue Src, SDNodeFlags Flags) const {
2628
  if (!needsDenormHandlingF32(DAG, Src, Flags))
2629
    return {};
2630

2631
  MVT VT = MVT::f32;
2632
  const fltSemantics &Semantics = APFloat::IEEEsingle();
2633
  SDValue SmallestNormal =
2634
      DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2635

2636
  SDValue IsLtSmallestNormal = DAG.getSetCC(
2637
      SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2638
      SmallestNormal, ISD::SETOLT);
2639

2640
  SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2641
  SDValue One = DAG.getConstantFP(1.0, SL, VT);
2642
  SDValue ScaleFactor =
2643
      DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2644

2645
  SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2646
  return {ScaledInput, IsLtSmallestNormal};
2647
}
2648

2649
SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2650
  // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2651
  // If we have to handle denormals, scale up the input and adjust the result.
2652

2653
  // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2654
  // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2655

2656
  SDLoc SL(Op);
2657
  EVT VT = Op.getValueType();
2658
  SDValue Src = Op.getOperand(0);
2659
  SDNodeFlags Flags = Op->getFlags();
2660

2661
  if (VT == MVT::f16) {
2662
    // Nothing in half is a denormal when promoted to f32.
2663
    assert(!Subtarget->has16BitInsts());
2664
    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2665
    SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2666
    return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2667
                       DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2668
  }
2669

2670
  auto [ScaledInput, IsLtSmallestNormal] =
2671
      getScaledLogInput(DAG, SL, Src, Flags);
2672
  if (!ScaledInput)
2673
    return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2674

2675
  SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2676

2677
  SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2678
  SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2679
  SDValue ResultOffset =
2680
      DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2681
  return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2682
}
2683

2684
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2685
                      SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2686
  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2687
  return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2688
}
2689

2690
SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2691
                                              SelectionDAG &DAG) const {
2692
  SDValue X = Op.getOperand(0);
2693
  EVT VT = Op.getValueType();
2694
  SDNodeFlags Flags = Op->getFlags();
2695
  SDLoc DL(Op);
2696

2697
  const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2698
  assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2699

2700
  const auto &Options = getTargetMachine().Options;
2701
  if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2702
      Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2703

2704
    if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2705
      // Log and multiply in f32 is good enough for f16.
2706
      X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2707
    }
2708

2709
    SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2710
    if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2711
      return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2712
                         DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2713
    }
2714

2715
    return Lowered;
2716
  }
2717

2718
  auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2719
  if (ScaledInput)
2720
    X = ScaledInput;
2721

2722
  SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2723

2724
  SDValue R;
2725
  if (Subtarget->hasFastFMAF32()) {
2726
    // c+cc are ln(2)/ln(10) to more than 49 bits
2727
    const float c_log10 = 0x1.344134p-2f;
2728
    const float cc_log10 = 0x1.09f79ep-26f;
2729

2730
    // c + cc is ln(2) to more than 49 bits
2731
    const float c_log = 0x1.62e42ep-1f;
2732
    const float cc_log = 0x1.efa39ep-25f;
2733

2734
    SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2735
    SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2736

2737
    R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2738
    SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2739
    SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2740
    SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2741
    R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2742
  } else {
2743
    // ch+ct is ln(2)/ln(10) to more than 36 bits
2744
    const float ch_log10 = 0x1.344000p-2f;
2745
    const float ct_log10 = 0x1.3509f6p-18f;
2746

2747
    // ch + ct is ln(2) to more than 36 bits
2748
    const float ch_log = 0x1.62e000p-1f;
2749
    const float ct_log = 0x1.0bfbe8p-15f;
2750

2751
    SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2752
    SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2753

2754
    SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2755
    SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2756
    SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2757
    SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2758
    SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2759

2760
    SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2761
    SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2762
    SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2763
    R = getMad(DAG, DL, VT, YH, CH, Mad1);
2764
  }
2765

2766
  const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2767
                            (Flags.hasNoInfs() || Options.NoInfsFPMath);
2768

2769
  // TODO: Check if known finite from source value.
2770
  if (!IsFiniteOnly) {
2771
    SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2772
    R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2773
  }
2774

2775
  if (IsScaled) {
2776
    SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2777
    SDValue ShiftK =
2778
        DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2779
    SDValue Shift =
2780
        DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2781
    R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2782
  }
2783

2784
  return R;
2785
}
2786

2787
SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2788
  return LowerFLOGCommon(Op, DAG);
2789
}
2790

2791
// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2792
// promote f16 operation.
2793
SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2794
                                              SelectionDAG &DAG, bool IsLog10,
2795
                                              SDNodeFlags Flags) const {
2796
  EVT VT = Src.getValueType();
2797
  unsigned LogOp =
2798
      VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2799

2800
  double Log2BaseInverted =
2801
      IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2802

2803
  if (VT == MVT::f32) {
2804
    auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2805
    if (ScaledInput) {
2806
      SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2807
      SDValue ScaledResultOffset =
2808
          DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2809

2810
      SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2811

2812
      SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2813
                                         ScaledResultOffset, Zero, Flags);
2814

2815
      SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2816

2817
      if (Subtarget->hasFastFMAF32())
2818
        return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2819
                           Flags);
2820
      SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2821
      return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2822
    }
2823
  }
2824

2825
  SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2826
  SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2827

2828
  return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2829
                     Flags);
2830
}
2831

2832
SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
2833
  // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2834
  // If we have to handle denormals, scale up the input and adjust the result.
2835

2836
  SDLoc SL(Op);
2837
  EVT VT = Op.getValueType();
2838
  SDValue Src = Op.getOperand(0);
2839
  SDNodeFlags Flags = Op->getFlags();
2840

2841
  if (VT == MVT::f16) {
2842
    // Nothing in half is a denormal when promoted to f32.
2843
    assert(!Subtarget->has16BitInsts());
2844
    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2845
    SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2846
    return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2847
                       DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2848
  }
2849

2850
  assert(VT == MVT::f32);
2851

2852
  if (!needsDenormHandlingF32(DAG, Src, Flags))
2853
    return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2854

2855
  // bool needs_scaling = x < -0x1.f80000p+6f;
2856
  // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2857

2858
  // -nextafter(128.0, -1)
2859
  SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2860

2861
  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2862

2863
  SDValue NeedsScaling =
2864
      DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2865

2866
  SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2867
  SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2868

2869
  SDValue AddOffset =
2870
      DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2871

2872
  SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2873
  SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2874

2875
  SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2876
  SDValue One = DAG.getConstantFP(1.0, SL, VT);
2877
  SDValue ResultScale =
2878
      DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2879

2880
  return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2881
}
2882

2883
SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
2884
                                              SelectionDAG &DAG,
2885
                                              SDNodeFlags Flags) const {
2886
  EVT VT = X.getValueType();
2887
  const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2888

2889
  if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2890
    // exp2(M_LOG2E_F * f);
2891
    SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2892
    return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2893
                                      : (unsigned)ISD::FEXP2,
2894
                       SL, VT, Mul, Flags);
2895
  }
2896

2897
  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2898

2899
  SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2900
  SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2901

2902
  SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2903

2904
  SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2905

2906
  SDValue AdjustedX =
2907
      DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2908

2909
  SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2910

2911
  SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2912

2913
  SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2914
  SDValue AdjustedResult =
2915
      DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2916

2917
  return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2918
                     Flags);
2919
}
2920

2921
/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2922
/// handled correctly.
2923
SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
2924
                                                SelectionDAG &DAG,
2925
                                                SDNodeFlags Flags) const {
2926
  const EVT VT = X.getValueType();
2927
  const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2928

2929
  if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2930
    // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2931
    SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2932
    SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2933

2934
    SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2935
    SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2936
    SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2937
    SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2938
    return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2939
  }
2940

2941
  // bool s = x < -0x1.2f7030p+5f;
2942
  // x += s ? 0x1.0p+5f : 0.0f;
2943
  // exp10 = exp2(x * 0x1.a92000p+1f) *
2944
  //        exp2(x * 0x1.4f0978p-11f) *
2945
  //        (s ? 0x1.9f623ep-107f : 1.0f);
2946

2947
  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2948

2949
  SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2950
  SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2951

2952
  SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2953
  SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2954
  SDValue AdjustedX =
2955
      DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2956

2957
  SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2958
  SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2959

2960
  SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2961
  SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2962
  SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2963
  SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2964

2965
  SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2966

2967
  SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2968
  SDValue AdjustedResult =
2969
      DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2970

2971
  return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2972
                     Flags);
2973
}
2974

2975
SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2976
  EVT VT = Op.getValueType();
2977
  SDLoc SL(Op);
2978
  SDValue X = Op.getOperand(0);
2979
  SDNodeFlags Flags = Op->getFlags();
2980
  const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2981

2982
  if (VT.getScalarType() == MVT::f16) {
2983
    // v_exp_f16 (fmul x, log2e)
2984
    if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2985
      return lowerFEXPUnsafe(X, SL, DAG, Flags);
2986

2987
    if (VT.isVector())
2988
      return SDValue();
2989

2990
    // exp(f16 x) ->
2991
    //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2992

2993
    // Nothing in half is a denormal when promoted to f32.
2994
    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2995
    SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2996
    return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2997
                       DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2998
  }
2999

3000
  assert(VT == MVT::f32);
3001

3002
  // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3003
  // library behavior. Also, is known-not-daz source sufficient?
3004
  if (allowApproxFunc(DAG, Flags)) {
3005
    return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3006
                   : lowerFEXPUnsafe(X, SL, DAG, Flags);
3007
  }
3008

3009
  //    Algorithm:
3010
  //
3011
  //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3012
  //
3013
  //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3014
  //    n = 64*m + j,   0 <= j < 64
3015
  //
3016
  //    e^x = 2^((64*m + j + f)/64)
3017
  //        = (2^m) * (2^(j/64)) * 2^(f/64)
3018
  //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3019
  //
3020
  //    f = x*(64/ln(2)) - n
3021
  //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
3022
  //
3023
  //    e^x = (2^m) * (2^(j/64)) * e^r
3024
  //
3025
  //    (2^(j/64)) is precomputed
3026
  //
3027
  //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3028
  //    e^r = 1 + q
3029
  //
3030
  //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3031
  //
3032
  //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3033
  SDNodeFlags FlagsNoContract = Flags;
3034
  FlagsNoContract.setAllowContract(false);
3035

3036
  SDValue PH, PL;
3037
  if (Subtarget->hasFastFMAF32()) {
3038
    const float c_exp = numbers::log2ef;
3039
    const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3040
    const float c_exp10 = 0x1.a934f0p+1f;
3041
    const float cc_exp10 = 0x1.2f346ep-24f;
3042

3043
    SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3044
    SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3045

3046
    PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3047
    SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3048
    SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3049
    PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3050
  } else {
3051
    const float ch_exp = 0x1.714000p+0f;
3052
    const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3053

3054
    const float ch_exp10 = 0x1.a92000p+1f;
3055
    const float cl_exp10 = 0x1.4f0978p-11f;
3056

3057
    SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3058
    SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3059

3060
    SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3061
    SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3062
    SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3063
    SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3064
    SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3065

3066
    PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3067

3068
    SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3069
    SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3070
    PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3071
  }
3072

3073
  SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3074

3075
  // It is unsafe to contract this fsub into the PH multiply.
3076
  SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3077

3078
  SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3079
  SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3080
  SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3081

3082
  SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3083

3084
  SDValue UnderflowCheckConst =
3085
      DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3086

3087
  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3088
  SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3089
  SDValue Underflow =
3090
      DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3091

3092
  R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3093
  const auto &Options = getTargetMachine().Options;
3094

3095
  if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3096
    SDValue OverflowCheckConst =
3097
        DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3098
    SDValue Overflow =
3099
        DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3100
    SDValue Inf =
3101
        DAG.getConstantFP(APFloat::getInf(APFloat::IEEEsingle()), SL, VT);
3102
    R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3103
  }
3104

3105
  return R;
3106
}
3107

3108
static bool isCtlzOpc(unsigned Opc) {
3109
  return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3110
}
3111

3112
static bool isCttzOpc(unsigned Opc) {
3113
  return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3114
}
3115

3116
SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
3117
                                               SelectionDAG &DAG) const {
3118
  auto SL = SDLoc(Op);
3119
  auto Opc = Op.getOpcode();
3120
  auto Arg = Op.getOperand(0u);
3121
  auto ResultVT = Op.getValueType();
3122

3123
  if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3124
    return {};
3125

3126
  assert(isCtlzOpc(Opc));
3127
  assert(ResultVT == Arg.getValueType());
3128

3129
  const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3130
  SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3131
  SDValue NewOp;
3132

3133
  if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3134
    NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3135
    NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3136
    NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3137
  } else {
3138
    NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3139
    NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3140
    NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3141
  }
3142

3143
  return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3144
}
3145

3146
SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
3147
  SDLoc SL(Op);
3148
  SDValue Src = Op.getOperand(0);
3149

3150
  assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3151
  bool Ctlz = isCtlzOpc(Op.getOpcode());
3152
  unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3153

3154
  bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3155
                   Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3156
  bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3157

3158
  if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3159
    // (ctlz hi:lo) -> (umin (ffbh src), 32)
3160
    // (cttz hi:lo) -> (umin (ffbl src), 32)
3161
    // (ctlz_zero_undef src) -> (ffbh src)
3162
    // (cttz_zero_undef src) -> (ffbl src)
3163

3164
    //  64-bit scalar version produce 32-bit result
3165
    // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3166
    // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3167
    // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3168
    // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3169
    SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3170
    if (!ZeroUndef) {
3171
      const SDValue ConstVal = DAG.getConstant(
3172
          Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3173
      NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3174
    }
3175
    return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3176
  }
3177

3178
  SDValue Lo, Hi;
3179
  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3180

3181
  SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3182
  SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3183

3184
  // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3185
  // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3186
  // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3187
  // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3188

3189
  unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3190
  const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3191
  if (Ctlz)
3192
    OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3193
  else
3194
    OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3195

3196
  SDValue NewOpr;
3197
  NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3198
  if (!ZeroUndef) {
3199
    const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3200
    NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3201
  }
3202

3203
  return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3204
}
3205

3206
SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
3207
                                               bool Signed) const {
3208
  // The regular method converting a 64-bit integer to float roughly consists of
3209
  // 2 steps: normalization and rounding. In fact, after normalization, the
3210
  // conversion from a 64-bit integer to a float is essentially the same as the
3211
  // one from a 32-bit integer. The only difference is that it has more
3212
  // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3213
  // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3214
  // converted into the correct float number. The basic steps for the unsigned
3215
  // conversion are illustrated in the following pseudo code:
3216
  //
3217
  // f32 uitofp(i64 u) {
3218
  //   i32 hi, lo = split(u);
3219
  //   // Only count the leading zeros in hi as we have native support of the
3220
  //   // conversion from i32 to f32. If hi is all 0s, the conversion is
3221
  //   // reduced to a 32-bit one automatically.
3222
  //   i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3223
  //   u <<= shamt;
3224
  //   hi, lo = split(u);
3225
  //   hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3226
  //   // convert it as a 32-bit integer and scale the result back.
3227
  //   return uitofp(hi) * 2^(32 - shamt);
3228
  // }
3229
  //
3230
  // The signed one follows the same principle but uses 'ffbh_i32' to count its
3231
  // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3232
  // converted instead followed by negation based its sign bit.
3233

3234
  SDLoc SL(Op);
3235
  SDValue Src = Op.getOperand(0);
3236

3237
  SDValue Lo, Hi;
3238
  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3239
  SDValue Sign;
3240
  SDValue ShAmt;
3241
  if (Signed && Subtarget->isGCN()) {
3242
    // We also need to consider the sign bit in Lo if Hi has just sign bits,
3243
    // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3244
    // account. That is, the maximal shift is
3245
    // - 32 if Lo and Hi have opposite signs;
3246
    // - 33 if Lo and Hi have the same sign.
3247
    //
3248
    // Or, MaxShAmt = 33 + OppositeSign, where
3249
    //
3250
    // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3251
    // - -1 if Lo and Hi have opposite signs; and
3252
    // -  0 otherwise.
3253
    //
3254
    // All in all, ShAmt is calculated as
3255
    //
3256
    //  umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3257
    //
3258
    // or
3259
    //
3260
    //  umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3261
    //
3262
    // to reduce the critical path.
3263
    SDValue OppositeSign = DAG.getNode(
3264
        ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3265
        DAG.getConstant(31, SL, MVT::i32));
3266
    SDValue MaxShAmt =
3267
        DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3268
                    OppositeSign);
3269
    // Count the leading sign bits.
3270
    ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3271
    // Different from unsigned conversion, the shift should be one bit less to
3272
    // preserve the sign bit.
3273
    ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3274
                        DAG.getConstant(1, SL, MVT::i32));
3275
    ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3276
  } else {
3277
    if (Signed) {
3278
      // Without 'ffbh_i32', only leading zeros could be counted. Take the
3279
      // absolute value first.
3280
      Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3281
                         DAG.getConstant(63, SL, MVT::i64));
3282
      SDValue Abs =
3283
          DAG.getNode(ISD::XOR, SL, MVT::i64,
3284
                      DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3285
      std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3286
    }
3287
    // Count the leading zeros.
3288
    ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3289
    // The shift amount for signed integers is [0, 32].
3290
  }
3291
  // Normalize the given 64-bit integer.
3292
  SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3293
  // Split it again.
3294
  std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3295
  // Calculate the adjust bit for rounding.
3296
  // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3297
  SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3298
                               DAG.getConstant(1, SL, MVT::i32), Lo);
3299
  // Get the 32-bit normalized integer.
3300
  Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3301
  // Convert the normalized 32-bit integer into f32.
3302
  unsigned Opc =
3303
      (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3304
  SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3305

3306
  // Finally, need to scale back the converted floating number as the original
3307
  // 64-bit integer is converted as a 32-bit one.
3308
  ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3309
                      ShAmt);
3310
  // On GCN, use LDEXP directly.
3311
  if (Subtarget->isGCN())
3312
    return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3313

3314
  // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3315
  // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3316
  // exponent is enough to avoid overflowing into the sign bit.
3317
  SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3318
                            DAG.getConstant(23, SL, MVT::i32));
3319
  SDValue IVal =
3320
      DAG.getNode(ISD::ADD, SL, MVT::i32,
3321
                  DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3322
  if (Signed) {
3323
    // Set the sign bit.
3324
    Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3325
                       DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3326
                       DAG.getConstant(31, SL, MVT::i32));
3327
    IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3328
  }
3329
  return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3330
}
3331

3332
SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3333
                                               bool Signed) const {
3334
  SDLoc SL(Op);
3335
  SDValue Src = Op.getOperand(0);
3336

3337
  SDValue Lo, Hi;
3338
  std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3339

3340
  SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3341
                              SL, MVT::f64, Hi);
3342

3343
  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3344

3345
  SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3346
                              DAG.getConstant(32, SL, MVT::i32));
3347
  // TODO: Should this propagate fast-math-flags?
3348
  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3349
}
3350

3351
SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3352
                                               SelectionDAG &DAG) const {
3353
  // TODO: Factor out code common with LowerSINT_TO_FP.
3354
  EVT DestVT = Op.getValueType();
3355
  SDValue Src = Op.getOperand(0);
3356
  EVT SrcVT = Src.getValueType();
3357

3358
  if (SrcVT == MVT::i16) {
3359
    if (DestVT == MVT::f16)
3360
      return Op;
3361
    SDLoc DL(Op);
3362

3363
    // Promote src to i32
3364
    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3365
    return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3366
  }
3367

3368
  if (DestVT == MVT::bf16) {
3369
    SDLoc SL(Op);
3370
    SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3371
    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3372
    return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3373
  }
3374

3375
  if (SrcVT != MVT::i64)
3376
    return Op;
3377

3378
  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3379
    SDLoc DL(Op);
3380

3381
    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3382
    SDValue FPRoundFlag =
3383
        DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3384
    SDValue FPRound =
3385
        DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3386

3387
    return FPRound;
3388
  }
3389

3390
  if (DestVT == MVT::f32)
3391
    return LowerINT_TO_FP32(Op, DAG, false);
3392

3393
  assert(DestVT == MVT::f64);
3394
  return LowerINT_TO_FP64(Op, DAG, false);
3395
}
3396

3397
SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3398
                                              SelectionDAG &DAG) const {
3399
  EVT DestVT = Op.getValueType();
3400

3401
  SDValue Src = Op.getOperand(0);
3402
  EVT SrcVT = Src.getValueType();
3403

3404
  if (SrcVT == MVT::i16) {
3405
    if (DestVT == MVT::f16)
3406
      return Op;
3407

3408
    SDLoc DL(Op);
3409
    // Promote src to i32
3410
    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3411
    return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3412
  }
3413

3414
  if (DestVT == MVT::bf16) {
3415
    SDLoc SL(Op);
3416
    SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3417
    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3418
    return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3419
  }
3420

3421
  if (SrcVT != MVT::i64)
3422
    return Op;
3423

3424
  // TODO: Factor out code common with LowerUINT_TO_FP.
3425

3426
  if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3427
    SDLoc DL(Op);
3428
    SDValue Src = Op.getOperand(0);
3429

3430
    SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3431
    SDValue FPRoundFlag =
3432
        DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3433
    SDValue FPRound =
3434
        DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3435

3436
    return FPRound;
3437
  }
3438

3439
  if (DestVT == MVT::f32)
3440
    return LowerINT_TO_FP32(Op, DAG, true);
3441

3442
  assert(DestVT == MVT::f64);
3443
  return LowerINT_TO_FP64(Op, DAG, true);
3444
}
3445

3446
SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3447
                                               bool Signed) const {
3448
  SDLoc SL(Op);
3449

3450
  SDValue Src = Op.getOperand(0);
3451
  EVT SrcVT = Src.getValueType();
3452

3453
  assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3454

3455
  // The basic idea of converting a floating point number into a pair of 32-bit
3456
  // integers is illustrated as follows:
3457
  //
3458
  //     tf := trunc(val);
3459
  //    hif := floor(tf * 2^-32);
3460
  //    lof := tf - hif * 2^32; // lof is always positive due to floor.
3461
  //     hi := fptoi(hif);
3462
  //     lo := fptoi(lof);
3463
  //
3464
  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3465
  SDValue Sign;
3466
  if (Signed && SrcVT == MVT::f32) {
3467
    // However, a 32-bit floating point number has only 23 bits mantissa and
3468
    // it's not enough to hold all the significant bits of `lof` if val is
3469
    // negative. To avoid the loss of precision, We need to take the absolute
3470
    // value after truncating and flip the result back based on the original
3471
    // signedness.
3472
    Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3473
                       DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3474
                       DAG.getConstant(31, SL, MVT::i32));
3475
    Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3476
  }
3477

3478
  SDValue K0, K1;
3479
  if (SrcVT == MVT::f64) {
3480
    K0 = DAG.getConstantFP(
3481
        llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3482
        SrcVT);
3483
    K1 = DAG.getConstantFP(
3484
        llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3485
        SrcVT);
3486
  } else {
3487
    K0 = DAG.getConstantFP(
3488
        llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3489
    K1 = DAG.getConstantFP(
3490
        llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3491
  }
3492
  // TODO: Should this propagate fast-math-flags?
3493
  SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3494

3495
  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3496

3497
  SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3498

3499
  SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3500
                                                         : ISD::FP_TO_UINT,
3501
                           SL, MVT::i32, FloorMul);
3502
  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3503

3504
  SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3505
                               DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3506

3507
  if (Signed && SrcVT == MVT::f32) {
3508
    assert(Sign);
3509
    // Flip the result based on the signedness, which is either all 0s or 1s.
3510
    Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3511
                       DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3512
    // r := xor(r, sign) - sign;
3513
    Result =
3514
        DAG.getNode(ISD::SUB, SL, MVT::i64,
3515
                    DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3516
  }
3517

3518
  return Result;
3519
}
3520

3521
SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3522
  SDLoc DL(Op);
3523
  SDValue N0 = Op.getOperand(0);
3524

3525
  // Convert to target node to get known bits
3526
  if (N0.getValueType() == MVT::f32)
3527
    return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3528

3529
  if (getTargetMachine().Options.UnsafeFPMath) {
3530
    // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3531
    return SDValue();
3532
  }
3533

3534
  assert(N0.getSimpleValueType() == MVT::f64);
3535

3536
  // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3537
  const unsigned ExpMask = 0x7ff;
3538
  const unsigned ExpBiasf64 = 1023;
3539
  const unsigned ExpBiasf16 = 15;
3540
  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3541
  SDValue One = DAG.getConstant(1, DL, MVT::i32);
3542
  SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3543
  SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3544
                           DAG.getConstant(32, DL, MVT::i64));
3545
  UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3546
  U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3547
  SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3548
                          DAG.getConstant(20, DL, MVT::i64));
3549
  E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3550
                  DAG.getConstant(ExpMask, DL, MVT::i32));
3551
  // Subtract the fp64 exponent bias (1023) to get the real exponent and
3552
  // add the f16 bias (15) to get the biased exponent for the f16 format.
3553
  E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3554
                  DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3555

3556
  SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3557
                          DAG.getConstant(8, DL, MVT::i32));
3558
  M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3559
                  DAG.getConstant(0xffe, DL, MVT::i32));
3560

3561
  SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3562
                                  DAG.getConstant(0x1ff, DL, MVT::i32));
3563
  MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3564

3565
  SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3566
  M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3567

3568
  // (M != 0 ? 0x0200 : 0) | 0x7c00;
3569
  SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3570
      DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3571
                      Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3572

3573
  // N = M | (E << 12);
3574
  SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3575
      DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3576
                  DAG.getConstant(12, DL, MVT::i32)));
3577

3578
  // B = clamp(1-E, 0, 13);
3579
  SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3580
                                  One, E);
3581
  SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3582
  B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3583
                  DAG.getConstant(13, DL, MVT::i32));
3584

3585
  SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3586
                                   DAG.getConstant(0x1000, DL, MVT::i32));
3587

3588
  SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3589
  SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3590
  SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3591
  D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3592

3593
  SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3594
  SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3595
                              DAG.getConstant(0x7, DL, MVT::i32));
3596
  V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3597
                  DAG.getConstant(2, DL, MVT::i32));
3598
  SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3599
                               One, Zero, ISD::SETEQ);
3600
  SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3601
                               One, Zero, ISD::SETGT);
3602
  V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3603
  V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3604

3605
  V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3606
                      DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3607
  V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3608
                      I, V, ISD::SETEQ);
3609

3610
  // Extract the sign bit.
3611
  SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3612
                            DAG.getConstant(16, DL, MVT::i32));
3613
  Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3614
                     DAG.getConstant(0x8000, DL, MVT::i32));
3615

3616
  V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3617
  return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3618
}
3619

3620
SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
3621
                                             SelectionDAG &DAG) const {
3622
  SDValue Src = Op.getOperand(0);
3623
  unsigned OpOpcode = Op.getOpcode();
3624
  EVT SrcVT = Src.getValueType();
3625
  EVT DestVT = Op.getValueType();
3626

3627
  // Will be selected natively
3628
  if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3629
    return Op;
3630

3631
  if (SrcVT == MVT::bf16) {
3632
    SDLoc DL(Op);
3633
    SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3634
    return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3635
  }
3636

3637
  // Promote i16 to i32
3638
  if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3639
    SDLoc DL(Op);
3640

3641
    SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3642
    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3643
  }
3644

3645
  if (DestVT != MVT::i64)
3646
    return Op;
3647

3648
  if (SrcVT == MVT::f16 ||
3649
      (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3650
    SDLoc DL(Op);
3651

3652
    SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3653
    unsigned Ext =
3654
        OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3655
    return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3656
  }
3657

3658
  if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3659
    return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3660

3661
  return SDValue();
3662
}
3663

3664
SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
3665
                                                     SelectionDAG &DAG) const {
3666
  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3667
  MVT VT = Op.getSimpleValueType();
3668
  MVT ScalarVT = VT.getScalarType();
3669

3670
  assert(VT.isVector());
3671

3672
  SDValue Src = Op.getOperand(0);
3673
  SDLoc DL(Op);
3674

3675
  // TODO: Don't scalarize on Evergreen?
3676
  unsigned NElts = VT.getVectorNumElements();
3677
  SmallVector<SDValue, 8> Args;
3678
  DAG.ExtractVectorElements(Src, Args, 0, NElts);
3679

3680
  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3681
  for (unsigned I = 0; I < NElts; ++I)
3682
    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3683

3684
  return DAG.getBuildVector(VT, DL, Args);
3685
}
3686

3687
//===----------------------------------------------------------------------===//
3688
// Custom DAG optimizations
3689
//===----------------------------------------------------------------------===//
3690

3691
static bool isU24(SDValue Op, SelectionDAG &DAG) {
3692
  return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3693
}
3694

3695
static bool isI24(SDValue Op, SelectionDAG &DAG) {
3696
  EVT VT = Op.getValueType();
3697
  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3698
                                     // as unsigned 24-bit values.
3699
         AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
3700
}
3701

3702
static SDValue simplifyMul24(SDNode *Node24,
3703
                             TargetLowering::DAGCombinerInfo &DCI) {
3704
  SelectionDAG &DAG = DCI.DAG;
3705
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3706
  bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3707

3708
  SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3709
  SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3710
  unsigned NewOpcode = Node24->getOpcode();
3711
  if (IsIntrin) {
3712
    unsigned IID = Node24->getConstantOperandVal(0);
3713
    switch (IID) {
3714
    case Intrinsic::amdgcn_mul_i24:
3715
      NewOpcode = AMDGPUISD::MUL_I24;
3716
      break;
3717
    case Intrinsic::amdgcn_mul_u24:
3718
      NewOpcode = AMDGPUISD::MUL_U24;
3719
      break;
3720
    case Intrinsic::amdgcn_mulhi_i24:
3721
      NewOpcode = AMDGPUISD::MULHI_I24;
3722
      break;
3723
    case Intrinsic::amdgcn_mulhi_u24:
3724
      NewOpcode = AMDGPUISD::MULHI_U24;
3725
      break;
3726
    default:
3727
      llvm_unreachable("Expected 24-bit mul intrinsic");
3728
    }
3729
  }
3730

3731
  APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3732

3733
  // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3734
  // the operands to have other uses, but will only perform simplifications that
3735
  // involve bypassing some nodes for this user.
3736
  SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3737
  SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3738
  if (DemandedLHS || DemandedRHS)
3739
    return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3740
                       DemandedLHS ? DemandedLHS : LHS,
3741
                       DemandedRHS ? DemandedRHS : RHS);
3742

3743
  // Now try SimplifyDemandedBits which can simplify the nodes used by our
3744
  // operands if this node is the only user.
3745
  if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3746
    return SDValue(Node24, 0);
3747
  if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3748
    return SDValue(Node24, 0);
3749

3750
  return SDValue();
3751
}
3752

3753
template <typename IntTy>
3754
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
3755
                               uint32_t Width, const SDLoc &DL) {
3756
  if (Width + Offset < 32) {
3757
    uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3758
    IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3759
    return DAG.getConstant(Result, DL, MVT::i32);
3760
  }
3761

3762
  return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3763
}
3764

3765
static bool hasVolatileUser(SDNode *Val) {
3766
  for (SDNode *U : Val->uses()) {
3767
    if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3768
      if (M->isVolatile())
3769
        return true;
3770
    }
3771
  }
3772

3773
  return false;
3774
}
3775

3776
bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
3777
  // i32 vectors are the canonical memory type.
3778
  if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3779
    return false;
3780

3781
  if (!VT.isByteSized())
3782
    return false;
3783

3784
  unsigned Size = VT.getStoreSize();
3785

3786
  if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3787
    return false;
3788

3789
  if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3790
    return false;
3791

3792
  return true;
3793
}
3794

3795
// Replace load of an illegal type with a store of a bitcast to a friendlier
3796
// type.
3797
SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
3798
                                                 DAGCombinerInfo &DCI) const {
3799
  if (!DCI.isBeforeLegalize())
3800
    return SDValue();
3801

3802
  LoadSDNode *LN = cast<LoadSDNode>(N);
3803
  if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3804
    return SDValue();
3805

3806
  SDLoc SL(N);
3807
  SelectionDAG &DAG = DCI.DAG;
3808
  EVT VT = LN->getMemoryVT();
3809

3810
  unsigned Size = VT.getStoreSize();
3811
  Align Alignment = LN->getAlign();
3812
  if (Alignment < Size && isTypeLegal(VT)) {
3813
    unsigned IsFast;
3814
    unsigned AS = LN->getAddressSpace();
3815

3816
    // Expand unaligned loads earlier than legalization. Due to visitation order
3817
    // problems during legalization, the emitted instructions to pack and unpack
3818
    // the bytes again are not eliminated in the case of an unaligned copy.
3819
    if (!allowsMisalignedMemoryAccesses(
3820
            VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3821
      if (VT.isVector())
3822
        return SplitVectorLoad(SDValue(LN, 0), DAG);
3823

3824
      SDValue Ops[2];
3825
      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3826

3827
      return DAG.getMergeValues(Ops, SDLoc(N));
3828
    }
3829

3830
    if (!IsFast)
3831
      return SDValue();
3832
  }
3833

3834
  if (!shouldCombineMemoryType(VT))
3835
    return SDValue();
3836

3837
  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3838

3839
  SDValue NewLoad
3840
    = DAG.getLoad(NewVT, SL, LN->getChain(),
3841
                  LN->getBasePtr(), LN->getMemOperand());
3842

3843
  SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3844
  DCI.CombineTo(N, BC, NewLoad.getValue(1));
3845
  return SDValue(N, 0);
3846
}
3847

3848
// Replace store of an illegal type with a store of a bitcast to a friendlier
3849
// type.
3850
SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3851
                                                  DAGCombinerInfo &DCI) const {
3852
  if (!DCI.isBeforeLegalize())
3853
    return SDValue();
3854

3855
  StoreSDNode *SN = cast<StoreSDNode>(N);
3856
  if (!SN->isSimple() || !ISD::isNormalStore(SN))
3857
    return SDValue();
3858

3859
  EVT VT = SN->getMemoryVT();
3860
  unsigned Size = VT.getStoreSize();
3861

3862
  SDLoc SL(N);
3863
  SelectionDAG &DAG = DCI.DAG;
3864
  Align Alignment = SN->getAlign();
3865
  if (Alignment < Size && isTypeLegal(VT)) {
3866
    unsigned IsFast;
3867
    unsigned AS = SN->getAddressSpace();
3868

3869
    // Expand unaligned stores earlier than legalization. Due to visitation
3870
    // order problems during legalization, the emitted instructions to pack and
3871
    // unpack the bytes again are not eliminated in the case of an unaligned
3872
    // copy.
3873
    if (!allowsMisalignedMemoryAccesses(
3874
            VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3875
      if (VT.isVector())
3876
        return SplitVectorStore(SDValue(SN, 0), DAG);
3877

3878
      return expandUnalignedStore(SN, DAG);
3879
    }
3880

3881
    if (!IsFast)
3882
      return SDValue();
3883
  }
3884

3885
  if (!shouldCombineMemoryType(VT))
3886
    return SDValue();
3887

3888
  EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3889
  SDValue Val = SN->getValue();
3890

3891
  //DCI.AddToWorklist(Val.getNode());
3892

3893
  bool OtherUses = !Val.hasOneUse();
3894
  SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3895
  if (OtherUses) {
3896
    SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3897
    DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3898
  }
3899

3900
  return DAG.getStore(SN->getChain(), SL, CastVal,
3901
                      SN->getBasePtr(), SN->getMemOperand());
3902
}
3903

3904
// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3905
// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3906
// issues.
3907
SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3908
                                                        DAGCombinerInfo &DCI) const {
3909
  SelectionDAG &DAG = DCI.DAG;
3910
  SDValue N0 = N->getOperand(0);
3911

3912
  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3913
  //     (vt2 (truncate (assertzext vt0:x, vt1)))
3914
  if (N0.getOpcode() == ISD::TRUNCATE) {
3915
    SDValue N1 = N->getOperand(1);
3916
    EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3917
    SDLoc SL(N);
3918

3919
    SDValue Src = N0.getOperand(0);
3920
    EVT SrcVT = Src.getValueType();
3921
    if (SrcVT.bitsGE(ExtVT)) {
3922
      SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3923
      return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3924
    }
3925
  }
3926

3927
  return SDValue();
3928
}
3929

3930
SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3931
  SDNode *N, DAGCombinerInfo &DCI) const {
3932
  unsigned IID = N->getConstantOperandVal(0);
3933
  switch (IID) {
3934
  case Intrinsic::amdgcn_mul_i24:
3935
  case Intrinsic::amdgcn_mul_u24:
3936
  case Intrinsic::amdgcn_mulhi_i24:
3937
  case Intrinsic::amdgcn_mulhi_u24:
3938
    return simplifyMul24(N, DCI);
3939
  case Intrinsic::amdgcn_fract:
3940
  case Intrinsic::amdgcn_rsq:
3941
  case Intrinsic::amdgcn_rcp_legacy:
3942
  case Intrinsic::amdgcn_rsq_legacy:
3943
  case Intrinsic::amdgcn_rsq_clamp: {
3944
    // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3945
    SDValue Src = N->getOperand(1);
3946
    return Src.isUndef() ? Src : SDValue();
3947
  }
3948
  case Intrinsic::amdgcn_frexp_exp: {
3949
    // frexp_exp (fneg x) -> frexp_exp x
3950
    // frexp_exp (fabs x) -> frexp_exp x
3951
    // frexp_exp (fneg (fabs x)) -> frexp_exp x
3952
    SDValue Src = N->getOperand(1);
3953
    SDValue PeekSign = peekFPSignOps(Src);
3954
    if (PeekSign == Src)
3955
      return SDValue();
3956
    return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
3957
                   0);
3958
  }
3959
  default:
3960
    return SDValue();
3961
  }
3962
}
3963

3964
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3965
/// binary operation \p Opc to it with the corresponding constant operands.
3966
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3967
  DAGCombinerInfo &DCI, const SDLoc &SL,
3968
  unsigned Opc, SDValue LHS,
3969
  uint32_t ValLo, uint32_t ValHi) const {
3970
  SelectionDAG &DAG = DCI.DAG;
3971
  SDValue Lo, Hi;
3972
  std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3973

3974
  SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3975
  SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3976

3977
  SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3978
  SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3979

3980
  // Re-visit the ands. It's possible we eliminated one of them and it could
3981
  // simplify the vector.
3982
  DCI.AddToWorklist(Lo.getNode());
3983
  DCI.AddToWorklist(Hi.getNode());
3984

3985
  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3986
  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3987
}
3988

3989
SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3990
                                                DAGCombinerInfo &DCI) const {
3991
  EVT VT = N->getValueType(0);
3992

3993
  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3994
  if (!RHS)
3995
    return SDValue();
3996

3997
  SDValue LHS = N->getOperand(0);
3998
  unsigned RHSVal = RHS->getZExtValue();
3999
  if (!RHSVal)
4000
    return LHS;
4001

4002
  SDLoc SL(N);
4003
  SelectionDAG &DAG = DCI.DAG;
4004

4005
  switch (LHS->getOpcode()) {
4006
  default:
4007
    break;
4008
  case ISD::ZERO_EXTEND:
4009
  case ISD::SIGN_EXTEND:
4010
  case ISD::ANY_EXTEND: {
4011
    SDValue X = LHS->getOperand(0);
4012

4013
    if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4014
        isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4015
      // Prefer build_vector as the canonical form if packed types are legal.
4016
      // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4017
      SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
4018
       { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
4019
      return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4020
    }
4021

4022
    // shl (ext x) => zext (shl x), if shift does not overflow int
4023
    if (VT != MVT::i64)
4024
      break;
4025
    KnownBits Known = DAG.computeKnownBits(X);
4026
    unsigned LZ = Known.countMinLeadingZeros();
4027
    if (LZ < RHSVal)
4028
      break;
4029
    EVT XVT = X.getValueType();
4030
    SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
4031
    return DAG.getZExtOrTrunc(Shl, SL, VT);
4032
  }
4033
  }
4034

4035
  if (VT != MVT::i64)
4036
    return SDValue();
4037

4038
  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
4039

4040
  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4041
  // common case, splitting this into a move and a 32-bit shift is faster and
4042
  // the same code size.
4043
  if (RHSVal < 32)
4044
    return SDValue();
4045

4046
  SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
4047

4048
  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4049
  SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4050

4051
  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4052

4053
  SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4054
  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4055
}
4056

4057
SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
4058
                                                DAGCombinerInfo &DCI) const {
4059
  if (N->getValueType(0) != MVT::i64)
4060
    return SDValue();
4061

4062
  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4063
  if (!RHS)
4064
    return SDValue();
4065

4066
  SelectionDAG &DAG = DCI.DAG;
4067
  SDLoc SL(N);
4068
  unsigned RHSVal = RHS->getZExtValue();
4069

4070
  // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4071
  if (RHSVal == 32) {
4072
    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4073
    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4074
                                   DAG.getConstant(31, SL, MVT::i32));
4075

4076
    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4077
    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4078
  }
4079

4080
  // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4081
  if (RHSVal == 63) {
4082
    SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4083
    SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4084
                                   DAG.getConstant(31, SL, MVT::i32));
4085
    SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4086
    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4087
  }
4088

4089
  return SDValue();
4090
}
4091

4092
SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4093
                                                DAGCombinerInfo &DCI) const {
4094
  auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4095
  if (!RHS)
4096
    return SDValue();
4097

4098
  EVT VT = N->getValueType(0);
4099
  SDValue LHS = N->getOperand(0);
4100
  unsigned ShiftAmt = RHS->getZExtValue();
4101
  SelectionDAG &DAG = DCI.DAG;
4102
  SDLoc SL(N);
4103

4104
  // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4105
  // this improves the ability to match BFE patterns in isel.
4106
  if (LHS.getOpcode() == ISD::AND) {
4107
    if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4108
      unsigned MaskIdx, MaskLen;
4109
      if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4110
          MaskIdx == ShiftAmt) {
4111
        return DAG.getNode(
4112
            ISD::AND, SL, VT,
4113
            DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4114
            DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4115
      }
4116
    }
4117
  }
4118

4119
  if (VT != MVT::i64)
4120
    return SDValue();
4121

4122
  if (ShiftAmt < 32)
4123
    return SDValue();
4124

4125
  // srl i64:x, C for C >= 32
4126
  // =>
4127
  //   build_pair (srl hi_32(x), C - 32), 0
4128
  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4129

4130
  SDValue Hi = getHiHalf64(LHS, DAG);
4131

4132
  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4133
  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4134

4135
  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4136

4137
  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4138
}
4139

4140
SDValue AMDGPUTargetLowering::performTruncateCombine(
4141
  SDNode *N, DAGCombinerInfo &DCI) const {
4142
  SDLoc SL(N);
4143
  SelectionDAG &DAG = DCI.DAG;
4144
  EVT VT = N->getValueType(0);
4145
  SDValue Src = N->getOperand(0);
4146

4147
  // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4148
  if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4149
    SDValue Vec = Src.getOperand(0);
4150
    if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4151
      SDValue Elt0 = Vec.getOperand(0);
4152
      EVT EltVT = Elt0.getValueType();
4153
      if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4154
        if (EltVT.isFloatingPoint()) {
4155
          Elt0 = DAG.getNode(ISD::BITCAST, SL,
4156
                             EltVT.changeTypeToInteger(), Elt0);
4157
        }
4158

4159
        return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4160
      }
4161
    }
4162
  }
4163

4164
  // Equivalent of above for accessing the high element of a vector as an
4165
  // integer operation.
4166
  // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4167
  if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4168
    if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
4169
      if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4170
        SDValue BV = stripBitcast(Src.getOperand(0));
4171
        if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4172
            BV.getValueType().getVectorNumElements() == 2) {
4173
          SDValue SrcElt = BV.getOperand(1);
4174
          EVT SrcEltVT = SrcElt.getValueType();
4175
          if (SrcEltVT.isFloatingPoint()) {
4176
            SrcElt = DAG.getNode(ISD::BITCAST, SL,
4177
                                 SrcEltVT.changeTypeToInteger(), SrcElt);
4178
          }
4179

4180
          return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4181
        }
4182
      }
4183
    }
4184
  }
4185

4186
  // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4187
  //
4188
  // i16 (trunc (srl i64:x, K)), K <= 16 ->
4189
  //     i16 (trunc (srl (i32 (trunc x), K)))
4190
  if (VT.getScalarSizeInBits() < 32) {
4191
    EVT SrcVT = Src.getValueType();
4192
    if (SrcVT.getScalarSizeInBits() > 32 &&
4193
        (Src.getOpcode() == ISD::SRL ||
4194
         Src.getOpcode() == ISD::SRA ||
4195
         Src.getOpcode() == ISD::SHL)) {
4196
      SDValue Amt = Src.getOperand(1);
4197
      KnownBits Known = DAG.computeKnownBits(Amt);
4198

4199
      // - For left shifts, do the transform as long as the shift
4200
      //   amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4201
      // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4202
      //   losing information stored in the high bits when truncating.
4203
      const unsigned MaxCstSize =
4204
          (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4205
      if (Known.getMaxValue().ule(MaxCstSize)) {
4206
        EVT MidVT = VT.isVector() ?
4207
          EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4208
                           VT.getVectorNumElements()) : MVT::i32;
4209

4210
        EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4211
        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4212
                                    Src.getOperand(0));
4213
        DCI.AddToWorklist(Trunc.getNode());
4214

4215
        if (Amt.getValueType() != NewShiftVT) {
4216
          Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4217
          DCI.AddToWorklist(Amt.getNode());
4218
        }
4219

4220
        SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4221
                                          Trunc, Amt);
4222
        return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4223
      }
4224
    }
4225
  }
4226

4227
  return SDValue();
4228
}
4229

4230
// We need to specifically handle i64 mul here to avoid unnecessary conversion
4231
// instructions. If we only match on the legalized i64 mul expansion,
4232
// SimplifyDemandedBits will be unable to remove them because there will be
4233
// multiple uses due to the separate mul + mulh[su].
4234
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4235
                        SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4236
  if (Size <= 32) {
4237
    unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4238
    return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4239
  }
4240

4241
  unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4242
  unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4243

4244
  SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4245
  SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4246

4247
  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4248
}
4249

4250
/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4251
/// return SDValue().
4252
static SDValue getAddOneOp(const SDNode *V) {
4253
  if (V->getOpcode() != ISD::ADD)
4254
    return SDValue();
4255

4256
  return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4257
}
4258

4259
SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4260
                                                DAGCombinerInfo &DCI) const {
4261
  assert(N->getOpcode() == ISD::MUL);
4262
  EVT VT = N->getValueType(0);
4263

4264
  // Don't generate 24-bit multiplies on values that are in SGPRs, since
4265
  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4266
  // unnecessarily). isDivergent() is used as an approximation of whether the
4267
  // value is in an SGPR.
4268
  if (!N->isDivergent())
4269
    return SDValue();
4270

4271
  unsigned Size = VT.getSizeInBits();
4272
  if (VT.isVector() || Size > 64)
4273
    return SDValue();
4274

4275
  SelectionDAG &DAG = DCI.DAG;
4276
  SDLoc DL(N);
4277

4278
  SDValue N0 = N->getOperand(0);
4279
  SDValue N1 = N->getOperand(1);
4280

4281
  // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4282
  // matching.
4283

4284
  // mul x, (add y, 1) -> add (mul x, y), x
4285
  auto IsFoldableAdd = [](SDValue V) -> SDValue {
4286
    SDValue AddOp = getAddOneOp(V.getNode());
4287
    if (!AddOp)
4288
      return SDValue();
4289

4290
    if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
4291
          return U->getOpcode() == ISD::MUL;
4292
        }))
4293
      return AddOp;
4294

4295
    return SDValue();
4296
  };
4297

4298
  // FIXME: The selection pattern is not properly checking for commuted
4299
  // operands, so we have to place the mul in the LHS
4300
  if (SDValue MulOper = IsFoldableAdd(N0)) {
4301
    SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4302
    return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4303
  }
4304

4305
  if (SDValue MulOper = IsFoldableAdd(N1)) {
4306
    SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4307
    return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4308
  }
4309

4310
  // There are i16 integer mul/mad.
4311
  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4312
    return SDValue();
4313

4314
  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4315
  // in the source into any_extends if the result of the mul is truncated. Since
4316
  // we can assume the high bits are whatever we want, use the underlying value
4317
  // to avoid the unknown high bits from interfering.
4318
  if (N0.getOpcode() == ISD::ANY_EXTEND)
4319
    N0 = N0.getOperand(0);
4320

4321
  if (N1.getOpcode() == ISD::ANY_EXTEND)
4322
    N1 = N1.getOperand(0);
4323

4324
  SDValue Mul;
4325

4326
  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4327
    N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4328
    N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4329
    Mul = getMul24(DAG, DL, N0, N1, Size, false);
4330
  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4331
    N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4332
    N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4333
    Mul = getMul24(DAG, DL, N0, N1, Size, true);
4334
  } else {
4335
    return SDValue();
4336
  }
4337

4338
  // We need to use sext even for MUL_U24, because MUL_U24 is used
4339
  // for signed multiply of 8 and 16-bit types.
4340
  return DAG.getSExtOrTrunc(Mul, DL, VT);
4341
}
4342

4343
SDValue
4344
AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4345
                                            DAGCombinerInfo &DCI) const {
4346
  if (N->getValueType(0) != MVT::i32)
4347
    return SDValue();
4348

4349
  SelectionDAG &DAG = DCI.DAG;
4350
  SDLoc DL(N);
4351

4352
  bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4353
  SDValue N0 = N->getOperand(0);
4354
  SDValue N1 = N->getOperand(1);
4355

4356
  // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4357
  // in the source into any_extends if the result of the mul is truncated. Since
4358
  // we can assume the high bits are whatever we want, use the underlying value
4359
  // to avoid the unknown high bits from interfering.
4360
  if (N0.getOpcode() == ISD::ANY_EXTEND)
4361
    N0 = N0.getOperand(0);
4362
  if (N1.getOpcode() == ISD::ANY_EXTEND)
4363
    N1 = N1.getOperand(0);
4364

4365
  // Try to use two fast 24-bit multiplies (one for each half of the result)
4366
  // instead of one slow extending multiply.
4367
  unsigned LoOpcode = 0;
4368
  unsigned HiOpcode = 0;
4369
  if (Signed) {
4370
    if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4371
      N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4372
      N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4373
      LoOpcode = AMDGPUISD::MUL_I24;
4374
      HiOpcode = AMDGPUISD::MULHI_I24;
4375
    }
4376
  } else {
4377
    if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4378
      N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4379
      N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4380
      LoOpcode = AMDGPUISD::MUL_U24;
4381
      HiOpcode = AMDGPUISD::MULHI_U24;
4382
    }
4383
  }
4384
  if (!LoOpcode)
4385
    return SDValue();
4386

4387
  SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4388
  SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4389
  DCI.CombineTo(N, Lo, Hi);
4390
  return SDValue(N, 0);
4391
}
4392

4393
SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4394
                                                  DAGCombinerInfo &DCI) const {
4395
  EVT VT = N->getValueType(0);
4396

4397
  if (!Subtarget->hasMulI24() || VT.isVector())
4398
    return SDValue();
4399

4400
  // Don't generate 24-bit multiplies on values that are in SGPRs, since
4401
  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4402
  // unnecessarily). isDivergent() is used as an approximation of whether the
4403
  // value is in an SGPR.
4404
  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4405
  // valu op anyway)
4406
  if (Subtarget->hasSMulHi() && !N->isDivergent())
4407
    return SDValue();
4408

4409
  SelectionDAG &DAG = DCI.DAG;
4410
  SDLoc DL(N);
4411

4412
  SDValue N0 = N->getOperand(0);
4413
  SDValue N1 = N->getOperand(1);
4414

4415
  if (!isI24(N0, DAG) || !isI24(N1, DAG))
4416
    return SDValue();
4417

4418
  N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4419
  N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4420

4421
  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4422
  DCI.AddToWorklist(Mulhi.getNode());
4423
  return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4424
}
4425

4426
SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4427
                                                  DAGCombinerInfo &DCI) const {
4428
  EVT VT = N->getValueType(0);
4429

4430
  if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4431
    return SDValue();
4432

4433
  // Don't generate 24-bit multiplies on values that are in SGPRs, since
4434
  // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4435
  // unnecessarily). isDivergent() is used as an approximation of whether the
4436
  // value is in an SGPR.
4437
  // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4438
  // valu op anyway)
4439
  if (Subtarget->hasSMulHi() && !N->isDivergent())
4440
    return SDValue();
4441

4442
  SelectionDAG &DAG = DCI.DAG;
4443
  SDLoc DL(N);
4444

4445
  SDValue N0 = N->getOperand(0);
4446
  SDValue N1 = N->getOperand(1);
4447

4448
  if (!isU24(N0, DAG) || !isU24(N1, DAG))
4449
    return SDValue();
4450

4451
  N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4452
  N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4453

4454
  SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4455
  DCI.AddToWorklist(Mulhi.getNode());
4456
  return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4457
}
4458

4459
SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4460
                                          SDValue Op,
4461
                                          const SDLoc &DL,
4462
                                          unsigned Opc) const {
4463
  EVT VT = Op.getValueType();
4464
  EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4465
  if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4466
                              LegalVT != MVT::i16))
4467
    return SDValue();
4468

4469
  if (VT != MVT::i32)
4470
    Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4471

4472
  SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4473
  if (VT != MVT::i32)
4474
    FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4475

4476
  return FFBX;
4477
}
4478

4479
// The native instructions return -1 on 0 input. Optimize out a select that
4480
// produces -1 on 0.
4481
//
4482
// TODO: If zero is not undef, we could also do this if the output is compared
4483
// against the bitwidth.
4484
//
4485
// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4486
SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4487
                                                 SDValue LHS, SDValue RHS,
4488
                                                 DAGCombinerInfo &DCI) const {
4489
  if (!isNullConstant(Cond.getOperand(1)))
4490
    return SDValue();
4491

4492
  SelectionDAG &DAG = DCI.DAG;
4493
  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4494
  SDValue CmpLHS = Cond.getOperand(0);
4495

4496
  // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4497
  // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4498
  if (CCOpcode == ISD::SETEQ &&
4499
      (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4500
      RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4501
    unsigned Opc =
4502
        isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4503
    return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4504
  }
4505

4506
  // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4507
  // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4508
  if (CCOpcode == ISD::SETNE &&
4509
      (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4510
      LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4511
    unsigned Opc =
4512
        isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4513

4514
    return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4515
  }
4516

4517
  return SDValue();
4518
}
4519

4520
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
4521
                                         unsigned Op,
4522
                                         const SDLoc &SL,
4523
                                         SDValue Cond,
4524
                                         SDValue N1,
4525
                                         SDValue N2) {
4526
  SelectionDAG &DAG = DCI.DAG;
4527
  EVT VT = N1.getValueType();
4528

4529
  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4530
                                  N1.getOperand(0), N2.getOperand(0));
4531
  DCI.AddToWorklist(NewSelect.getNode());
4532
  return DAG.getNode(Op, SL, VT, NewSelect);
4533
}
4534

4535
// Pull a free FP operation out of a select so it may fold into uses.
4536
//
4537
// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4538
// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4539
//
4540
// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4541
// select c, (fabs x), +k -> fabs (select c, x, k)
4542
SDValue
4543
AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4544
                                           SDValue N) const {
4545
  SelectionDAG &DAG = DCI.DAG;
4546
  SDValue Cond = N.getOperand(0);
4547
  SDValue LHS = N.getOperand(1);
4548
  SDValue RHS = N.getOperand(2);
4549

4550
  EVT VT = N.getValueType();
4551
  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4552
      (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4553
    if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
4554
      return SDValue();
4555

4556
    return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4557
                                     SDLoc(N), Cond, LHS, RHS);
4558
  }
4559

4560
  bool Inv = false;
4561
  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4562
    std::swap(LHS, RHS);
4563
    Inv = true;
4564
  }
4565

4566
  // TODO: Support vector constants.
4567
  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4568
  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4569
      !selectSupportsSourceMods(N.getNode())) {
4570
    SDLoc SL(N);
4571
    // If one side is an fneg/fabs and the other is a constant, we can push the
4572
    // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4573
    SDValue NewLHS = LHS.getOperand(0);
4574
    SDValue NewRHS = RHS;
4575

4576
    // Careful: if the neg can be folded up, don't try to pull it back down.
4577
    bool ShouldFoldNeg = true;
4578

4579
    if (NewLHS.hasOneUse()) {
4580
      unsigned Opc = NewLHS.getOpcode();
4581
      if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4582
        ShouldFoldNeg = false;
4583
      if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4584
        ShouldFoldNeg = false;
4585
    }
4586

4587
    if (ShouldFoldNeg) {
4588
      if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4589
        return SDValue();
4590

4591
      // We're going to be forced to use a source modifier anyway, there's no
4592
      // point to pulling the negate out unless we can get a size reduction by
4593
      // negating the constant.
4594
      //
4595
      // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4596
      // about cheaper constants.
4597
      if (NewLHS.getOpcode() == ISD::FABS &&
4598
          getConstantNegateCost(CRHS) != NegatibleCost::Cheaper)
4599
        return SDValue();
4600

4601
      if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
4602
        return SDValue();
4603

4604
      if (LHS.getOpcode() == ISD::FNEG)
4605
        NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4606

4607
      if (Inv)
4608
        std::swap(NewLHS, NewRHS);
4609

4610
      SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4611
                                      Cond, NewLHS, NewRHS);
4612
      DCI.AddToWorklist(NewSelect.getNode());
4613
      return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4614
    }
4615
  }
4616

4617
  return SDValue();
4618
}
4619

4620
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
4621
                                                   DAGCombinerInfo &DCI) const {
4622
  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4623
    return Folded;
4624

4625
  SDValue Cond = N->getOperand(0);
4626
  if (Cond.getOpcode() != ISD::SETCC)
4627
    return SDValue();
4628

4629
  EVT VT = N->getValueType(0);
4630
  SDValue LHS = Cond.getOperand(0);
4631
  SDValue RHS = Cond.getOperand(1);
4632
  SDValue CC = Cond.getOperand(2);
4633

4634
  SDValue True = N->getOperand(1);
4635
  SDValue False = N->getOperand(2);
4636

4637
  if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4638
    SelectionDAG &DAG = DCI.DAG;
4639
    if (DAG.isConstantValueOfAnyType(True) &&
4640
        !DAG.isConstantValueOfAnyType(False)) {
4641
      // Swap cmp + select pair to move constant to false input.
4642
      // This will allow using VOPC cndmasks more often.
4643
      // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4644

4645
      SDLoc SL(N);
4646
      ISD::CondCode NewCC =
4647
          getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4648

4649
      SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4650
      return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4651
    }
4652

4653
    if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4654
      SDValue MinMax
4655
        = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4656
      // Revisit this node so we can catch min3/max3/med3 patterns.
4657
      //DCI.AddToWorklist(MinMax.getNode());
4658
      return MinMax;
4659
    }
4660
  }
4661

4662
  // There's no reason to not do this if the condition has other uses.
4663
  return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4664
}
4665

4666
static bool isInv2Pi(const APFloat &APF) {
4667
  static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4668
  static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4669
  static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4670

4671
  return APF.bitwiseIsEqual(KF16) ||
4672
         APF.bitwiseIsEqual(KF32) ||
4673
         APF.bitwiseIsEqual(KF64);
4674
}
4675

4676
// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4677
// additional cost to negate them.
4678
TargetLowering::NegatibleCost
4679
AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
4680
  if (C->isZero())
4681
    return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4682

4683
  if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4684
    return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4685

4686
  return NegatibleCost::Neutral;
4687
}
4688

4689
bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
4690
  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4691
    return getConstantNegateCost(C) == NegatibleCost::Expensive;
4692
  return false;
4693
}
4694

4695
bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
4696
  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4697
    return getConstantNegateCost(C) == NegatibleCost::Cheaper;
4698
  return false;
4699
}
4700

4701
static unsigned inverseMinMax(unsigned Opc) {
4702
  switch (Opc) {
4703
  case ISD::FMAXNUM:
4704
    return ISD::FMINNUM;
4705
  case ISD::FMINNUM:
4706
    return ISD::FMAXNUM;
4707
  case ISD::FMAXNUM_IEEE:
4708
    return ISD::FMINNUM_IEEE;
4709
  case ISD::FMINNUM_IEEE:
4710
    return ISD::FMAXNUM_IEEE;
4711
  case ISD::FMAXIMUM:
4712
    return ISD::FMINIMUM;
4713
  case ISD::FMINIMUM:
4714
    return ISD::FMAXIMUM;
4715
  case AMDGPUISD::FMAX_LEGACY:
4716
    return AMDGPUISD::FMIN_LEGACY;
4717
  case AMDGPUISD::FMIN_LEGACY:
4718
    return  AMDGPUISD::FMAX_LEGACY;
4719
  default:
4720
    llvm_unreachable("invalid min/max opcode");
4721
  }
4722
}
4723

4724
/// \return true if it's profitable to try to push an fneg into its source
4725
/// instruction.
4726
bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
4727
  // If the input has multiple uses and we can either fold the negate down, or
4728
  // the other uses cannot, give up. This both prevents unprofitable
4729
  // transformations and infinite loops: we won't repeatedly try to fold around
4730
  // a negate that has no 'good' form.
4731
  if (N0.hasOneUse()) {
4732
    // This may be able to fold into the source, but at a code size cost. Don't
4733
    // fold if the fold into the user is free.
4734
    if (allUsesHaveSourceMods(N, 0))
4735
      return false;
4736
  } else {
4737
    if (fnegFoldsIntoOp(N0.getNode()) &&
4738
        (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
4739
      return false;
4740
  }
4741

4742
  return true;
4743
}
4744

4745
SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
4746
                                                 DAGCombinerInfo &DCI) const {
4747
  SelectionDAG &DAG = DCI.DAG;
4748
  SDValue N0 = N->getOperand(0);
4749
  EVT VT = N->getValueType(0);
4750

4751
  unsigned Opc = N0.getOpcode();
4752

4753
  if (!shouldFoldFNegIntoSrc(N, N0))
4754
    return SDValue();
4755

4756
  SDLoc SL(N);
4757
  switch (Opc) {
4758
  case ISD::FADD: {
4759
    if (!mayIgnoreSignedZero(N0))
4760
      return SDValue();
4761

4762
    // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4763
    SDValue LHS = N0.getOperand(0);
4764
    SDValue RHS = N0.getOperand(1);
4765

4766
    if (LHS.getOpcode() != ISD::FNEG)
4767
      LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4768
    else
4769
      LHS = LHS.getOperand(0);
4770

4771
    if (RHS.getOpcode() != ISD::FNEG)
4772
      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4773
    else
4774
      RHS = RHS.getOperand(0);
4775

4776
    SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4777
    if (Res.getOpcode() != ISD::FADD)
4778
      return SDValue(); // Op got folded away.
4779
    if (!N0.hasOneUse())
4780
      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4781
    return Res;
4782
  }
4783
  case ISD::FMUL:
4784
  case AMDGPUISD::FMUL_LEGACY: {
4785
    // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4786
    // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4787
    SDValue LHS = N0.getOperand(0);
4788
    SDValue RHS = N0.getOperand(1);
4789

4790
    if (LHS.getOpcode() == ISD::FNEG)
4791
      LHS = LHS.getOperand(0);
4792
    else if (RHS.getOpcode() == ISD::FNEG)
4793
      RHS = RHS.getOperand(0);
4794
    else
4795
      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4796

4797
    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4798
    if (Res.getOpcode() != Opc)
4799
      return SDValue(); // Op got folded away.
4800
    if (!N0.hasOneUse())
4801
      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4802
    return Res;
4803
  }
4804
  case ISD::FMA:
4805
  case ISD::FMAD: {
4806
    // TODO: handle llvm.amdgcn.fma.legacy
4807
    if (!mayIgnoreSignedZero(N0))
4808
      return SDValue();
4809

4810
    // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4811
    SDValue LHS = N0.getOperand(0);
4812
    SDValue MHS = N0.getOperand(1);
4813
    SDValue RHS = N0.getOperand(2);
4814

4815
    if (LHS.getOpcode() == ISD::FNEG)
4816
      LHS = LHS.getOperand(0);
4817
    else if (MHS.getOpcode() == ISD::FNEG)
4818
      MHS = MHS.getOperand(0);
4819
    else
4820
      MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4821

4822
    if (RHS.getOpcode() != ISD::FNEG)
4823
      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4824
    else
4825
      RHS = RHS.getOperand(0);
4826

4827
    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4828
    if (Res.getOpcode() != Opc)
4829
      return SDValue(); // Op got folded away.
4830
    if (!N0.hasOneUse())
4831
      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4832
    return Res;
4833
  }
4834
  case ISD::FMAXNUM:
4835
  case ISD::FMINNUM:
4836
  case ISD::FMAXNUM_IEEE:
4837
  case ISD::FMINNUM_IEEE:
4838
  case ISD::FMINIMUM:
4839
  case ISD::FMAXIMUM:
4840
  case AMDGPUISD::FMAX_LEGACY:
4841
  case AMDGPUISD::FMIN_LEGACY: {
4842
    // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4843
    // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4844
    // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4845
    // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4846

4847
    SDValue LHS = N0.getOperand(0);
4848
    SDValue RHS = N0.getOperand(1);
4849

4850
    // 0 doesn't have a negated inline immediate.
4851
    // TODO: This constant check should be generalized to other operations.
4852
    if (isConstantCostlierToNegate(RHS))
4853
      return SDValue();
4854

4855
    SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4856
    SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4857
    unsigned Opposite = inverseMinMax(Opc);
4858

4859
    SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4860
    if (Res.getOpcode() != Opposite)
4861
      return SDValue(); // Op got folded away.
4862
    if (!N0.hasOneUse())
4863
      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4864
    return Res;
4865
  }
4866
  case AMDGPUISD::FMED3: {
4867
    SDValue Ops[3];
4868
    for (unsigned I = 0; I < 3; ++I)
4869
      Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4870

4871
    SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4872
    if (Res.getOpcode() != AMDGPUISD::FMED3)
4873
      return SDValue(); // Op got folded away.
4874

4875
    if (!N0.hasOneUse()) {
4876
      SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4877
      DAG.ReplaceAllUsesWith(N0, Neg);
4878

4879
      for (SDNode *U : Neg->uses())
4880
        DCI.AddToWorklist(U);
4881
    }
4882

4883
    return Res;
4884
  }
4885
  case ISD::FP_EXTEND:
4886
  case ISD::FTRUNC:
4887
  case ISD::FRINT:
4888
  case ISD::FNEARBYINT: // XXX - Should fround be handled?
4889
  case ISD::FROUNDEVEN:
4890
  case ISD::FSIN:
4891
  case ISD::FCANONICALIZE:
4892
  case AMDGPUISD::RCP:
4893
  case AMDGPUISD::RCP_LEGACY:
4894
  case AMDGPUISD::RCP_IFLAG:
4895
  case AMDGPUISD::SIN_HW: {
4896
    SDValue CvtSrc = N0.getOperand(0);
4897
    if (CvtSrc.getOpcode() == ISD::FNEG) {
4898
      // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4899
      // (fneg (rcp (fneg x))) -> (rcp x)
4900
      return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4901
    }
4902

4903
    if (!N0.hasOneUse())
4904
      return SDValue();
4905

4906
    // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4907
    // (fneg (rcp x)) -> (rcp (fneg x))
4908
    SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4909
    return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4910
  }
4911
  case ISD::FP_ROUND: {
4912
    SDValue CvtSrc = N0.getOperand(0);
4913

4914
    if (CvtSrc.getOpcode() == ISD::FNEG) {
4915
      // (fneg (fp_round (fneg x))) -> (fp_round x)
4916
      return DAG.getNode(ISD::FP_ROUND, SL, VT,
4917
                         CvtSrc.getOperand(0), N0.getOperand(1));
4918
    }
4919

4920
    if (!N0.hasOneUse())
4921
      return SDValue();
4922

4923
    // (fneg (fp_round x)) -> (fp_round (fneg x))
4924
    SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4925
    return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4926
  }
4927
  case ISD::FP16_TO_FP: {
4928
    // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4929
    // f16, but legalization of f16 fneg ends up pulling it out of the source.
4930
    // Put the fneg back as a legal source operation that can be matched later.
4931
    SDLoc SL(N);
4932

4933
    SDValue Src = N0.getOperand(0);
4934
    EVT SrcVT = Src.getValueType();
4935

4936
    // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4937
    SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4938
                                  DAG.getConstant(0x8000, SL, SrcVT));
4939
    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4940
  }
4941
  case ISD::SELECT: {
4942
    // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4943
    // TODO: Invert conditions of foldFreeOpFromSelect
4944
    return SDValue();
4945
  }
4946
  case ISD::BITCAST: {
4947
    SDLoc SL(N);
4948
    SDValue BCSrc = N0.getOperand(0);
4949
    if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4950
      SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4951
      if (HighBits.getValueType().getSizeInBits() != 32 ||
4952
          !fnegFoldsIntoOp(HighBits.getNode()))
4953
        return SDValue();
4954

4955
      // f64 fneg only really needs to operate on the high half of of the
4956
      // register, so try to force it to an f32 operation to help make use of
4957
      // source modifiers.
4958
      //
4959
      //
4960
      // fneg (f64 (bitcast (build_vector x, y))) ->
4961
      // f64 (bitcast (build_vector (bitcast i32:x to f32),
4962
      //                            (fneg (bitcast i32:y to f32)))
4963

4964
      SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4965
      SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4966
      SDValue CastBack =
4967
          DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4968

4969
      SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4970
      Ops.back() = CastBack;
4971
      DCI.AddToWorklist(NegHi.getNode());
4972
      SDValue Build =
4973
          DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4974
      SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4975

4976
      if (!N0.hasOneUse())
4977
        DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4978
      return Result;
4979
    }
4980

4981
    if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4982
        BCSrc.hasOneUse()) {
4983
      // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4984
      //   select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4985

4986
      // TODO: Cast back result for multiple uses is beneficial in some cases.
4987

4988
      SDValue LHS =
4989
          DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4990
      SDValue RHS =
4991
          DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4992

4993
      SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4994
      SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4995

4996
      return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4997
                         NegRHS);
4998
    }
4999

5000
    return SDValue();
5001
  }
5002
  default:
5003
    return SDValue();
5004
  }
5005
}
5006

5007
SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
5008
                                                 DAGCombinerInfo &DCI) const {
5009
  SelectionDAG &DAG = DCI.DAG;
5010
  SDValue N0 = N->getOperand(0);
5011

5012
  if (!N0.hasOneUse())
5013
    return SDValue();
5014

5015
  switch (N0.getOpcode()) {
5016
  case ISD::FP16_TO_FP: {
5017
    assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5018
    SDLoc SL(N);
5019
    SDValue Src = N0.getOperand(0);
5020
    EVT SrcVT = Src.getValueType();
5021

5022
    // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5023
    SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5024
                                  DAG.getConstant(0x7fff, SL, SrcVT));
5025
    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5026
  }
5027
  default:
5028
    return SDValue();
5029
  }
5030
}
5031

5032
SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
5033
                                                DAGCombinerInfo &DCI) const {
5034
  const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5035
  if (!CFP)
5036
    return SDValue();
5037

5038
  // XXX - Should this flush denormals?
5039
  const APFloat &Val = CFP->getValueAPF();
5040
  APFloat One(Val.getSemantics(), "1.0");
5041
  return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5042
}
5043

5044
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
5045
                                                DAGCombinerInfo &DCI) const {
5046
  SelectionDAG &DAG = DCI.DAG;
5047
  SDLoc DL(N);
5048

5049
  switch(N->getOpcode()) {
5050
  default:
5051
    break;
5052
  case ISD::BITCAST: {
5053
    EVT DestVT = N->getValueType(0);
5054

5055
    // Push casts through vector builds. This helps avoid emitting a large
5056
    // number of copies when materializing floating point vector constants.
5057
    //
5058
    // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5059
    //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5060
    if (DestVT.isVector()) {
5061
      SDValue Src = N->getOperand(0);
5062
      if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5063
          (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||
5064
           isOperationLegal(ISD::BUILD_VECTOR, DestVT))) {
5065
        EVT SrcVT = Src.getValueType();
5066
        unsigned NElts = DestVT.getVectorNumElements();
5067

5068
        if (SrcVT.getVectorNumElements() == NElts) {
5069
          EVT DestEltVT = DestVT.getVectorElementType();
5070

5071
          SmallVector<SDValue, 8> CastedElts;
5072
          SDLoc SL(N);
5073
          for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5074
            SDValue Elt = Src.getOperand(I);
5075
            CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5076
          }
5077

5078
          return DAG.getBuildVector(DestVT, SL, CastedElts);
5079
        }
5080
      }
5081
    }
5082

5083
    if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5084
      break;
5085

5086
    // Fold bitcasts of constants.
5087
    //
5088
    // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5089
    // TODO: Generalize and move to DAGCombiner
5090
    SDValue Src = N->getOperand(0);
5091
    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5092
      SDLoc SL(N);
5093
      uint64_t CVal = C->getZExtValue();
5094
      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5095
                               DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5096
                               DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5097
      return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5098
    }
5099

5100
    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5101
      const APInt &Val = C->getValueAPF().bitcastToAPInt();
5102
      SDLoc SL(N);
5103
      uint64_t CVal = Val.getZExtValue();
5104
      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5105
                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5106
                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5107

5108
      return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5109
    }
5110

5111
    break;
5112
  }
5113
  case ISD::SHL: {
5114
    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5115
      break;
5116

5117
    return performShlCombine(N, DCI);
5118
  }
5119
  case ISD::SRL: {
5120
    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5121
      break;
5122

5123
    return performSrlCombine(N, DCI);
5124
  }
5125
  case ISD::SRA: {
5126
    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5127
      break;
5128

5129
    return performSraCombine(N, DCI);
5130
  }
5131
  case ISD::TRUNCATE:
5132
    return performTruncateCombine(N, DCI);
5133
  case ISD::MUL:
5134
    return performMulCombine(N, DCI);
5135
  case AMDGPUISD::MUL_U24:
5136
  case AMDGPUISD::MUL_I24: {
5137
    if (SDValue Simplified = simplifyMul24(N, DCI))
5138
      return Simplified;
5139
    break;
5140
  }
5141
  case AMDGPUISD::MULHI_I24:
5142
  case AMDGPUISD::MULHI_U24:
5143
    return simplifyMul24(N, DCI);
5144
  case ISD::SMUL_LOHI:
5145
  case ISD::UMUL_LOHI:
5146
    return performMulLoHiCombine(N, DCI);
5147
  case ISD::MULHS:
5148
    return performMulhsCombine(N, DCI);
5149
  case ISD::MULHU:
5150
    return performMulhuCombine(N, DCI);
5151
  case ISD::SELECT:
5152
    return performSelectCombine(N, DCI);
5153
  case ISD::FNEG:
5154
    return performFNegCombine(N, DCI);
5155
  case ISD::FABS:
5156
    return performFAbsCombine(N, DCI);
5157
  case AMDGPUISD::BFE_I32:
5158
  case AMDGPUISD::BFE_U32: {
5159
    assert(!N->getValueType(0).isVector() &&
5160
           "Vector handling of BFE not implemented");
5161
    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5162
    if (!Width)
5163
      break;
5164

5165
    uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5166
    if (WidthVal == 0)
5167
      return DAG.getConstant(0, DL, MVT::i32);
5168

5169
    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5170
    if (!Offset)
5171
      break;
5172

5173
    SDValue BitsFrom = N->getOperand(0);
5174
    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5175

5176
    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5177

5178
    if (OffsetVal == 0) {
5179
      // This is already sign / zero extended, so try to fold away extra BFEs.
5180
      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5181

5182
      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5183
      if (OpSignBits >= SignBits)
5184
        return BitsFrom;
5185

5186
      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5187
      if (Signed) {
5188
        // This is a sign_extend_inreg. Replace it to take advantage of existing
5189
        // DAG Combines. If not eliminated, we will match back to BFE during
5190
        // selection.
5191

5192
        // TODO: The sext_inreg of extended types ends, although we can could
5193
        // handle them in a single BFE.
5194
        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5195
                           DAG.getValueType(SmallVT));
5196
      }
5197

5198
      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5199
    }
5200

5201
    if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5202
      if (Signed) {
5203
        return constantFoldBFE<int32_t>(DAG,
5204
                                        CVal->getSExtValue(),
5205
                                        OffsetVal,
5206
                                        WidthVal,
5207
                                        DL);
5208
      }
5209

5210
      return constantFoldBFE<uint32_t>(DAG,
5211
                                       CVal->getZExtValue(),
5212
                                       OffsetVal,
5213
                                       WidthVal,
5214
                                       DL);
5215
    }
5216

5217
    if ((OffsetVal + WidthVal) >= 32 &&
5218
        !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5219
      SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5220
      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5221
                         BitsFrom, ShiftVal);
5222
    }
5223

5224
    if (BitsFrom.hasOneUse()) {
5225
      APInt Demanded = APInt::getBitsSet(32,
5226
                                         OffsetVal,
5227
                                         OffsetVal + WidthVal);
5228

5229
      KnownBits Known;
5230
      TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
5231
                                            !DCI.isBeforeLegalizeOps());
5232
      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5233
      if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5234
          TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5235
        DCI.CommitTargetLoweringOpt(TLO);
5236
      }
5237
    }
5238

5239
    break;
5240
  }
5241
  case ISD::LOAD:
5242
    return performLoadCombine(N, DCI);
5243
  case ISD::STORE:
5244
    return performStoreCombine(N, DCI);
5245
  case AMDGPUISD::RCP:
5246
  case AMDGPUISD::RCP_IFLAG:
5247
    return performRcpCombine(N, DCI);
5248
  case ISD::AssertZext:
5249
  case ISD::AssertSext:
5250
    return performAssertSZExtCombine(N, DCI);
5251
  case ISD::INTRINSIC_WO_CHAIN:
5252
    return performIntrinsicWOChainCombine(N, DCI);
5253
  case AMDGPUISD::FMAD_FTZ: {
5254
    SDValue N0 = N->getOperand(0);
5255
    SDValue N1 = N->getOperand(1);
5256
    SDValue N2 = N->getOperand(2);
5257
    EVT VT = N->getValueType(0);
5258

5259
    // FMAD_FTZ is a FMAD + flush denormals to zero.
5260
    // We flush the inputs, the intermediate step, and the output.
5261
    ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5262
    ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5263
    ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5264
    if (N0CFP && N1CFP && N2CFP) {
5265
      const auto FTZ = [](const APFloat &V) {
5266
        if (V.isDenormal()) {
5267
          APFloat Zero(V.getSemantics(), 0);
5268
          return V.isNegative() ? -Zero : Zero;
5269
        }
5270
        return V;
5271
      };
5272

5273
      APFloat V0 = FTZ(N0CFP->getValueAPF());
5274
      APFloat V1 = FTZ(N1CFP->getValueAPF());
5275
      APFloat V2 = FTZ(N2CFP->getValueAPF());
5276
      V0.multiply(V1, APFloat::rmNearestTiesToEven);
5277
      V0 = FTZ(V0);
5278
      V0.add(V2, APFloat::rmNearestTiesToEven);
5279
      return DAG.getConstantFP(FTZ(V0), DL, VT);
5280
    }
5281
    break;
5282
  }
5283
  }
5284
  return SDValue();
5285
}
5286

5287
//===----------------------------------------------------------------------===//
5288
// Helper functions
5289
//===----------------------------------------------------------------------===//
5290

5291
SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5292
                                                   const TargetRegisterClass *RC,
5293
                                                   Register Reg, EVT VT,
5294
                                                   const SDLoc &SL,
5295
                                                   bool RawReg) const {
5296
  MachineFunction &MF = DAG.getMachineFunction();
5297
  MachineRegisterInfo &MRI = MF.getRegInfo();
5298
  Register VReg;
5299

5300
  if (!MRI.isLiveIn(Reg)) {
5301
    VReg = MRI.createVirtualRegister(RC);
5302
    MRI.addLiveIn(Reg, VReg);
5303
  } else {
5304
    VReg = MRI.getLiveInVirtReg(Reg);
5305
  }
5306

5307
  if (RawReg)
5308
    return DAG.getRegister(VReg, VT);
5309

5310
  return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5311
}
5312

5313
// This may be called multiple times, and nothing prevents creating multiple
5314
// objects at the same offset. See if we already defined this object.
5315
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5316
                                       int64_t Offset) {
5317
  for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5318
    if (MFI.getObjectOffset(I) == Offset) {
5319
      assert(MFI.getObjectSize(I) == Size);
5320
      return I;
5321
    }
5322
  }
5323

5324
  return MFI.CreateFixedObject(Size, Offset, true);
5325
}
5326

5327
SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5328
                                                  EVT VT,
5329
                                                  const SDLoc &SL,
5330
                                                  int64_t Offset) const {
5331
  MachineFunction &MF = DAG.getMachineFunction();
5332
  MachineFrameInfo &MFI = MF.getFrameInfo();
5333
  int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5334

5335
  auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5336
  SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5337

5338
  return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5339
                     MachineMemOperand::MODereferenceable |
5340
                         MachineMemOperand::MOInvariant);
5341
}
5342

5343
SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5344
                                                   const SDLoc &SL,
5345
                                                   SDValue Chain,
5346
                                                   SDValue ArgVal,
5347
                                                   int64_t Offset) const {
5348
  MachineFunction &MF = DAG.getMachineFunction();
5349
  MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5350
  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5351

5352
  SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5353
  // Stores to the argument stack area are relative to the stack pointer.
5354
  SDValue SP =
5355
      DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5356
  Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5357
  SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5358
                               MachineMemOperand::MODereferenceable);
5359
  return Store;
5360
}
5361

5362
SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5363
                                             const TargetRegisterClass *RC,
5364
                                             EVT VT, const SDLoc &SL,
5365
                                             const ArgDescriptor &Arg) const {
5366
  assert(Arg && "Attempting to load missing argument");
5367

5368
  SDValue V = Arg.isRegister() ?
5369
    CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5370
    loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5371

5372
  if (!Arg.isMasked())
5373
    return V;
5374

5375
  unsigned Mask = Arg.getMask();
5376
  unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5377
  V = DAG.getNode(ISD::SRL, SL, VT, V,
5378
                  DAG.getShiftAmountConstant(Shift, VT, SL));
5379
  return DAG.getNode(ISD::AND, SL, VT, V,
5380
                     DAG.getConstant(Mask >> Shift, SL, VT));
5381
}
5382

5383
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5384
    uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5385
  unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5386
  const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5387
  uint64_t ArgOffset =
5388
      alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5389
  switch (Param) {
5390
  case FIRST_IMPLICIT:
5391
    return ArgOffset;
5392
  case PRIVATE_BASE:
5393
    return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5394
  case SHARED_BASE:
5395
    return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5396
  case QUEUE_PTR:
5397
    return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5398
  }
5399
  llvm_unreachable("unexpected implicit parameter type");
5400
}
5401

5402
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5403
    const MachineFunction &MF, const ImplicitParameter Param) const {
5404
  const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
5405
  return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param);
5406
}
5407

5408
#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5409

5410
const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5411
  switch ((AMDGPUISD::NodeType)Opcode) {
5412
  case AMDGPUISD::FIRST_NUMBER: break;
5413
  // AMDIL DAG nodes
5414
  NODE_NAME_CASE(UMUL);
5415
  NODE_NAME_CASE(BRANCH_COND);
5416

5417
  // AMDGPU DAG nodes
5418
  NODE_NAME_CASE(IF)
5419
  NODE_NAME_CASE(ELSE)
5420
  NODE_NAME_CASE(LOOP)
5421
  NODE_NAME_CASE(CALL)
5422
  NODE_NAME_CASE(TC_RETURN)
5423
  NODE_NAME_CASE(TC_RETURN_GFX)
5424
  NODE_NAME_CASE(TC_RETURN_CHAIN)
5425
  NODE_NAME_CASE(TRAP)
5426
  NODE_NAME_CASE(RET_GLUE)
5427
  NODE_NAME_CASE(WAVE_ADDRESS)
5428
  NODE_NAME_CASE(RETURN_TO_EPILOG)
5429
  NODE_NAME_CASE(ENDPGM)
5430
  NODE_NAME_CASE(ENDPGM_TRAP)
5431
  NODE_NAME_CASE(SIMULATED_TRAP)
5432
  NODE_NAME_CASE(DWORDADDR)
5433
  NODE_NAME_CASE(FRACT)
5434
  NODE_NAME_CASE(SETCC)
5435
  NODE_NAME_CASE(SETREG)
5436
  NODE_NAME_CASE(DENORM_MODE)
5437
  NODE_NAME_CASE(FMA_W_CHAIN)
5438
  NODE_NAME_CASE(FMUL_W_CHAIN)
5439
  NODE_NAME_CASE(CLAMP)
5440
  NODE_NAME_CASE(COS_HW)
5441
  NODE_NAME_CASE(SIN_HW)
5442
  NODE_NAME_CASE(FMAX_LEGACY)
5443
  NODE_NAME_CASE(FMIN_LEGACY)
5444
  NODE_NAME_CASE(FMAX3)
5445
  NODE_NAME_CASE(SMAX3)
5446
  NODE_NAME_CASE(UMAX3)
5447
  NODE_NAME_CASE(FMIN3)
5448
  NODE_NAME_CASE(SMIN3)
5449
  NODE_NAME_CASE(UMIN3)
5450
  NODE_NAME_CASE(FMED3)
5451
  NODE_NAME_CASE(SMED3)
5452
  NODE_NAME_CASE(UMED3)
5453
  NODE_NAME_CASE(FMAXIMUM3)
5454
  NODE_NAME_CASE(FMINIMUM3)
5455
  NODE_NAME_CASE(FDOT2)
5456
  NODE_NAME_CASE(URECIP)
5457
  NODE_NAME_CASE(DIV_SCALE)
5458
  NODE_NAME_CASE(DIV_FMAS)
5459
  NODE_NAME_CASE(DIV_FIXUP)
5460
  NODE_NAME_CASE(FMAD_FTZ)
5461
  NODE_NAME_CASE(RCP)
5462
  NODE_NAME_CASE(RSQ)
5463
  NODE_NAME_CASE(RCP_LEGACY)
5464
  NODE_NAME_CASE(RCP_IFLAG)
5465
  NODE_NAME_CASE(LOG)
5466
  NODE_NAME_CASE(EXP)
5467
  NODE_NAME_CASE(FMUL_LEGACY)
5468
  NODE_NAME_CASE(RSQ_CLAMP)
5469
  NODE_NAME_CASE(FP_CLASS)
5470
  NODE_NAME_CASE(DOT4)
5471
  NODE_NAME_CASE(CARRY)
5472
  NODE_NAME_CASE(BORROW)
5473
  NODE_NAME_CASE(BFE_U32)
5474
  NODE_NAME_CASE(BFE_I32)
5475
  NODE_NAME_CASE(BFI)
5476
  NODE_NAME_CASE(BFM)
5477
  NODE_NAME_CASE(FFBH_U32)
5478
  NODE_NAME_CASE(FFBH_I32)
5479
  NODE_NAME_CASE(FFBL_B32)
5480
  NODE_NAME_CASE(MUL_U24)
5481
  NODE_NAME_CASE(MUL_I24)
5482
  NODE_NAME_CASE(MULHI_U24)
5483
  NODE_NAME_CASE(MULHI_I24)
5484
  NODE_NAME_CASE(MAD_U24)
5485
  NODE_NAME_CASE(MAD_I24)
5486
  NODE_NAME_CASE(MAD_I64_I32)
5487
  NODE_NAME_CASE(MAD_U64_U32)
5488
  NODE_NAME_CASE(PERM)
5489
  NODE_NAME_CASE(TEXTURE_FETCH)
5490
  NODE_NAME_CASE(R600_EXPORT)
5491
  NODE_NAME_CASE(CONST_ADDRESS)
5492
  NODE_NAME_CASE(REGISTER_LOAD)
5493
  NODE_NAME_CASE(REGISTER_STORE)
5494
  NODE_NAME_CASE(SAMPLE)
5495
  NODE_NAME_CASE(SAMPLEB)
5496
  NODE_NAME_CASE(SAMPLED)
5497
  NODE_NAME_CASE(SAMPLEL)
5498
  NODE_NAME_CASE(CVT_F32_UBYTE0)
5499
  NODE_NAME_CASE(CVT_F32_UBYTE1)
5500
  NODE_NAME_CASE(CVT_F32_UBYTE2)
5501
  NODE_NAME_CASE(CVT_F32_UBYTE3)
5502
  NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5503
  NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5504
  NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5505
  NODE_NAME_CASE(CVT_PK_I16_I32)
5506
  NODE_NAME_CASE(CVT_PK_U16_U32)
5507
  NODE_NAME_CASE(FP_TO_FP16)
5508
  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5509
  NODE_NAME_CASE(CONST_DATA_PTR)
5510
  NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5511
  NODE_NAME_CASE(LDS)
5512
  NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
5513
  NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5514
  NODE_NAME_CASE(DUMMY_CHAIN)
5515
  case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
5516
  NODE_NAME_CASE(LOAD_D16_HI)
5517
  NODE_NAME_CASE(LOAD_D16_LO)
5518
  NODE_NAME_CASE(LOAD_D16_HI_I8)
5519
  NODE_NAME_CASE(LOAD_D16_HI_U8)
5520
  NODE_NAME_CASE(LOAD_D16_LO_I8)
5521
  NODE_NAME_CASE(LOAD_D16_LO_U8)
5522
  NODE_NAME_CASE(STORE_MSKOR)
5523
  NODE_NAME_CASE(LOAD_CONSTANT)
5524
  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5525
  NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5526
  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5527
  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5528
  NODE_NAME_CASE(DS_ORDERED_COUNT)
5529
  NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5530
  NODE_NAME_CASE(BUFFER_LOAD)
5531
  NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5532
  NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5533
  NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5534
  NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5535
  NODE_NAME_CASE(BUFFER_LOAD_TFE)
5536
  NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
5537
  NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
5538
  NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
5539
  NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
5540
  NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5541
  NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5542
  NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5543
  NODE_NAME_CASE(SBUFFER_LOAD)
5544
  NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5545
  NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5546
  NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5547
  NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5548
  NODE_NAME_CASE(BUFFER_STORE)
5549
  NODE_NAME_CASE(BUFFER_STORE_BYTE)
5550
  NODE_NAME_CASE(BUFFER_STORE_SHORT)
5551
  NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5552
  NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5553
  NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5554
  NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5555
  NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5556
  NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5557
  NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5558
  NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5559
  NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5560
  NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5561
  NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5562
  NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5563
  NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5564
  NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5565
  NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5566
  NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5567
  NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5568
  NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5569
  NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5570
  NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5571

5572
  case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
5573
  }
5574
  return nullptr;
5575
}
5576

5577
SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5578
                                              SelectionDAG &DAG, int Enabled,
5579
                                              int &RefinementSteps,
5580
                                              bool &UseOneConstNR,
5581
                                              bool Reciprocal) const {
5582
  EVT VT = Operand.getValueType();
5583

5584
  if (VT == MVT::f32) {
5585
    RefinementSteps = 0;
5586
    return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5587
  }
5588

5589
  // TODO: There is also f64 rsq instruction, but the documentation is less
5590
  // clear on its precision.
5591

5592
  return SDValue();
5593
}
5594

5595
SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
5596
                                               SelectionDAG &DAG, int Enabled,
5597
                                               int &RefinementSteps) const {
5598
  EVT VT = Operand.getValueType();
5599

5600
  if (VT == MVT::f32) {
5601
    // Reciprocal, < 1 ulp error.
5602
    //
5603
    // This reciprocal approximation converges to < 0.5 ulp error with one
5604
    // newton rhapson performed with two fused multiple adds (FMAs).
5605

5606
    RefinementSteps = 0;
5607
    return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5608
  }
5609

5610
  // TODO: There is also f64 rcp instruction, but the documentation is less
5611
  // clear on its precision.
5612

5613
  return SDValue();
5614
}
5615

5616
static unsigned workitemIntrinsicDim(unsigned ID) {
5617
  switch (ID) {
5618
  case Intrinsic::amdgcn_workitem_id_x:
5619
    return 0;
5620
  case Intrinsic::amdgcn_workitem_id_y:
5621
    return 1;
5622
  case Intrinsic::amdgcn_workitem_id_z:
5623
    return 2;
5624
  default:
5625
    llvm_unreachable("not a workitem intrinsic");
5626
  }
5627
}
5628

5629
void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
5630
    const SDValue Op, KnownBits &Known,
5631
    const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5632

5633
  Known.resetAll(); // Don't know anything.
5634

5635
  unsigned Opc = Op.getOpcode();
5636

5637
  switch (Opc) {
5638
  default:
5639
    break;
5640
  case AMDGPUISD::CARRY:
5641
  case AMDGPUISD::BORROW: {
5642
    Known.Zero = APInt::getHighBitsSet(32, 31);
5643
    break;
5644
  }
5645

5646
  case AMDGPUISD::BFE_I32:
5647
  case AMDGPUISD::BFE_U32: {
5648
    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5649
    if (!CWidth)
5650
      return;
5651

5652
    uint32_t Width = CWidth->getZExtValue() & 0x1f;
5653

5654
    if (Opc == AMDGPUISD::BFE_U32)
5655
      Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5656

5657
    break;
5658
  }
5659
  case AMDGPUISD::FP_TO_FP16: {
5660
    unsigned BitWidth = Known.getBitWidth();
5661

5662
    // High bits are zero.
5663
    Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
5664
    break;
5665
  }
5666
  case AMDGPUISD::MUL_U24:
5667
  case AMDGPUISD::MUL_I24: {
5668
    KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5669
    KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5670
    unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5671
                      RHSKnown.countMinTrailingZeros();
5672
    Known.Zero.setLowBits(std::min(TrailZ, 32u));
5673
    // Skip extra check if all bits are known zeros.
5674
    if (TrailZ >= 32)
5675
      break;
5676

5677
    // Truncate to 24 bits.
5678
    LHSKnown = LHSKnown.trunc(24);
5679
    RHSKnown = RHSKnown.trunc(24);
5680

5681
    if (Opc == AMDGPUISD::MUL_I24) {
5682
      unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5683
      unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5684
      unsigned MaxValBits = LHSValBits + RHSValBits;
5685
      if (MaxValBits > 32)
5686
        break;
5687
      unsigned SignBits = 32 - MaxValBits + 1;
5688
      bool LHSNegative = LHSKnown.isNegative();
5689
      bool LHSNonNegative = LHSKnown.isNonNegative();
5690
      bool LHSPositive = LHSKnown.isStrictlyPositive();
5691
      bool RHSNegative = RHSKnown.isNegative();
5692
      bool RHSNonNegative = RHSKnown.isNonNegative();
5693
      bool RHSPositive = RHSKnown.isStrictlyPositive();
5694

5695
      if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5696
        Known.Zero.setHighBits(SignBits);
5697
      else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5698
        Known.One.setHighBits(SignBits);
5699
    } else {
5700
      unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5701
      unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5702
      unsigned MaxValBits = LHSValBits + RHSValBits;
5703
      if (MaxValBits >= 32)
5704
        break;
5705
      Known.Zero.setBitsFrom(MaxValBits);
5706
    }
5707
    break;
5708
  }
5709
  case AMDGPUISD::PERM: {
5710
    ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5711
    if (!CMask)
5712
      return;
5713

5714
    KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5715
    KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5716
    unsigned Sel = CMask->getZExtValue();
5717

5718
    for (unsigned I = 0; I < 32; I += 8) {
5719
      unsigned SelBits = Sel & 0xff;
5720
      if (SelBits < 4) {
5721
        SelBits *= 8;
5722
        Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5723
        Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5724
      } else if (SelBits < 7) {
5725
        SelBits = (SelBits & 3) * 8;
5726
        Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5727
        Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5728
      } else if (SelBits == 0x0c) {
5729
        Known.Zero |= 0xFFull << I;
5730
      } else if (SelBits > 0x0c) {
5731
        Known.One |= 0xFFull << I;
5732
      }
5733
      Sel >>= 8;
5734
    }
5735
    break;
5736
  }
5737
  case AMDGPUISD::BUFFER_LOAD_UBYTE:  {
5738
    Known.Zero.setHighBits(24);
5739
    break;
5740
  }
5741
  case AMDGPUISD::BUFFER_LOAD_USHORT: {
5742
    Known.Zero.setHighBits(16);
5743
    break;
5744
  }
5745
  case AMDGPUISD::LDS: {
5746
    auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5747
    Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5748

5749
    Known.Zero.setHighBits(16);
5750
    Known.Zero.setLowBits(Log2(Alignment));
5751
    break;
5752
  }
5753
  case AMDGPUISD::SMIN3:
5754
  case AMDGPUISD::SMAX3:
5755
  case AMDGPUISD::SMED3:
5756
  case AMDGPUISD::UMIN3:
5757
  case AMDGPUISD::UMAX3:
5758
  case AMDGPUISD::UMED3: {
5759
    KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5760
    if (Known2.isUnknown())
5761
      break;
5762

5763
    KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5764
    if (Known1.isUnknown())
5765
      break;
5766

5767
    KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5768
    if (Known0.isUnknown())
5769
      break;
5770

5771
    // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5772
    Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5773
    Known.One = Known0.One & Known1.One & Known2.One;
5774
    break;
5775
  }
5776
  case ISD::INTRINSIC_WO_CHAIN: {
5777
    unsigned IID = Op.getConstantOperandVal(0);
5778
    switch (IID) {
5779
    case Intrinsic::amdgcn_workitem_id_x:
5780
    case Intrinsic::amdgcn_workitem_id_y:
5781
    case Intrinsic::amdgcn_workitem_id_z: {
5782
      unsigned MaxValue = Subtarget->getMaxWorkitemID(
5783
          DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));
5784
      Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5785
      break;
5786
    }
5787
    default:
5788
      break;
5789
    }
5790
  }
5791
  }
5792
}
5793

5794
unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
5795
    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5796
    unsigned Depth) const {
5797
  switch (Op.getOpcode()) {
5798
  case AMDGPUISD::BFE_I32: {
5799
    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5800
    if (!Width)
5801
      return 1;
5802

5803
    unsigned SignBits = 32 - Width->getZExtValue() + 1;
5804
    if (!isNullConstant(Op.getOperand(1)))
5805
      return SignBits;
5806

5807
    // TODO: Could probably figure something out with non-0 offsets.
5808
    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5809
    return std::max(SignBits, Op0SignBits);
5810
  }
5811

5812
  case AMDGPUISD::BFE_U32: {
5813
    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5814
    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5815
  }
5816

5817
  case AMDGPUISD::CARRY:
5818
  case AMDGPUISD::BORROW:
5819
    return 31;
5820
  case AMDGPUISD::BUFFER_LOAD_BYTE:
5821
    return 25;
5822
  case AMDGPUISD::BUFFER_LOAD_SHORT:
5823
    return 17;
5824
  case AMDGPUISD::BUFFER_LOAD_UBYTE:
5825
    return 24;
5826
  case AMDGPUISD::BUFFER_LOAD_USHORT:
5827
    return 16;
5828
  case AMDGPUISD::FP_TO_FP16:
5829
    return 16;
5830
  case AMDGPUISD::SMIN3:
5831
  case AMDGPUISD::SMAX3:
5832
  case AMDGPUISD::SMED3:
5833
  case AMDGPUISD::UMIN3:
5834
  case AMDGPUISD::UMAX3:
5835
  case AMDGPUISD::UMED3: {
5836
    unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5837
    if (Tmp2 == 1)
5838
      return 1; // Early out.
5839

5840
    unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5841
    if (Tmp1 == 1)
5842
      return 1; // Early out.
5843

5844
    unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5845
    if (Tmp0 == 1)
5846
      return 1; // Early out.
5847

5848
    return std::min({Tmp0, Tmp1, Tmp2});
5849
  }
5850
  default:
5851
    return 1;
5852
  }
5853
}
5854

5855
unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
5856
  GISelKnownBits &Analysis, Register R,
5857
  const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5858
  unsigned Depth) const {
5859
  const MachineInstr *MI = MRI.getVRegDef(R);
5860
  if (!MI)
5861
    return 1;
5862

5863
  // TODO: Check range metadata on MMO.
5864
  switch (MI->getOpcode()) {
5865
  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5866
    return 25;
5867
  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5868
    return 17;
5869
  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5870
    return 24;
5871
  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5872
    return 16;
5873
  case AMDGPU::G_AMDGPU_SMED3:
5874
  case AMDGPU::G_AMDGPU_UMED3: {
5875
    auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5876
    unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5877
    if (Tmp2 == 1)
5878
      return 1;
5879
    unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5880
    if (Tmp1 == 1)
5881
      return 1;
5882
    unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5883
    if (Tmp0 == 1)
5884
      return 1;
5885
    return std::min({Tmp0, Tmp1, Tmp2});
5886
  }
5887
  default:
5888
    return 1;
5889
  }
5890
}
5891

5892
bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
5893
                                                        const SelectionDAG &DAG,
5894
                                                        bool SNaN,
5895
                                                        unsigned Depth) const {
5896
  unsigned Opcode = Op.getOpcode();
5897
  switch (Opcode) {
5898
  case AMDGPUISD::FMIN_LEGACY:
5899
  case AMDGPUISD::FMAX_LEGACY: {
5900
    if (SNaN)
5901
      return true;
5902

5903
    // TODO: Can check no nans on one of the operands for each one, but which
5904
    // one?
5905
    return false;
5906
  }
5907
  case AMDGPUISD::FMUL_LEGACY:
5908
  case AMDGPUISD::CVT_PKRTZ_F16_F32: {
5909
    if (SNaN)
5910
      return true;
5911
    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5912
           DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5913
  }
5914
  case AMDGPUISD::FMED3:
5915
  case AMDGPUISD::FMIN3:
5916
  case AMDGPUISD::FMAX3:
5917
  case AMDGPUISD::FMINIMUM3:
5918
  case AMDGPUISD::FMAXIMUM3:
5919
  case AMDGPUISD::FMAD_FTZ: {
5920
    if (SNaN)
5921
      return true;
5922
    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5923
           DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5924
           DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5925
  }
5926
  case AMDGPUISD::CVT_F32_UBYTE0:
5927
  case AMDGPUISD::CVT_F32_UBYTE1:
5928
  case AMDGPUISD::CVT_F32_UBYTE2:
5929
  case AMDGPUISD::CVT_F32_UBYTE3:
5930
    return true;
5931

5932
  case AMDGPUISD::RCP:
5933
  case AMDGPUISD::RSQ:
5934
  case AMDGPUISD::RCP_LEGACY:
5935
  case AMDGPUISD::RSQ_CLAMP: {
5936
    if (SNaN)
5937
      return true;
5938

5939
    // TODO: Need is known positive check.
5940
    return false;
5941
  }
5942
  case ISD::FLDEXP:
5943
  case AMDGPUISD::FRACT: {
5944
    if (SNaN)
5945
      return true;
5946
    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5947
  }
5948
  case AMDGPUISD::DIV_SCALE:
5949
  case AMDGPUISD::DIV_FMAS:
5950
  case AMDGPUISD::DIV_FIXUP:
5951
    // TODO: Refine on operands.
5952
    return SNaN;
5953
  case AMDGPUISD::SIN_HW:
5954
  case AMDGPUISD::COS_HW: {
5955
    // TODO: Need check for infinity
5956
    return SNaN;
5957
  }
5958
  case ISD::INTRINSIC_WO_CHAIN: {
5959
    unsigned IntrinsicID = Op.getConstantOperandVal(0);
5960
    // TODO: Handle more intrinsics
5961
    switch (IntrinsicID) {
5962
    case Intrinsic::amdgcn_cubeid:
5963
      return true;
5964

5965
    case Intrinsic::amdgcn_frexp_mant: {
5966
      if (SNaN)
5967
        return true;
5968
      return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5969
    }
5970
    case Intrinsic::amdgcn_cvt_pkrtz: {
5971
      if (SNaN)
5972
        return true;
5973
      return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5974
             DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5975
    }
5976
    case Intrinsic::amdgcn_rcp:
5977
    case Intrinsic::amdgcn_rsq:
5978
    case Intrinsic::amdgcn_rcp_legacy:
5979
    case Intrinsic::amdgcn_rsq_legacy:
5980
    case Intrinsic::amdgcn_rsq_clamp: {
5981
      if (SNaN)
5982
        return true;
5983

5984
      // TODO: Need is known positive check.
5985
      return false;
5986
    }
5987
    case Intrinsic::amdgcn_trig_preop:
5988
    case Intrinsic::amdgcn_fdot2:
5989
      // TODO: Refine on operand
5990
      return SNaN;
5991
    case Intrinsic::amdgcn_fma_legacy:
5992
      if (SNaN)
5993
        return true;
5994
      return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5995
             DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5996
             DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5997
    default:
5998
      return false;
5999
    }
6000
  }
6001
  default:
6002
    return false;
6003
  }
6004
}
6005

6006
bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
6007
                                               Register N0, Register N1) const {
6008
  return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6009
}
6010

6011
TargetLowering::AtomicExpansionKind
6012
AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
6013
  switch (RMW->getOperation()) {
6014
  case AtomicRMWInst::Nand:
6015
  case AtomicRMWInst::FAdd:
6016
  case AtomicRMWInst::FSub:
6017
  case AtomicRMWInst::FMax:
6018
  case AtomicRMWInst::FMin:
6019
    return AtomicExpansionKind::CmpXChg;
6020
  case AtomicRMWInst::Xchg: {
6021
    const DataLayout &DL = RMW->getFunction()->getDataLayout();
6022
    unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
6023
    if (ValSize == 32 || ValSize == 64)
6024
      return AtomicExpansionKind::None;
6025
    return AtomicExpansionKind::CmpXChg;
6026
  }
6027
  default: {
6028
    if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
6029
      unsigned Size = IntTy->getBitWidth();
6030
      if (Size == 32 || Size == 64)
6031
        return AtomicExpansionKind::None;
6032
    }
6033

6034
    return AtomicExpansionKind::CmpXChg;
6035
  }
6036
  }
6037
}
6038

6039
/// Whether it is profitable to sink the operands of an
6040
/// Instruction I to the basic block of I.
6041
/// This helps using several modifiers (like abs and neg) more often.
6042
bool AMDGPUTargetLowering::shouldSinkOperands(
6043
    Instruction *I, SmallVectorImpl<Use *> &Ops) const {
6044
  using namespace PatternMatch;
6045

6046
  for (auto &Op : I->operands()) {
6047
    // Ensure we are not already sinking this operand.
6048
    if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
6049
      continue;
6050

6051
    if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
6052
      Ops.push_back(&Op);
6053
  }
6054

6055
  return !Ops.empty();
6056
}
6057

6058
Product

Resources

Company