Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
104186 views
1
//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file implements the PPCISelLowering class.
10
//
11
//===----------------------------------------------------------------------===//
12
13
#include "PPCISelLowering.h"
14
#include "MCTargetDesc/PPCMCTargetDesc.h"
15
#include "MCTargetDesc/PPCPredicates.h"
16
#include "PPC.h"
17
#include "PPCCCState.h"
18
#include "PPCCallingConv.h"
19
#include "PPCFrameLowering.h"
20
#include "PPCInstrInfo.h"
21
#include "PPCMachineFunctionInfo.h"
22
#include "PPCPerfectShuffle.h"
23
#include "PPCRegisterInfo.h"
24
#include "PPCSubtarget.h"
25
#include "PPCTargetMachine.h"
26
#include "llvm/ADT/APFloat.h"
27
#include "llvm/ADT/APInt.h"
28
#include "llvm/ADT/APSInt.h"
29
#include "llvm/ADT/ArrayRef.h"
30
#include "llvm/ADT/DenseMap.h"
31
#include "llvm/ADT/STLExtras.h"
32
#include "llvm/ADT/SmallPtrSet.h"
33
#include "llvm/ADT/SmallSet.h"
34
#include "llvm/ADT/SmallVector.h"
35
#include "llvm/ADT/Statistic.h"
36
#include "llvm/ADT/StringRef.h"
37
#include "llvm/ADT/StringSwitch.h"
38
#include "llvm/CodeGen/CallingConvLower.h"
39
#include "llvm/CodeGen/ISDOpcodes.h"
40
#include "llvm/CodeGen/MachineBasicBlock.h"
41
#include "llvm/CodeGen/MachineFrameInfo.h"
42
#include "llvm/CodeGen/MachineFunction.h"
43
#include "llvm/CodeGen/MachineInstr.h"
44
#include "llvm/CodeGen/MachineInstrBuilder.h"
45
#include "llvm/CodeGen/MachineJumpTableInfo.h"
46
#include "llvm/CodeGen/MachineLoopInfo.h"
47
#include "llvm/CodeGen/MachineMemOperand.h"
48
#include "llvm/CodeGen/MachineModuleInfo.h"
49
#include "llvm/CodeGen/MachineOperand.h"
50
#include "llvm/CodeGen/MachineRegisterInfo.h"
51
#include "llvm/CodeGen/RuntimeLibcallUtil.h"
52
#include "llvm/CodeGen/SelectionDAG.h"
53
#include "llvm/CodeGen/SelectionDAGNodes.h"
54
#include "llvm/CodeGen/TargetInstrInfo.h"
55
#include "llvm/CodeGen/TargetLowering.h"
56
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
57
#include "llvm/CodeGen/TargetRegisterInfo.h"
58
#include "llvm/CodeGen/ValueTypes.h"
59
#include "llvm/CodeGenTypes/MachineValueType.h"
60
#include "llvm/IR/CallingConv.h"
61
#include "llvm/IR/Constant.h"
62
#include "llvm/IR/Constants.h"
63
#include "llvm/IR/DataLayout.h"
64
#include "llvm/IR/DebugLoc.h"
65
#include "llvm/IR/DerivedTypes.h"
66
#include "llvm/IR/Function.h"
67
#include "llvm/IR/GlobalValue.h"
68
#include "llvm/IR/IRBuilder.h"
69
#include "llvm/IR/Instructions.h"
70
#include "llvm/IR/Intrinsics.h"
71
#include "llvm/IR/IntrinsicsPowerPC.h"
72
#include "llvm/IR/Module.h"
73
#include "llvm/IR/Type.h"
74
#include "llvm/IR/Use.h"
75
#include "llvm/IR/Value.h"
76
#include "llvm/MC/MCContext.h"
77
#include "llvm/MC/MCExpr.h"
78
#include "llvm/MC/MCRegisterInfo.h"
79
#include "llvm/MC/MCSectionXCOFF.h"
80
#include "llvm/MC/MCSymbolXCOFF.h"
81
#include "llvm/Support/AtomicOrdering.h"
82
#include "llvm/Support/BranchProbability.h"
83
#include "llvm/Support/Casting.h"
84
#include "llvm/Support/CodeGen.h"
85
#include "llvm/Support/CommandLine.h"
86
#include "llvm/Support/Compiler.h"
87
#include "llvm/Support/Debug.h"
88
#include "llvm/Support/ErrorHandling.h"
89
#include "llvm/Support/Format.h"
90
#include "llvm/Support/KnownBits.h"
91
#include "llvm/Support/MathExtras.h"
92
#include "llvm/Support/raw_ostream.h"
93
#include "llvm/Target/TargetMachine.h"
94
#include "llvm/Target/TargetOptions.h"
95
#include <algorithm>
96
#include <cassert>
97
#include <cstdint>
98
#include <iterator>
99
#include <list>
100
#include <optional>
101
#include <utility>
102
#include <vector>
103
104
using namespace llvm;
105
106
#define DEBUG_TYPE "ppc-lowering"
107
108
static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
109
cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
110
111
static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
112
cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
113
114
static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
115
cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
116
117
static cl::opt<bool> DisableSCO("disable-ppc-sco",
118
cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
119
120
static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
121
cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
122
123
static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
124
cl::desc("use absolute jump tables on ppc"), cl::Hidden);
125
126
static cl::opt<bool>
127
DisablePerfectShuffle("ppc-disable-perfect-shuffle",
128
cl::desc("disable vector permute decomposition"),
129
cl::init(true), cl::Hidden);
130
131
cl::opt<bool> DisableAutoPairedVecSt(
132
"disable-auto-paired-vec-st",
133
cl::desc("disable automatically generated 32byte paired vector stores"),
134
cl::init(true), cl::Hidden);
135
136
static cl::opt<unsigned> PPCMinimumJumpTableEntries(
137
"ppc-min-jump-table-entries", cl::init(64), cl::Hidden,
138
cl::desc("Set minimum number of entries to use a jump table on PPC"));
139
140
static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth(
141
"ppc-gather-alias-max-depth", cl::init(18), cl::Hidden,
142
cl::desc("max depth when checking alias info in GatherAllAliases()"));
143
144
static cl::opt<unsigned> PPCAIXTLSModelOptUseIEForLDLimit(
145
"ppc-aix-shared-lib-tls-model-opt-limit", cl::init(1), cl::Hidden,
146
cl::desc("Set inclusive limit count of TLS local-dynamic access(es) in a "
147
"function to use initial-exec"));
148
149
STATISTIC(NumTailCalls, "Number of tail calls");
150
STATISTIC(NumSiblingCalls, "Number of sibling calls");
151
STATISTIC(ShufflesHandledWithVPERM,
152
"Number of shuffles lowered to a VPERM or XXPERM");
153
STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed");
154
155
static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
156
157
static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
158
159
static const char AIXSSPCanaryWordName[] = "__ssp_canary_word";
160
161
// A faster local-[exec|dynamic] TLS access sequence (enabled with the
162
// -maix-small-local-[exec|dynamic]-tls option) can be produced for TLS
163
// variables; consistent with the IBM XL compiler, we apply a max size of
164
// slightly under 32KB.
165
constexpr uint64_t AIXSmallTlsPolicySizeLimit = 32751;
166
167
// FIXME: Remove this once the bug has been fixed!
168
extern cl::opt<bool> ANDIGlueBug;
169
170
PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
171
const PPCSubtarget &STI)
172
: TargetLowering(TM), Subtarget(STI) {
173
// Initialize map that relates the PPC addressing modes to the computed flags
174
// of a load/store instruction. The map is used to determine the optimal
175
// addressing mode when selecting load and stores.
176
initializeAddrModeMap();
177
// On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
178
// arguments are at least 4/8 bytes aligned.
179
bool isPPC64 = Subtarget.isPPC64();
180
setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4));
181
182
// Set up the register classes.
183
addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
184
if (!useSoftFloat()) {
185
if (hasSPE()) {
186
addRegisterClass(MVT::f32, &PPC::GPRCRegClass);
187
// EFPU2 APU only supports f32
188
if (!Subtarget.hasEFPU2())
189
addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
190
} else {
191
addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
192
addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
193
}
194
}
195
196
// Match BITREVERSE to customized fast code sequence in the td file.
197
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
198
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
199
200
// Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
201
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
202
203
// Custom lower inline assembly to check for special registers.
204
setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
205
setOperationAction(ISD::INLINEASM_BR, MVT::Other, Custom);
206
207
// PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
208
for (MVT VT : MVT::integer_valuetypes()) {
209
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
210
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
211
}
212
213
if (Subtarget.isISA3_0()) {
214
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Legal);
215
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Legal);
216
setTruncStoreAction(MVT::f64, MVT::f16, Legal);
217
setTruncStoreAction(MVT::f32, MVT::f16, Legal);
218
} else {
219
// No extending loads from f16 or HW conversions back and forth.
220
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
221
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
222
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
223
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
224
setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
225
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
226
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
227
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
228
}
229
230
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
231
232
// PowerPC has pre-inc load and store's.
233
setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
234
setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
235
setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
236
setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
237
setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
238
setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
239
setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
240
setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
241
setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
242
setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
243
if (!Subtarget.hasSPE()) {
244
setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
245
setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
246
setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
247
setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
248
}
249
250
// PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry.
251
const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
252
for (MVT VT : ScalarIntVTs) {
253
setOperationAction(ISD::ADDC, VT, Legal);
254
setOperationAction(ISD::ADDE, VT, Legal);
255
setOperationAction(ISD::SUBC, VT, Legal);
256
setOperationAction(ISD::SUBE, VT, Legal);
257
}
258
259
if (Subtarget.useCRBits()) {
260
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
261
262
if (isPPC64 || Subtarget.hasFPCVT()) {
263
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Promote);
264
AddPromotedToType(ISD::STRICT_SINT_TO_FP, MVT::i1,
265
isPPC64 ? MVT::i64 : MVT::i32);
266
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Promote);
267
AddPromotedToType(ISD::STRICT_UINT_TO_FP, MVT::i1,
268
isPPC64 ? MVT::i64 : MVT::i32);
269
270
setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
271
AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
272
isPPC64 ? MVT::i64 : MVT::i32);
273
setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
274
AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
275
isPPC64 ? MVT::i64 : MVT::i32);
276
277
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i1, Promote);
278
AddPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::i1,
279
isPPC64 ? MVT::i64 : MVT::i32);
280
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i1, Promote);
281
AddPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::i1,
282
isPPC64 ? MVT::i64 : MVT::i32);
283
284
setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote);
285
AddPromotedToType(ISD::FP_TO_SINT, MVT::i1,
286
isPPC64 ? MVT::i64 : MVT::i32);
287
setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote);
288
AddPromotedToType(ISD::FP_TO_UINT, MVT::i1,
289
isPPC64 ? MVT::i64 : MVT::i32);
290
} else {
291
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Custom);
292
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Custom);
293
setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
294
setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
295
}
296
297
// PowerPC does not support direct load/store of condition registers.
298
setOperationAction(ISD::LOAD, MVT::i1, Custom);
299
setOperationAction(ISD::STORE, MVT::i1, Custom);
300
301
// FIXME: Remove this once the ANDI glue bug is fixed:
302
if (ANDIGlueBug)
303
setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
304
305
for (MVT VT : MVT::integer_valuetypes()) {
306
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
307
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
308
setTruncStoreAction(VT, MVT::i1, Expand);
309
}
310
311
addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
312
}
313
314
// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
315
// PPC (the libcall is not available).
316
setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom);
317
setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom);
318
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::ppcf128, Custom);
319
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::ppcf128, Custom);
320
321
// We do not currently implement these libm ops for PowerPC.
322
setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
323
setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand);
324
setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
325
setOperationAction(ISD::FRINT, MVT::ppcf128, Expand);
326
setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
327
setOperationAction(ISD::FREM, MVT::ppcf128, Expand);
328
329
// PowerPC has no SREM/UREM instructions unless we are on P9
330
// On P9 we may use a hardware instruction to compute the remainder.
331
// When the result of both the remainder and the division is required it is
332
// more efficient to compute the remainder from the result of the division
333
// rather than use the remainder instruction. The instructions are legalized
334
// directly because the DivRemPairsPass performs the transformation at the IR
335
// level.
336
if (Subtarget.isISA3_0()) {
337
setOperationAction(ISD::SREM, MVT::i32, Legal);
338
setOperationAction(ISD::UREM, MVT::i32, Legal);
339
setOperationAction(ISD::SREM, MVT::i64, Legal);
340
setOperationAction(ISD::UREM, MVT::i64, Legal);
341
} else {
342
setOperationAction(ISD::SREM, MVT::i32, Expand);
343
setOperationAction(ISD::UREM, MVT::i32, Expand);
344
setOperationAction(ISD::SREM, MVT::i64, Expand);
345
setOperationAction(ISD::UREM, MVT::i64, Expand);
346
}
347
348
// Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
349
setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
350
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
351
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
352
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
353
setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
354
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
355
setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
356
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
357
358
// Handle constrained floating-point operations of scalar.
359
// TODO: Handle SPE specific operation.
360
setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
361
setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
362
setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
363
setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
364
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
365
366
setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
367
setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
368
setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
369
setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
370
371
if (!Subtarget.hasSPE()) {
372
setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal);
373
setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);
374
}
375
376
if (Subtarget.hasVSX()) {
377
setOperationAction(ISD::STRICT_FRINT, MVT::f32, Legal);
378
setOperationAction(ISD::STRICT_FRINT, MVT::f64, Legal);
379
}
380
381
if (Subtarget.hasFSQRT()) {
382
setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
383
setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
384
}
385
386
if (Subtarget.hasFPRND()) {
387
setOperationAction(ISD::STRICT_FFLOOR, MVT::f32, Legal);
388
setOperationAction(ISD::STRICT_FCEIL, MVT::f32, Legal);
389
setOperationAction(ISD::STRICT_FTRUNC, MVT::f32, Legal);
390
setOperationAction(ISD::STRICT_FROUND, MVT::f32, Legal);
391
392
setOperationAction(ISD::STRICT_FFLOOR, MVT::f64, Legal);
393
setOperationAction(ISD::STRICT_FCEIL, MVT::f64, Legal);
394
setOperationAction(ISD::STRICT_FTRUNC, MVT::f64, Legal);
395
setOperationAction(ISD::STRICT_FROUND, MVT::f64, Legal);
396
}
397
398
// We don't support sin/cos/sqrt/fmod/pow
399
setOperationAction(ISD::FSIN , MVT::f64, Expand);
400
setOperationAction(ISD::FCOS , MVT::f64, Expand);
401
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
402
setOperationAction(ISD::FREM , MVT::f64, Expand);
403
setOperationAction(ISD::FPOW , MVT::f64, Expand);
404
setOperationAction(ISD::FSIN , MVT::f32, Expand);
405
setOperationAction(ISD::FCOS , MVT::f32, Expand);
406
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
407
setOperationAction(ISD::FREM , MVT::f32, Expand);
408
setOperationAction(ISD::FPOW , MVT::f32, Expand);
409
410
// MASS transformation for LLVM intrinsics with replicating fast-math flag
411
// to be consistent to PPCGenScalarMASSEntries pass
412
if (TM.getOptLevel() == CodeGenOptLevel::Aggressive) {
413
setOperationAction(ISD::FSIN , MVT::f64, Custom);
414
setOperationAction(ISD::FCOS , MVT::f64, Custom);
415
setOperationAction(ISD::FPOW , MVT::f64, Custom);
416
setOperationAction(ISD::FLOG, MVT::f64, Custom);
417
setOperationAction(ISD::FLOG10, MVT::f64, Custom);
418
setOperationAction(ISD::FEXP, MVT::f64, Custom);
419
setOperationAction(ISD::FSIN , MVT::f32, Custom);
420
setOperationAction(ISD::FCOS , MVT::f32, Custom);
421
setOperationAction(ISD::FPOW , MVT::f32, Custom);
422
setOperationAction(ISD::FLOG, MVT::f32, Custom);
423
setOperationAction(ISD::FLOG10, MVT::f32, Custom);
424
setOperationAction(ISD::FEXP, MVT::f32, Custom);
425
}
426
427
if (Subtarget.hasSPE()) {
428
setOperationAction(ISD::FMA , MVT::f64, Expand);
429
setOperationAction(ISD::FMA , MVT::f32, Expand);
430
} else {
431
setOperationAction(ISD::FMA , MVT::f64, Legal);
432
setOperationAction(ISD::FMA , MVT::f32, Legal);
433
}
434
435
if (Subtarget.hasSPE())
436
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
437
438
setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
439
440
// If we're enabling GP optimizations, use hardware square root
441
if (!Subtarget.hasFSQRT() &&
442
!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
443
Subtarget.hasFRE()))
444
setOperationAction(ISD::FSQRT, MVT::f64, Expand);
445
446
if (!Subtarget.hasFSQRT() &&
447
!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
448
Subtarget.hasFRES()))
449
setOperationAction(ISD::FSQRT, MVT::f32, Expand);
450
451
if (Subtarget.hasFCPSGN()) {
452
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
453
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
454
} else {
455
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
456
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
457
}
458
459
if (Subtarget.hasFPRND()) {
460
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
461
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
462
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
463
setOperationAction(ISD::FROUND, MVT::f64, Legal);
464
465
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
466
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
467
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
468
setOperationAction(ISD::FROUND, MVT::f32, Legal);
469
}
470
471
// Prior to P10, PowerPC does not have BSWAP, but we can use vector BSWAP
472
// instruction xxbrd to speed up scalar BSWAP64.
473
if (Subtarget.isISA3_1()) {
474
setOperationAction(ISD::BSWAP, MVT::i32, Legal);
475
setOperationAction(ISD::BSWAP, MVT::i64, Legal);
476
} else {
477
setOperationAction(ISD::BSWAP, MVT::i32, Expand);
478
setOperationAction(
479
ISD::BSWAP, MVT::i64,
480
(Subtarget.hasP9Vector() && Subtarget.isPPC64()) ? Custom : Expand);
481
}
482
483
// CTPOP or CTTZ were introduced in P8/P9 respectively
484
if (Subtarget.isISA3_0()) {
485
setOperationAction(ISD::CTTZ , MVT::i32 , Legal);
486
setOperationAction(ISD::CTTZ , MVT::i64 , Legal);
487
} else {
488
setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
489
setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
490
}
491
492
if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
493
setOperationAction(ISD::CTPOP, MVT::i32 , Legal);
494
setOperationAction(ISD::CTPOP, MVT::i64 , Legal);
495
} else {
496
setOperationAction(ISD::CTPOP, MVT::i32 , Expand);
497
setOperationAction(ISD::CTPOP, MVT::i64 , Expand);
498
}
499
500
// PowerPC does not have ROTR
501
setOperationAction(ISD::ROTR, MVT::i32 , Expand);
502
setOperationAction(ISD::ROTR, MVT::i64 , Expand);
503
504
if (!Subtarget.useCRBits()) {
505
// PowerPC does not have Select
506
setOperationAction(ISD::SELECT, MVT::i32, Expand);
507
setOperationAction(ISD::SELECT, MVT::i64, Expand);
508
setOperationAction(ISD::SELECT, MVT::f32, Expand);
509
setOperationAction(ISD::SELECT, MVT::f64, Expand);
510
}
511
512
// PowerPC wants to turn select_cc of FP into fsel when possible.
513
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
514
setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
515
516
// PowerPC wants to optimize integer setcc a bit
517
if (!Subtarget.useCRBits())
518
setOperationAction(ISD::SETCC, MVT::i32, Custom);
519
520
if (Subtarget.hasFPU()) {
521
setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Legal);
522
setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Legal);
523
setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Legal);
524
525
setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);
526
setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
527
setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Legal);
528
}
529
530
// PowerPC does not have BRCOND which requires SetCC
531
if (!Subtarget.useCRBits())
532
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
533
534
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
535
536
if (Subtarget.hasSPE()) {
537
// SPE has built-in conversions
538
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Legal);
539
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Legal);
540
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Legal);
541
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
542
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
543
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
544
545
// SPE supports signaling compare of f32/f64.
546
setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);
547
setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
548
} else {
549
// PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
550
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
551
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
552
553
// PowerPC does not have [U|S]INT_TO_FP
554
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Expand);
555
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Expand);
556
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
557
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
558
}
559
560
if (Subtarget.hasDirectMove() && isPPC64) {
561
setOperationAction(ISD::BITCAST, MVT::f32, Legal);
562
setOperationAction(ISD::BITCAST, MVT::i32, Legal);
563
setOperationAction(ISD::BITCAST, MVT::i64, Legal);
564
setOperationAction(ISD::BITCAST, MVT::f64, Legal);
565
if (TM.Options.UnsafeFPMath) {
566
setOperationAction(ISD::LRINT, MVT::f64, Legal);
567
setOperationAction(ISD::LRINT, MVT::f32, Legal);
568
setOperationAction(ISD::LLRINT, MVT::f64, Legal);
569
setOperationAction(ISD::LLRINT, MVT::f32, Legal);
570
setOperationAction(ISD::LROUND, MVT::f64, Legal);
571
setOperationAction(ISD::LROUND, MVT::f32, Legal);
572
setOperationAction(ISD::LLROUND, MVT::f64, Legal);
573
setOperationAction(ISD::LLROUND, MVT::f32, Legal);
574
}
575
} else {
576
setOperationAction(ISD::BITCAST, MVT::f32, Expand);
577
setOperationAction(ISD::BITCAST, MVT::i32, Expand);
578
setOperationAction(ISD::BITCAST, MVT::i64, Expand);
579
setOperationAction(ISD::BITCAST, MVT::f64, Expand);
580
}
581
582
// We cannot sextinreg(i1). Expand to shifts.
583
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
584
585
// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
586
// SjLj exception handling but a light-weight setjmp/longjmp replacement to
587
// support continuation, user-level threading, and etc.. As a result, no
588
// other SjLj exception interfaces are implemented and please don't build
589
// your own exception handling based on them.
590
// LLVM/Clang supports zero-cost DWARF exception handling.
591
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
592
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
593
594
// We want to legalize GlobalAddress and ConstantPool nodes into the
595
// appropriate instructions to materialize the address.
596
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
597
setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
598
setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
599
setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
600
setOperationAction(ISD::JumpTable, MVT::i32, Custom);
601
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
602
setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
603
setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
604
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
605
setOperationAction(ISD::JumpTable, MVT::i64, Custom);
606
607
// TRAP is legal.
608
setOperationAction(ISD::TRAP, MVT::Other, Legal);
609
610
// TRAMPOLINE is custom lowered.
611
setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
612
setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
613
614
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
615
setOperationAction(ISD::VASTART , MVT::Other, Custom);
616
617
if (Subtarget.is64BitELFABI()) {
618
// VAARG always uses double-word chunks, so promote anything smaller.
619
setOperationAction(ISD::VAARG, MVT::i1, Promote);
620
AddPromotedToType(ISD::VAARG, MVT::i1, MVT::i64);
621
setOperationAction(ISD::VAARG, MVT::i8, Promote);
622
AddPromotedToType(ISD::VAARG, MVT::i8, MVT::i64);
623
setOperationAction(ISD::VAARG, MVT::i16, Promote);
624
AddPromotedToType(ISD::VAARG, MVT::i16, MVT::i64);
625
setOperationAction(ISD::VAARG, MVT::i32, Promote);
626
AddPromotedToType(ISD::VAARG, MVT::i32, MVT::i64);
627
setOperationAction(ISD::VAARG, MVT::Other, Expand);
628
} else if (Subtarget.is32BitELFABI()) {
629
// VAARG is custom lowered with the 32-bit SVR4 ABI.
630
setOperationAction(ISD::VAARG, MVT::Other, Custom);
631
setOperationAction(ISD::VAARG, MVT::i64, Custom);
632
} else
633
setOperationAction(ISD::VAARG, MVT::Other, Expand);
634
635
// VACOPY is custom lowered with the 32-bit SVR4 ABI.
636
if (Subtarget.is32BitELFABI())
637
setOperationAction(ISD::VACOPY , MVT::Other, Custom);
638
else
639
setOperationAction(ISD::VACOPY , MVT::Other, Expand);
640
641
// Use the default implementation.
642
setOperationAction(ISD::VAEND , MVT::Other, Expand);
643
setOperationAction(ISD::STACKSAVE , MVT::Other, Expand);
644
setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom);
645
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);
646
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom);
647
setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
648
setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
649
setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
650
setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
651
652
// We want to custom lower some of our intrinsics.
653
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
654
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
655
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
656
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
657
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom);
658
659
// To handle counter-based loop conditions.
660
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
661
662
setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
663
setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
664
setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom);
665
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
666
667
// Comparisons that require checking two conditions.
668
if (Subtarget.hasSPE()) {
669
setCondCodeAction(ISD::SETO, MVT::f32, Expand);
670
setCondCodeAction(ISD::SETO, MVT::f64, Expand);
671
setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
672
setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
673
}
674
setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
675
setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
676
setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
677
setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
678
setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
679
setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
680
setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
681
setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
682
setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
683
setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
684
setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
685
setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
686
687
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
688
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
689
690
if (Subtarget.has64BitSupport()) {
691
// They also have instructions for converting between i64 and fp.
692
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
693
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Expand);
694
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
695
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Expand);
696
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
697
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
698
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
699
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
700
// This is just the low 32 bits of a (signed) fp->i64 conversion.
701
// We cannot do this with Promote because i64 is not a legal type.
702
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
703
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
704
705
if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) {
706
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
707
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
708
}
709
} else {
710
// PowerPC does not have FP_TO_UINT on 32-bit implementations.
711
if (Subtarget.hasSPE()) {
712
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Legal);
713
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
714
} else {
715
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Expand);
716
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
717
}
718
}
719
720
// With the instructions enabled under FPCVT, we can do everything.
721
if (Subtarget.hasFPCVT()) {
722
if (Subtarget.has64BitSupport()) {
723
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
724
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
725
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
726
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
727
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
728
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
729
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
730
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
731
}
732
733
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
734
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
735
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
736
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
737
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
738
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
739
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
740
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
741
}
742
743
if (Subtarget.use64BitRegs()) {
744
// 64-bit PowerPC implementations can support i64 types directly
745
addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
746
// BUILD_PAIR can't be handled natively, and should be expanded to shl/or
747
setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
748
// 64-bit PowerPC wants to expand i128 shifts itself.
749
setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
750
setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
751
setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
752
} else {
753
// 32-bit PowerPC wants to expand i64 shifts itself.
754
setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
755
setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
756
setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
757
}
758
759
// PowerPC has better expansions for funnel shifts than the generic
760
// TargetLowering::expandFunnelShift.
761
if (Subtarget.has64BitSupport()) {
762
setOperationAction(ISD::FSHL, MVT::i64, Custom);
763
setOperationAction(ISD::FSHR, MVT::i64, Custom);
764
}
765
setOperationAction(ISD::FSHL, MVT::i32, Custom);
766
setOperationAction(ISD::FSHR, MVT::i32, Custom);
767
768
if (Subtarget.hasVSX()) {
769
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
770
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
771
setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
772
setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
773
}
774
775
if (Subtarget.hasAltivec()) {
776
for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
777
setOperationAction(ISD::SADDSAT, VT, Legal);
778
setOperationAction(ISD::SSUBSAT, VT, Legal);
779
setOperationAction(ISD::UADDSAT, VT, Legal);
780
setOperationAction(ISD::USUBSAT, VT, Legal);
781
}
782
// First set operation action for all vector types to expand. Then we
783
// will selectively turn on ones that can be effectively codegen'd.
784
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
785
// add/sub are legal for all supported vector VT's.
786
setOperationAction(ISD::ADD, VT, Legal);
787
setOperationAction(ISD::SUB, VT, Legal);
788
789
// For v2i64, these are only valid with P8Vector. This is corrected after
790
// the loop.
791
if (VT.getSizeInBits() <= 128 && VT.getScalarSizeInBits() <= 64) {
792
setOperationAction(ISD::SMAX, VT, Legal);
793
setOperationAction(ISD::SMIN, VT, Legal);
794
setOperationAction(ISD::UMAX, VT, Legal);
795
setOperationAction(ISD::UMIN, VT, Legal);
796
}
797
else {
798
setOperationAction(ISD::SMAX, VT, Expand);
799
setOperationAction(ISD::SMIN, VT, Expand);
800
setOperationAction(ISD::UMAX, VT, Expand);
801
setOperationAction(ISD::UMIN, VT, Expand);
802
}
803
804
if (Subtarget.hasVSX()) {
805
setOperationAction(ISD::FMAXNUM, VT, Legal);
806
setOperationAction(ISD::FMINNUM, VT, Legal);
807
}
808
809
// Vector instructions introduced in P8
810
if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
811
setOperationAction(ISD::CTPOP, VT, Legal);
812
setOperationAction(ISD::CTLZ, VT, Legal);
813
}
814
else {
815
setOperationAction(ISD::CTPOP, VT, Expand);
816
setOperationAction(ISD::CTLZ, VT, Expand);
817
}
818
819
// Vector instructions introduced in P9
820
if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))
821
setOperationAction(ISD::CTTZ, VT, Legal);
822
else
823
setOperationAction(ISD::CTTZ, VT, Expand);
824
825
// We promote all shuffles to v16i8.
826
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
827
AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);
828
829
// We promote all non-typed operations to v4i32.
830
setOperationAction(ISD::AND , VT, Promote);
831
AddPromotedToType (ISD::AND , VT, MVT::v4i32);
832
setOperationAction(ISD::OR , VT, Promote);
833
AddPromotedToType (ISD::OR , VT, MVT::v4i32);
834
setOperationAction(ISD::XOR , VT, Promote);
835
AddPromotedToType (ISD::XOR , VT, MVT::v4i32);
836
setOperationAction(ISD::LOAD , VT, Promote);
837
AddPromotedToType (ISD::LOAD , VT, MVT::v4i32);
838
setOperationAction(ISD::SELECT, VT, Promote);
839
AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
840
setOperationAction(ISD::VSELECT, VT, Legal);
841
setOperationAction(ISD::SELECT_CC, VT, Promote);
842
AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32);
843
setOperationAction(ISD::STORE, VT, Promote);
844
AddPromotedToType (ISD::STORE, VT, MVT::v4i32);
845
846
// No other operations are legal.
847
setOperationAction(ISD::MUL , VT, Expand);
848
setOperationAction(ISD::SDIV, VT, Expand);
849
setOperationAction(ISD::SREM, VT, Expand);
850
setOperationAction(ISD::UDIV, VT, Expand);
851
setOperationAction(ISD::UREM, VT, Expand);
852
setOperationAction(ISD::FDIV, VT, Expand);
853
setOperationAction(ISD::FREM, VT, Expand);
854
setOperationAction(ISD::FNEG, VT, Expand);
855
setOperationAction(ISD::FSQRT, VT, Expand);
856
setOperationAction(ISD::FLOG, VT, Expand);
857
setOperationAction(ISD::FLOG10, VT, Expand);
858
setOperationAction(ISD::FLOG2, VT, Expand);
859
setOperationAction(ISD::FEXP, VT, Expand);
860
setOperationAction(ISD::FEXP2, VT, Expand);
861
setOperationAction(ISD::FSIN, VT, Expand);
862
setOperationAction(ISD::FCOS, VT, Expand);
863
setOperationAction(ISD::FABS, VT, Expand);
864
setOperationAction(ISD::FFLOOR, VT, Expand);
865
setOperationAction(ISD::FCEIL, VT, Expand);
866
setOperationAction(ISD::FTRUNC, VT, Expand);
867
setOperationAction(ISD::FRINT, VT, Expand);
868
setOperationAction(ISD::FLDEXP, VT, Expand);
869
setOperationAction(ISD::FNEARBYINT, VT, Expand);
870
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
871
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
872
setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
873
setOperationAction(ISD::MULHU, VT, Expand);
874
setOperationAction(ISD::MULHS, VT, Expand);
875
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
876
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
877
setOperationAction(ISD::UDIVREM, VT, Expand);
878
setOperationAction(ISD::SDIVREM, VT, Expand);
879
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
880
setOperationAction(ISD::FPOW, VT, Expand);
881
setOperationAction(ISD::BSWAP, VT, Expand);
882
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
883
setOperationAction(ISD::ROTL, VT, Expand);
884
setOperationAction(ISD::ROTR, VT, Expand);
885
886
for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
887
setTruncStoreAction(VT, InnerVT, Expand);
888
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
889
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
890
setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
891
}
892
}
893
setOperationAction(ISD::SELECT_CC, MVT::v4i32, Expand);
894
if (!Subtarget.hasP8Vector()) {
895
setOperationAction(ISD::SMAX, MVT::v2i64, Expand);
896
setOperationAction(ISD::SMIN, MVT::v2i64, Expand);
897
setOperationAction(ISD::UMAX, MVT::v2i64, Expand);
898
setOperationAction(ISD::UMIN, MVT::v2i64, Expand);
899
}
900
901
// We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
902
// with merges, splats, etc.
903
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
904
905
// Vector truncates to sub-word integer that fit in an Altivec/VSX register
906
// are cheap, so handle them before they get expanded to scalar.
907
setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
908
setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
909
setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
910
setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
911
setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
912
913
setOperationAction(ISD::AND , MVT::v4i32, Legal);
914
setOperationAction(ISD::OR , MVT::v4i32, Legal);
915
setOperationAction(ISD::XOR , MVT::v4i32, Legal);
916
setOperationAction(ISD::LOAD , MVT::v4i32, Legal);
917
setOperationAction(ISD::SELECT, MVT::v4i32,
918
Subtarget.useCRBits() ? Legal : Expand);
919
setOperationAction(ISD::STORE , MVT::v4i32, Legal);
920
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
921
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Legal);
922
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
923
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Legal);
924
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
925
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
926
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
927
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
928
setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
929
setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
930
setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
931
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
932
933
// Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8.
934
setOperationAction(ISD::ROTL, MVT::v1i128, Custom);
935
// With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w).
936
if (Subtarget.hasAltivec())
937
for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8})
938
setOperationAction(ISD::ROTL, VT, Legal);
939
// With hasP8Altivec set, we can lower ISD::ROTL to vrld.
940
if (Subtarget.hasP8Altivec())
941
setOperationAction(ISD::ROTL, MVT::v2i64, Legal);
942
943
addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
944
addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
945
addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
946
addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
947
948
setOperationAction(ISD::MUL, MVT::v4f32, Legal);
949
setOperationAction(ISD::FMA, MVT::v4f32, Legal);
950
951
if (Subtarget.hasVSX()) {
952
setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
953
setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
954
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
955
}
956
957
if (Subtarget.hasP8Altivec())
958
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
959
else
960
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
961
962
if (Subtarget.isISA3_1()) {
963
setOperationAction(ISD::MUL, MVT::v2i64, Legal);
964
setOperationAction(ISD::MULHS, MVT::v2i64, Legal);
965
setOperationAction(ISD::MULHU, MVT::v2i64, Legal);
966
setOperationAction(ISD::MULHS, MVT::v4i32, Legal);
967
setOperationAction(ISD::MULHU, MVT::v4i32, Legal);
968
setOperationAction(ISD::UDIV, MVT::v2i64, Legal);
969
setOperationAction(ISD::SDIV, MVT::v2i64, Legal);
970
setOperationAction(ISD::UDIV, MVT::v4i32, Legal);
971
setOperationAction(ISD::SDIV, MVT::v4i32, Legal);
972
setOperationAction(ISD::UREM, MVT::v2i64, Legal);
973
setOperationAction(ISD::SREM, MVT::v2i64, Legal);
974
setOperationAction(ISD::UREM, MVT::v4i32, Legal);
975
setOperationAction(ISD::SREM, MVT::v4i32, Legal);
976
setOperationAction(ISD::UREM, MVT::v1i128, Legal);
977
setOperationAction(ISD::SREM, MVT::v1i128, Legal);
978
setOperationAction(ISD::UDIV, MVT::v1i128, Legal);
979
setOperationAction(ISD::SDIV, MVT::v1i128, Legal);
980
setOperationAction(ISD::ROTL, MVT::v1i128, Legal);
981
}
982
983
setOperationAction(ISD::MUL, MVT::v8i16, Legal);
984
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
985
986
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
987
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
988
989
setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
990
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
991
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
992
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
993
994
// Altivec does not contain unordered floating-point compare instructions
995
setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
996
setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
997
setCondCodeAction(ISD::SETO, MVT::v4f32, Expand);
998
setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
999
1000
if (Subtarget.hasVSX()) {
1001
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
1002
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
1003
if (Subtarget.hasP8Vector()) {
1004
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
1005
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
1006
}
1007
if (Subtarget.hasDirectMove() && isPPC64) {
1008
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
1009
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
1010
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
1011
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);
1012
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal);
1013
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);
1014
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
1015
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
1016
}
1017
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
1018
1019
// The nearbyint variants are not allowed to raise the inexact exception
1020
// so we can only code-gen them with unsafe math.
1021
if (TM.Options.UnsafeFPMath) {
1022
setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1023
setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1024
}
1025
1026
setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
1027
setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
1028
setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
1029
setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
1030
setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
1031
setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
1032
setOperationAction(ISD::FROUND, MVT::f64, Legal);
1033
setOperationAction(ISD::FRINT, MVT::f64, Legal);
1034
1035
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
1036
setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1037
setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
1038
setOperationAction(ISD::FROUND, MVT::f32, Legal);
1039
setOperationAction(ISD::FRINT, MVT::f32, Legal);
1040
1041
setOperationAction(ISD::MUL, MVT::v2f64, Legal);
1042
setOperationAction(ISD::FMA, MVT::v2f64, Legal);
1043
1044
setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
1045
setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
1046
1047
// Share the Altivec comparison restrictions.
1048
setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
1049
setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
1050
setCondCodeAction(ISD::SETO, MVT::v2f64, Expand);
1051
setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);
1052
1053
setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
1054
setOperationAction(ISD::STORE, MVT::v2f64, Legal);
1055
1056
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
1057
1058
if (Subtarget.hasP8Vector())
1059
addRegisterClass(MVT::f32, &PPC::VSSRCRegClass);
1060
1061
addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);
1062
1063
addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);
1064
addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
1065
addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);
1066
1067
if (Subtarget.hasP8Altivec()) {
1068
setOperationAction(ISD::SHL, MVT::v2i64, Legal);
1069
setOperationAction(ISD::SRA, MVT::v2i64, Legal);
1070
setOperationAction(ISD::SRL, MVT::v2i64, Legal);
1071
1072
// 128 bit shifts can be accomplished via 3 instructions for SHL and
1073
// SRL, but not for SRA because of the instructions available:
1074
// VS{RL} and VS{RL}O. However due to direct move costs, it's not worth
1075
// doing
1076
setOperationAction(ISD::SHL, MVT::v1i128, Expand);
1077
setOperationAction(ISD::SRL, MVT::v1i128, Expand);
1078
setOperationAction(ISD::SRA, MVT::v1i128, Expand);
1079
1080
setOperationAction(ISD::SETCC, MVT::v2i64, Legal);
1081
}
1082
else {
1083
setOperationAction(ISD::SHL, MVT::v2i64, Expand);
1084
setOperationAction(ISD::SRA, MVT::v2i64, Expand);
1085
setOperationAction(ISD::SRL, MVT::v2i64, Expand);
1086
1087
setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
1088
1089
// VSX v2i64 only supports non-arithmetic operations.
1090
setOperationAction(ISD::ADD, MVT::v2i64, Expand);
1091
setOperationAction(ISD::SUB, MVT::v2i64, Expand);
1092
}
1093
1094
if (Subtarget.isISA3_1())
1095
setOperationAction(ISD::SETCC, MVT::v1i128, Legal);
1096
else
1097
setOperationAction(ISD::SETCC, MVT::v1i128, Expand);
1098
1099
setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
1100
AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
1101
setOperationAction(ISD::STORE, MVT::v2i64, Promote);
1102
AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);
1103
1104
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
1105
1106
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i64, Legal);
1107
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i64, Legal);
1108
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i64, Legal);
1109
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i64, Legal);
1110
setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
1111
setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
1112
setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
1113
setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
1114
1115
// Custom handling for partial vectors of integers converted to
1116
// floating point. We already have optimal handling for v2i32 through
1117
// the DAG combine, so those aren't necessary.
1118
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i8, Custom);
1119
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i8, Custom);
1120
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i16, Custom);
1121
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i16, Custom);
1122
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i8, Custom);
1123
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i8, Custom);
1124
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i16, Custom);
1125
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i16, Custom);
1126
setOperationAction(ISD::UINT_TO_FP, MVT::v2i8, Custom);
1127
setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
1128
setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom);
1129
setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1130
setOperationAction(ISD::SINT_TO_FP, MVT::v2i8, Custom);
1131
setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Custom);
1132
setOperationAction(ISD::SINT_TO_FP, MVT::v2i16, Custom);
1133
setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
1134
1135
setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
1136
setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
1137
setOperationAction(ISD::FABS, MVT::v4f32, Legal);
1138
setOperationAction(ISD::FABS, MVT::v2f64, Legal);
1139
setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
1140
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Legal);
1141
1142
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
1143
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
1144
1145
// Handle constrained floating-point operations of vector.
1146
// The predictor is `hasVSX` because altivec instruction has
1147
// no exception but VSX vector instruction has.
1148
setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
1149
setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
1150
setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
1151
setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
1152
setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal);
1153
setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
1154
setOperationAction(ISD::STRICT_FMAXNUM, MVT::v4f32, Legal);
1155
setOperationAction(ISD::STRICT_FMINNUM, MVT::v4f32, Legal);
1156
setOperationAction(ISD::STRICT_FRINT, MVT::v4f32, Legal);
1157
setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);
1158
setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal);
1159
setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);
1160
setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal);
1161
1162
setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1163
setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1164
setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1165
setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1166
setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal);
1167
setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1168
setOperationAction(ISD::STRICT_FMAXNUM, MVT::v2f64, Legal);
1169
setOperationAction(ISD::STRICT_FMINNUM, MVT::v2f64, Legal);
1170
setOperationAction(ISD::STRICT_FRINT, MVT::v2f64, Legal);
1171
setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);
1172
setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal);
1173
setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);
1174
setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);
1175
1176
addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
1177
addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
1178
1179
for (MVT FPT : MVT::fp_valuetypes())
1180
setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
1181
1182
// Expand the SELECT to SELECT_CC
1183
setOperationAction(ISD::SELECT, MVT::f128, Expand);
1184
1185
setTruncStoreAction(MVT::f128, MVT::f64, Expand);
1186
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
1187
1188
// No implementation for these ops for PowerPC.
1189
setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
1190
setOperationAction(ISD::FSIN, MVT::f128, Expand);
1191
setOperationAction(ISD::FCOS, MVT::f128, Expand);
1192
setOperationAction(ISD::FPOW, MVT::f128, Expand);
1193
setOperationAction(ISD::FPOWI, MVT::f128, Expand);
1194
setOperationAction(ISD::FREM, MVT::f128, Expand);
1195
}
1196
1197
if (Subtarget.hasP8Altivec()) {
1198
addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);
1199
addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass);
1200
}
1201
1202
if (Subtarget.hasP9Vector()) {
1203
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
1204
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
1205
1206
// Test data class instructions store results in CR bits.
1207
if (Subtarget.useCRBits()) {
1208
setOperationAction(ISD::IS_FPCLASS, MVT::f32, Custom);
1209
setOperationAction(ISD::IS_FPCLASS, MVT::f64, Custom);
1210
setOperationAction(ISD::IS_FPCLASS, MVT::f128, Custom);
1211
}
1212
1213
// 128 bit shifts can be accomplished via 3 instructions for SHL and
1214
// SRL, but not for SRA because of the instructions available:
1215
// VS{RL} and VS{RL}O.
1216
setOperationAction(ISD::SHL, MVT::v1i128, Legal);
1217
setOperationAction(ISD::SRL, MVT::v1i128, Legal);
1218
setOperationAction(ISD::SRA, MVT::v1i128, Expand);
1219
1220
setOperationAction(ISD::FADD, MVT::f128, Legal);
1221
setOperationAction(ISD::FSUB, MVT::f128, Legal);
1222
setOperationAction(ISD::FDIV, MVT::f128, Legal);
1223
setOperationAction(ISD::FMUL, MVT::f128, Legal);
1224
setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
1225
1226
setOperationAction(ISD::FMA, MVT::f128, Legal);
1227
setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
1228
setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
1229
setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);
1230
setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);
1231
setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);
1232
setCondCodeAction(ISD::SETONE, MVT::f128, Expand);
1233
1234
setOperationAction(ISD::FTRUNC, MVT::f128, Legal);
1235
setOperationAction(ISD::FRINT, MVT::f128, Legal);
1236
setOperationAction(ISD::FFLOOR, MVT::f128, Legal);
1237
setOperationAction(ISD::FCEIL, MVT::f128, Legal);
1238
setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
1239
setOperationAction(ISD::FROUND, MVT::f128, Legal);
1240
1241
setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
1242
setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
1243
setOperationAction(ISD::BITCAST, MVT::i128, Custom);
1244
1245
// Handle constrained floating-point operations of fp128
1246
setOperationAction(ISD::STRICT_FADD, MVT::f128, Legal);
1247
setOperationAction(ISD::STRICT_FSUB, MVT::f128, Legal);
1248
setOperationAction(ISD::STRICT_FMUL, MVT::f128, Legal);
1249
setOperationAction(ISD::STRICT_FDIV, MVT::f128, Legal);
1250
setOperationAction(ISD::STRICT_FMA, MVT::f128, Legal);
1251
setOperationAction(ISD::STRICT_FSQRT, MVT::f128, Legal);
1252
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Legal);
1253
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
1254
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
1255
setOperationAction(ISD::STRICT_FRINT, MVT::f128, Legal);
1256
setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f128, Legal);
1257
setOperationAction(ISD::STRICT_FFLOOR, MVT::f128, Legal);
1258
setOperationAction(ISD::STRICT_FCEIL, MVT::f128, Legal);
1259
setOperationAction(ISD::STRICT_FTRUNC, MVT::f128, Legal);
1260
setOperationAction(ISD::STRICT_FROUND, MVT::f128, Legal);
1261
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1262
setOperationAction(ISD::BSWAP, MVT::v8i16, Legal);
1263
setOperationAction(ISD::BSWAP, MVT::v4i32, Legal);
1264
setOperationAction(ISD::BSWAP, MVT::v2i64, Legal);
1265
setOperationAction(ISD::BSWAP, MVT::v1i128, Legal);
1266
} else if (Subtarget.hasVSX()) {
1267
setOperationAction(ISD::LOAD, MVT::f128, Promote);
1268
setOperationAction(ISD::STORE, MVT::f128, Promote);
1269
1270
AddPromotedToType(ISD::LOAD, MVT::f128, MVT::v4i32);
1271
AddPromotedToType(ISD::STORE, MVT::f128, MVT::v4i32);
1272
1273
// Set FADD/FSUB as libcall to avoid the legalizer to expand the
1274
// fp_to_uint and int_to_fp.
1275
setOperationAction(ISD::FADD, MVT::f128, LibCall);
1276
setOperationAction(ISD::FSUB, MVT::f128, LibCall);
1277
1278
setOperationAction(ISD::FMUL, MVT::f128, Expand);
1279
setOperationAction(ISD::FDIV, MVT::f128, Expand);
1280
setOperationAction(ISD::FNEG, MVT::f128, Expand);
1281
setOperationAction(ISD::FABS, MVT::f128, Expand);
1282
setOperationAction(ISD::FSQRT, MVT::f128, Expand);
1283
setOperationAction(ISD::FMA, MVT::f128, Expand);
1284
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
1285
1286
// Expand the fp_extend if the target type is fp128.
1287
setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand);
1288
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Expand);
1289
1290
// Expand the fp_round if the source type is fp128.
1291
for (MVT VT : {MVT::f32, MVT::f64}) {
1292
setOperationAction(ISD::FP_ROUND, VT, Custom);
1293
setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
1294
}
1295
1296
setOperationAction(ISD::SETCC, MVT::f128, Custom);
1297
setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
1298
setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
1299
setOperationAction(ISD::BR_CC, MVT::f128, Expand);
1300
1301
// Lower following f128 select_cc pattern:
1302
// select_cc x, y, tv, fv, cc -> select_cc (setcc x, y, cc), 0, tv, fv, NE
1303
setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
1304
1305
// We need to handle f128 SELECT_CC with integer result type.
1306
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
1307
setOperationAction(ISD::SELECT_CC, MVT::i64, isPPC64 ? Custom : Expand);
1308
}
1309
1310
if (Subtarget.hasP9Altivec()) {
1311
if (Subtarget.isISA3_1()) {
1312
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal);
1313
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Legal);
1314
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Legal);
1315
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal);
1316
} else {
1317
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
1318
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1319
}
1320
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);
1321
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
1322
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
1323
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Legal);
1324
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Legal);
1325
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
1326
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
1327
1328
setOperationAction(ISD::ABDU, MVT::v16i8, Legal);
1329
setOperationAction(ISD::ABDU, MVT::v8i16, Legal);
1330
setOperationAction(ISD::ABDU, MVT::v4i32, Legal);
1331
setOperationAction(ISD::ABDS, MVT::v4i32, Legal);
1332
}
1333
1334
if (Subtarget.hasP10Vector()) {
1335
setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
1336
}
1337
}
1338
1339
if (Subtarget.pairedVectorMemops()) {
1340
addRegisterClass(MVT::v256i1, &PPC::VSRpRCRegClass);
1341
setOperationAction(ISD::LOAD, MVT::v256i1, Custom);
1342
setOperationAction(ISD::STORE, MVT::v256i1, Custom);
1343
}
1344
if (Subtarget.hasMMA()) {
1345
if (Subtarget.isISAFuture())
1346
addRegisterClass(MVT::v512i1, &PPC::WACCRCRegClass);
1347
else
1348
addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass);
1349
setOperationAction(ISD::LOAD, MVT::v512i1, Custom);
1350
setOperationAction(ISD::STORE, MVT::v512i1, Custom);
1351
setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom);
1352
}
1353
1354
if (Subtarget.has64BitSupport())
1355
setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
1356
1357
if (Subtarget.isISA3_1())
1358
setOperationAction(ISD::SRA, MVT::v1i128, Legal);
1359
1360
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);
1361
1362
if (!isPPC64) {
1363
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
1364
setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
1365
}
1366
1367
if (shouldInlineQuadwordAtomics()) {
1368
setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
1369
setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
1370
setOperationAction(ISD::INTRINSIC_VOID, MVT::i128, Custom);
1371
}
1372
1373
setBooleanContents(ZeroOrOneBooleanContent);
1374
1375
if (Subtarget.hasAltivec()) {
1376
// Altivec instructions set fields to all zeros or all ones.
1377
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
1378
}
1379
1380
if (shouldInlineQuadwordAtomics())
1381
setMaxAtomicSizeInBitsSupported(128);
1382
else if (isPPC64)
1383
setMaxAtomicSizeInBitsSupported(64);
1384
else
1385
setMaxAtomicSizeInBitsSupported(32);
1386
1387
setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
1388
1389
// We have target-specific dag combine patterns for the following nodes:
1390
setTargetDAGCombine({ISD::AND, ISD::ADD, ISD::SHL, ISD::SRA, ISD::SRL,
1391
ISD::MUL, ISD::FMA, ISD::SINT_TO_FP, ISD::BUILD_VECTOR});
1392
if (Subtarget.hasFPCVT())
1393
setTargetDAGCombine(ISD::UINT_TO_FP);
1394
setTargetDAGCombine({ISD::LOAD, ISD::STORE, ISD::BR_CC});
1395
if (Subtarget.useCRBits())
1396
setTargetDAGCombine(ISD::BRCOND);
1397
setTargetDAGCombine({ISD::BSWAP, ISD::INTRINSIC_WO_CHAIN,
1398
ISD::INTRINSIC_W_CHAIN, ISD::INTRINSIC_VOID});
1399
1400
setTargetDAGCombine({ISD::SIGN_EXTEND, ISD::ZERO_EXTEND, ISD::ANY_EXTEND});
1401
1402
setTargetDAGCombine({ISD::TRUNCATE, ISD::VECTOR_SHUFFLE});
1403
1404
if (Subtarget.useCRBits()) {
1405
setTargetDAGCombine({ISD::TRUNCATE, ISD::SETCC, ISD::SELECT_CC});
1406
}
1407
1408
setLibcallName(RTLIB::LOG_F128, "logf128");
1409
setLibcallName(RTLIB::LOG2_F128, "log2f128");
1410
setLibcallName(RTLIB::LOG10_F128, "log10f128");
1411
setLibcallName(RTLIB::EXP_F128, "expf128");
1412
setLibcallName(RTLIB::EXP2_F128, "exp2f128");
1413
setLibcallName(RTLIB::SIN_F128, "sinf128");
1414
setLibcallName(RTLIB::COS_F128, "cosf128");
1415
setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
1416
setLibcallName(RTLIB::POW_F128, "powf128");
1417
setLibcallName(RTLIB::FMIN_F128, "fminf128");
1418
setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
1419
setLibcallName(RTLIB::REM_F128, "fmodf128");
1420
setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
1421
setLibcallName(RTLIB::CEIL_F128, "ceilf128");
1422
setLibcallName(RTLIB::FLOOR_F128, "floorf128");
1423
setLibcallName(RTLIB::TRUNC_F128, "truncf128");
1424
setLibcallName(RTLIB::ROUND_F128, "roundf128");
1425
setLibcallName(RTLIB::LROUND_F128, "lroundf128");
1426
setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
1427
setLibcallName(RTLIB::RINT_F128, "rintf128");
1428
setLibcallName(RTLIB::LRINT_F128, "lrintf128");
1429
setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
1430
setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
1431
setLibcallName(RTLIB::FMA_F128, "fmaf128");
1432
setLibcallName(RTLIB::FREXP_F128, "frexpf128");
1433
1434
if (Subtarget.isAIXABI()) {
1435
setLibcallName(RTLIB::MEMCPY, isPPC64 ? "___memmove64" : "___memmove");
1436
setLibcallName(RTLIB::MEMMOVE, isPPC64 ? "___memmove64" : "___memmove");
1437
setLibcallName(RTLIB::MEMSET, isPPC64 ? "___memset64" : "___memset");
1438
setLibcallName(RTLIB::BZERO, isPPC64 ? "___bzero64" : "___bzero");
1439
}
1440
1441
// With 32 condition bits, we don't need to sink (and duplicate) compares
1442
// aggressively in CodeGenPrep.
1443
if (Subtarget.useCRBits()) {
1444
setHasMultipleConditionRegisters();
1445
setJumpIsExpensive();
1446
}
1447
1448
// TODO: The default entry number is set to 64. This stops most jump table
1449
// generation on PPC. But it is good for current PPC HWs because the indirect
1450
// branch instruction mtctr to the jump table may lead to bad branch predict.
1451
// Re-evaluate this value on future HWs that can do better with mtctr.
1452
setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);
1453
1454
setMinFunctionAlignment(Align(4));
1455
1456
switch (Subtarget.getCPUDirective()) {
1457
default: break;
1458
case PPC::DIR_970:
1459
case PPC::DIR_A2:
1460
case PPC::DIR_E500:
1461
case PPC::DIR_E500mc:
1462
case PPC::DIR_E5500:
1463
case PPC::DIR_PWR4:
1464
case PPC::DIR_PWR5:
1465
case PPC::DIR_PWR5X:
1466
case PPC::DIR_PWR6:
1467
case PPC::DIR_PWR6X:
1468
case PPC::DIR_PWR7:
1469
case PPC::DIR_PWR8:
1470
case PPC::DIR_PWR9:
1471
case PPC::DIR_PWR10:
1472
case PPC::DIR_PWR11:
1473
case PPC::DIR_PWR_FUTURE:
1474
setPrefLoopAlignment(Align(16));
1475
setPrefFunctionAlignment(Align(16));
1476
break;
1477
}
1478
1479
if (Subtarget.enableMachineScheduler())
1480
setSchedulingPreference(Sched::Source);
1481
else
1482
setSchedulingPreference(Sched::Hybrid);
1483
1484
computeRegisterProperties(STI.getRegisterInfo());
1485
1486
// The Freescale cores do better with aggressive inlining of memcpy and
1487
// friends. GCC uses same threshold of 128 bytes (= 32 word stores).
1488
if (Subtarget.getCPUDirective() == PPC::DIR_E500mc ||
1489
Subtarget.getCPUDirective() == PPC::DIR_E5500) {
1490
MaxStoresPerMemset = 32;
1491
MaxStoresPerMemsetOptSize = 16;
1492
MaxStoresPerMemcpy = 32;
1493
MaxStoresPerMemcpyOptSize = 8;
1494
MaxStoresPerMemmove = 32;
1495
MaxStoresPerMemmoveOptSize = 8;
1496
} else if (Subtarget.getCPUDirective() == PPC::DIR_A2) {
1497
// The A2 also benefits from (very) aggressive inlining of memcpy and
1498
// friends. The overhead of a the function call, even when warm, can be
1499
// over one hundred cycles.
1500
MaxStoresPerMemset = 128;
1501
MaxStoresPerMemcpy = 128;
1502
MaxStoresPerMemmove = 128;
1503
MaxLoadsPerMemcmp = 128;
1504
} else {
1505
MaxLoadsPerMemcmp = 8;
1506
MaxLoadsPerMemcmpOptSize = 4;
1507
}
1508
1509
IsStrictFPEnabled = true;
1510
1511
// Let the subtarget (CPU) decide if a predictable select is more expensive
1512
// than the corresponding branch. This information is used in CGP to decide
1513
// when to convert selects into branches.
1514
PredictableSelectIsExpensive = Subtarget.isPredictableSelectIsExpensive();
1515
1516
GatherAllAliasesMaxDepth = PPCGatherAllAliasesMaxDepth;
1517
}
1518
1519
// *********************************** NOTE ************************************
1520
// For selecting load and store instructions, the addressing modes are defined
1521
// as ComplexPatterns in PPCInstrInfo.td, which are then utilized in the TD
1522
// patterns to match the load the store instructions.
1523
//
1524
// The TD definitions for the addressing modes correspond to their respective
1525
// Select<AddrMode>Form() function in PPCISelDAGToDAG.cpp. These functions rely
1526
// on SelectOptimalAddrMode(), which calls computeMOFlags() to compute the
1527
// address mode flags of a particular node. Afterwards, the computed address
1528
// flags are passed into getAddrModeForFlags() in order to retrieve the optimal
1529
// addressing mode. SelectOptimalAddrMode() then sets the Base and Displacement
1530
// accordingly, based on the preferred addressing mode.
1531
//
1532
// Within PPCISelLowering.h, there are two enums: MemOpFlags and AddrMode.
1533
// MemOpFlags contains all the possible flags that can be used to compute the
1534
// optimal addressing mode for load and store instructions.
1535
// AddrMode contains all the possible load and store addressing modes available
1536
// on Power (such as DForm, DSForm, DQForm, XForm, etc.)
1537
//
1538
// When adding new load and store instructions, it is possible that new address
1539
// flags may need to be added into MemOpFlags, and a new addressing mode will
1540
// need to be added to AddrMode. An entry of the new addressing mode (consisting
1541
// of the minimal and main distinguishing address flags for the new load/store
1542
// instructions) will need to be added into initializeAddrModeMap() below.
1543
// Finally, when adding new addressing modes, the getAddrModeForFlags() will
1544
// need to be updated to account for selecting the optimal addressing mode.
1545
// *****************************************************************************
1546
/// Initialize the map that relates the different addressing modes of the load
1547
/// and store instructions to a set of flags. This ensures the load/store
1548
/// instruction is correctly matched during instruction selection.
1549
void PPCTargetLowering::initializeAddrModeMap() {
1550
AddrModesMap[PPC::AM_DForm] = {
1551
// LWZ, STW
1552
PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_WordInt,
1553
PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_WordInt,
1554
PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,
1555
PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,
1556
// LBZ, LHZ, STB, STH
1557
PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,
1558
PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,
1559
PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,
1560
PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,
1561
// LHA
1562
PPC::MOF_SExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,
1563
PPC::MOF_SExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,
1564
PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,
1565
PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,
1566
// LFS, LFD, STFS, STFD
1567
PPC::MOF_RPlusSImm16 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1568
PPC::MOF_RPlusLo | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1569
PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1570
PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1571
};
1572
AddrModesMap[PPC::AM_DSForm] = {
1573
// LWA
1574
PPC::MOF_SExt | PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_WordInt,
1575
PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,
1576
PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,
1577
// LD, STD
1578
PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_DoubleWordInt,
1579
PPC::MOF_NotAddNorCst | PPC::MOF_DoubleWordInt,
1580
PPC::MOF_AddrIsSImm32 | PPC::MOF_DoubleWordInt,
1581
// DFLOADf32, DFLOADf64, DSTOREf32, DSTOREf64
1582
PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1583
PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1584
PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1585
};
1586
AddrModesMap[PPC::AM_DQForm] = {
1587
// LXV, STXV
1588
PPC::MOF_RPlusSImm16Mult16 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1589
PPC::MOF_NotAddNorCst | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1590
PPC::MOF_AddrIsSImm32 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1591
};
1592
AddrModesMap[PPC::AM_PrefixDForm] = {PPC::MOF_RPlusSImm34 |
1593
PPC::MOF_SubtargetP10};
1594
// TODO: Add mapping for quadword load/store.
1595
}
1596
1597
/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1598
/// the desired ByVal argument alignment.
1599
static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) {
1600
if (MaxAlign == MaxMaxAlign)
1601
return;
1602
if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1603
if (MaxMaxAlign >= 32 &&
1604
VTy->getPrimitiveSizeInBits().getFixedValue() >= 256)
1605
MaxAlign = Align(32);
1606
else if (VTy->getPrimitiveSizeInBits().getFixedValue() >= 128 &&
1607
MaxAlign < 16)
1608
MaxAlign = Align(16);
1609
} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1610
Align EltAlign;
1611
getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
1612
if (EltAlign > MaxAlign)
1613
MaxAlign = EltAlign;
1614
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1615
for (auto *EltTy : STy->elements()) {
1616
Align EltAlign;
1617
getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
1618
if (EltAlign > MaxAlign)
1619
MaxAlign = EltAlign;
1620
if (MaxAlign == MaxMaxAlign)
1621
break;
1622
}
1623
}
1624
}
1625
1626
/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1627
/// function arguments in the caller parameter area.
1628
uint64_t PPCTargetLowering::getByValTypeAlignment(Type *Ty,
1629
const DataLayout &DL) const {
1630
// 16byte and wider vectors are passed on 16byte boundary.
1631
// The rest is 8 on PPC64 and 4 on PPC32 boundary.
1632
Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
1633
if (Subtarget.hasAltivec())
1634
getMaxByValAlign(Ty, Alignment, Align(16));
1635
return Alignment.value();
1636
}
1637
1638
bool PPCTargetLowering::useSoftFloat() const {
1639
return Subtarget.useSoftFloat();
1640
}
1641
1642
bool PPCTargetLowering::hasSPE() const {
1643
return Subtarget.hasSPE();
1644
}
1645
1646
bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
1647
return VT.isScalarInteger();
1648
}
1649
1650
bool PPCTargetLowering::shallExtractConstSplatVectorElementToStore(
1651
Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const {
1652
if (!Subtarget.isPPC64() || !Subtarget.hasVSX())
1653
return false;
1654
1655
if (auto *VTy = dyn_cast<VectorType>(VectorTy)) {
1656
if (VTy->getScalarType()->isIntegerTy()) {
1657
// ElemSizeInBits 8/16 can fit in immediate field, not needed here.
1658
if (ElemSizeInBits == 32) {
1659
Index = Subtarget.isLittleEndian() ? 2 : 1;
1660
return true;
1661
}
1662
if (ElemSizeInBits == 64) {
1663
Index = Subtarget.isLittleEndian() ? 1 : 0;
1664
return true;
1665
}
1666
}
1667
}
1668
return false;
1669
}
1670
1671
const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
1672
switch ((PPCISD::NodeType)Opcode) {
1673
case PPCISD::FIRST_NUMBER: break;
1674
case PPCISD::FSEL: return "PPCISD::FSEL";
1675
case PPCISD::XSMAXC: return "PPCISD::XSMAXC";
1676
case PPCISD::XSMINC: return "PPCISD::XSMINC";
1677
case PPCISD::FCFID: return "PPCISD::FCFID";
1678
case PPCISD::FCFIDU: return "PPCISD::FCFIDU";
1679
case PPCISD::FCFIDS: return "PPCISD::FCFIDS";
1680
case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS";
1681
case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ";
1682
case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ";
1683
case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ";
1684
case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ";
1685
case PPCISD::FRE: return "PPCISD::FRE";
1686
case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";
1687
case PPCISD::FTSQRT:
1688
return "PPCISD::FTSQRT";
1689
case PPCISD::FSQRT:
1690
return "PPCISD::FSQRT";
1691
case PPCISD::STFIWX: return "PPCISD::STFIWX";
1692
case PPCISD::VPERM: return "PPCISD::VPERM";
1693
case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
1694
case PPCISD::XXSPLTI_SP_TO_DP:
1695
return "PPCISD::XXSPLTI_SP_TO_DP";
1696
case PPCISD::XXSPLTI32DX:
1697
return "PPCISD::XXSPLTI32DX";
1698
case PPCISD::VECINSERT: return "PPCISD::VECINSERT";
1699
case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
1700
case PPCISD::XXPERM:
1701
return "PPCISD::XXPERM";
1702
case PPCISD::VECSHL: return "PPCISD::VECSHL";
1703
case PPCISD::CMPB: return "PPCISD::CMPB";
1704
case PPCISD::Hi: return "PPCISD::Hi";
1705
case PPCISD::Lo: return "PPCISD::Lo";
1706
case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";
1707
case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
1708
case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
1709
case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";
1710
case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";
1711
case PPCISD::PROBED_ALLOCA: return "PPCISD::PROBED_ALLOCA";
1712
case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
1713
case PPCISD::SRL: return "PPCISD::SRL";
1714
case PPCISD::SRA: return "PPCISD::SRA";
1715
case PPCISD::SHL: return "PPCISD::SHL";
1716
case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE";
1717
case PPCISD::CALL: return "PPCISD::CALL";
1718
case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP";
1719
case PPCISD::CALL_NOTOC: return "PPCISD::CALL_NOTOC";
1720
case PPCISD::CALL_RM:
1721
return "PPCISD::CALL_RM";
1722
case PPCISD::CALL_NOP_RM:
1723
return "PPCISD::CALL_NOP_RM";
1724
case PPCISD::CALL_NOTOC_RM:
1725
return "PPCISD::CALL_NOTOC_RM";
1726
case PPCISD::MTCTR: return "PPCISD::MTCTR";
1727
case PPCISD::BCTRL: return "PPCISD::BCTRL";
1728
case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC";
1729
case PPCISD::BCTRL_RM:
1730
return "PPCISD::BCTRL_RM";
1731
case PPCISD::BCTRL_LOAD_TOC_RM:
1732
return "PPCISD::BCTRL_LOAD_TOC_RM";
1733
case PPCISD::RET_GLUE: return "PPCISD::RET_GLUE";
1734
case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE";
1735
case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP";
1736
case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
1737
case PPCISD::MFOCRF: return "PPCISD::MFOCRF";
1738
case PPCISD::MFVSR: return "PPCISD::MFVSR";
1739
case PPCISD::MTVSRA: return "PPCISD::MTVSRA";
1740
case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ";
1741
case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP";
1742
case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP";
1743
case PPCISD::SCALAR_TO_VECTOR_PERMUTED:
1744
return "PPCISD::SCALAR_TO_VECTOR_PERMUTED";
1745
case PPCISD::ANDI_rec_1_EQ_BIT:
1746
return "PPCISD::ANDI_rec_1_EQ_BIT";
1747
case PPCISD::ANDI_rec_1_GT_BIT:
1748
return "PPCISD::ANDI_rec_1_GT_BIT";
1749
case PPCISD::VCMP: return "PPCISD::VCMP";
1750
case PPCISD::VCMP_rec: return "PPCISD::VCMP_rec";
1751
case PPCISD::LBRX: return "PPCISD::LBRX";
1752
case PPCISD::STBRX: return "PPCISD::STBRX";
1753
case PPCISD::LFIWAX: return "PPCISD::LFIWAX";
1754
case PPCISD::LFIWZX: return "PPCISD::LFIWZX";
1755
case PPCISD::LXSIZX: return "PPCISD::LXSIZX";
1756
case PPCISD::STXSIX: return "PPCISD::STXSIX";
1757
case PPCISD::VEXTS: return "PPCISD::VEXTS";
1758
case PPCISD::LXVD2X: return "PPCISD::LXVD2X";
1759
case PPCISD::STXVD2X: return "PPCISD::STXVD2X";
1760
case PPCISD::LOAD_VEC_BE: return "PPCISD::LOAD_VEC_BE";
1761
case PPCISD::STORE_VEC_BE: return "PPCISD::STORE_VEC_BE";
1762
case PPCISD::ST_VSR_SCAL_INT:
1763
return "PPCISD::ST_VSR_SCAL_INT";
1764
case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";
1765
case PPCISD::BDNZ: return "PPCISD::BDNZ";
1766
case PPCISD::BDZ: return "PPCISD::BDZ";
1767
case PPCISD::MFFS: return "PPCISD::MFFS";
1768
case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ";
1769
case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN";
1770
case PPCISD::CR6SET: return "PPCISD::CR6SET";
1771
case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET";
1772
case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT";
1773
case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT";
1774
case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
1775
case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L";
1776
case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS";
1777
case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA";
1778
case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L";
1779
case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR";
1780
case PPCISD::GET_TLS_MOD_AIX: return "PPCISD::GET_TLS_MOD_AIX";
1781
case PPCISD::GET_TPOINTER: return "PPCISD::GET_TPOINTER";
1782
case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
1783
case PPCISD::TLSGD_AIX: return "PPCISD::TLSGD_AIX";
1784
case PPCISD::TLSLD_AIX: return "PPCISD::TLSLD_AIX";
1785
case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA";
1786
case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L";
1787
case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR";
1788
case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
1789
case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
1790
case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L";
1791
case PPCISD::PADDI_DTPREL:
1792
return "PPCISD::PADDI_DTPREL";
1793
case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT";
1794
case PPCISD::SC: return "PPCISD::SC";
1795
case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB";
1796
case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE";
1797
case PPCISD::RFEBB: return "PPCISD::RFEBB";
1798
case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD";
1799
case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN";
1800
case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";
1801
case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64";
1802
case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE";
1803
case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";
1804
case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";
1805
case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF";
1806
case PPCISD::MAT_PCREL_ADDR: return "PPCISD::MAT_PCREL_ADDR";
1807
case PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR:
1808
return "PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR";
1809
case PPCISD::TLS_LOCAL_EXEC_MAT_ADDR:
1810
return "PPCISD::TLS_LOCAL_EXEC_MAT_ADDR";
1811
case PPCISD::ACC_BUILD: return "PPCISD::ACC_BUILD";
1812
case PPCISD::PAIR_BUILD: return "PPCISD::PAIR_BUILD";
1813
case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG";
1814
case PPCISD::XXMFACC: return "PPCISD::XXMFACC";
1815
case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";
1816
case PPCISD::ZEXT_LD_SPLAT: return "PPCISD::ZEXT_LD_SPLAT";
1817
case PPCISD::SEXT_LD_SPLAT: return "PPCISD::SEXT_LD_SPLAT";
1818
case PPCISD::FNMSUB: return "PPCISD::FNMSUB";
1819
case PPCISD::STRICT_FADDRTZ:
1820
return "PPCISD::STRICT_FADDRTZ";
1821
case PPCISD::STRICT_FCTIDZ:
1822
return "PPCISD::STRICT_FCTIDZ";
1823
case PPCISD::STRICT_FCTIWZ:
1824
return "PPCISD::STRICT_FCTIWZ";
1825
case PPCISD::STRICT_FCTIDUZ:
1826
return "PPCISD::STRICT_FCTIDUZ";
1827
case PPCISD::STRICT_FCTIWUZ:
1828
return "PPCISD::STRICT_FCTIWUZ";
1829
case PPCISD::STRICT_FCFID:
1830
return "PPCISD::STRICT_FCFID";
1831
case PPCISD::STRICT_FCFIDU:
1832
return "PPCISD::STRICT_FCFIDU";
1833
case PPCISD::STRICT_FCFIDS:
1834
return "PPCISD::STRICT_FCFIDS";
1835
case PPCISD::STRICT_FCFIDUS:
1836
return "PPCISD::STRICT_FCFIDUS";
1837
case PPCISD::LXVRZX: return "PPCISD::LXVRZX";
1838
case PPCISD::STORE_COND:
1839
return "PPCISD::STORE_COND";
1840
}
1841
return nullptr;
1842
}
1843
1844
EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
1845
EVT VT) const {
1846
if (!VT.isVector())
1847
return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
1848
1849
return VT.changeVectorElementTypeToInteger();
1850
}
1851
1852
bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
1853
assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
1854
return true;
1855
}
1856
1857
//===----------------------------------------------------------------------===//
1858
// Node matching predicates, for use by the tblgen matching code.
1859
//===----------------------------------------------------------------------===//
1860
1861
/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
1862
static bool isFloatingPointZero(SDValue Op) {
1863
if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
1864
return CFP->getValueAPF().isZero();
1865
else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
1866
// Maybe this has already been legalized into the constant pool?
1867
if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
1868
if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
1869
return CFP->getValueAPF().isZero();
1870
}
1871
return false;
1872
}
1873
1874
/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return
1875
/// true if Op is undef or if it matches the specified value.
1876
static bool isConstantOrUndef(int Op, int Val) {
1877
return Op < 0 || Op == Val;
1878
}
1879
1880
/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
1881
/// VPKUHUM instruction.
1882
/// The ShuffleKind distinguishes between big-endian operations with
1883
/// two different inputs (0), either-endian operations with two identical
1884
/// inputs (1), and little-endian operations with two different inputs (2).
1885
/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1886
bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1887
SelectionDAG &DAG) {
1888
bool IsLE = DAG.getDataLayout().isLittleEndian();
1889
if (ShuffleKind == 0) {
1890
if (IsLE)
1891
return false;
1892
for (unsigned i = 0; i != 16; ++i)
1893
if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
1894
return false;
1895
} else if (ShuffleKind == 2) {
1896
if (!IsLE)
1897
return false;
1898
for (unsigned i = 0; i != 16; ++i)
1899
if (!isConstantOrUndef(N->getMaskElt(i), i*2))
1900
return false;
1901
} else if (ShuffleKind == 1) {
1902
unsigned j = IsLE ? 0 : 1;
1903
for (unsigned i = 0; i != 8; ++i)
1904
if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) ||
1905
!isConstantOrUndef(N->getMaskElt(i+8), i*2+j))
1906
return false;
1907
}
1908
return true;
1909
}
1910
1911
/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
1912
/// VPKUWUM instruction.
1913
/// The ShuffleKind distinguishes between big-endian operations with
1914
/// two different inputs (0), either-endian operations with two identical
1915
/// inputs (1), and little-endian operations with two different inputs (2).
1916
/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1917
bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1918
SelectionDAG &DAG) {
1919
bool IsLE = DAG.getDataLayout().isLittleEndian();
1920
if (ShuffleKind == 0) {
1921
if (IsLE)
1922
return false;
1923
for (unsigned i = 0; i != 16; i += 2)
1924
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) ||
1925
!isConstantOrUndef(N->getMaskElt(i+1), i*2+3))
1926
return false;
1927
} else if (ShuffleKind == 2) {
1928
if (!IsLE)
1929
return false;
1930
for (unsigned i = 0; i != 16; i += 2)
1931
if (!isConstantOrUndef(N->getMaskElt(i ), i*2) ||
1932
!isConstantOrUndef(N->getMaskElt(i+1), i*2+1))
1933
return false;
1934
} else if (ShuffleKind == 1) {
1935
unsigned j = IsLE ? 0 : 2;
1936
for (unsigned i = 0; i != 8; i += 2)
1937
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
1938
!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) ||
1939
!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||
1940
!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1))
1941
return false;
1942
}
1943
return true;
1944
}
1945
1946
/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
1947
/// VPKUDUM instruction, AND the VPKUDUM instruction exists for the
1948
/// current subtarget.
1949
///
1950
/// The ShuffleKind distinguishes between big-endian operations with
1951
/// two different inputs (0), either-endian operations with two identical
1952
/// inputs (1), and little-endian operations with two different inputs (2).
1953
/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1954
bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1955
SelectionDAG &DAG) {
1956
const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
1957
if (!Subtarget.hasP8Vector())
1958
return false;
1959
1960
bool IsLE = DAG.getDataLayout().isLittleEndian();
1961
if (ShuffleKind == 0) {
1962
if (IsLE)
1963
return false;
1964
for (unsigned i = 0; i != 16; i += 4)
1965
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) ||
1966
!isConstantOrUndef(N->getMaskElt(i+1), i*2+5) ||
1967
!isConstantOrUndef(N->getMaskElt(i+2), i*2+6) ||
1968
!isConstantOrUndef(N->getMaskElt(i+3), i*2+7))
1969
return false;
1970
} else if (ShuffleKind == 2) {
1971
if (!IsLE)
1972
return false;
1973
for (unsigned i = 0; i != 16; i += 4)
1974
if (!isConstantOrUndef(N->getMaskElt(i ), i*2) ||
1975
!isConstantOrUndef(N->getMaskElt(i+1), i*2+1) ||
1976
!isConstantOrUndef(N->getMaskElt(i+2), i*2+2) ||
1977
!isConstantOrUndef(N->getMaskElt(i+3), i*2+3))
1978
return false;
1979
} else if (ShuffleKind == 1) {
1980
unsigned j = IsLE ? 0 : 4;
1981
for (unsigned i = 0; i != 8; i += 4)
1982
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
1983
!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) ||
1984
!isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) ||
1985
!isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) ||
1986
!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||
1987
!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) ||
1988
!isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) ||
1989
!isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3))
1990
return false;
1991
}
1992
return true;
1993
}
1994
1995
/// isVMerge - Common function, used to match vmrg* shuffles.
1996
///
1997
static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
1998
unsigned LHSStart, unsigned RHSStart) {
1999
if (N->getValueType(0) != MVT::v16i8)
2000
return false;
2001
assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
2002
"Unsupported merge size!");
2003
2004
for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units
2005
for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit
2006
if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j),
2007
LHSStart+j+i*UnitSize) ||
2008
!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j),
2009
RHSStart+j+i*UnitSize))
2010
return false;
2011
}
2012
return true;
2013
}
2014
2015
/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
2016
/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
2017
/// The ShuffleKind distinguishes between big-endian merges with two
2018
/// different inputs (0), either-endian merges with two identical inputs (1),
2019
/// and little-endian merges with two different inputs (2). For the latter,
2020
/// the input operands are swapped (see PPCInstrAltivec.td).
2021
bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
2022
unsigned ShuffleKind, SelectionDAG &DAG) {
2023
if (DAG.getDataLayout().isLittleEndian()) {
2024
if (ShuffleKind == 1) // unary
2025
return isVMerge(N, UnitSize, 0, 0);
2026
else if (ShuffleKind == 2) // swapped
2027
return isVMerge(N, UnitSize, 0, 16);
2028
else
2029
return false;
2030
} else {
2031
if (ShuffleKind == 1) // unary
2032
return isVMerge(N, UnitSize, 8, 8);
2033
else if (ShuffleKind == 0) // normal
2034
return isVMerge(N, UnitSize, 8, 24);
2035
else
2036
return false;
2037
}
2038
}
2039
2040
/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
2041
/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
2042
/// The ShuffleKind distinguishes between big-endian merges with two
2043
/// different inputs (0), either-endian merges with two identical inputs (1),
2044
/// and little-endian merges with two different inputs (2). For the latter,
2045
/// the input operands are swapped (see PPCInstrAltivec.td).
2046
bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
2047
unsigned ShuffleKind, SelectionDAG &DAG) {
2048
if (DAG.getDataLayout().isLittleEndian()) {
2049
if (ShuffleKind == 1) // unary
2050
return isVMerge(N, UnitSize, 8, 8);
2051
else if (ShuffleKind == 2) // swapped
2052
return isVMerge(N, UnitSize, 8, 24);
2053
else
2054
return false;
2055
} else {
2056
if (ShuffleKind == 1) // unary
2057
return isVMerge(N, UnitSize, 0, 0);
2058
else if (ShuffleKind == 0) // normal
2059
return isVMerge(N, UnitSize, 0, 16);
2060
else
2061
return false;
2062
}
2063
}
2064
2065
/**
2066
* Common function used to match vmrgew and vmrgow shuffles
2067
*
2068
* The indexOffset determines whether to look for even or odd words in
2069
* the shuffle mask. This is based on the of the endianness of the target
2070
* machine.
2071
* - Little Endian:
2072
* - Use offset of 0 to check for odd elements
2073
* - Use offset of 4 to check for even elements
2074
* - Big Endian:
2075
* - Use offset of 0 to check for even elements
2076
* - Use offset of 4 to check for odd elements
2077
* A detailed description of the vector element ordering for little endian and
2078
* big endian can be found at
2079
* http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html
2080
* Targeting your applications - what little endian and big endian IBM XL C/C++
2081
* compiler differences mean to you
2082
*
2083
* The mask to the shuffle vector instruction specifies the indices of the
2084
* elements from the two input vectors to place in the result. The elements are
2085
* numbered in array-access order, starting with the first vector. These vectors
2086
* are always of type v16i8, thus each vector will contain 16 elements of size
2087
* 8. More info on the shuffle vector can be found in the
2088
* http://llvm.org/docs/LangRef.html#shufflevector-instruction
2089
* Language Reference.
2090
*
2091
* The RHSStartValue indicates whether the same input vectors are used (unary)
2092
* or two different input vectors are used, based on the following:
2093
* - If the instruction uses the same vector for both inputs, the range of the
2094
* indices will be 0 to 15. In this case, the RHSStart value passed should
2095
* be 0.
2096
* - If the instruction has two different vectors then the range of the
2097
* indices will be 0 to 31. In this case, the RHSStart value passed should
2098
* be 16 (indices 0-15 specify elements in the first vector while indices 16
2099
* to 31 specify elements in the second vector).
2100
*
2101
* \param[in] N The shuffle vector SD Node to analyze
2102
* \param[in] IndexOffset Specifies whether to look for even or odd elements
2103
* \param[in] RHSStartValue Specifies the starting index for the righthand input
2104
* vector to the shuffle_vector instruction
2105
* \return true iff this shuffle vector represents an even or odd word merge
2106
*/
2107
static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
2108
unsigned RHSStartValue) {
2109
if (N->getValueType(0) != MVT::v16i8)
2110
return false;
2111
2112
for (unsigned i = 0; i < 2; ++i)
2113
for (unsigned j = 0; j < 4; ++j)
2114
if (!isConstantOrUndef(N->getMaskElt(i*4+j),
2115
i*RHSStartValue+j+IndexOffset) ||
2116
!isConstantOrUndef(N->getMaskElt(i*4+j+8),
2117
i*RHSStartValue+j+IndexOffset+8))
2118
return false;
2119
return true;
2120
}
2121
2122
/**
2123
* Determine if the specified shuffle mask is suitable for the vmrgew or
2124
* vmrgow instructions.
2125
*
2126
* \param[in] N The shuffle vector SD Node to analyze
2127
* \param[in] CheckEven Check for an even merge (true) or an odd merge (false)
2128
* \param[in] ShuffleKind Identify the type of merge:
2129
* - 0 = big-endian merge with two different inputs;
2130
* - 1 = either-endian merge with two identical inputs;
2131
* - 2 = little-endian merge with two different inputs (inputs are swapped for
2132
* little-endian merges).
2133
* \param[in] DAG The current SelectionDAG
2134
* \return true iff this shuffle mask
2135
*/
2136
bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
2137
unsigned ShuffleKind, SelectionDAG &DAG) {
2138
if (DAG.getDataLayout().isLittleEndian()) {
2139
unsigned indexOffset = CheckEven ? 4 : 0;
2140
if (ShuffleKind == 1) // Unary
2141
return isVMerge(N, indexOffset, 0);
2142
else if (ShuffleKind == 2) // swapped
2143
return isVMerge(N, indexOffset, 16);
2144
else
2145
return false;
2146
}
2147
else {
2148
unsigned indexOffset = CheckEven ? 0 : 4;
2149
if (ShuffleKind == 1) // Unary
2150
return isVMerge(N, indexOffset, 0);
2151
else if (ShuffleKind == 0) // Normal
2152
return isVMerge(N, indexOffset, 16);
2153
else
2154
return false;
2155
}
2156
return false;
2157
}
2158
2159
/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
2160
/// amount, otherwise return -1.
2161
/// The ShuffleKind distinguishes between big-endian operations with two
2162
/// different inputs (0), either-endian operations with two identical inputs
2163
/// (1), and little-endian operations with two different inputs (2). For the
2164
/// latter, the input operands are swapped (see PPCInstrAltivec.td).
2165
int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
2166
SelectionDAG &DAG) {
2167
if (N->getValueType(0) != MVT::v16i8)
2168
return -1;
2169
2170
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2171
2172
// Find the first non-undef value in the shuffle mask.
2173
unsigned i;
2174
for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
2175
/*search*/;
2176
2177
if (i == 16) return -1; // all undef.
2178
2179
// Otherwise, check to see if the rest of the elements are consecutively
2180
// numbered from this value.
2181
unsigned ShiftAmt = SVOp->getMaskElt(i);
2182
if (ShiftAmt < i) return -1;
2183
2184
ShiftAmt -= i;
2185
bool isLE = DAG.getDataLayout().isLittleEndian();
2186
2187
if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
2188
// Check the rest of the elements to see if they are consecutive.
2189
for (++i; i != 16; ++i)
2190
if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
2191
return -1;
2192
} else if (ShuffleKind == 1) {
2193
// Check the rest of the elements to see if they are consecutive.
2194
for (++i; i != 16; ++i)
2195
if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
2196
return -1;
2197
} else
2198
return -1;
2199
2200
if (isLE)
2201
ShiftAmt = 16 - ShiftAmt;
2202
2203
return ShiftAmt;
2204
}
2205
2206
/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
2207
/// specifies a splat of a single element that is suitable for input to
2208
/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
2209
bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
2210
EVT VT = N->getValueType(0);
2211
if (VT == MVT::v2i64 || VT == MVT::v2f64)
2212
return EltSize == 8 && N->getMaskElt(0) == N->getMaskElt(1);
2213
2214
assert(VT == MVT::v16i8 && isPowerOf2_32(EltSize) &&
2215
EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");
2216
2217
// The consecutive indices need to specify an element, not part of two
2218
// different elements. So abandon ship early if this isn't the case.
2219
if (N->getMaskElt(0) % EltSize != 0)
2220
return false;
2221
2222
// This is a splat operation if each element of the permute is the same, and
2223
// if the value doesn't reference the second vector.
2224
unsigned ElementBase = N->getMaskElt(0);
2225
2226
// FIXME: Handle UNDEF elements too!
2227
if (ElementBase >= 16)
2228
return false;
2229
2230
// Check that the indices are consecutive, in the case of a multi-byte element
2231
// splatted with a v16i8 mask.
2232
for (unsigned i = 1; i != EltSize; ++i)
2233
if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase))
2234
return false;
2235
2236
for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
2237
if (N->getMaskElt(i) < 0) continue;
2238
for (unsigned j = 0; j != EltSize; ++j)
2239
if (N->getMaskElt(i+j) != N->getMaskElt(j))
2240
return false;
2241
}
2242
return true;
2243
}
2244
2245
/// Check that the mask is shuffling N byte elements. Within each N byte
2246
/// element of the mask, the indices could be either in increasing or
2247
/// decreasing order as long as they are consecutive.
2248
/// \param[in] N the shuffle vector SD Node to analyze
2249
/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/
2250
/// Word/DoubleWord/QuadWord).
2251
/// \param[in] StepLen the delta indices number among the N byte element, if
2252
/// the mask is in increasing/decreasing order then it is 1/-1.
2253
/// \return true iff the mask is shuffling N byte elements.
2254
static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,
2255
int StepLen) {
2256
assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) &&
2257
"Unexpected element width.");
2258
assert((StepLen == 1 || StepLen == -1) && "Unexpected element width.");
2259
2260
unsigned NumOfElem = 16 / Width;
2261
unsigned MaskVal[16]; // Width is never greater than 16
2262
for (unsigned i = 0; i < NumOfElem; ++i) {
2263
MaskVal[0] = N->getMaskElt(i * Width);
2264
if ((StepLen == 1) && (MaskVal[0] % Width)) {
2265
return false;
2266
} else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {
2267
return false;
2268
}
2269
2270
for (unsigned int j = 1; j < Width; ++j) {
2271
MaskVal[j] = N->getMaskElt(i * Width + j);
2272
if (MaskVal[j] != MaskVal[j-1] + StepLen) {
2273
return false;
2274
}
2275
}
2276
}
2277
2278
return true;
2279
}
2280
2281
bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
2282
unsigned &InsertAtByte, bool &Swap, bool IsLE) {
2283
if (!isNByteElemShuffleMask(N, 4, 1))
2284
return false;
2285
2286
// Now we look at mask elements 0,4,8,12
2287
unsigned M0 = N->getMaskElt(0) / 4;
2288
unsigned M1 = N->getMaskElt(4) / 4;
2289
unsigned M2 = N->getMaskElt(8) / 4;
2290
unsigned M3 = N->getMaskElt(12) / 4;
2291
unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
2292
unsigned BigEndianShifts[] = { 3, 0, 1, 2 };
2293
2294
// Below, let H and L be arbitrary elements of the shuffle mask
2295
// where H is in the range [4,7] and L is in the range [0,3].
2296
// H, 1, 2, 3 or L, 5, 6, 7
2297
if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) ||
2298
(M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
2299
ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
2300
InsertAtByte = IsLE ? 12 : 0;
2301
Swap = M0 < 4;
2302
return true;
2303
}
2304
// 0, H, 2, 3 or 4, L, 6, 7
2305
if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) ||
2306
(M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
2307
ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
2308
InsertAtByte = IsLE ? 8 : 4;
2309
Swap = M1 < 4;
2310
return true;
2311
}
2312
// 0, 1, H, 3 or 4, 5, L, 7
2313
if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) ||
2314
(M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
2315
ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
2316
InsertAtByte = IsLE ? 4 : 8;
2317
Swap = M2 < 4;
2318
return true;
2319
}
2320
// 0, 1, 2, H or 4, 5, 6, L
2321
if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) ||
2322
(M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
2323
ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
2324
InsertAtByte = IsLE ? 0 : 12;
2325
Swap = M3 < 4;
2326
return true;
2327
}
2328
2329
// If both vector operands for the shuffle are the same vector, the mask will
2330
// contain only elements from the first one and the second one will be undef.
2331
if (N->getOperand(1).isUndef()) {
2332
ShiftElts = 0;
2333
Swap = true;
2334
unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
2335
if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
2336
InsertAtByte = IsLE ? 12 : 0;
2337
return true;
2338
}
2339
if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
2340
InsertAtByte = IsLE ? 8 : 4;
2341
return true;
2342
}
2343
if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
2344
InsertAtByte = IsLE ? 4 : 8;
2345
return true;
2346
}
2347
if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
2348
InsertAtByte = IsLE ? 0 : 12;
2349
return true;
2350
}
2351
}
2352
2353
return false;
2354
}
2355
2356
bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
2357
bool &Swap, bool IsLE) {
2358
assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2359
// Ensure each byte index of the word is consecutive.
2360
if (!isNByteElemShuffleMask(N, 4, 1))
2361
return false;
2362
2363
// Now we look at mask elements 0,4,8,12, which are the beginning of words.
2364
unsigned M0 = N->getMaskElt(0) / 4;
2365
unsigned M1 = N->getMaskElt(4) / 4;
2366
unsigned M2 = N->getMaskElt(8) / 4;
2367
unsigned M3 = N->getMaskElt(12) / 4;
2368
2369
// If both vector operands for the shuffle are the same vector, the mask will
2370
// contain only elements from the first one and the second one will be undef.
2371
if (N->getOperand(1).isUndef()) {
2372
assert(M0 < 4 && "Indexing into an undef vector?");
2373
if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4)
2374
return false;
2375
2376
ShiftElts = IsLE ? (4 - M0) % 4 : M0;
2377
Swap = false;
2378
return true;
2379
}
2380
2381
// Ensure each word index of the ShuffleVector Mask is consecutive.
2382
if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8)
2383
return false;
2384
2385
if (IsLE) {
2386
if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) {
2387
// Input vectors don't need to be swapped if the leading element
2388
// of the result is one of the 3 left elements of the second vector
2389
// (or if there is no shift to be done at all).
2390
Swap = false;
2391
ShiftElts = (8 - M0) % 8;
2392
} else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) {
2393
// Input vectors need to be swapped if the leading element
2394
// of the result is one of the 3 left elements of the first vector
2395
// (or if we're shifting by 4 - thereby simply swapping the vectors).
2396
Swap = true;
2397
ShiftElts = (4 - M0) % 4;
2398
}
2399
2400
return true;
2401
} else { // BE
2402
if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) {
2403
// Input vectors don't need to be swapped if the leading element
2404
// of the result is one of the 4 elements of the first vector.
2405
Swap = false;
2406
ShiftElts = M0;
2407
} else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) {
2408
// Input vectors need to be swapped if the leading element
2409
// of the result is one of the 4 elements of the right vector.
2410
Swap = true;
2411
ShiftElts = M0 - 4;
2412
}
2413
2414
return true;
2415
}
2416
}
2417
2418
bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {
2419
assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2420
2421
if (!isNByteElemShuffleMask(N, Width, -1))
2422
return false;
2423
2424
for (int i = 0; i < 16; i += Width)
2425
if (N->getMaskElt(i) != i + Width - 1)
2426
return false;
2427
2428
return true;
2429
}
2430
2431
bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {
2432
return isXXBRShuffleMaskHelper(N, 2);
2433
}
2434
2435
bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {
2436
return isXXBRShuffleMaskHelper(N, 4);
2437
}
2438
2439
bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {
2440
return isXXBRShuffleMaskHelper(N, 8);
2441
}
2442
2443
bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {
2444
return isXXBRShuffleMaskHelper(N, 16);
2445
}
2446
2447
/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
2448
/// if the inputs to the instruction should be swapped and set \p DM to the
2449
/// value for the immediate.
2450
/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
2451
/// AND element 0 of the result comes from the first input (LE) or second input
2452
/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
2453
/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
2454
/// mask.
2455
bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
2456
bool &Swap, bool IsLE) {
2457
assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2458
2459
// Ensure each byte index of the double word is consecutive.
2460
if (!isNByteElemShuffleMask(N, 8, 1))
2461
return false;
2462
2463
unsigned M0 = N->getMaskElt(0) / 8;
2464
unsigned M1 = N->getMaskElt(8) / 8;
2465
assert(((M0 | M1) < 4) && "A mask element out of bounds?");
2466
2467
// If both vector operands for the shuffle are the same vector, the mask will
2468
// contain only elements from the first one and the second one will be undef.
2469
if (N->getOperand(1).isUndef()) {
2470
if ((M0 | M1) < 2) {
2471
DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
2472
Swap = false;
2473
return true;
2474
} else
2475
return false;
2476
}
2477
2478
if (IsLE) {
2479
if (M0 > 1 && M1 < 2) {
2480
Swap = false;
2481
} else if (M0 < 2 && M1 > 1) {
2482
M0 = (M0 + 2) % 4;
2483
M1 = (M1 + 2) % 4;
2484
Swap = true;
2485
} else
2486
return false;
2487
2488
// Note: if control flow comes here that means Swap is already set above
2489
DM = (((~M1) & 1) << 1) + ((~M0) & 1);
2490
return true;
2491
} else { // BE
2492
if (M0 < 2 && M1 > 1) {
2493
Swap = false;
2494
} else if (M0 > 1 && M1 < 2) {
2495
M0 = (M0 + 2) % 4;
2496
M1 = (M1 + 2) % 4;
2497
Swap = true;
2498
} else
2499
return false;
2500
2501
// Note: if control flow comes here that means Swap is already set above
2502
DM = (M0 << 1) + (M1 & 1);
2503
return true;
2504
}
2505
}
2506
2507
2508
/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
2509
/// appropriate for PPC mnemonics (which have a big endian bias - namely
2510
/// elements are counted from the left of the vector register).
2511
unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
2512
SelectionDAG &DAG) {
2513
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2514
assert(isSplatShuffleMask(SVOp, EltSize));
2515
EVT VT = SVOp->getValueType(0);
2516
2517
if (VT == MVT::v2i64 || VT == MVT::v2f64)
2518
return DAG.getDataLayout().isLittleEndian() ? 1 - SVOp->getMaskElt(0)
2519
: SVOp->getMaskElt(0);
2520
2521
if (DAG.getDataLayout().isLittleEndian())
2522
return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
2523
else
2524
return SVOp->getMaskElt(0) / EltSize;
2525
}
2526
2527
/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
2528
/// by using a vspltis[bhw] instruction of the specified element size, return
2529
/// the constant being splatted. The ByteSize field indicates the number of
2530
/// bytes of each element [124] -> [bhw].
2531
SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
2532
SDValue OpVal;
2533
2534
// If ByteSize of the splat is bigger than the element size of the
2535
// build_vector, then we have a case where we are checking for a splat where
2536
// multiple elements of the buildvector are folded together into a single
2537
// logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
2538
unsigned EltSize = 16/N->getNumOperands();
2539
if (EltSize < ByteSize) {
2540
unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval.
2541
SDValue UniquedVals[4];
2542
assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
2543
2544
// See if all of the elements in the buildvector agree across.
2545
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
2546
if (N->getOperand(i).isUndef()) continue;
2547
// If the element isn't a constant, bail fully out.
2548
if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
2549
2550
if (!UniquedVals[i&(Multiple-1)].getNode())
2551
UniquedVals[i&(Multiple-1)] = N->getOperand(i);
2552
else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
2553
return SDValue(); // no match.
2554
}
2555
2556
// Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
2557
// either constant or undef values that are identical for each chunk. See
2558
// if these chunks can form into a larger vspltis*.
2559
2560
// Check to see if all of the leading entries are either 0 or -1. If
2561
// neither, then this won't fit into the immediate field.
2562
bool LeadingZero = true;
2563
bool LeadingOnes = true;
2564
for (unsigned i = 0; i != Multiple-1; ++i) {
2565
if (!UniquedVals[i].getNode()) continue; // Must have been undefs.
2566
2567
LeadingZero &= isNullConstant(UniquedVals[i]);
2568
LeadingOnes &= isAllOnesConstant(UniquedVals[i]);
2569
}
2570
// Finally, check the least significant entry.
2571
if (LeadingZero) {
2572
if (!UniquedVals[Multiple-1].getNode())
2573
return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef
2574
int Val = UniquedVals[Multiple - 1]->getAsZExtVal();
2575
if (Val < 16) // 0,0,0,4 -> vspltisw(4)
2576
return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
2577
}
2578
if (LeadingOnes) {
2579
if (!UniquedVals[Multiple-1].getNode())
2580
return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef
2581
int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
2582
if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2)
2583
return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
2584
}
2585
2586
return SDValue();
2587
}
2588
2589
// Check to see if this buildvec has a single non-undef value in its elements.
2590
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
2591
if (N->getOperand(i).isUndef()) continue;
2592
if (!OpVal.getNode())
2593
OpVal = N->getOperand(i);
2594
else if (OpVal != N->getOperand(i))
2595
return SDValue();
2596
}
2597
2598
if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def.
2599
2600
unsigned ValSizeInBytes = EltSize;
2601
uint64_t Value = 0;
2602
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
2603
Value = CN->getZExtValue();
2604
} else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
2605
assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
2606
Value = llvm::bit_cast<uint32_t>(CN->getValueAPF().convertToFloat());
2607
}
2608
2609
// If the splat value is larger than the element value, then we can never do
2610
// this splat. The only case that we could fit the replicated bits into our
2611
// immediate field for would be zero, and we prefer to use vxor for it.
2612
if (ValSizeInBytes < ByteSize) return SDValue();
2613
2614
// If the element value is larger than the splat value, check if it consists
2615
// of a repeated bit pattern of size ByteSize.
2616
if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8))
2617
return SDValue();
2618
2619
// Properly sign extend the value.
2620
int MaskVal = SignExtend32(Value, ByteSize * 8);
2621
2622
// If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
2623
if (MaskVal == 0) return SDValue();
2624
2625
// Finally, if this value fits in a 5 bit sext field, return it
2626
if (SignExtend32<5>(MaskVal) == MaskVal)
2627
return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32);
2628
return SDValue();
2629
}
2630
2631
//===----------------------------------------------------------------------===//
2632
// Addressing Mode Selection
2633
//===----------------------------------------------------------------------===//
2634
2635
/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
2636
/// or 64-bit immediate, and if the value can be accurately represented as a
2637
/// sign extension from a 16-bit value. If so, this returns true and the
2638
/// immediate.
2639
bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
2640
if (!isa<ConstantSDNode>(N))
2641
return false;
2642
2643
Imm = (int16_t)N->getAsZExtVal();
2644
if (N->getValueType(0) == MVT::i32)
2645
return Imm == (int32_t)N->getAsZExtVal();
2646
else
2647
return Imm == (int64_t)N->getAsZExtVal();
2648
}
2649
bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
2650
return isIntS16Immediate(Op.getNode(), Imm);
2651
}
2652
2653
/// Used when computing address flags for selecting loads and stores.
2654
/// If we have an OR, check if the LHS and RHS are provably disjoint.
2655
/// An OR of two provably disjoint values is equivalent to an ADD.
2656
/// Most PPC load/store instructions compute the effective address as a sum,
2657
/// so doing this conversion is useful.
2658
static bool provablyDisjointOr(SelectionDAG &DAG, const SDValue &N) {
2659
if (N.getOpcode() != ISD::OR)
2660
return false;
2661
KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
2662
if (!LHSKnown.Zero.getBoolValue())
2663
return false;
2664
KnownBits RHSKnown = DAG.computeKnownBits(N.getOperand(1));
2665
return (~(LHSKnown.Zero | RHSKnown.Zero) == 0);
2666
}
2667
2668
/// SelectAddressEVXRegReg - Given the specified address, check to see if it can
2669
/// be represented as an indexed [r+r] operation.
2670
bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
2671
SDValue &Index,
2672
SelectionDAG &DAG) const {
2673
for (SDNode *U : N->uses()) {
2674
if (MemSDNode *Memop = dyn_cast<MemSDNode>(U)) {
2675
if (Memop->getMemoryVT() == MVT::f64) {
2676
Base = N.getOperand(0);
2677
Index = N.getOperand(1);
2678
return true;
2679
}
2680
}
2681
}
2682
return false;
2683
}
2684
2685
/// isIntS34Immediate - This method tests if value of node given can be
2686
/// accurately represented as a sign extension from a 34-bit value. If so,
2687
/// this returns true and the immediate.
2688
bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
2689
if (!isa<ConstantSDNode>(N))
2690
return false;
2691
2692
Imm = (int64_t)N->getAsZExtVal();
2693
return isInt<34>(Imm);
2694
}
2695
bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
2696
return isIntS34Immediate(Op.getNode(), Imm);
2697
}
2698
2699
/// SelectAddressRegReg - Given the specified addressed, check to see if it
2700
/// can be represented as an indexed [r+r] operation. Returns false if it
2701
/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
2702
/// non-zero and N can be represented by a base register plus a signed 16-bit
2703
/// displacement, make a more precise judgement by checking (displacement % \p
2704
/// EncodingAlignment).
2705
bool PPCTargetLowering::SelectAddressRegReg(
2706
SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG,
2707
MaybeAlign EncodingAlignment) const {
2708
// If we have a PC Relative target flag don't select as [reg+reg]. It will be
2709
// a [pc+imm].
2710
if (SelectAddressPCRel(N, Base))
2711
return false;
2712
2713
int16_t Imm = 0;
2714
if (N.getOpcode() == ISD::ADD) {
2715
// Is there any SPE load/store (f64), which can't handle 16bit offset?
2716
// SPE load/store can only handle 8-bit offsets.
2717
if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))
2718
return true;
2719
if (isIntS16Immediate(N.getOperand(1), Imm) &&
2720
(!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))
2721
return false; // r+i
2722
if (N.getOperand(1).getOpcode() == PPCISD::Lo)
2723
return false; // r+i
2724
2725
Base = N.getOperand(0);
2726
Index = N.getOperand(1);
2727
return true;
2728
} else if (N.getOpcode() == ISD::OR) {
2729
if (isIntS16Immediate(N.getOperand(1), Imm) &&
2730
(!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))
2731
return false; // r+i can fold it if we can.
2732
2733
// If this is an or of disjoint bitfields, we can codegen this as an add
2734
// (for better address arithmetic) if the LHS and RHS of the OR are provably
2735
// disjoint.
2736
KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
2737
2738
if (LHSKnown.Zero.getBoolValue()) {
2739
KnownBits RHSKnown = DAG.computeKnownBits(N.getOperand(1));
2740
// If all of the bits are known zero on the LHS or RHS, the add won't
2741
// carry.
2742
if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) {
2743
Base = N.getOperand(0);
2744
Index = N.getOperand(1);
2745
return true;
2746
}
2747
}
2748
}
2749
2750
return false;
2751
}
2752
2753
// If we happen to be doing an i64 load or store into a stack slot that has
2754
// less than a 4-byte alignment, then the frame-index elimination may need to
2755
// use an indexed load or store instruction (because the offset may not be a
2756
// multiple of 4). The extra register needed to hold the offset comes from the
2757
// register scavenger, and it is possible that the scavenger will need to use
2758
// an emergency spill slot. As a result, we need to make sure that a spill slot
2759
// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
2760
// stack slot.
2761
static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
2762
// FIXME: This does not handle the LWA case.
2763
if (VT != MVT::i64)
2764
return;
2765
2766
// NOTE: We'll exclude negative FIs here, which come from argument
2767
// lowering, because there are no known test cases triggering this problem
2768
// using packed structures (or similar). We can remove this exclusion if
2769
// we find such a test case. The reason why this is so test-case driven is
2770
// because this entire 'fixup' is only to prevent crashes (from the
2771
// register scavenger) on not-really-valid inputs. For example, if we have:
2772
// %a = alloca i1
2773
// %b = bitcast i1* %a to i64*
2774
// store i64* a, i64 b
2775
// then the store should really be marked as 'align 1', but is not. If it
2776
// were marked as 'align 1' then the indexed form would have been
2777
// instruction-selected initially, and the problem this 'fixup' is preventing
2778
// won't happen regardless.
2779
if (FrameIdx < 0)
2780
return;
2781
2782
MachineFunction &MF = DAG.getMachineFunction();
2783
MachineFrameInfo &MFI = MF.getFrameInfo();
2784
2785
if (MFI.getObjectAlign(FrameIdx) >= Align(4))
2786
return;
2787
2788
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
2789
FuncInfo->setHasNonRISpills();
2790
}
2791
2792
/// Returns true if the address N can be represented by a base register plus
2793
/// a signed 16-bit displacement [r+imm], and if it is not better
2794
/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept
2795
/// displacements that are multiples of that value.
2796
bool PPCTargetLowering::SelectAddressRegImm(
2797
SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG,
2798
MaybeAlign EncodingAlignment) const {
2799
// FIXME dl should come from parent load or store, not from address
2800
SDLoc dl(N);
2801
2802
// If we have a PC Relative target flag don't select as [reg+imm]. It will be
2803
// a [pc+imm].
2804
if (SelectAddressPCRel(N, Base))
2805
return false;
2806
2807
// If this can be more profitably realized as r+r, fail.
2808
if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment))
2809
return false;
2810
2811
if (N.getOpcode() == ISD::ADD) {
2812
int16_t imm = 0;
2813
if (isIntS16Immediate(N.getOperand(1), imm) &&
2814
(!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {
2815
Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
2816
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
2817
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
2818
fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
2819
} else {
2820
Base = N.getOperand(0);
2821
}
2822
return true; // [r+i]
2823
} else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
2824
// Match LOAD (ADD (X, Lo(G))).
2825
assert(!N.getOperand(1).getConstantOperandVal(1) &&
2826
"Cannot handle constant offsets yet!");
2827
Disp = N.getOperand(1).getOperand(0); // The global address.
2828
assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
2829
Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
2830
Disp.getOpcode() == ISD::TargetConstantPool ||
2831
Disp.getOpcode() == ISD::TargetJumpTable);
2832
Base = N.getOperand(0);
2833
return true; // [&g+r]
2834
}
2835
} else if (N.getOpcode() == ISD::OR) {
2836
int16_t imm = 0;
2837
if (isIntS16Immediate(N.getOperand(1), imm) &&
2838
(!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {
2839
// If this is an or of disjoint bitfields, we can codegen this as an add
2840
// (for better address arithmetic) if the LHS and RHS of the OR are
2841
// provably disjoint.
2842
KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
2843
2844
if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
2845
// If all of the bits are known zero on the LHS or RHS, the add won't
2846
// carry.
2847
if (FrameIndexSDNode *FI =
2848
dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
2849
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
2850
fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
2851
} else {
2852
Base = N.getOperand(0);
2853
}
2854
Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
2855
return true;
2856
}
2857
}
2858
} else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
2859
// Loading from a constant address.
2860
2861
// If this address fits entirely in a 16-bit sext immediate field, codegen
2862
// this as "d, 0"
2863
int16_t Imm;
2864
if (isIntS16Immediate(CN, Imm) &&
2865
(!EncodingAlignment || isAligned(*EncodingAlignment, Imm))) {
2866
Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
2867
Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
2868
CN->getValueType(0));
2869
return true;
2870
}
2871
2872
// Handle 32-bit sext immediates with LIS + addr mode.
2873
if ((CN->getValueType(0) == MVT::i32 ||
2874
(int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
2875
(!EncodingAlignment ||
2876
isAligned(*EncodingAlignment, CN->getZExtValue()))) {
2877
int Addr = (int)CN->getZExtValue();
2878
2879
// Otherwise, break this down into an LIS + disp.
2880
Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32);
2881
2882
Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl,
2883
MVT::i32);
2884
unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
2885
Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
2886
return true;
2887
}
2888
}
2889
2890
Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout()));
2891
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
2892
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
2893
fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
2894
} else
2895
Base = N;
2896
return true; // [r+0]
2897
}
2898
2899
/// Similar to the 16-bit case but for instructions that take a 34-bit
2900
/// displacement field (prefixed loads/stores).
2901
bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp,
2902
SDValue &Base,
2903
SelectionDAG &DAG) const {
2904
// Only on 64-bit targets.
2905
if (N.getValueType() != MVT::i64)
2906
return false;
2907
2908
SDLoc dl(N);
2909
int64_t Imm = 0;
2910
2911
if (N.getOpcode() == ISD::ADD) {
2912
if (!isIntS34Immediate(N.getOperand(1), Imm))
2913
return false;
2914
Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
2915
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
2916
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
2917
else
2918
Base = N.getOperand(0);
2919
return true;
2920
}
2921
2922
if (N.getOpcode() == ISD::OR) {
2923
if (!isIntS34Immediate(N.getOperand(1), Imm))
2924
return false;
2925
// If this is an or of disjoint bitfields, we can codegen this as an add
2926
// (for better address arithmetic) if the LHS and RHS of the OR are
2927
// provably disjoint.
2928
KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
2929
if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) != ~0ULL)
2930
return false;
2931
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
2932
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
2933
else
2934
Base = N.getOperand(0);
2935
Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
2936
return true;
2937
}
2938
2939
if (isIntS34Immediate(N, Imm)) { // If the address is a 34-bit const.
2940
Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
2941
Base = DAG.getRegister(PPC::ZERO8, N.getValueType());
2942
return true;
2943
}
2944
2945
return false;
2946
}
2947
2948
/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
2949
/// represented as an indexed [r+r] operation.
2950
bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
2951
SDValue &Index,
2952
SelectionDAG &DAG) const {
2953
// Check to see if we can easily represent this as an [r+r] address. This
2954
// will fail if it thinks that the address is more profitably represented as
2955
// reg+imm, e.g. where imm = 0.
2956
if (SelectAddressRegReg(N, Base, Index, DAG))
2957
return true;
2958
2959
// If the address is the result of an add, we will utilize the fact that the
2960
// address calculation includes an implicit add. However, we can reduce
2961
// register pressure if we do not materialize a constant just for use as the
2962
// index register. We only get rid of the add if it is not an add of a
2963
// value and a 16-bit signed constant and both have a single use.
2964
int16_t imm = 0;
2965
if (N.getOpcode() == ISD::ADD &&
2966
(!isIntS16Immediate(N.getOperand(1), imm) ||
2967
!N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) {
2968
Base = N.getOperand(0);
2969
Index = N.getOperand(1);
2970
return true;
2971
}
2972
2973
// Otherwise, do it the hard way, using R0 as the base register.
2974
Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
2975
N.getValueType());
2976
Index = N;
2977
return true;
2978
}
2979
2980
template <typename Ty> static bool isValidPCRelNode(SDValue N) {
2981
Ty *PCRelCand = dyn_cast<Ty>(N);
2982
return PCRelCand && (PPCInstrInfo::hasPCRelFlag(PCRelCand->getTargetFlags()));
2983
}
2984
2985
/// Returns true if this address is a PC Relative address.
2986
/// PC Relative addresses are marked with the flag PPCII::MO_PCREL_FLAG
2987
/// or if the node opcode is PPCISD::MAT_PCREL_ADDR.
2988
bool PPCTargetLowering::SelectAddressPCRel(SDValue N, SDValue &Base) const {
2989
// This is a materialize PC Relative node. Always select this as PC Relative.
2990
Base = N;
2991
if (N.getOpcode() == PPCISD::MAT_PCREL_ADDR)
2992
return true;
2993
if (isValidPCRelNode<ConstantPoolSDNode>(N) ||
2994
isValidPCRelNode<GlobalAddressSDNode>(N) ||
2995
isValidPCRelNode<JumpTableSDNode>(N) ||
2996
isValidPCRelNode<BlockAddressSDNode>(N))
2997
return true;
2998
return false;
2999
}
3000
3001
/// Returns true if we should use a direct load into vector instruction
3002
/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
3003
static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
3004
3005
// If there are any other uses other than scalar to vector, then we should
3006
// keep it as a scalar load -> direct move pattern to prevent multiple
3007
// loads.
3008
LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
3009
if (!LD)
3010
return false;
3011
3012
EVT MemVT = LD->getMemoryVT();
3013
if (!MemVT.isSimple())
3014
return false;
3015
switch(MemVT.getSimpleVT().SimpleTy) {
3016
case MVT::i64:
3017
break;
3018
case MVT::i32:
3019
if (!ST.hasP8Vector())
3020
return false;
3021
break;
3022
case MVT::i16:
3023
case MVT::i8:
3024
if (!ST.hasP9Vector())
3025
return false;
3026
break;
3027
default:
3028
return false;
3029
}
3030
3031
SDValue LoadedVal(N, 0);
3032
if (!LoadedVal.hasOneUse())
3033
return false;
3034
3035
for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();
3036
UI != UE; ++UI)
3037
if (UI.getUse().get().getResNo() == 0 &&
3038
UI->getOpcode() != ISD::SCALAR_TO_VECTOR &&
3039
UI->getOpcode() != PPCISD::SCALAR_TO_VECTOR_PERMUTED)
3040
return false;
3041
3042
return true;
3043
}
3044
3045
/// getPreIndexedAddressParts - returns true by value, base pointer and
3046
/// offset pointer and addressing mode by reference if the node's address
3047
/// can be legally represented as pre-indexed load / store address.
3048
bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
3049
SDValue &Offset,
3050
ISD::MemIndexedMode &AM,
3051
SelectionDAG &DAG) const {
3052
if (DisablePPCPreinc) return false;
3053
3054
bool isLoad = true;
3055
SDValue Ptr;
3056
EVT VT;
3057
Align Alignment;
3058
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
3059
Ptr = LD->getBasePtr();
3060
VT = LD->getMemoryVT();
3061
Alignment = LD->getAlign();
3062
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
3063
Ptr = ST->getBasePtr();
3064
VT = ST->getMemoryVT();
3065
Alignment = ST->getAlign();
3066
isLoad = false;
3067
} else
3068
return false;
3069
3070
// Do not generate pre-inc forms for specific loads that feed scalar_to_vector
3071
// instructions because we can fold these into a more efficient instruction
3072
// instead, (such as LXSD).
3073
if (isLoad && usePartialVectorLoads(N, Subtarget)) {
3074
return false;
3075
}
3076
3077
// PowerPC doesn't have preinc load/store instructions for vectors
3078
if (VT.isVector())
3079
return false;
3080
3081
if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
3082
// Common code will reject creating a pre-inc form if the base pointer
3083
// is a frame index, or if N is a store and the base pointer is either
3084
// the same as or a predecessor of the value being stored. Check for
3085
// those situations here, and try with swapped Base/Offset instead.
3086
bool Swap = false;
3087
3088
if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base))
3089
Swap = true;
3090
else if (!isLoad) {
3091
SDValue Val = cast<StoreSDNode>(N)->getValue();
3092
if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode()))
3093
Swap = true;
3094
}
3095
3096
if (Swap)
3097
std::swap(Base, Offset);
3098
3099
AM = ISD::PRE_INC;
3100
return true;
3101
}
3102
3103
// LDU/STU can only handle immediates that are a multiple of 4.
3104
if (VT != MVT::i64) {
3105
if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, std::nullopt))
3106
return false;
3107
} else {
3108
// LDU/STU need an address with at least 4-byte alignment.
3109
if (Alignment < Align(4))
3110
return false;
3111
3112
if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, Align(4)))
3113
return false;
3114
}
3115
3116
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
3117
// PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of
3118
// sext i32 to i64 when addr mode is r+i.
3119
if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
3120
LD->getExtensionType() == ISD::SEXTLOAD &&
3121
isa<ConstantSDNode>(Offset))
3122
return false;
3123
}
3124
3125
AM = ISD::PRE_INC;
3126
return true;
3127
}
3128
3129
//===----------------------------------------------------------------------===//
3130
// LowerOperation implementation
3131
//===----------------------------------------------------------------------===//
3132
3133
/// Return true if we should reference labels using a PICBase, set the HiOpFlags
3134
/// and LoOpFlags to the target MO flags.
3135
static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
3136
unsigned &HiOpFlags, unsigned &LoOpFlags,
3137
const GlobalValue *GV = nullptr) {
3138
HiOpFlags = PPCII::MO_HA;
3139
LoOpFlags = PPCII::MO_LO;
3140
3141
// Don't use the pic base if not in PIC relocation model.
3142
if (IsPIC) {
3143
HiOpFlags = PPCII::MO_PIC_HA_FLAG;
3144
LoOpFlags = PPCII::MO_PIC_LO_FLAG;
3145
}
3146
}
3147
3148
static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
3149
SelectionDAG &DAG) {
3150
SDLoc DL(HiPart);
3151
EVT PtrVT = HiPart.getValueType();
3152
SDValue Zero = DAG.getConstant(0, DL, PtrVT);
3153
3154
SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
3155
SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);
3156
3157
// With PIC, the first instruction is actually "GR+hi(&G)".
3158
if (isPIC)
3159
Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
3160
DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);
3161
3162
// Generate non-pic code that has direct accesses to the constant pool.
3163
// The address of the global is just (hi(&g)+lo(&g)).
3164
return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
3165
}
3166
3167
static void setUsesTOCBasePtr(MachineFunction &MF) {
3168
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
3169
FuncInfo->setUsesTOCBasePtr();
3170
}
3171
3172
static void setUsesTOCBasePtr(SelectionDAG &DAG) {
3173
setUsesTOCBasePtr(DAG.getMachineFunction());
3174
}
3175
3176
SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
3177
SDValue GA) const {
3178
const bool Is64Bit = Subtarget.isPPC64();
3179
EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
3180
SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT)
3181
: Subtarget.isAIXABI()
3182
? DAG.getRegister(PPC::R2, VT)
3183
: DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
3184
SDValue Ops[] = { GA, Reg };
3185
return DAG.getMemIntrinsicNode(
3186
PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
3187
MachinePointerInfo::getGOT(DAG.getMachineFunction()), std::nullopt,
3188
MachineMemOperand::MOLoad);
3189
}
3190
3191
SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
3192
SelectionDAG &DAG) const {
3193
EVT PtrVT = Op.getValueType();
3194
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3195
const Constant *C = CP->getConstVal();
3196
3197
// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3198
// The actual address of the GlobalValue is stored in the TOC.
3199
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3200
if (Subtarget.isUsingPCRelativeCalls()) {
3201
SDLoc DL(CP);
3202
EVT Ty = getPointerTy(DAG.getDataLayout());
3203
SDValue ConstPool = DAG.getTargetConstantPool(
3204
C, Ty, CP->getAlign(), CP->getOffset(), PPCII::MO_PCREL_FLAG);
3205
return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, ConstPool);
3206
}
3207
setUsesTOCBasePtr(DAG);
3208
SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0);
3209
return getTOCEntry(DAG, SDLoc(CP), GA);
3210
}
3211
3212
unsigned MOHiFlag, MOLoFlag;
3213
bool IsPIC = isPositionIndependent();
3214
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
3215
3216
if (IsPIC && Subtarget.isSVR4ABI()) {
3217
SDValue GA =
3218
DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), PPCII::MO_PIC_FLAG);
3219
return getTOCEntry(DAG, SDLoc(CP), GA);
3220
}
3221
3222
SDValue CPIHi =
3223
DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOHiFlag);
3224
SDValue CPILo =
3225
DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOLoFlag);
3226
return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);
3227
}
3228
3229
// For 64-bit PowerPC, prefer the more compact relative encodings.
3230
// This trades 32 bits per jump table entry for one or two instructions
3231
// on the jump site.
3232
unsigned PPCTargetLowering::getJumpTableEncoding() const {
3233
if (isJumpTableRelative())
3234
return MachineJumpTableInfo::EK_LabelDifference32;
3235
3236
return TargetLowering::getJumpTableEncoding();
3237
}
3238
3239
bool PPCTargetLowering::isJumpTableRelative() const {
3240
if (UseAbsoluteJumpTables)
3241
return false;
3242
if (Subtarget.isPPC64() || Subtarget.isAIXABI())
3243
return true;
3244
return TargetLowering::isJumpTableRelative();
3245
}
3246
3247
SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
3248
SelectionDAG &DAG) const {
3249
if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
3250
return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
3251
3252
switch (getTargetMachine().getCodeModel()) {
3253
case CodeModel::Small:
3254
case CodeModel::Medium:
3255
return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
3256
default:
3257
return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(),
3258
getPointerTy(DAG.getDataLayout()));
3259
}
3260
}
3261
3262
const MCExpr *
3263
PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
3264
unsigned JTI,
3265
MCContext &Ctx) const {
3266
if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
3267
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
3268
3269
switch (getTargetMachine().getCodeModel()) {
3270
case CodeModel::Small:
3271
case CodeModel::Medium:
3272
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
3273
default:
3274
return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
3275
}
3276
}
3277
3278
SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
3279
EVT PtrVT = Op.getValueType();
3280
JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
3281
3282
// isUsingPCRelativeCalls() returns true when PCRelative is enabled
3283
if (Subtarget.isUsingPCRelativeCalls()) {
3284
SDLoc DL(JT);
3285
EVT Ty = getPointerTy(DAG.getDataLayout());
3286
SDValue GA =
3287
DAG.getTargetJumpTable(JT->getIndex(), Ty, PPCII::MO_PCREL_FLAG);
3288
SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
3289
return MatAddr;
3290
}
3291
3292
// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3293
// The actual address of the GlobalValue is stored in the TOC.
3294
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3295
setUsesTOCBasePtr(DAG);
3296
SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
3297
return getTOCEntry(DAG, SDLoc(JT), GA);
3298
}
3299
3300
unsigned MOHiFlag, MOLoFlag;
3301
bool IsPIC = isPositionIndependent();
3302
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
3303
3304
if (IsPIC && Subtarget.isSVR4ABI()) {
3305
SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
3306
PPCII::MO_PIC_FLAG);
3307
return getTOCEntry(DAG, SDLoc(GA), GA);
3308
}
3309
3310
SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
3311
SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
3312
return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG);
3313
}
3314
3315
SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
3316
SelectionDAG &DAG) const {
3317
EVT PtrVT = Op.getValueType();
3318
BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
3319
const BlockAddress *BA = BASDN->getBlockAddress();
3320
3321
// isUsingPCRelativeCalls() returns true when PCRelative is enabled
3322
if (Subtarget.isUsingPCRelativeCalls()) {
3323
SDLoc DL(BASDN);
3324
EVT Ty = getPointerTy(DAG.getDataLayout());
3325
SDValue GA = DAG.getTargetBlockAddress(BA, Ty, BASDN->getOffset(),
3326
PPCII::MO_PCREL_FLAG);
3327
SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
3328
return MatAddr;
3329
}
3330
3331
// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3332
// The actual BlockAddress is stored in the TOC.
3333
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3334
setUsesTOCBasePtr(DAG);
3335
SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
3336
return getTOCEntry(DAG, SDLoc(BASDN), GA);
3337
}
3338
3339
// 32-bit position-independent ELF stores the BlockAddress in the .got.
3340
if (Subtarget.is32BitELFABI() && isPositionIndependent())
3341
return getTOCEntry(
3342
DAG, SDLoc(BASDN),
3343
DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()));
3344
3345
unsigned MOHiFlag, MOLoFlag;
3346
bool IsPIC = isPositionIndependent();
3347
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
3348
SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
3349
SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
3350
return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG);
3351
}
3352
3353
SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
3354
SelectionDAG &DAG) const {
3355
if (Subtarget.isAIXABI())
3356
return LowerGlobalTLSAddressAIX(Op, DAG);
3357
3358
return LowerGlobalTLSAddressLinux(Op, DAG);
3359
}
3360
3361
/// updateForAIXShLibTLSModelOpt - Helper to initialize TLS model opt settings,
3362
/// and then apply the update.
3363
static void updateForAIXShLibTLSModelOpt(TLSModel::Model &Model,
3364
SelectionDAG &DAG,
3365
const TargetMachine &TM) {
3366
// Initialize TLS model opt setting lazily:
3367
// (1) Use initial-exec for single TLS var references within current function.
3368
// (2) Use local-dynamic for multiple TLS var references within current
3369
// function.
3370
PPCFunctionInfo *FuncInfo =
3371
DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
3372
if (!FuncInfo->isAIXFuncTLSModelOptInitDone()) {
3373
SmallPtrSet<const GlobalValue *, 8> TLSGV;
3374
// Iterate over all instructions within current function, collect all TLS
3375
// global variables (global variables taken as the first parameter to
3376
// Intrinsic::threadlocal_address).
3377
const Function &Func = DAG.getMachineFunction().getFunction();
3378
for (Function::const_iterator BI = Func.begin(), BE = Func.end(); BI != BE;
3379
++BI)
3380
for (BasicBlock::const_iterator II = BI->begin(), IE = BI->end();
3381
II != IE; ++II)
3382
if (II->getOpcode() == Instruction::Call)
3383
if (const CallInst *CI = dyn_cast<const CallInst>(&*II))
3384
if (Function *CF = CI->getCalledFunction())
3385
if (CF->isDeclaration() &&
3386
CF->getIntrinsicID() == Intrinsic::threadlocal_address)
3387
if (const GlobalValue *GV =
3388
dyn_cast<GlobalValue>(II->getOperand(0))) {
3389
TLSModel::Model GVModel = TM.getTLSModel(GV);
3390
if (GVModel == TLSModel::LocalDynamic)
3391
TLSGV.insert(GV);
3392
}
3393
3394
unsigned TLSGVCnt = TLSGV.size();
3395
LLVM_DEBUG(dbgs() << format("LocalDynamic TLSGV count:%d\n", TLSGVCnt));
3396
if (TLSGVCnt <= PPCAIXTLSModelOptUseIEForLDLimit)
3397
FuncInfo->setAIXFuncUseTLSIEForLD();
3398
FuncInfo->setAIXFuncTLSModelOptInitDone();
3399
}
3400
3401
if (FuncInfo->isAIXFuncUseTLSIEForLD()) {
3402
LLVM_DEBUG(
3403
dbgs() << DAG.getMachineFunction().getName()
3404
<< " function is using the TLS-IE model for TLS-LD access.\n");
3405
Model = TLSModel::InitialExec;
3406
}
3407
}
3408
3409
SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
3410
SelectionDAG &DAG) const {
3411
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3412
3413
if (DAG.getTarget().useEmulatedTLS())
3414
report_fatal_error("Emulated TLS is not yet supported on AIX");
3415
3416
SDLoc dl(GA);
3417
const GlobalValue *GV = GA->getGlobal();
3418
EVT PtrVT = getPointerTy(DAG.getDataLayout());
3419
bool Is64Bit = Subtarget.isPPC64();
3420
TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
3421
3422
// Apply update to the TLS model.
3423
if (Subtarget.hasAIXShLibTLSModelOpt())
3424
updateForAIXShLibTLSModelOpt(Model, DAG, getTargetMachine());
3425
3426
bool IsTLSLocalExecModel = Model == TLSModel::LocalExec;
3427
3428
if (IsTLSLocalExecModel || Model == TLSModel::InitialExec) {
3429
bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
3430
bool HasAIXSmallTLSGlobalAttr = false;
3431
SDValue VariableOffsetTGA =
3432
DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_FLAG);
3433
SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);
3434
SDValue TLSReg;
3435
3436
if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
3437
if (GVar->hasAttribute("aix-small-tls"))
3438
HasAIXSmallTLSGlobalAttr = true;
3439
3440
if (Is64Bit) {
3441
// For local-exec and initial-exec on AIX (64-bit), the sequence generated
3442
// involves a load of the variable offset (from the TOC), followed by an
3443
// add of the loaded variable offset to R13 (the thread pointer).
3444
// This code sequence looks like:
3445
// ld reg1,var[TC](2)
3446
// add reg2, reg1, r13 // r13 contains the thread pointer
3447
TLSReg = DAG.getRegister(PPC::X13, MVT::i64);
3448
3449
// With the -maix-small-local-exec-tls option, or with the "aix-small-tls"
3450
// global variable attribute, produce a faster access sequence for
3451
// local-exec TLS variables where the offset from the TLS base is encoded
3452
// as an immediate operand.
3453
//
3454
// We only utilize the faster local-exec access sequence when the TLS
3455
// variable has a size within the policy limit. We treat types that are
3456
// not sized or are empty as being over the policy size limit.
3457
if ((HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr) &&
3458
IsTLSLocalExecModel) {
3459
Type *GVType = GV->getValueType();
3460
if (GVType->isSized() && !GVType->isEmptyTy() &&
3461
GV->getDataLayout().getTypeAllocSize(GVType) <=
3462
AIXSmallTlsPolicySizeLimit)
3463
return DAG.getNode(PPCISD::Lo, dl, PtrVT, VariableOffsetTGA, TLSReg);
3464
}
3465
} else {
3466
// For local-exec and initial-exec on AIX (32-bit), the sequence generated
3467
// involves loading the variable offset from the TOC, generating a call to
3468
// .__get_tpointer to get the thread pointer (which will be in R3), and
3469
// adding the two together:
3470
// lwz reg1,var[TC](2)
3471
// bla .__get_tpointer
3472
// add reg2, reg1, r3
3473
TLSReg = DAG.getNode(PPCISD::GET_TPOINTER, dl, PtrVT);
3474
3475
// We do not implement the 32-bit version of the faster access sequence
3476
// for local-exec that is controlled by the -maix-small-local-exec-tls
3477
// option, or the "aix-small-tls" global variable attribute.
3478
if (HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr)
3479
report_fatal_error("The small-local-exec TLS access sequence is "
3480
"currently only supported on AIX (64-bit mode).");
3481
}
3482
return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, VariableOffset);
3483
}
3484
3485
if (Model == TLSModel::LocalDynamic) {
3486
bool HasAIXSmallLocalDynamicTLS = Subtarget.hasAIXSmallLocalDynamicTLS();
3487
3488
// We do not implement the 32-bit version of the faster access sequence
3489
// for local-dynamic that is controlled by -maix-small-local-dynamic-tls.
3490
if (!Is64Bit && HasAIXSmallLocalDynamicTLS)
3491
report_fatal_error("The small-local-dynamic TLS access sequence is "
3492
"currently only supported on AIX (64-bit mode).");
3493
3494
// For local-dynamic on AIX, we need to generate one TOC entry for each
3495
// variable offset, and a single module-handle TOC entry for the entire
3496
// file.
3497
3498
SDValue VariableOffsetTGA =
3499
DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLSLD_FLAG);
3500
SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);
3501
3502
Module *M = DAG.getMachineFunction().getFunction().getParent();
3503
GlobalVariable *TLSGV =
3504
dyn_cast_or_null<GlobalVariable>(M->getOrInsertGlobal(
3505
StringRef("_$TLSML"), PointerType::getUnqual(*DAG.getContext())));
3506
TLSGV->setThreadLocalMode(GlobalVariable::LocalDynamicTLSModel);
3507
assert(TLSGV && "Not able to create GV for _$TLSML.");
3508
SDValue ModuleHandleTGA =
3509
DAG.getTargetGlobalAddress(TLSGV, dl, PtrVT, 0, PPCII::MO_TLSLDM_FLAG);
3510
SDValue ModuleHandleTOC = getTOCEntry(DAG, dl, ModuleHandleTGA);
3511
SDValue ModuleHandle =
3512
DAG.getNode(PPCISD::TLSLD_AIX, dl, PtrVT, ModuleHandleTOC);
3513
3514
// With the -maix-small-local-dynamic-tls option, produce a faster access
3515
// sequence for local-dynamic TLS variables where the offset from the
3516
// module-handle is encoded as an immediate operand.
3517
//
3518
// We only utilize the faster local-dynamic access sequence when the TLS
3519
// variable has a size within the policy limit. We treat types that are
3520
// not sized or are empty as being over the policy size limit.
3521
if (HasAIXSmallLocalDynamicTLS) {
3522
Type *GVType = GV->getValueType();
3523
if (GVType->isSized() && !GVType->isEmptyTy() &&
3524
GV->getDataLayout().getTypeAllocSize(GVType) <=
3525
AIXSmallTlsPolicySizeLimit)
3526
return DAG.getNode(PPCISD::Lo, dl, PtrVT, VariableOffsetTGA,
3527
ModuleHandle);
3528
}
3529
3530
return DAG.getNode(ISD::ADD, dl, PtrVT, ModuleHandle, VariableOffset);
3531
}
3532
3533
// If Local- or Initial-exec or Local-dynamic is not possible or specified,
3534
// all GlobalTLSAddress nodes are lowered using the general-dynamic model. We
3535
// need to generate two TOC entries, one for the variable offset, one for the
3536
// region handle. The global address for the TOC entry of the region handle is
3537
// created with the MO_TLSGDM_FLAG flag and the global address for the TOC
3538
// entry of the variable offset is created with MO_TLSGD_FLAG.
3539
SDValue VariableOffsetTGA =
3540
DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLSGD_FLAG);
3541
SDValue RegionHandleTGA =
3542
DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLSGDM_FLAG);
3543
SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);
3544
SDValue RegionHandle = getTOCEntry(DAG, dl, RegionHandleTGA);
3545
return DAG.getNode(PPCISD::TLSGD_AIX, dl, PtrVT, VariableOffset,
3546
RegionHandle);
3547
}
3548
3549
SDValue PPCTargetLowering::LowerGlobalTLSAddressLinux(SDValue Op,
3550
SelectionDAG &DAG) const {
3551
// FIXME: TLS addresses currently use medium model code sequences,
3552
// which is the most useful form. Eventually support for small and
3553
// large models could be added if users need it, at the cost of
3554
// additional complexity.
3555
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3556
if (DAG.getTarget().useEmulatedTLS())
3557
return LowerToTLSEmulatedModel(GA, DAG);
3558
3559
SDLoc dl(GA);
3560
const GlobalValue *GV = GA->getGlobal();
3561
EVT PtrVT = getPointerTy(DAG.getDataLayout());
3562
bool is64bit = Subtarget.isPPC64();
3563
const Module *M = DAG.getMachineFunction().getFunction().getParent();
3564
PICLevel::Level picLevel = M->getPICLevel();
3565
3566
const TargetMachine &TM = getTargetMachine();
3567
TLSModel::Model Model = TM.getTLSModel(GV);
3568
3569
if (Model == TLSModel::LocalExec) {
3570
if (Subtarget.isUsingPCRelativeCalls()) {
3571
SDValue TLSReg = DAG.getRegister(PPC::X13, MVT::i64);
3572
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3573
PPCII::MO_TPREL_PCREL_FLAG);
3574
SDValue MatAddr =
3575
DAG.getNode(PPCISD::TLS_LOCAL_EXEC_MAT_ADDR, dl, PtrVT, TGA);
3576
return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, MatAddr);
3577
}
3578
3579
SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3580
PPCII::MO_TPREL_HA);
3581
SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3582
PPCII::MO_TPREL_LO);
3583
SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64)
3584
: DAG.getRegister(PPC::R2, MVT::i32);
3585
3586
SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
3587
return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
3588
}
3589
3590
if (Model == TLSModel::InitialExec) {
3591
bool IsPCRel = Subtarget.isUsingPCRelativeCalls();
3592
SDValue TGA = DAG.getTargetGlobalAddress(
3593
GV, dl, PtrVT, 0, IsPCRel ? PPCII::MO_GOT_TPREL_PCREL_FLAG : 0);
3594
SDValue TGATLS = DAG.getTargetGlobalAddress(
3595
GV, dl, PtrVT, 0, IsPCRel ? PPCII::MO_TLS_PCREL_FLAG : PPCII::MO_TLS);
3596
SDValue TPOffset;
3597
if (IsPCRel) {
3598
SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, dl, PtrVT, TGA);
3599
TPOffset = DAG.getLoad(MVT::i64, dl, DAG.getEntryNode(), MatPCRel,
3600
MachinePointerInfo());
3601
} else {
3602
SDValue GOTPtr;
3603
if (is64bit) {
3604
setUsesTOCBasePtr(DAG);
3605
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
3606
GOTPtr =
3607
DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, PtrVT, GOTReg, TGA);
3608
} else {
3609
if (!TM.isPositionIndependent())
3610
GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
3611
else if (picLevel == PICLevel::SmallPIC)
3612
GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
3613
else
3614
GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
3615
}
3616
TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, PtrVT, TGA, GOTPtr);
3617
}
3618
return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
3619
}
3620
3621
if (Model == TLSModel::GeneralDynamic) {
3622
if (Subtarget.isUsingPCRelativeCalls()) {
3623
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3624
PPCII::MO_GOT_TLSGD_PCREL_FLAG);
3625
return DAG.getNode(PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, dl, PtrVT, TGA);
3626
}
3627
3628
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
3629
SDValue GOTPtr;
3630
if (is64bit) {
3631
setUsesTOCBasePtr(DAG);
3632
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
3633
GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
3634
GOTReg, TGA);
3635
} else {
3636
if (picLevel == PICLevel::SmallPIC)
3637
GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
3638
else
3639
GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
3640
}
3641
return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT,
3642
GOTPtr, TGA, TGA);
3643
}
3644
3645
if (Model == TLSModel::LocalDynamic) {
3646
if (Subtarget.isUsingPCRelativeCalls()) {
3647
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3648
PPCII::MO_GOT_TLSLD_PCREL_FLAG);
3649
SDValue MatPCRel =
3650
DAG.getNode(PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, dl, PtrVT, TGA);
3651
return DAG.getNode(PPCISD::PADDI_DTPREL, dl, PtrVT, MatPCRel, TGA);
3652
}
3653
3654
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
3655
SDValue GOTPtr;
3656
if (is64bit) {
3657
setUsesTOCBasePtr(DAG);
3658
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
3659
GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
3660
GOTReg, TGA);
3661
} else {
3662
if (picLevel == PICLevel::SmallPIC)
3663
GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
3664
else
3665
GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
3666
}
3667
SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl,
3668
PtrVT, GOTPtr, TGA, TGA);
3669
SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl,
3670
PtrVT, TLSAddr, TGA);
3671
return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
3672
}
3673
3674
llvm_unreachable("Unknown TLS model!");
3675
}
3676
3677
SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
3678
SelectionDAG &DAG) const {
3679
EVT PtrVT = Op.getValueType();
3680
GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
3681
SDLoc DL(GSDN);
3682
const GlobalValue *GV = GSDN->getGlobal();
3683
3684
// 64-bit SVR4 ABI & AIX ABI code is always position-independent.
3685
// The actual address of the GlobalValue is stored in the TOC.
3686
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3687
if (Subtarget.isUsingPCRelativeCalls()) {
3688
EVT Ty = getPointerTy(DAG.getDataLayout());
3689
if (isAccessedAsGotIndirect(Op)) {
3690
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),
3691
PPCII::MO_GOT_PCREL_FLAG);
3692
SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
3693
SDValue Load = DAG.getLoad(MVT::i64, DL, DAG.getEntryNode(), MatPCRel,
3694
MachinePointerInfo());
3695
return Load;
3696
} else {
3697
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),
3698
PPCII::MO_PCREL_FLAG);
3699
return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
3700
}
3701
}
3702
setUsesTOCBasePtr(DAG);
3703
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
3704
return getTOCEntry(DAG, DL, GA);
3705
}
3706
3707
unsigned MOHiFlag, MOLoFlag;
3708
bool IsPIC = isPositionIndependent();
3709
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);
3710
3711
if (IsPIC && Subtarget.isSVR4ABI()) {
3712
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
3713
GSDN->getOffset(),
3714
PPCII::MO_PIC_FLAG);
3715
return getTOCEntry(DAG, DL, GA);
3716
}
3717
3718
SDValue GAHi =
3719
DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
3720
SDValue GALo =
3721
DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);
3722
3723
return LowerLabelRef(GAHi, GALo, IsPIC, DAG);
3724
}
3725
3726
SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
3727
bool IsStrict = Op->isStrictFPOpcode();
3728
ISD::CondCode CC =
3729
cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
3730
SDValue LHS = Op.getOperand(IsStrict ? 1 : 0);
3731
SDValue RHS = Op.getOperand(IsStrict ? 2 : 1);
3732
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
3733
EVT LHSVT = LHS.getValueType();
3734
SDLoc dl(Op);
3735
3736
// Soften the setcc with libcall if it is fp128.
3737
if (LHSVT == MVT::f128) {
3738
assert(!Subtarget.hasP9Vector() &&
3739
"SETCC for f128 is already legal under Power9!");
3740
softenSetCCOperands(DAG, LHSVT, LHS, RHS, CC, dl, LHS, RHS, Chain,
3741
Op->getOpcode() == ISD::STRICT_FSETCCS);
3742
if (RHS.getNode())
3743
LHS = DAG.getNode(ISD::SETCC, dl, Op.getValueType(), LHS, RHS,
3744
DAG.getCondCode(CC));
3745
if (IsStrict)
3746
return DAG.getMergeValues({LHS, Chain}, dl);
3747
return LHS;
3748
}
3749
3750
assert(!IsStrict && "Don't know how to handle STRICT_FSETCC!");
3751
3752
if (Op.getValueType() == MVT::v2i64) {
3753
// When the operands themselves are v2i64 values, we need to do something
3754
// special because VSX has no underlying comparison operations for these.
3755
if (LHS.getValueType() == MVT::v2i64) {
3756
// Equality can be handled by casting to the legal type for Altivec
3757
// comparisons, everything else needs to be expanded.
3758
if (CC != ISD::SETEQ && CC != ISD::SETNE)
3759
return SDValue();
3760
SDValue SetCC32 = DAG.getSetCC(
3761
dl, MVT::v4i32, DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, LHS),
3762
DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, RHS), CC);
3763
int ShuffV[] = {1, 0, 3, 2};
3764
SDValue Shuff =
3765
DAG.getVectorShuffle(MVT::v4i32, dl, SetCC32, SetCC32, ShuffV);
3766
return DAG.getBitcast(MVT::v2i64,
3767
DAG.getNode(CC == ISD::SETEQ ? ISD::AND : ISD::OR,
3768
dl, MVT::v4i32, Shuff, SetCC32));
3769
}
3770
3771
// We handle most of these in the usual way.
3772
return Op;
3773
}
3774
3775
// If we're comparing for equality to zero, expose the fact that this is
3776
// implemented as a ctlz/srl pair on ppc, so that the dag combiner can
3777
// fold the new nodes.
3778
if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
3779
return V;
3780
3781
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
3782
// Leave comparisons against 0 and -1 alone for now, since they're usually
3783
// optimized. FIXME: revisit this when we can custom lower all setcc
3784
// optimizations.
3785
if (C->isAllOnes() || C->isZero())
3786
return SDValue();
3787
}
3788
3789
// If we have an integer seteq/setne, turn it into a compare against zero
3790
// by xor'ing the rhs with the lhs, which is faster than setting a
3791
// condition register, reading it back out, and masking the correct bit. The
3792
// normal approach here uses sub to do this instead of xor. Using xor exposes
3793
// the result to other bit-twiddling opportunities.
3794
if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3795
EVT VT = Op.getValueType();
3796
SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, LHS, RHS);
3797
return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC);
3798
}
3799
return SDValue();
3800
}
3801
3802
SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3803
SDNode *Node = Op.getNode();
3804
EVT VT = Node->getValueType(0);
3805
EVT PtrVT = getPointerTy(DAG.getDataLayout());
3806
SDValue InChain = Node->getOperand(0);
3807
SDValue VAListPtr = Node->getOperand(1);
3808
const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
3809
SDLoc dl(Node);
3810
3811
assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
3812
3813
// gpr_index
3814
SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
3815
VAListPtr, MachinePointerInfo(SV), MVT::i8);
3816
InChain = GprIndex.getValue(1);
3817
3818
if (VT == MVT::i64) {
3819
// Check if GprIndex is even
3820
SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,
3821
DAG.getConstant(1, dl, MVT::i32));
3822
SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,
3823
DAG.getConstant(0, dl, MVT::i32), ISD::SETNE);
3824
SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,
3825
DAG.getConstant(1, dl, MVT::i32));
3826
// Align GprIndex to be even if it isn't
3827
GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,
3828
GprIndex);
3829
}
3830
3831
// fpr index is 1 byte after gpr
3832
SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
3833
DAG.getConstant(1, dl, MVT::i32));
3834
3835
// fpr
3836
SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
3837
FprPtr, MachinePointerInfo(SV), MVT::i8);
3838
InChain = FprIndex.getValue(1);
3839
3840
SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
3841
DAG.getConstant(8, dl, MVT::i32));
3842
3843
SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
3844
DAG.getConstant(4, dl, MVT::i32));
3845
3846
// areas
3847
SDValue OverflowArea =
3848
DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo());
3849
InChain = OverflowArea.getValue(1);
3850
3851
SDValue RegSaveArea =
3852
DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo());
3853
InChain = RegSaveArea.getValue(1);
3854
3855
// select overflow_area if index > 8
3856
SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,
3857
DAG.getConstant(8, dl, MVT::i32), ISD::SETLT);
3858
3859
// adjustment constant gpr_index * 4/8
3860
SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,
3861
VT.isInteger() ? GprIndex : FprIndex,
3862
DAG.getConstant(VT.isInteger() ? 4 : 8, dl,
3863
MVT::i32));
3864
3865
// OurReg = RegSaveArea + RegConstant
3866
SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,
3867
RegConstant);
3868
3869
// Floating types are 32 bytes into RegSaveArea
3870
if (VT.isFloatingPoint())
3871
OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,
3872
DAG.getConstant(32, dl, MVT::i32));
3873
3874
// increase {f,g}pr_index by 1 (or 2 if VT is i64)
3875
SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,
3876
VT.isInteger() ? GprIndex : FprIndex,
3877
DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl,
3878
MVT::i32));
3879
3880
InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
3881
VT.isInteger() ? VAListPtr : FprPtr,
3882
MachinePointerInfo(SV), MVT::i8);
3883
3884
// determine if we should load from reg_save_area or overflow_area
3885
SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);
3886
3887
// increase overflow_area by 4/8 if gpr/fpr > 8
3888
SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,
3889
DAG.getConstant(VT.isInteger() ? 4 : 8,
3890
dl, MVT::i32));
3891
3892
OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
3893
OverflowAreaPlusN);
3894
3895
InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr,
3896
MachinePointerInfo(), MVT::i32);
3897
3898
return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo());
3899
}
3900
3901
SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
3902
assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
3903
3904
// We have to copy the entire va_list struct:
3905
// 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
3906
return DAG.getMemcpy(Op.getOperand(0), Op, Op.getOperand(1), Op.getOperand(2),
3907
DAG.getConstant(12, SDLoc(Op), MVT::i32), Align(8),
3908
false, true, /*CI=*/nullptr, std::nullopt,
3909
MachinePointerInfo(), MachinePointerInfo());
3910
}
3911
3912
SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
3913
SelectionDAG &DAG) const {
3914
if (Subtarget.isAIXABI())
3915
report_fatal_error("ADJUST_TRAMPOLINE operation is not supported on AIX.");
3916
3917
return Op.getOperand(0);
3918
}
3919
3920
SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
3921
MachineFunction &MF = DAG.getMachineFunction();
3922
PPCFunctionInfo &MFI = *MF.getInfo<PPCFunctionInfo>();
3923
3924
assert((Op.getOpcode() == ISD::INLINEASM ||
3925
Op.getOpcode() == ISD::INLINEASM_BR) &&
3926
"Expecting Inline ASM node.");
3927
3928
// If an LR store is already known to be required then there is not point in
3929
// checking this ASM as well.
3930
if (MFI.isLRStoreRequired())
3931
return Op;
3932
3933
// Inline ASM nodes have an optional last operand that is an incoming Flag of
3934
// type MVT::Glue. We want to ignore this last operand if that is the case.
3935
unsigned NumOps = Op.getNumOperands();
3936
if (Op.getOperand(NumOps - 1).getValueType() == MVT::Glue)
3937
--NumOps;
3938
3939
// Check all operands that may contain the LR.
3940
for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
3941
const InlineAsm::Flag Flags(Op.getConstantOperandVal(i));
3942
unsigned NumVals = Flags.getNumOperandRegisters();
3943
++i; // Skip the ID value.
3944
3945
switch (Flags.getKind()) {
3946
default:
3947
llvm_unreachable("Bad flags!");
3948
case InlineAsm::Kind::RegUse:
3949
case InlineAsm::Kind::Imm:
3950
case InlineAsm::Kind::Mem:
3951
i += NumVals;
3952
break;
3953
case InlineAsm::Kind::Clobber:
3954
case InlineAsm::Kind::RegDef:
3955
case InlineAsm::Kind::RegDefEarlyClobber: {
3956
for (; NumVals; --NumVals, ++i) {
3957
Register Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg();
3958
if (Reg != PPC::LR && Reg != PPC::LR8)
3959
continue;
3960
MFI.setLRStoreRequired();
3961
return Op;
3962
}
3963
break;
3964
}
3965
}
3966
}
3967
3968
return Op;
3969
}
3970
3971
SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
3972
SelectionDAG &DAG) const {
3973
if (Subtarget.isAIXABI())
3974
report_fatal_error("INIT_TRAMPOLINE operation is not supported on AIX.");
3975
3976
SDValue Chain = Op.getOperand(0);
3977
SDValue Trmp = Op.getOperand(1); // trampoline
3978
SDValue FPtr = Op.getOperand(2); // nested function
3979
SDValue Nest = Op.getOperand(3); // 'nest' parameter value
3980
SDLoc dl(Op);
3981
3982
EVT PtrVT = getPointerTy(DAG.getDataLayout());
3983
bool isPPC64 = (PtrVT == MVT::i64);
3984
Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
3985
3986
TargetLowering::ArgListTy Args;
3987
TargetLowering::ArgListEntry Entry;
3988
3989
Entry.Ty = IntPtrTy;
3990
Entry.Node = Trmp; Args.push_back(Entry);
3991
3992
// TrampSize == (isPPC64 ? 48 : 40);
3993
Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl,
3994
isPPC64 ? MVT::i64 : MVT::i32);
3995
Args.push_back(Entry);
3996
3997
Entry.Node = FPtr; Args.push_back(Entry);
3998
Entry.Node = Nest; Args.push_back(Entry);
3999
4000
// Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
4001
TargetLowering::CallLoweringInfo CLI(DAG);
4002
CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
4003
CallingConv::C, Type::getVoidTy(*DAG.getContext()),
4004
DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
4005
4006
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4007
return CallResult.second;
4008
}
4009
4010
SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
4011
MachineFunction &MF = DAG.getMachineFunction();
4012
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
4013
EVT PtrVT = getPointerTy(MF.getDataLayout());
4014
4015
SDLoc dl(Op);
4016
4017
if (Subtarget.isPPC64() || Subtarget.isAIXABI()) {
4018
// vastart just stores the address of the VarArgsFrameIndex slot into the
4019
// memory location argument.
4020
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4021
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4022
return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4023
MachinePointerInfo(SV));
4024
}
4025
4026
// For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
4027
// We suppose the given va_list is already allocated.
4028
//
4029
// typedef struct {
4030
// char gpr; /* index into the array of 8 GPRs
4031
// * stored in the register save area
4032
// * gpr=0 corresponds to r3,
4033
// * gpr=1 to r4, etc.
4034
// */
4035
// char fpr; /* index into the array of 8 FPRs
4036
// * stored in the register save area
4037
// * fpr=0 corresponds to f1,
4038
// * fpr=1 to f2, etc.
4039
// */
4040
// char *overflow_arg_area;
4041
// /* location on stack that holds
4042
// * the next overflow argument
4043
// */
4044
// char *reg_save_area;
4045
// /* where r3:r10 and f1:f8 (if saved)
4046
// * are stored
4047
// */
4048
// } va_list[1];
4049
4050
SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
4051
SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
4052
SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
4053
PtrVT);
4054
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
4055
PtrVT);
4056
4057
uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
4058
SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT);
4059
4060
uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
4061
SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT);
4062
4063
uint64_t FPROffset = 1;
4064
SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT);
4065
4066
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4067
4068
// Store first byte : number of int regs
4069
SDValue firstStore =
4070
DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1),
4071
MachinePointerInfo(SV), MVT::i8);
4072
uint64_t nextOffset = FPROffset;
4073
SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
4074
ConstFPROffset);
4075
4076
// Store second byte : number of float regs
4077
SDValue secondStore =
4078
DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
4079
MachinePointerInfo(SV, nextOffset), MVT::i8);
4080
nextOffset += StackOffset;
4081
nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);
4082
4083
// Store second word : arguments given on stack
4084
SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
4085
MachinePointerInfo(SV, nextOffset));
4086
nextOffset += FrameOffset;
4087
nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);
4088
4089
// Store third word : arguments given in registers
4090
return DAG.getStore(thirdStore, dl, FR, nextPtr,
4091
MachinePointerInfo(SV, nextOffset));
4092
}
4093
4094
/// FPR - The set of FP registers that should be allocated for arguments
4095
/// on Darwin and AIX.
4096
static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
4097
PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,
4098
PPC::F11, PPC::F12, PPC::F13};
4099
4100
/// CalculateStackSlotSize - Calculates the size reserved for this argument on
4101
/// the stack.
4102
static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
4103
unsigned PtrByteSize) {
4104
unsigned ArgSize = ArgVT.getStoreSize();
4105
if (Flags.isByVal())
4106
ArgSize = Flags.getByValSize();
4107
4108
// Round up to multiples of the pointer size, except for array members,
4109
// which are always packed.
4110
if (!Flags.isInConsecutiveRegs())
4111
ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4112
4113
return ArgSize;
4114
}
4115
4116
/// CalculateStackSlotAlignment - Calculates the alignment of this argument
4117
/// on the stack.
4118
static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
4119
ISD::ArgFlagsTy Flags,
4120
unsigned PtrByteSize) {
4121
Align Alignment(PtrByteSize);
4122
4123
// Altivec parameters are padded to a 16 byte boundary.
4124
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
4125
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
4126
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
4127
ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
4128
Alignment = Align(16);
4129
4130
// ByVal parameters are aligned as requested.
4131
if (Flags.isByVal()) {
4132
auto BVAlign = Flags.getNonZeroByValAlign();
4133
if (BVAlign > PtrByteSize) {
4134
if (BVAlign.value() % PtrByteSize != 0)
4135
llvm_unreachable(
4136
"ByVal alignment is not a multiple of the pointer size");
4137
4138
Alignment = BVAlign;
4139
}
4140
}
4141
4142
// Array members are always packed to their original alignment.
4143
if (Flags.isInConsecutiveRegs()) {
4144
// If the array member was split into multiple registers, the first
4145
// needs to be aligned to the size of the full type. (Except for
4146
// ppcf128, which is only aligned as its f64 components.)
4147
if (Flags.isSplit() && OrigVT != MVT::ppcf128)
4148
Alignment = Align(OrigVT.getStoreSize());
4149
else
4150
Alignment = Align(ArgVT.getStoreSize());
4151
}
4152
4153
return Alignment;
4154
}
4155
4156
/// CalculateStackSlotUsed - Return whether this argument will use its
4157
/// stack slot (instead of being passed in registers). ArgOffset,
4158
/// AvailableFPRs, and AvailableVRs must hold the current argument
4159
/// position, and will be updated to account for this argument.
4160
static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags,
4161
unsigned PtrByteSize, unsigned LinkageSize,
4162
unsigned ParamAreaSize, unsigned &ArgOffset,
4163
unsigned &AvailableFPRs,
4164
unsigned &AvailableVRs) {
4165
bool UseMemory = false;
4166
4167
// Respect alignment of argument on the stack.
4168
Align Alignment =
4169
CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
4170
ArgOffset = alignTo(ArgOffset, Alignment);
4171
// If there's no space left in the argument save area, we must
4172
// use memory (this check also catches zero-sized arguments).
4173
if (ArgOffset >= LinkageSize + ParamAreaSize)
4174
UseMemory = true;
4175
4176
// Allocate argument on the stack.
4177
ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
4178
if (Flags.isInConsecutiveRegsLast())
4179
ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4180
// If we overran the argument save area, we must use memory
4181
// (this check catches arguments passed partially in memory)
4182
if (ArgOffset > LinkageSize + ParamAreaSize)
4183
UseMemory = true;
4184
4185
// However, if the argument is actually passed in an FPR or a VR,
4186
// we don't use memory after all.
4187
if (!Flags.isByVal()) {
4188
if (ArgVT == MVT::f32 || ArgVT == MVT::f64)
4189
if (AvailableFPRs > 0) {
4190
--AvailableFPRs;
4191
return false;
4192
}
4193
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
4194
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
4195
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
4196
ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
4197
if (AvailableVRs > 0) {
4198
--AvailableVRs;
4199
return false;
4200
}
4201
}
4202
4203
return UseMemory;
4204
}
4205
4206
/// EnsureStackAlignment - Round stack frame size up from NumBytes to
4207
/// ensure minimum alignment required for target.
4208
static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
4209
unsigned NumBytes) {
4210
return alignTo(NumBytes, Lowering->getStackAlign());
4211
}
4212
4213
SDValue PPCTargetLowering::LowerFormalArguments(
4214
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4215
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4216
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4217
if (Subtarget.isAIXABI())
4218
return LowerFormalArguments_AIX(Chain, CallConv, isVarArg, Ins, dl, DAG,
4219
InVals);
4220
if (Subtarget.is64BitELFABI())
4221
return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
4222
InVals);
4223
assert(Subtarget.is32BitELFABI());
4224
return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
4225
InVals);
4226
}
4227
4228
SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
4229
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4230
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4231
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4232
4233
// 32-bit SVR4 ABI Stack Frame Layout:
4234
// +-----------------------------------+
4235
// +--> | Back chain |
4236
// | +-----------------------------------+
4237
// | | Floating-point register save area |
4238
// | +-----------------------------------+
4239
// | | General register save area |
4240
// | +-----------------------------------+
4241
// | | CR save word |
4242
// | +-----------------------------------+
4243
// | | VRSAVE save word |
4244
// | +-----------------------------------+
4245
// | | Alignment padding |
4246
// | +-----------------------------------+
4247
// | | Vector register save area |
4248
// | +-----------------------------------+
4249
// | | Local variable space |
4250
// | +-----------------------------------+
4251
// | | Parameter list area |
4252
// | +-----------------------------------+
4253
// | | LR save word |
4254
// | +-----------------------------------+
4255
// SP--> +--- | Back chain |
4256
// +-----------------------------------+
4257
//
4258
// Specifications:
4259
// System V Application Binary Interface PowerPC Processor Supplement
4260
// AltiVec Technology Programming Interface Manual
4261
4262
MachineFunction &MF = DAG.getMachineFunction();
4263
MachineFrameInfo &MFI = MF.getFrameInfo();
4264
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
4265
4266
EVT PtrVT = getPointerTy(MF.getDataLayout());
4267
// Potential tail calls could cause overwriting of argument stack slots.
4268
bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
4269
(CallConv == CallingConv::Fast));
4270
const Align PtrAlign(4);
4271
4272
// Assign locations to all of the incoming arguments.
4273
SmallVector<CCValAssign, 16> ArgLocs;
4274
PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4275
*DAG.getContext());
4276
4277
// Reserve space for the linkage area on the stack.
4278
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4279
CCInfo.AllocateStack(LinkageSize, PtrAlign);
4280
if (useSoftFloat())
4281
CCInfo.PreAnalyzeFormalArguments(Ins);
4282
4283
CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
4284
CCInfo.clearWasPPCF128();
4285
4286
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4287
CCValAssign &VA = ArgLocs[i];
4288
4289
// Arguments stored in registers.
4290
if (VA.isRegLoc()) {
4291
const TargetRegisterClass *RC;
4292
EVT ValVT = VA.getValVT();
4293
4294
switch (ValVT.getSimpleVT().SimpleTy) {
4295
default:
4296
llvm_unreachable("ValVT not supported by formal arguments Lowering");
4297
case MVT::i1:
4298
case MVT::i32:
4299
RC = &PPC::GPRCRegClass;
4300
break;
4301
case MVT::f32:
4302
if (Subtarget.hasP8Vector())
4303
RC = &PPC::VSSRCRegClass;
4304
else if (Subtarget.hasSPE())
4305
RC = &PPC::GPRCRegClass;
4306
else
4307
RC = &PPC::F4RCRegClass;
4308
break;
4309
case MVT::f64:
4310
if (Subtarget.hasVSX())
4311
RC = &PPC::VSFRCRegClass;
4312
else if (Subtarget.hasSPE())
4313
// SPE passes doubles in GPR pairs.
4314
RC = &PPC::GPRCRegClass;
4315
else
4316
RC = &PPC::F8RCRegClass;
4317
break;
4318
case MVT::v16i8:
4319
case MVT::v8i16:
4320
case MVT::v4i32:
4321
RC = &PPC::VRRCRegClass;
4322
break;
4323
case MVT::v4f32:
4324
RC = &PPC::VRRCRegClass;
4325
break;
4326
case MVT::v2f64:
4327
case MVT::v2i64:
4328
RC = &PPC::VRRCRegClass;
4329
break;
4330
}
4331
4332
SDValue ArgValue;
4333
// Transform the arguments stored in physical registers into
4334
// virtual ones.
4335
if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) {
4336
assert(i + 1 < e && "No second half of double precision argument");
4337
Register RegLo = MF.addLiveIn(VA.getLocReg(), RC);
4338
Register RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC);
4339
SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32);
4340
SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32);
4341
if (!Subtarget.isLittleEndian())
4342
std::swap (ArgValueLo, ArgValueHi);
4343
ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo,
4344
ArgValueHi);
4345
} else {
4346
Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4347
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
4348
ValVT == MVT::i1 ? MVT::i32 : ValVT);
4349
if (ValVT == MVT::i1)
4350
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
4351
}
4352
4353
InVals.push_back(ArgValue);
4354
} else {
4355
// Argument stored in memory.
4356
assert(VA.isMemLoc());
4357
4358
// Get the extended size of the argument type in stack
4359
unsigned ArgSize = VA.getLocVT().getStoreSize();
4360
// Get the actual size of the argument type
4361
unsigned ObjSize = VA.getValVT().getStoreSize();
4362
unsigned ArgOffset = VA.getLocMemOffset();
4363
// Stack objects in PPC32 are right justified.
4364
ArgOffset += ArgSize - ObjSize;
4365
int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, isImmutable);
4366
4367
// Create load nodes to retrieve arguments from the stack.
4368
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4369
InVals.push_back(
4370
DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
4371
}
4372
}
4373
4374
// Assign locations to all of the incoming aggregate by value arguments.
4375
// Aggregates passed by value are stored in the local variable space of the
4376
// caller's stack frame, right above the parameter list area.
4377
SmallVector<CCValAssign, 16> ByValArgLocs;
4378
CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
4379
ByValArgLocs, *DAG.getContext());
4380
4381
// Reserve stack space for the allocations in CCInfo.
4382
CCByValInfo.AllocateStack(CCInfo.getStackSize(), PtrAlign);
4383
4384
CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);
4385
4386
// Area that is at least reserved in the caller of this function.
4387
unsigned MinReservedArea = CCByValInfo.getStackSize();
4388
MinReservedArea = std::max(MinReservedArea, LinkageSize);
4389
4390
// Set the size that is at least reserved in caller of this function. Tail
4391
// call optimized function's reserved stack space needs to be aligned so that
4392
// taking the difference between two stack areas will result in an aligned
4393
// stack.
4394
MinReservedArea =
4395
EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
4396
FuncInfo->setMinReservedArea(MinReservedArea);
4397
4398
SmallVector<SDValue, 8> MemOps;
4399
4400
// If the function takes variable number of arguments, make a frame index for
4401
// the start of the first vararg value... for expansion of llvm.va_start.
4402
if (isVarArg) {
4403
static const MCPhysReg GPArgRegs[] = {
4404
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
4405
PPC::R7, PPC::R8, PPC::R9, PPC::R10,
4406
};
4407
const unsigned NumGPArgRegs = std::size(GPArgRegs);
4408
4409
static const MCPhysReg FPArgRegs[] = {
4410
PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
4411
PPC::F8
4412
};
4413
unsigned NumFPArgRegs = std::size(FPArgRegs);
4414
4415
if (useSoftFloat() || hasSPE())
4416
NumFPArgRegs = 0;
4417
4418
FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
4419
FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs));
4420
4421
// Make room for NumGPArgRegs and NumFPArgRegs.
4422
int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
4423
NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
4424
4425
FuncInfo->setVarArgsStackOffset(MFI.CreateFixedObject(
4426
PtrVT.getSizeInBits() / 8, CCInfo.getStackSize(), true));
4427
4428
FuncInfo->setVarArgsFrameIndex(
4429
MFI.CreateStackObject(Depth, Align(8), false));
4430
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4431
4432
// The fixed integer arguments of a variadic function are stored to the
4433
// VarArgsFrameIndex on the stack so that they may be loaded by
4434
// dereferencing the result of va_next.
4435
for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
4436
// Get an existing live-in vreg, or add a new one.
4437
Register VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
4438
if (!VReg)
4439
VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
4440
4441
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
4442
SDValue Store =
4443
DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
4444
MemOps.push_back(Store);
4445
// Increment the address by four for the next argument to store
4446
SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
4447
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
4448
}
4449
4450
// FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
4451
// is set.
4452
// The double arguments are stored to the VarArgsFrameIndex
4453
// on the stack.
4454
for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
4455
// Get an existing live-in vreg, or add a new one.
4456
Register VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
4457
if (!VReg)
4458
VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
4459
4460
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
4461
SDValue Store =
4462
DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
4463
MemOps.push_back(Store);
4464
// Increment the address by eight for the next argument to store
4465
SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl,
4466
PtrVT);
4467
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
4468
}
4469
}
4470
4471
if (!MemOps.empty())
4472
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4473
4474
return Chain;
4475
}
4476
4477
// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
4478
// value to MVT::i64 and then truncate to the correct register size.
4479
SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
4480
EVT ObjectVT, SelectionDAG &DAG,
4481
SDValue ArgVal,
4482
const SDLoc &dl) const {
4483
if (Flags.isSExt())
4484
ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
4485
DAG.getValueType(ObjectVT));
4486
else if (Flags.isZExt())
4487
ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
4488
DAG.getValueType(ObjectVT));
4489
4490
return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
4491
}
4492
4493
SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
4494
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4495
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4496
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4497
// TODO: add description of PPC stack frame format, or at least some docs.
4498
//
4499
bool isELFv2ABI = Subtarget.isELFv2ABI();
4500
bool isLittleEndian = Subtarget.isLittleEndian();
4501
MachineFunction &MF = DAG.getMachineFunction();
4502
MachineFrameInfo &MFI = MF.getFrameInfo();
4503
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
4504
4505
assert(!(CallConv == CallingConv::Fast && isVarArg) &&
4506
"fastcc not supported on varargs functions");
4507
4508
EVT PtrVT = getPointerTy(MF.getDataLayout());
4509
// Potential tail calls could cause overwriting of argument stack slots.
4510
bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
4511
(CallConv == CallingConv::Fast));
4512
unsigned PtrByteSize = 8;
4513
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4514
4515
static const MCPhysReg GPR[] = {
4516
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
4517
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
4518
};
4519
static const MCPhysReg VR[] = {
4520
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
4521
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
4522
};
4523
4524
const unsigned Num_GPR_Regs = std::size(GPR);
4525
const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
4526
const unsigned Num_VR_Regs = std::size(VR);
4527
4528
// Do a first pass over the arguments to determine whether the ABI
4529
// guarantees that our caller has allocated the parameter save area
4530
// on its stack frame. In the ELFv1 ABI, this is always the case;
4531
// in the ELFv2 ABI, it is true if this is a vararg function or if
4532
// any parameter is located in a stack slot.
4533
4534
bool HasParameterArea = !isELFv2ABI || isVarArg;
4535
unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
4536
unsigned NumBytes = LinkageSize;
4537
unsigned AvailableFPRs = Num_FPR_Regs;
4538
unsigned AvailableVRs = Num_VR_Regs;
4539
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4540
if (Ins[i].Flags.isNest())
4541
continue;
4542
4543
if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
4544
PtrByteSize, LinkageSize, ParamAreaSize,
4545
NumBytes, AvailableFPRs, AvailableVRs))
4546
HasParameterArea = true;
4547
}
4548
4549
// Add DAG nodes to load the arguments or copy them out of registers. On
4550
// entry to a function on PPC, the arguments start after the linkage area,
4551
// although the first ones are often in registers.
4552
4553
unsigned ArgOffset = LinkageSize;
4554
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
4555
SmallVector<SDValue, 8> MemOps;
4556
Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
4557
unsigned CurArgIdx = 0;
4558
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
4559
SDValue ArgVal;
4560
bool needsLoad = false;
4561
EVT ObjectVT = Ins[ArgNo].VT;
4562
EVT OrigVT = Ins[ArgNo].ArgVT;
4563
unsigned ObjSize = ObjectVT.getStoreSize();
4564
unsigned ArgSize = ObjSize;
4565
ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
4566
if (Ins[ArgNo].isOrigArg()) {
4567
std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
4568
CurArgIdx = Ins[ArgNo].getOrigArgIndex();
4569
}
4570
// We re-align the argument offset for each argument, except when using the
4571
// fast calling convention, when we need to make sure we do that only when
4572
// we'll actually use a stack slot.
4573
unsigned CurArgOffset;
4574
Align Alignment;
4575
auto ComputeArgOffset = [&]() {
4576
/* Respect alignment of argument on the stack. */
4577
Alignment =
4578
CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
4579
ArgOffset = alignTo(ArgOffset, Alignment);
4580
CurArgOffset = ArgOffset;
4581
};
4582
4583
if (CallConv != CallingConv::Fast) {
4584
ComputeArgOffset();
4585
4586
/* Compute GPR index associated with argument offset. */
4587
GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
4588
GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
4589
}
4590
4591
// FIXME the codegen can be much improved in some cases.
4592
// We do not have to keep everything in memory.
4593
if (Flags.isByVal()) {
4594
assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
4595
4596
if (CallConv == CallingConv::Fast)
4597
ComputeArgOffset();
4598
4599
// ObjSize is the true size, ArgSize rounded up to multiple of registers.
4600
ObjSize = Flags.getByValSize();
4601
ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4602
// Empty aggregate parameters do not take up registers. Examples:
4603
// struct { } a;
4604
// union { } b;
4605
// int c[0];
4606
// etc. However, we have to provide a place-holder in InVals, so
4607
// pretend we have an 8-byte item at the current address for that
4608
// purpose.
4609
if (!ObjSize) {
4610
int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
4611
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4612
InVals.push_back(FIN);
4613
continue;
4614
}
4615
4616
// Create a stack object covering all stack doublewords occupied
4617
// by the argument. If the argument is (fully or partially) on
4618
// the stack, or if the argument is fully in registers but the
4619
// caller has allocated the parameter save anyway, we can refer
4620
// directly to the caller's stack frame. Otherwise, create a
4621
// local copy in our own frame.
4622
int FI;
4623
if (HasParameterArea ||
4624
ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
4625
FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);
4626
else
4627
FI = MFI.CreateStackObject(ArgSize, Alignment, false);
4628
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4629
4630
// Handle aggregates smaller than 8 bytes.
4631
if (ObjSize < PtrByteSize) {
4632
// The value of the object is its address, which differs from the
4633
// address of the enclosing doubleword on big-endian systems.
4634
SDValue Arg = FIN;
4635
if (!isLittleEndian) {
4636
SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT);
4637
Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);
4638
}
4639
InVals.push_back(Arg);
4640
4641
if (GPR_idx != Num_GPR_Regs) {
4642
Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
4643
FuncInfo->addLiveInAttr(VReg, Flags);
4644
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
4645
EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), ObjSize * 8);
4646
SDValue Store =
4647
DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
4648
MachinePointerInfo(&*FuncArg), ObjType);
4649
MemOps.push_back(Store);
4650
}
4651
// Whether we copied from a register or not, advance the offset
4652
// into the parameter save area by a full doubleword.
4653
ArgOffset += PtrByteSize;
4654
continue;
4655
}
4656
4657
// The value of the object is its address, which is the address of
4658
// its first stack doubleword.
4659
InVals.push_back(FIN);
4660
4661
// Store whatever pieces of the object are in registers to memory.
4662
for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
4663
if (GPR_idx == Num_GPR_Regs)
4664
break;
4665
4666
Register VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
4667
FuncInfo->addLiveInAttr(VReg, Flags);
4668
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
4669
SDValue Addr = FIN;
4670
if (j) {
4671
SDValue Off = DAG.getConstant(j, dl, PtrVT);
4672
Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
4673
}
4674
unsigned StoreSizeInBits = std::min(PtrByteSize, (ObjSize - j)) * 8;
4675
EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), StoreSizeInBits);
4676
SDValue Store =
4677
DAG.getTruncStore(Val.getValue(1), dl, Val, Addr,
4678
MachinePointerInfo(&*FuncArg, j), ObjType);
4679
MemOps.push_back(Store);
4680
++GPR_idx;
4681
}
4682
ArgOffset += ArgSize;
4683
continue;
4684
}
4685
4686
switch (ObjectVT.getSimpleVT().SimpleTy) {
4687
default: llvm_unreachable("Unhandled argument type!");
4688
case MVT::i1:
4689
case MVT::i32:
4690
case MVT::i64:
4691
if (Flags.isNest()) {
4692
// The 'nest' parameter, if any, is passed in R11.
4693
Register VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
4694
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
4695
4696
if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
4697
ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
4698
4699
break;
4700
}
4701
4702
// These can be scalar arguments or elements of an integer array type
4703
// passed directly. Clang may use those instead of "byval" aggregate
4704
// types to avoid forcing arguments to memory unnecessarily.
4705
if (GPR_idx != Num_GPR_Regs) {
4706
Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
4707
FuncInfo->addLiveInAttr(VReg, Flags);
4708
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
4709
4710
if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
4711
// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
4712
// value to MVT::i64 and then truncate to the correct register size.
4713
ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
4714
} else {
4715
if (CallConv == CallingConv::Fast)
4716
ComputeArgOffset();
4717
4718
needsLoad = true;
4719
ArgSize = PtrByteSize;
4720
}
4721
if (CallConv != CallingConv::Fast || needsLoad)
4722
ArgOffset += 8;
4723
break;
4724
4725
case MVT::f32:
4726
case MVT::f64:
4727
// These can be scalar arguments or elements of a float array type
4728
// passed directly. The latter are used to implement ELFv2 homogenous
4729
// float aggregates.
4730
if (FPR_idx != Num_FPR_Regs) {
4731
unsigned VReg;
4732
4733
if (ObjectVT == MVT::f32)
4734
VReg = MF.addLiveIn(FPR[FPR_idx],
4735
Subtarget.hasP8Vector()
4736
? &PPC::VSSRCRegClass
4737
: &PPC::F4RCRegClass);
4738
else
4739
VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX()
4740
? &PPC::VSFRCRegClass
4741
: &PPC::F8RCRegClass);
4742
4743
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
4744
++FPR_idx;
4745
} else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
4746
// FIXME: We may want to re-enable this for CallingConv::Fast on the P8
4747
// once we support fp <-> gpr moves.
4748
4749
// This can only ever happen in the presence of f32 array types,
4750
// since otherwise we never run out of FPRs before running out
4751
// of GPRs.
4752
Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
4753
FuncInfo->addLiveInAttr(VReg, Flags);
4754
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
4755
4756
if (ObjectVT == MVT::f32) {
4757
if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
4758
ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
4759
DAG.getConstant(32, dl, MVT::i32));
4760
ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
4761
}
4762
4763
ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
4764
} else {
4765
if (CallConv == CallingConv::Fast)
4766
ComputeArgOffset();
4767
4768
needsLoad = true;
4769
}
4770
4771
// When passing an array of floats, the array occupies consecutive
4772
// space in the argument area; only round up to the next doubleword
4773
// at the end of the array. Otherwise, each float takes 8 bytes.
4774
if (CallConv != CallingConv::Fast || needsLoad) {
4775
ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
4776
ArgOffset += ArgSize;
4777
if (Flags.isInConsecutiveRegsLast())
4778
ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4779
}
4780
break;
4781
case MVT::v4f32:
4782
case MVT::v4i32:
4783
case MVT::v8i16:
4784
case MVT::v16i8:
4785
case MVT::v2f64:
4786
case MVT::v2i64:
4787
case MVT::v1i128:
4788
case MVT::f128:
4789
// These can be scalar arguments or elements of a vector array type
4790
// passed directly. The latter are used to implement ELFv2 homogenous
4791
// vector aggregates.
4792
if (VR_idx != Num_VR_Regs) {
4793
Register VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
4794
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
4795
++VR_idx;
4796
} else {
4797
if (CallConv == CallingConv::Fast)
4798
ComputeArgOffset();
4799
needsLoad = true;
4800
}
4801
if (CallConv != CallingConv::Fast || needsLoad)
4802
ArgOffset += 16;
4803
break;
4804
}
4805
4806
// We need to load the argument to a virtual register if we determined
4807
// above that we ran out of physical registers of the appropriate type.
4808
if (needsLoad) {
4809
if (ObjSize < ArgSize && !isLittleEndian)
4810
CurArgOffset += ArgSize - ObjSize;
4811
int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
4812
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4813
ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
4814
}
4815
4816
InVals.push_back(ArgVal);
4817
}
4818
4819
// Area that is at least reserved in the caller of this function.
4820
unsigned MinReservedArea;
4821
if (HasParameterArea)
4822
MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
4823
else
4824
MinReservedArea = LinkageSize;
4825
4826
// Set the size that is at least reserved in caller of this function. Tail
4827
// call optimized functions' reserved stack space needs to be aligned so that
4828
// taking the difference between two stack areas will result in an aligned
4829
// stack.
4830
MinReservedArea =
4831
EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
4832
FuncInfo->setMinReservedArea(MinReservedArea);
4833
4834
// If the function takes variable number of arguments, make a frame index for
4835
// the start of the first vararg value... for expansion of llvm.va_start.
4836
// On ELFv2ABI spec, it writes:
4837
// C programs that are intended to be *portable* across different compilers
4838
// and architectures must use the header file <stdarg.h> to deal with variable
4839
// argument lists.
4840
if (isVarArg && MFI.hasVAStart()) {
4841
int Depth = ArgOffset;
4842
4843
FuncInfo->setVarArgsFrameIndex(
4844
MFI.CreateFixedObject(PtrByteSize, Depth, true));
4845
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4846
4847
// If this function is vararg, store any remaining integer argument regs
4848
// to their spots on the stack so that they may be loaded by dereferencing
4849
// the result of va_next.
4850
for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
4851
GPR_idx < Num_GPR_Regs; ++GPR_idx) {
4852
Register VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
4853
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
4854
SDValue Store =
4855
DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
4856
MemOps.push_back(Store);
4857
// Increment the address by four for the next argument to store
4858
SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
4859
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
4860
}
4861
}
4862
4863
if (!MemOps.empty())
4864
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4865
4866
return Chain;
4867
}
4868
4869
/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
4870
/// adjusted to accommodate the arguments for the tailcall.
4871
static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
4872
unsigned ParamSize) {
4873
4874
if (!isTailCall) return 0;
4875
4876
PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
4877
unsigned CallerMinReservedArea = FI->getMinReservedArea();
4878
int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
4879
// Remember only if the new adjustment is bigger.
4880
if (SPDiff < FI->getTailCallSPDelta())
4881
FI->setTailCallSPDelta(SPDiff);
4882
4883
return SPDiff;
4884
}
4885
4886
static bool isFunctionGlobalAddress(const GlobalValue *CalleeGV);
4887
4888
static bool callsShareTOCBase(const Function *Caller,
4889
const GlobalValue *CalleeGV,
4890
const TargetMachine &TM) {
4891
// It does not make sense to call callsShareTOCBase() with a caller that
4892
// is PC Relative since PC Relative callers do not have a TOC.
4893
#ifndef NDEBUG
4894
const PPCSubtarget *STICaller = &TM.getSubtarget<PPCSubtarget>(*Caller);
4895
assert(!STICaller->isUsingPCRelativeCalls() &&
4896
"PC Relative callers do not have a TOC and cannot share a TOC Base");
4897
#endif
4898
4899
// Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
4900
// don't have enough information to determine if the caller and callee share
4901
// the same TOC base, so we have to pessimistically assume they don't for
4902
// correctness.
4903
if (!CalleeGV)
4904
return false;
4905
4906
// If the callee is preemptable, then the static linker will use a plt-stub
4907
// which saves the toc to the stack, and needs a nop after the call
4908
// instruction to convert to a toc-restore.
4909
if (!TM.shouldAssumeDSOLocal(CalleeGV))
4910
return false;
4911
4912
// Functions with PC Relative enabled may clobber the TOC in the same DSO.
4913
// We may need a TOC restore in the situation where the caller requires a
4914
// valid TOC but the callee is PC Relative and does not.
4915
const Function *F = dyn_cast<Function>(CalleeGV);
4916
const GlobalAlias *Alias = dyn_cast<GlobalAlias>(CalleeGV);
4917
4918
// If we have an Alias we can try to get the function from there.
4919
if (Alias) {
4920
const GlobalObject *GlobalObj = Alias->getAliaseeObject();
4921
F = dyn_cast<Function>(GlobalObj);
4922
}
4923
4924
// If we still have no valid function pointer we do not have enough
4925
// information to determine if the callee uses PC Relative calls so we must
4926
// assume that it does.
4927
if (!F)
4928
return false;
4929
4930
// If the callee uses PC Relative we cannot guarantee that the callee won't
4931
// clobber the TOC of the caller and so we must assume that the two
4932
// functions do not share a TOC base.
4933
const PPCSubtarget *STICallee = &TM.getSubtarget<PPCSubtarget>(*F);
4934
if (STICallee->isUsingPCRelativeCalls())
4935
return false;
4936
4937
// If the GV is not a strong definition then we need to assume it can be
4938
// replaced by another function at link time. The function that replaces
4939
// it may not share the same TOC as the caller since the callee may be
4940
// replaced by a PC Relative version of the same function.
4941
if (!CalleeGV->isStrongDefinitionForLinker())
4942
return false;
4943
4944
// The medium and large code models are expected to provide a sufficiently
4945
// large TOC to provide all data addressing needs of a module with a
4946
// single TOC.
4947
if (CodeModel::Medium == TM.getCodeModel() ||
4948
CodeModel::Large == TM.getCodeModel())
4949
return true;
4950
4951
// Any explicitly-specified sections and section prefixes must also match.
4952
// Also, if we're using -ffunction-sections, then each function is always in
4953
// a different section (the same is true for COMDAT functions).
4954
if (TM.getFunctionSections() || CalleeGV->hasComdat() ||
4955
Caller->hasComdat() || CalleeGV->getSection() != Caller->getSection())
4956
return false;
4957
if (const auto *F = dyn_cast<Function>(CalleeGV)) {
4958
if (F->getSectionPrefix() != Caller->getSectionPrefix())
4959
return false;
4960
}
4961
4962
return true;
4963
}
4964
4965
static bool
4966
needStackSlotPassParameters(const PPCSubtarget &Subtarget,
4967
const SmallVectorImpl<ISD::OutputArg> &Outs) {
4968
assert(Subtarget.is64BitELFABI());
4969
4970
const unsigned PtrByteSize = 8;
4971
const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4972
4973
static const MCPhysReg GPR[] = {
4974
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
4975
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
4976
};
4977
static const MCPhysReg VR[] = {
4978
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
4979
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
4980
};
4981
4982
const unsigned NumGPRs = std::size(GPR);
4983
const unsigned NumFPRs = 13;
4984
const unsigned NumVRs = std::size(VR);
4985
const unsigned ParamAreaSize = NumGPRs * PtrByteSize;
4986
4987
unsigned NumBytes = LinkageSize;
4988
unsigned AvailableFPRs = NumFPRs;
4989
unsigned AvailableVRs = NumVRs;
4990
4991
for (const ISD::OutputArg& Param : Outs) {
4992
if (Param.Flags.isNest()) continue;
4993
4994
if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, PtrByteSize,
4995
LinkageSize, ParamAreaSize, NumBytes,
4996
AvailableFPRs, AvailableVRs))
4997
return true;
4998
}
4999
return false;
5000
}
5001
5002
static bool hasSameArgumentList(const Function *CallerFn, const CallBase &CB) {
5003
if (CB.arg_size() != CallerFn->arg_size())
5004
return false;
5005
5006
auto CalleeArgIter = CB.arg_begin();
5007
auto CalleeArgEnd = CB.arg_end();
5008
Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
5009
5010
for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
5011
const Value* CalleeArg = *CalleeArgIter;
5012
const Value* CallerArg = &(*CallerArgIter);
5013
if (CalleeArg == CallerArg)
5014
continue;
5015
5016
// e.g. @caller([4 x i64] %a, [4 x i64] %b) {
5017
// tail call @callee([4 x i64] undef, [4 x i64] %b)
5018
// }
5019
// 1st argument of callee is undef and has the same type as caller.
5020
if (CalleeArg->getType() == CallerArg->getType() &&
5021
isa<UndefValue>(CalleeArg))
5022
continue;
5023
5024
return false;
5025
}
5026
5027
return true;
5028
}
5029
5030
// Returns true if TCO is possible between the callers and callees
5031
// calling conventions.
5032
static bool
5033
areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
5034
CallingConv::ID CalleeCC) {
5035
// Tail calls are possible with fastcc and ccc.
5036
auto isTailCallableCC = [] (CallingConv::ID CC){
5037
return CC == CallingConv::C || CC == CallingConv::Fast;
5038
};
5039
if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC))
5040
return false;
5041
5042
// We can safely tail call both fastcc and ccc callees from a c calling
5043
// convention caller. If the caller is fastcc, we may have less stack space
5044
// than a non-fastcc caller with the same signature so disable tail-calls in
5045
// that case.
5046
return CallerCC == CallingConv::C || CallerCC == CalleeCC;
5047
}
5048
5049
bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
5050
const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
5051
CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg,
5052
const SmallVectorImpl<ISD::OutputArg> &Outs,
5053
const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc,
5054
bool isCalleeExternalSymbol) const {
5055
bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
5056
5057
if (DisableSCO && !TailCallOpt) return false;
5058
5059
// Variadic argument functions are not supported.
5060
if (isVarArg) return false;
5061
5062
// Check that the calling conventions are compatible for tco.
5063
if (!areCallingConvEligibleForTCO_64SVR4(CallerCC, CalleeCC))
5064
return false;
5065
5066
// Caller contains any byval parameter is not supported.
5067
if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
5068
return false;
5069
5070
// Callee contains any byval parameter is not supported, too.
5071
// Note: This is a quick work around, because in some cases, e.g.
5072
// caller's stack size > callee's stack size, we are still able to apply
5073
// sibling call optimization. For example, gcc is able to do SCO for caller1
5074
// in the following example, but not for caller2.
5075
// struct test {
5076
// long int a;
5077
// char ary[56];
5078
// } gTest;
5079
// __attribute__((noinline)) int callee(struct test v, struct test *b) {
5080
// b->a = v.a;
5081
// return 0;
5082
// }
5083
// void caller1(struct test a, struct test c, struct test *b) {
5084
// callee(gTest, b); }
5085
// void caller2(struct test *b) { callee(gTest, b); }
5086
if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
5087
return false;
5088
5089
// If callee and caller use different calling conventions, we cannot pass
5090
// parameters on stack since offsets for the parameter area may be different.
5091
if (CallerCC != CalleeCC && needStackSlotPassParameters(Subtarget, Outs))
5092
return false;
5093
5094
// All variants of 64-bit ELF ABIs without PC-Relative addressing require that
5095
// the caller and callee share the same TOC for TCO/SCO. If the caller and
5096
// callee potentially have different TOC bases then we cannot tail call since
5097
// we need to restore the TOC pointer after the call.
5098
// ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
5099
// We cannot guarantee this for indirect calls or calls to external functions.
5100
// When PC-Relative addressing is used, the concept of the TOC is no longer
5101
// applicable so this check is not required.
5102
// Check first for indirect calls.
5103
if (!Subtarget.isUsingPCRelativeCalls() &&
5104
!isFunctionGlobalAddress(CalleeGV) && !isCalleeExternalSymbol)
5105
return false;
5106
5107
// Check if we share the TOC base.
5108
if (!Subtarget.isUsingPCRelativeCalls() &&
5109
!callsShareTOCBase(CallerFunc, CalleeGV, getTargetMachine()))
5110
return false;
5111
5112
// TCO allows altering callee ABI, so we don't have to check further.
5113
if (CalleeCC == CallingConv::Fast && TailCallOpt)
5114
return true;
5115
5116
if (DisableSCO) return false;
5117
5118
// If callee use the same argument list that caller is using, then we can
5119
// apply SCO on this case. If it is not, then we need to check if callee needs
5120
// stack for passing arguments.
5121
// PC Relative tail calls may not have a CallBase.
5122
// If there is no CallBase we cannot verify if we have the same argument
5123
// list so assume that we don't have the same argument list.
5124
if (CB && !hasSameArgumentList(CallerFunc, *CB) &&
5125
needStackSlotPassParameters(Subtarget, Outs))
5126
return false;
5127
else if (!CB && needStackSlotPassParameters(Subtarget, Outs))
5128
return false;
5129
5130
return true;
5131
}
5132
5133
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
5134
/// for tail call optimization. Targets which want to do tail call
5135
/// optimization should implement this function.
5136
bool PPCTargetLowering::IsEligibleForTailCallOptimization(
5137
const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
5138
CallingConv::ID CallerCC, bool isVarArg,
5139
const SmallVectorImpl<ISD::InputArg> &Ins) const {
5140
if (!getTargetMachine().Options.GuaranteedTailCallOpt)
5141
return false;
5142
5143
// Variable argument functions are not supported.
5144
if (isVarArg)
5145
return false;
5146
5147
if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
5148
// Functions containing by val parameters are not supported.
5149
if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
5150
return false;
5151
5152
// Non-PIC/GOT tail calls are supported.
5153
if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
5154
return true;
5155
5156
// At the moment we can only do local tail calls (in same module, hidden
5157
// or protected) if we are generating PIC.
5158
if (CalleeGV)
5159
return CalleeGV->hasHiddenVisibility() ||
5160
CalleeGV->hasProtectedVisibility();
5161
}
5162
5163
return false;
5164
}
5165
5166
/// isCallCompatibleAddress - Return the immediate to use if the specified
5167
/// 32-bit value is representable in the immediate field of a BxA instruction.
5168
static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
5169
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
5170
if (!C) return nullptr;
5171
5172
int Addr = C->getZExtValue();
5173
if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero.
5174
SignExtend32<26>(Addr) != Addr)
5175
return nullptr; // Top 6 bits have to be sext of immediate.
5176
5177
return DAG
5178
.getConstant(
5179
(int)C->getZExtValue() >> 2, SDLoc(Op),
5180
DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))
5181
.getNode();
5182
}
5183
5184
namespace {
5185
5186
struct TailCallArgumentInfo {
5187
SDValue Arg;
5188
SDValue FrameIdxOp;
5189
int FrameIdx = 0;
5190
5191
TailCallArgumentInfo() = default;
5192
};
5193
5194
} // end anonymous namespace
5195
5196
/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
5197
static void StoreTailCallArgumentsToStackSlot(
5198
SelectionDAG &DAG, SDValue Chain,
5199
const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
5200
SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
5201
for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
5202
SDValue Arg = TailCallArgs[i].Arg;
5203
SDValue FIN = TailCallArgs[i].FrameIdxOp;
5204
int FI = TailCallArgs[i].FrameIdx;
5205
// Store relative to framepointer.
5206
MemOpChains.push_back(DAG.getStore(
5207
Chain, dl, Arg, FIN,
5208
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
5209
}
5210
}
5211
5212
/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
5213
/// the appropriate stack slot for the tail call optimized function call.
5214
static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
5215
SDValue OldRetAddr, SDValue OldFP,
5216
int SPDiff, const SDLoc &dl) {
5217
if (SPDiff) {
5218
// Calculate the new stack slot for the return address.
5219
MachineFunction &MF = DAG.getMachineFunction();
5220
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
5221
const PPCFrameLowering *FL = Subtarget.getFrameLowering();
5222
bool isPPC64 = Subtarget.isPPC64();
5223
int SlotSize = isPPC64 ? 8 : 4;
5224
int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
5225
int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize,
5226
NewRetAddrLoc, true);
5227
EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
5228
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
5229
Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
5230
MachinePointerInfo::getFixedStack(MF, NewRetAddr));
5231
}
5232
return Chain;
5233
}
5234
5235
/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
5236
/// the position of the argument.
5237
static void
5238
CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
5239
SDValue Arg, int SPDiff, unsigned ArgOffset,
5240
SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {
5241
int Offset = ArgOffset + SPDiff;
5242
uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;
5243
int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
5244
EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
5245
SDValue FIN = DAG.getFrameIndex(FI, VT);
5246
TailCallArgumentInfo Info;
5247
Info.Arg = Arg;
5248
Info.FrameIdxOp = FIN;
5249
Info.FrameIdx = FI;
5250
TailCallArguments.push_back(Info);
5251
}
5252
5253
/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
5254
/// stack slot. Returns the chain as result and the loaded frame pointers in
5255
/// LROpOut/FPOpout. Used when tail calling.
5256
SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
5257
SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
5258
SDValue &FPOpOut, const SDLoc &dl) const {
5259
if (SPDiff) {
5260
// Load the LR and FP stack slot for later adjusting.
5261
EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
5262
LROpOut = getReturnAddrFrameIndex(DAG);
5263
LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());
5264
Chain = SDValue(LROpOut.getNode(), 1);
5265
}
5266
return Chain;
5267
}
5268
5269
/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
5270
/// by "Src" to address "Dst" of size "Size". Alignment information is
5271
/// specified by the specific parameter attribute. The copy will be passed as
5272
/// a byval function parameter.
5273
/// Sometimes what we are copying is the end of a larger object, the part that
5274
/// does not fit in registers.
5275
static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
5276
SDValue Chain, ISD::ArgFlagsTy Flags,
5277
SelectionDAG &DAG, const SDLoc &dl) {
5278
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
5279
return DAG.getMemcpy(
5280
Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), false, false,
5281
/*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo());
5282
}
5283
5284
/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
5285
/// tail calls.
5286
static void LowerMemOpCallTo(
5287
SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
5288
SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
5289
bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
5290
SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
5291
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
5292
if (!isTailCall) {
5293
if (isVector) {
5294
SDValue StackPtr;
5295
if (isPPC64)
5296
StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
5297
else
5298
StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
5299
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
5300
DAG.getConstant(ArgOffset, dl, PtrVT));
5301
}
5302
MemOpChains.push_back(
5303
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
5304
// Calculate and remember argument location.
5305
} else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
5306
TailCallArguments);
5307
}
5308
5309
static void
5310
PrepareTailCall(SelectionDAG &DAG, SDValue &InGlue, SDValue &Chain,
5311
const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
5312
SDValue FPOp,
5313
SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
5314
// Emit a sequence of copyto/copyfrom virtual registers for arguments that
5315
// might overwrite each other in case of tail call optimization.
5316
SmallVector<SDValue, 8> MemOpChains2;
5317
// Do not flag preceding copytoreg stuff together with the following stuff.
5318
InGlue = SDValue();
5319
StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
5320
MemOpChains2, dl);
5321
if (!MemOpChains2.empty())
5322
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
5323
5324
// Store the return address to the appropriate stack slot.
5325
Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl);
5326
5327
// Emit callseq_end just before tailcall node.
5328
Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, dl);
5329
InGlue = Chain.getValue(1);
5330
}
5331
5332
// Is this global address that of a function that can be called by name? (as
5333
// opposed to something that must hold a descriptor for an indirect call).
5334
static bool isFunctionGlobalAddress(const GlobalValue *GV) {
5335
if (GV) {
5336
if (GV->isThreadLocal())
5337
return false;
5338
5339
return GV->getValueType()->isFunctionTy();
5340
}
5341
5342
return false;
5343
}
5344
5345
SDValue PPCTargetLowering::LowerCallResult(
5346
SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
5347
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
5348
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
5349
SmallVector<CCValAssign, 16> RVLocs;
5350
CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
5351
*DAG.getContext());
5352
5353
CCRetInfo.AnalyzeCallResult(
5354
Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
5355
? RetCC_PPC_Cold
5356
: RetCC_PPC);
5357
5358
// Copy all of the result registers out of their specified physreg.
5359
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
5360
CCValAssign &VA = RVLocs[i];
5361
assert(VA.isRegLoc() && "Can only return in registers!");
5362
5363
SDValue Val;
5364
5365
if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
5366
SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
5367
InGlue);
5368
Chain = Lo.getValue(1);
5369
InGlue = Lo.getValue(2);
5370
VA = RVLocs[++i]; // skip ahead to next loc
5371
SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
5372
InGlue);
5373
Chain = Hi.getValue(1);
5374
InGlue = Hi.getValue(2);
5375
if (!Subtarget.isLittleEndian())
5376
std::swap (Lo, Hi);
5377
Val = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, Lo, Hi);
5378
} else {
5379
Val = DAG.getCopyFromReg(Chain, dl,
5380
VA.getLocReg(), VA.getLocVT(), InGlue);
5381
Chain = Val.getValue(1);
5382
InGlue = Val.getValue(2);
5383
}
5384
5385
switch (VA.getLocInfo()) {
5386
default: llvm_unreachable("Unknown loc info!");
5387
case CCValAssign::Full: break;
5388
case CCValAssign::AExt:
5389
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
5390
break;
5391
case CCValAssign::ZExt:
5392
Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
5393
DAG.getValueType(VA.getValVT()));
5394
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
5395
break;
5396
case CCValAssign::SExt:
5397
Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
5398
DAG.getValueType(VA.getValVT()));
5399
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
5400
break;
5401
}
5402
5403
InVals.push_back(Val);
5404
}
5405
5406
return Chain;
5407
}
5408
5409
static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG,
5410
const PPCSubtarget &Subtarget, bool isPatchPoint) {
5411
auto *G = dyn_cast<GlobalAddressSDNode>(Callee);
5412
const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5413
5414
// PatchPoint calls are not indirect.
5415
if (isPatchPoint)
5416
return false;
5417
5418
if (isFunctionGlobalAddress(GV) || isa<ExternalSymbolSDNode>(Callee))
5419
return false;
5420
5421
// Darwin, and 32-bit ELF can use a BLA. The descriptor based ABIs can not
5422
// becuase the immediate function pointer points to a descriptor instead of
5423
// a function entry point. The ELFv2 ABI cannot use a BLA because the function
5424
// pointer immediate points to the global entry point, while the BLA would
5425
// need to jump to the local entry point (see rL211174).
5426
if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI() &&
5427
isBLACompatibleAddress(Callee, DAG))
5428
return false;
5429
5430
return true;
5431
}
5432
5433
// AIX and 64-bit ELF ABIs w/o PCRel require a TOC save/restore around calls.
5434
static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) {
5435
return Subtarget.isAIXABI() ||
5436
(Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls());
5437
}
5438
5439
static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
5440
const Function &Caller, const SDValue &Callee,
5441
const PPCSubtarget &Subtarget,
5442
const TargetMachine &TM,
5443
bool IsStrictFPCall = false) {
5444
if (CFlags.IsTailCall)
5445
return PPCISD::TC_RETURN;
5446
5447
unsigned RetOpc = 0;
5448
// This is a call through a function pointer.
5449
if (CFlags.IsIndirect) {
5450
// AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
5451
// indirect calls. The save of the caller's TOC pointer to the stack will be
5452
// inserted into the DAG as part of call lowering. The restore of the TOC
5453
// pointer is modeled by using a pseudo instruction for the call opcode that
5454
// represents the 2 instruction sequence of an indirect branch and link,
5455
// immediately followed by a load of the TOC pointer from the stack save
5456
// slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC
5457
// as it is not saved or used.
5458
RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
5459
: PPCISD::BCTRL;
5460
} else if (Subtarget.isUsingPCRelativeCalls()) {
5461
assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");
5462
RetOpc = PPCISD::CALL_NOTOC;
5463
} else if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) {
5464
// The ABIs that maintain a TOC pointer accross calls need to have a nop
5465
// immediately following the call instruction if the caller and callee may
5466
// have different TOC bases. At link time if the linker determines the calls
5467
// may not share a TOC base, the call is redirected to a trampoline inserted
5468
// by the linker. The trampoline will (among other things) save the callers
5469
// TOC pointer at an ABI designated offset in the linkage area and the
5470
// linker will rewrite the nop to be a load of the TOC pointer from the
5471
// linkage area into gpr2.
5472
auto *G = dyn_cast<GlobalAddressSDNode>(Callee);
5473
const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5474
RetOpc =
5475
callsShareTOCBase(&Caller, GV, TM) ? PPCISD::CALL : PPCISD::CALL_NOP;
5476
} else
5477
RetOpc = PPCISD::CALL;
5478
if (IsStrictFPCall) {
5479
switch (RetOpc) {
5480
default:
5481
llvm_unreachable("Unknown call opcode");
5482
case PPCISD::BCTRL_LOAD_TOC:
5483
RetOpc = PPCISD::BCTRL_LOAD_TOC_RM;
5484
break;
5485
case PPCISD::BCTRL:
5486
RetOpc = PPCISD::BCTRL_RM;
5487
break;
5488
case PPCISD::CALL_NOTOC:
5489
RetOpc = PPCISD::CALL_NOTOC_RM;
5490
break;
5491
case PPCISD::CALL:
5492
RetOpc = PPCISD::CALL_RM;
5493
break;
5494
case PPCISD::CALL_NOP:
5495
RetOpc = PPCISD::CALL_NOP_RM;
5496
break;
5497
}
5498
}
5499
return RetOpc;
5500
}
5501
5502
static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
5503
const SDLoc &dl, const PPCSubtarget &Subtarget) {
5504
if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI())
5505
if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG))
5506
return SDValue(Dest, 0);
5507
5508
// Returns true if the callee is local, and false otherwise.
5509
auto isLocalCallee = [&]() {
5510
const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
5511
const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5512
5513
return DAG.getTarget().shouldAssumeDSOLocal(GV) &&
5514
!isa_and_nonnull<GlobalIFunc>(GV);
5515
};
5516
5517
// The PLT is only used in 32-bit ELF PIC mode. Attempting to use the PLT in
5518
// a static relocation model causes some versions of GNU LD (2.17.50, at
5519
// least) to force BSS-PLT, instead of secure-PLT, even if all objects are
5520
// built with secure-PLT.
5521
bool UsePlt =
5522
Subtarget.is32BitELFABI() && !isLocalCallee() &&
5523
Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_;
5524
5525
const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) {
5526
const TargetMachine &TM = Subtarget.getTargetMachine();
5527
const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering();
5528
MCSymbolXCOFF *S =
5529
cast<MCSymbolXCOFF>(TLOF->getFunctionEntryPointSymbol(GV, TM));
5530
5531
MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
5532
return DAG.getMCSymbol(S, PtrVT);
5533
};
5534
5535
auto *G = dyn_cast<GlobalAddressSDNode>(Callee);
5536
const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5537
if (isFunctionGlobalAddress(GV)) {
5538
const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
5539
5540
if (Subtarget.isAIXABI()) {
5541
assert(!isa<GlobalIFunc>(GV) && "IFunc is not supported on AIX.");
5542
return getAIXFuncEntryPointSymbolSDNode(GV);
5543
}
5544
return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0,
5545
UsePlt ? PPCII::MO_PLT : 0);
5546
}
5547
5548
if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
5549
const char *SymName = S->getSymbol();
5550
if (Subtarget.isAIXABI()) {
5551
// If there exists a user-declared function whose name is the same as the
5552
// ExternalSymbol's, then we pick up the user-declared version.
5553
const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
5554
if (const Function *F =
5555
dyn_cast_or_null<Function>(Mod->getNamedValue(SymName)))
5556
return getAIXFuncEntryPointSymbolSDNode(F);
5557
5558
// On AIX, direct function calls reference the symbol for the function's
5559
// entry point, which is named by prepending a "." before the function's
5560
// C-linkage name. A Qualname is returned here because an external
5561
// function entry point is a csect with XTY_ER property.
5562
const auto getExternalFunctionEntryPointSymbol = [&](StringRef SymName) {
5563
auto &Context = DAG.getMachineFunction().getContext();
5564
MCSectionXCOFF *Sec = Context.getXCOFFSection(
5565
(Twine(".") + Twine(SymName)).str(), SectionKind::getMetadata(),
5566
XCOFF::CsectProperties(XCOFF::XMC_PR, XCOFF::XTY_ER));
5567
return Sec->getQualNameSymbol();
5568
};
5569
5570
SymName = getExternalFunctionEntryPointSymbol(SymName)->getName().data();
5571
}
5572
return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(),
5573
UsePlt ? PPCII::MO_PLT : 0);
5574
}
5575
5576
// No transformation needed.
5577
assert(Callee.getNode() && "What no callee?");
5578
return Callee;
5579
}
5580
5581
static SDValue getOutputChainFromCallSeq(SDValue CallSeqStart) {
5582
assert(CallSeqStart.getOpcode() == ISD::CALLSEQ_START &&
5583
"Expected a CALLSEQ_STARTSDNode.");
5584
5585
// The last operand is the chain, except when the node has glue. If the node
5586
// has glue, then the last operand is the glue, and the chain is the second
5587
// last operand.
5588
SDValue LastValue = CallSeqStart.getValue(CallSeqStart->getNumValues() - 1);
5589
if (LastValue.getValueType() != MVT::Glue)
5590
return LastValue;
5591
5592
return CallSeqStart.getValue(CallSeqStart->getNumValues() - 2);
5593
}
5594
5595
// Creates the node that moves a functions address into the count register
5596
// to prepare for an indirect call instruction.
5597
static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,
5598
SDValue &Glue, SDValue &Chain,
5599
const SDLoc &dl) {
5600
SDValue MTCTROps[] = {Chain, Callee, Glue};
5601
EVT ReturnTypes[] = {MVT::Other, MVT::Glue};
5602
Chain = DAG.getNode(PPCISD::MTCTR, dl, ReturnTypes,
5603
ArrayRef(MTCTROps, Glue.getNode() ? 3 : 2));
5604
// The glue is the second value produced.
5605
Glue = Chain.getValue(1);
5606
}
5607
5608
static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
5609
SDValue &Glue, SDValue &Chain,
5610
SDValue CallSeqStart,
5611
const CallBase *CB, const SDLoc &dl,
5612
bool hasNest,
5613
const PPCSubtarget &Subtarget) {
5614
// Function pointers in the 64-bit SVR4 ABI do not point to the function
5615
// entry point, but to the function descriptor (the function entry point
5616
// address is part of the function descriptor though).
5617
// The function descriptor is a three doubleword structure with the
5618
// following fields: function entry point, TOC base address and
5619
// environment pointer.
5620
// Thus for a call through a function pointer, the following actions need
5621
// to be performed:
5622
// 1. Save the TOC of the caller in the TOC save area of its stack
5623
// frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
5624
// 2. Load the address of the function entry point from the function
5625
// descriptor.
5626
// 3. Load the TOC of the callee from the function descriptor into r2.
5627
// 4. Load the environment pointer from the function descriptor into
5628
// r11.
5629
// 5. Branch to the function entry point address.
5630
// 6. On return of the callee, the TOC of the caller needs to be
5631
// restored (this is done in FinishCall()).
5632
//
5633
// The loads are scheduled at the beginning of the call sequence, and the
5634
// register copies are flagged together to ensure that no other
5635
// operations can be scheduled in between. E.g. without flagging the
5636
// copies together, a TOC access in the caller could be scheduled between
5637
// the assignment of the callee TOC and the branch to the callee, which leads
5638
// to incorrect code.
5639
5640
// Start by loading the function address from the descriptor.
5641
SDValue LDChain = getOutputChainFromCallSeq(CallSeqStart);
5642
auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
5643
? (MachineMemOperand::MODereferenceable |
5644
MachineMemOperand::MOInvariant)
5645
: MachineMemOperand::MONone;
5646
5647
MachinePointerInfo MPI(CB ? CB->getCalledOperand() : nullptr);
5648
5649
// Registers used in building the DAG.
5650
const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister();
5651
const MCRegister TOCReg = Subtarget.getTOCPointerRegister();
5652
5653
// Offsets of descriptor members.
5654
const unsigned TOCAnchorOffset = Subtarget.descriptorTOCAnchorOffset();
5655
const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset();
5656
5657
const MVT RegVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
5658
const Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
5659
5660
// One load for the functions entry point address.
5661
SDValue LoadFuncPtr = DAG.getLoad(RegVT, dl, LDChain, Callee, MPI,
5662
Alignment, MMOFlags);
5663
5664
// One for loading the TOC anchor for the module that contains the called
5665
// function.
5666
SDValue TOCOff = DAG.getIntPtrConstant(TOCAnchorOffset, dl);
5667
SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, Callee, TOCOff);
5668
SDValue TOCPtr =
5669
DAG.getLoad(RegVT, dl, LDChain, AddTOC,
5670
MPI.getWithOffset(TOCAnchorOffset), Alignment, MMOFlags);
5671
5672
// One for loading the environment pointer.
5673
SDValue PtrOff = DAG.getIntPtrConstant(EnvPtrOffset, dl);
5674
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, RegVT, Callee, PtrOff);
5675
SDValue LoadEnvPtr =
5676
DAG.getLoad(RegVT, dl, LDChain, AddPtr,
5677
MPI.getWithOffset(EnvPtrOffset), Alignment, MMOFlags);
5678
5679
5680
// Then copy the newly loaded TOC anchor to the TOC pointer.
5681
SDValue TOCVal = DAG.getCopyToReg(Chain, dl, TOCReg, TOCPtr, Glue);
5682
Chain = TOCVal.getValue(0);
5683
Glue = TOCVal.getValue(1);
5684
5685
// If the function call has an explicit 'nest' parameter, it takes the
5686
// place of the environment pointer.
5687
assert((!hasNest || !Subtarget.isAIXABI()) &&
5688
"Nest parameter is not supported on AIX.");
5689
if (!hasNest) {
5690
SDValue EnvVal = DAG.getCopyToReg(Chain, dl, EnvPtrReg, LoadEnvPtr, Glue);
5691
Chain = EnvVal.getValue(0);
5692
Glue = EnvVal.getValue(1);
5693
}
5694
5695
// The rest of the indirect call sequence is the same as the non-descriptor
5696
// DAG.
5697
prepareIndirectCall(DAG, LoadFuncPtr, Glue, Chain, dl);
5698
}
5699
5700
static void
5701
buildCallOperands(SmallVectorImpl<SDValue> &Ops,
5702
PPCTargetLowering::CallFlags CFlags, const SDLoc &dl,
5703
SelectionDAG &DAG,
5704
SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
5705
SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff,
5706
const PPCSubtarget &Subtarget) {
5707
const bool IsPPC64 = Subtarget.isPPC64();
5708
// MVT for a general purpose register.
5709
const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
5710
5711
// First operand is always the chain.
5712
Ops.push_back(Chain);
5713
5714
// If it's a direct call pass the callee as the second operand.
5715
if (!CFlags.IsIndirect)
5716
Ops.push_back(Callee);
5717
else {
5718
assert(!CFlags.IsPatchPoint && "Patch point calls are not indirect.");
5719
5720
// For the TOC based ABIs, we have saved the TOC pointer to the linkage area
5721
// on the stack (this would have been done in `LowerCall_64SVR4` or
5722
// `LowerCall_AIX`). The call instruction is a pseudo instruction that
5723
// represents both the indirect branch and a load that restores the TOC
5724
// pointer from the linkage area. The operand for the TOC restore is an add
5725
// of the TOC save offset to the stack pointer. This must be the second
5726
// operand: after the chain input but before any other variadic arguments.
5727
// For 64-bit ELFv2 ABI with PCRel, do not restore the TOC as it is not
5728
// saved or used.
5729
if (isTOCSaveRestoreRequired(Subtarget)) {
5730
const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
5731
5732
SDValue StackPtr = DAG.getRegister(StackPtrReg, RegVT);
5733
unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
5734
SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
5735
SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, StackPtr, TOCOff);
5736
Ops.push_back(AddTOC);
5737
}
5738
5739
// Add the register used for the environment pointer.
5740
if (Subtarget.usesFunctionDescriptors() && !CFlags.HasNest)
5741
Ops.push_back(DAG.getRegister(Subtarget.getEnvironmentPointerRegister(),
5742
RegVT));
5743
5744
5745
// Add CTR register as callee so a bctr can be emitted later.
5746
if (CFlags.IsTailCall)
5747
Ops.push_back(DAG.getRegister(IsPPC64 ? PPC::CTR8 : PPC::CTR, RegVT));
5748
}
5749
5750
// If this is a tail call add stack pointer delta.
5751
if (CFlags.IsTailCall)
5752
Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));
5753
5754
// Add argument registers to the end of the list so that they are known live
5755
// into the call.
5756
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
5757
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
5758
RegsToPass[i].second.getValueType()));
5759
5760
// We cannot add R2/X2 as an operand here for PATCHPOINT, because there is
5761
// no way to mark dependencies as implicit here.
5762
// We will add the R2/X2 dependency in EmitInstrWithCustomInserter.
5763
if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) &&
5764
!CFlags.IsPatchPoint && !Subtarget.isUsingPCRelativeCalls())
5765
Ops.push_back(DAG.getRegister(Subtarget.getTOCPointerRegister(), RegVT));
5766
5767
// Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
5768
if (CFlags.IsVarArg && Subtarget.is32BitELFABI())
5769
Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
5770
5771
// Add a register mask operand representing the call-preserved registers.
5772
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
5773
const uint32_t *Mask =
5774
TRI->getCallPreservedMask(DAG.getMachineFunction(), CFlags.CallConv);
5775
assert(Mask && "Missing call preserved mask for calling convention");
5776
Ops.push_back(DAG.getRegisterMask(Mask));
5777
5778
// If the glue is valid, it is the last operand.
5779
if (Glue.getNode())
5780
Ops.push_back(Glue);
5781
}
5782
5783
SDValue PPCTargetLowering::FinishCall(
5784
CallFlags CFlags, const SDLoc &dl, SelectionDAG &DAG,
5785
SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue Glue,
5786
SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
5787
unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
5788
SmallVectorImpl<SDValue> &InVals, const CallBase *CB) const {
5789
5790
if ((Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls()) ||
5791
Subtarget.isAIXABI())
5792
setUsesTOCBasePtr(DAG);
5793
5794
unsigned CallOpc =
5795
getCallOpcode(CFlags, DAG.getMachineFunction().getFunction(), Callee,
5796
Subtarget, DAG.getTarget(), CB ? CB->isStrictFP() : false);
5797
5798
if (!CFlags.IsIndirect)
5799
Callee = transformCallee(Callee, DAG, dl, Subtarget);
5800
else if (Subtarget.usesFunctionDescriptors())
5801
prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CB,
5802
dl, CFlags.HasNest, Subtarget);
5803
else
5804
prepareIndirectCall(DAG, Callee, Glue, Chain, dl);
5805
5806
// Build the operand list for the call instruction.
5807
SmallVector<SDValue, 8> Ops;
5808
buildCallOperands(Ops, CFlags, dl, DAG, RegsToPass, Glue, Chain, Callee,
5809
SPDiff, Subtarget);
5810
5811
// Emit tail call.
5812
if (CFlags.IsTailCall) {
5813
// Indirect tail call when using PC Relative calls do not have the same
5814
// constraints.
5815
assert(((Callee.getOpcode() == ISD::Register &&
5816
cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
5817
Callee.getOpcode() == ISD::TargetExternalSymbol ||
5818
Callee.getOpcode() == ISD::TargetGlobalAddress ||
5819
isa<ConstantSDNode>(Callee) ||
5820
(CFlags.IsIndirect && Subtarget.isUsingPCRelativeCalls())) &&
5821
"Expecting a global address, external symbol, absolute value, "
5822
"register or an indirect tail call when PC Relative calls are "
5823
"used.");
5824
// PC Relative calls also use TC_RETURN as the way to mark tail calls.
5825
assert(CallOpc == PPCISD::TC_RETURN &&
5826
"Unexpected call opcode for a tail call.");
5827
DAG.getMachineFunction().getFrameInfo().setHasTailCall();
5828
SDValue Ret = DAG.getNode(CallOpc, dl, MVT::Other, Ops);
5829
DAG.addNoMergeSiteInfo(Ret.getNode(), CFlags.NoMerge);
5830
return Ret;
5831
}
5832
5833
std::array<EVT, 2> ReturnTypes = {{MVT::Other, MVT::Glue}};
5834
Chain = DAG.getNode(CallOpc, dl, ReturnTypes, Ops);
5835
DAG.addNoMergeSiteInfo(Chain.getNode(), CFlags.NoMerge);
5836
Glue = Chain.getValue(1);
5837
5838
// When performing tail call optimization the callee pops its arguments off
5839
// the stack. Account for this here so these bytes can be pushed back on in
5840
// PPCFrameLowering::eliminateCallFramePseudoInstr.
5841
int BytesCalleePops = (CFlags.CallConv == CallingConv::Fast &&
5842
getTargetMachine().Options.GuaranteedTailCallOpt)
5843
? NumBytes
5844
: 0;
5845
5846
Chain = DAG.getCALLSEQ_END(Chain, NumBytes, BytesCalleePops, Glue, dl);
5847
Glue = Chain.getValue(1);
5848
5849
return LowerCallResult(Chain, Glue, CFlags.CallConv, CFlags.IsVarArg, Ins, dl,
5850
DAG, InVals);
5851
}
5852
5853
bool PPCTargetLowering::supportsTailCallFor(const CallBase *CB) const {
5854
CallingConv::ID CalleeCC = CB->getCallingConv();
5855
const Function *CallerFunc = CB->getCaller();
5856
CallingConv::ID CallerCC = CallerFunc->getCallingConv();
5857
const Function *CalleeFunc = CB->getCalledFunction();
5858
if (!CalleeFunc)
5859
return false;
5860
const GlobalValue *CalleeGV = dyn_cast<GlobalValue>(CalleeFunc);
5861
5862
SmallVector<ISD::OutputArg, 2> Outs;
5863
SmallVector<ISD::InputArg, 2> Ins;
5864
5865
GetReturnInfo(CalleeCC, CalleeFunc->getReturnType(),
5866
CalleeFunc->getAttributes(), Outs, *this,
5867
CalleeFunc->getDataLayout());
5868
5869
return isEligibleForTCO(CalleeGV, CalleeCC, CallerCC, CB,
5870
CalleeFunc->isVarArg(), Outs, Ins, CallerFunc,
5871
false /*isCalleeExternalSymbol*/);
5872
}
5873
5874
bool PPCTargetLowering::isEligibleForTCO(
5875
const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
5876
CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg,
5877
const SmallVectorImpl<ISD::OutputArg> &Outs,
5878
const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc,
5879
bool isCalleeExternalSymbol) const {
5880
if (Subtarget.useLongCalls() && !(CB && CB->isMustTailCall()))
5881
return false;
5882
5883
if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
5884
return IsEligibleForTailCallOptimization_64SVR4(
5885
CalleeGV, CalleeCC, CallerCC, CB, isVarArg, Outs, Ins, CallerFunc,
5886
isCalleeExternalSymbol);
5887
else
5888
return IsEligibleForTailCallOptimization(CalleeGV, CalleeCC, CallerCC,
5889
isVarArg, Ins);
5890
}
5891
5892
SDValue
5893
PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
5894
SmallVectorImpl<SDValue> &InVals) const {
5895
SelectionDAG &DAG = CLI.DAG;
5896
SDLoc &dl = CLI.DL;
5897
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
5898
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
5899
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
5900
SDValue Chain = CLI.Chain;
5901
SDValue Callee = CLI.Callee;
5902
bool &isTailCall = CLI.IsTailCall;
5903
CallingConv::ID CallConv = CLI.CallConv;
5904
bool isVarArg = CLI.IsVarArg;
5905
bool isPatchPoint = CLI.IsPatchPoint;
5906
const CallBase *CB = CLI.CB;
5907
5908
if (isTailCall) {
5909
MachineFunction &MF = DAG.getMachineFunction();
5910
CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
5911
auto *G = dyn_cast<GlobalAddressSDNode>(Callee);
5912
const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5913
bool IsCalleeExternalSymbol = isa<ExternalSymbolSDNode>(Callee);
5914
5915
isTailCall =
5916
isEligibleForTCO(GV, CallConv, CallerCC, CB, isVarArg, Outs, Ins,
5917
&(MF.getFunction()), IsCalleeExternalSymbol);
5918
if (isTailCall) {
5919
++NumTailCalls;
5920
if (!getTargetMachine().Options.GuaranteedTailCallOpt)
5921
++NumSiblingCalls;
5922
5923
// PC Relative calls no longer guarantee that the callee is a Global
5924
// Address Node. The callee could be an indirect tail call in which
5925
// case the SDValue for the callee could be a load (to load the address
5926
// of a function pointer) or it may be a register copy (to move the
5927
// address of the callee from a function parameter into a virtual
5928
// register). It may also be an ExternalSymbolSDNode (ex memcopy).
5929
assert((Subtarget.isUsingPCRelativeCalls() ||
5930
isa<GlobalAddressSDNode>(Callee)) &&
5931
"Callee should be an llvm::Function object.");
5932
5933
LLVM_DEBUG(dbgs() << "TCO caller: " << DAG.getMachineFunction().getName()
5934
<< "\nTCO callee: ");
5935
LLVM_DEBUG(Callee.dump());
5936
}
5937
}
5938
5939
if (!isTailCall && CB && CB->isMustTailCall())
5940
report_fatal_error("failed to perform tail call elimination on a call "
5941
"site marked musttail");
5942
5943
// When long calls (i.e. indirect calls) are always used, calls are always
5944
// made via function pointer. If we have a function name, first translate it
5945
// into a pointer.
5946
if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) &&
5947
!isTailCall)
5948
Callee = LowerGlobalAddress(Callee, DAG);
5949
5950
CallFlags CFlags(
5951
CallConv, isTailCall, isVarArg, isPatchPoint,
5952
isIndirectCall(Callee, DAG, Subtarget, isPatchPoint),
5953
// hasNest
5954
Subtarget.is64BitELFABI() &&
5955
any_of(Outs, [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),
5956
CLI.NoMerge);
5957
5958
if (Subtarget.isAIXABI())
5959
return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5960
InVals, CB);
5961
5962
assert(Subtarget.isSVR4ABI());
5963
if (Subtarget.isPPC64())
5964
return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5965
InVals, CB);
5966
return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5967
InVals, CB);
5968
}
5969
5970
SDValue PPCTargetLowering::LowerCall_32SVR4(
5971
SDValue Chain, SDValue Callee, CallFlags CFlags,
5972
const SmallVectorImpl<ISD::OutputArg> &Outs,
5973
const SmallVectorImpl<SDValue> &OutVals,
5974
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
5975
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
5976
const CallBase *CB) const {
5977
// See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
5978
// of the 32-bit SVR4 ABI stack frame layout.
5979
5980
const CallingConv::ID CallConv = CFlags.CallConv;
5981
const bool IsVarArg = CFlags.IsVarArg;
5982
const bool IsTailCall = CFlags.IsTailCall;
5983
5984
assert((CallConv == CallingConv::C ||
5985
CallConv == CallingConv::Cold ||
5986
CallConv == CallingConv::Fast) && "Unknown calling convention!");
5987
5988
const Align PtrAlign(4);
5989
5990
MachineFunction &MF = DAG.getMachineFunction();
5991
5992
// Mark this function as potentially containing a function that contains a
5993
// tail call. As a consequence the frame pointer will be used for dynamicalloc
5994
// and restoring the callers stack pointer in this functions epilog. This is
5995
// done because by tail calling the called function might overwrite the value
5996
// in this function's (MF) stack pointer stack slot 0(SP).
5997
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
5998
CallConv == CallingConv::Fast)
5999
MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
6000
6001
// Count how many bytes are to be pushed on the stack, including the linkage
6002
// area, parameter list area and the part of the local variable space which
6003
// contains copies of aggregates which are passed by value.
6004
6005
// Assign locations to all of the outgoing arguments.
6006
SmallVector<CCValAssign, 16> ArgLocs;
6007
PPCCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
6008
6009
// Reserve space for the linkage area on the stack.
6010
CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),
6011
PtrAlign);
6012
if (useSoftFloat())
6013
CCInfo.PreAnalyzeCallOperands(Outs);
6014
6015
if (IsVarArg) {
6016
// Handle fixed and variable vector arguments differently.
6017
// Fixed vector arguments go into registers as long as registers are
6018
// available. Variable vector arguments always go into memory.
6019
unsigned NumArgs = Outs.size();
6020
6021
for (unsigned i = 0; i != NumArgs; ++i) {
6022
MVT ArgVT = Outs[i].VT;
6023
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
6024
bool Result;
6025
6026
if (Outs[i].IsFixed) {
6027
Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
6028
CCInfo);
6029
} else {
6030
Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
6031
ArgFlags, CCInfo);
6032
}
6033
6034
if (Result) {
6035
#ifndef NDEBUG
6036
errs() << "Call operand #" << i << " has unhandled type "
6037
<< ArgVT << "\n";
6038
#endif
6039
llvm_unreachable(nullptr);
6040
}
6041
}
6042
} else {
6043
// All arguments are treated the same.
6044
CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
6045
}
6046
CCInfo.clearWasPPCF128();
6047
6048
// Assign locations to all of the outgoing aggregate by value arguments.
6049
SmallVector<CCValAssign, 16> ByValArgLocs;
6050
CCState CCByValInfo(CallConv, IsVarArg, MF, ByValArgLocs, *DAG.getContext());
6051
6052
// Reserve stack space for the allocations in CCInfo.
6053
CCByValInfo.AllocateStack(CCInfo.getStackSize(), PtrAlign);
6054
6055
CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);
6056
6057
// Size of the linkage area, parameter list area and the part of the local
6058
// space variable where copies of aggregates which are passed by value are
6059
// stored.
6060
unsigned NumBytes = CCByValInfo.getStackSize();
6061
6062
// Calculate by how many bytes the stack has to be adjusted in case of tail
6063
// call optimization.
6064
int SPDiff = CalculateTailCallSPDiff(DAG, IsTailCall, NumBytes);
6065
6066
// Adjust the stack pointer for the new arguments...
6067
// These operations are automatically eliminated by the prolog/epilog pass
6068
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
6069
SDValue CallSeqStart = Chain;
6070
6071
// Load the return address and frame pointer so it can be moved somewhere else
6072
// later.
6073
SDValue LROp, FPOp;
6074
Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
6075
6076
// Set up a copy of the stack pointer for use loading and storing any
6077
// arguments that may not fit in the registers available for argument
6078
// passing.
6079
SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
6080
6081
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
6082
SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
6083
SmallVector<SDValue, 8> MemOpChains;
6084
6085
bool seenFloatArg = false;
6086
// Walk the register/memloc assignments, inserting copies/loads.
6087
// i - Tracks the index into the list of registers allocated for the call
6088
// RealArgIdx - Tracks the index into the list of actual function arguments
6089
// j - Tracks the index into the list of byval arguments
6090
for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size();
6091
i != e;
6092
++i, ++RealArgIdx) {
6093
CCValAssign &VA = ArgLocs[i];
6094
SDValue Arg = OutVals[RealArgIdx];
6095
ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags;
6096
6097
if (Flags.isByVal()) {
6098
// Argument is an aggregate which is passed by value, thus we need to
6099
// create a copy of it in the local variable space of the current stack
6100
// frame (which is the stack frame of the caller) and pass the address of
6101
// this copy to the callee.
6102
assert((j < ByValArgLocs.size()) && "Index out of bounds!");
6103
CCValAssign &ByValVA = ByValArgLocs[j++];
6104
assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
6105
6106
// Memory reserved in the local variable space of the callers stack frame.
6107
unsigned LocMemOffset = ByValVA.getLocMemOffset();
6108
6109
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
6110
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
6111
StackPtr, PtrOff);
6112
6113
// Create a copy of the argument in the local area of the current
6114
// stack frame.
6115
SDValue MemcpyCall =
6116
CreateCopyOfByValArgument(Arg, PtrOff,
6117
CallSeqStart.getNode()->getOperand(0),
6118
Flags, DAG, dl);
6119
6120
// This must go outside the CALLSEQ_START..END.
6121
SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,
6122
SDLoc(MemcpyCall));
6123
DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
6124
NewCallSeqStart.getNode());
6125
Chain = CallSeqStart = NewCallSeqStart;
6126
6127
// Pass the address of the aggregate copy on the stack either in a
6128
// physical register or in the parameter list area of the current stack
6129
// frame to the callee.
6130
Arg = PtrOff;
6131
}
6132
6133
// When useCRBits() is true, there can be i1 arguments.
6134
// It is because getRegisterType(MVT::i1) => MVT::i1,
6135
// and for other integer types getRegisterType() => MVT::i32.
6136
// Extend i1 and ensure callee will get i32.
6137
if (Arg.getValueType() == MVT::i1)
6138
Arg = DAG.getNode(Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
6139
dl, MVT::i32, Arg);
6140
6141
if (VA.isRegLoc()) {
6142
seenFloatArg |= VA.getLocVT().isFloatingPoint();
6143
// Put argument in a physical register.
6144
if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) {
6145
bool IsLE = Subtarget.isLittleEndian();
6146
SDValue SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
6147
DAG.getIntPtrConstant(IsLE ? 0 : 1, dl));
6148
RegsToPass.push_back(std::make_pair(VA.getLocReg(), SVal.getValue(0)));
6149
SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
6150
DAG.getIntPtrConstant(IsLE ? 1 : 0, dl));
6151
RegsToPass.push_back(std::make_pair(ArgLocs[++i].getLocReg(),
6152
SVal.getValue(0)));
6153
} else
6154
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
6155
} else {
6156
// Put argument in the parameter list area of the current stack frame.
6157
assert(VA.isMemLoc());
6158
unsigned LocMemOffset = VA.getLocMemOffset();
6159
6160
if (!IsTailCall) {
6161
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
6162
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
6163
StackPtr, PtrOff);
6164
6165
MemOpChains.push_back(
6166
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
6167
} else {
6168
// Calculate and remember argument location.
6169
CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
6170
TailCallArguments);
6171
}
6172
}
6173
}
6174
6175
if (!MemOpChains.empty())
6176
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
6177
6178
// Build a sequence of copy-to-reg nodes chained together with token chain
6179
// and flag operands which copy the outgoing args into the appropriate regs.
6180
SDValue InGlue;
6181
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
6182
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
6183
RegsToPass[i].second, InGlue);
6184
InGlue = Chain.getValue(1);
6185
}
6186
6187
// Set CR bit 6 to true if this is a vararg call with floating args passed in
6188
// registers.
6189
if (IsVarArg) {
6190
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
6191
SDValue Ops[] = { Chain, InGlue };
6192
6193
Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, dl,
6194
VTs, ArrayRef(Ops, InGlue.getNode() ? 2 : 1));
6195
6196
InGlue = Chain.getValue(1);
6197
}
6198
6199
if (IsTailCall)
6200
PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
6201
TailCallArguments);
6202
6203
return FinishCall(CFlags, dl, DAG, RegsToPass, InGlue, Chain, CallSeqStart,
6204
Callee, SPDiff, NumBytes, Ins, InVals, CB);
6205
}
6206
6207
// Copy an argument into memory, being careful to do this outside the
6208
// call sequence for the call to which the argument belongs.
6209
SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
6210
SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
6211
SelectionDAG &DAG, const SDLoc &dl) const {
6212
SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
6213
CallSeqStart.getNode()->getOperand(0),
6214
Flags, DAG, dl);
6215
// The MEMCPY must go outside the CALLSEQ_START..END.
6216
int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);
6217
SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,
6218
SDLoc(MemcpyCall));
6219
DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
6220
NewCallSeqStart.getNode());
6221
return NewCallSeqStart;
6222
}
6223
6224
SDValue PPCTargetLowering::LowerCall_64SVR4(
6225
SDValue Chain, SDValue Callee, CallFlags CFlags,
6226
const SmallVectorImpl<ISD::OutputArg> &Outs,
6227
const SmallVectorImpl<SDValue> &OutVals,
6228
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
6229
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
6230
const CallBase *CB) const {
6231
bool isELFv2ABI = Subtarget.isELFv2ABI();
6232
bool isLittleEndian = Subtarget.isLittleEndian();
6233
unsigned NumOps = Outs.size();
6234
bool IsSibCall = false;
6235
bool IsFastCall = CFlags.CallConv == CallingConv::Fast;
6236
6237
EVT PtrVT = getPointerTy(DAG.getDataLayout());
6238
unsigned PtrByteSize = 8;
6239
6240
MachineFunction &MF = DAG.getMachineFunction();
6241
6242
if (CFlags.IsTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
6243
IsSibCall = true;
6244
6245
// Mark this function as potentially containing a function that contains a
6246
// tail call. As a consequence the frame pointer will be used for dynamicalloc
6247
// and restoring the callers stack pointer in this functions epilog. This is
6248
// done because by tail calling the called function might overwrite the value
6249
// in this function's (MF) stack pointer stack slot 0(SP).
6250
if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
6251
MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
6252
6253
assert(!(IsFastCall && CFlags.IsVarArg) &&
6254
"fastcc not supported on varargs functions");
6255
6256
// Count how many bytes are to be pushed on the stack, including the linkage
6257
// area, and parameter passing area. On ELFv1, the linkage area is 48 bytes
6258
// reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
6259
// area is 32 bytes reserved space for [SP][CR][LR][TOC].
6260
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
6261
unsigned NumBytes = LinkageSize;
6262
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
6263
6264
static const MCPhysReg GPR[] = {
6265
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
6266
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
6267
};
6268
static const MCPhysReg VR[] = {
6269
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
6270
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
6271
};
6272
6273
const unsigned NumGPRs = std::size(GPR);
6274
const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
6275
const unsigned NumVRs = std::size(VR);
6276
6277
// On ELFv2, we can avoid allocating the parameter area if all the arguments
6278
// can be passed to the callee in registers.
6279
// For the fast calling convention, there is another check below.
6280
// Note: We should keep consistent with LowerFormalArguments_64SVR4()
6281
bool HasParameterArea = !isELFv2ABI || CFlags.IsVarArg || IsFastCall;
6282
if (!HasParameterArea) {
6283
unsigned ParamAreaSize = NumGPRs * PtrByteSize;
6284
unsigned AvailableFPRs = NumFPRs;
6285
unsigned AvailableVRs = NumVRs;
6286
unsigned NumBytesTmp = NumBytes;
6287
for (unsigned i = 0; i != NumOps; ++i) {
6288
if (Outs[i].Flags.isNest()) continue;
6289
if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,
6290
PtrByteSize, LinkageSize, ParamAreaSize,
6291
NumBytesTmp, AvailableFPRs, AvailableVRs))
6292
HasParameterArea = true;
6293
}
6294
}
6295
6296
// When using the fast calling convention, we don't provide backing for
6297
// arguments that will be in registers.
6298
unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
6299
6300
// Avoid allocating parameter area for fastcc functions if all the arguments
6301
// can be passed in the registers.
6302
if (IsFastCall)
6303
HasParameterArea = false;
6304
6305
// Add up all the space actually used.
6306
for (unsigned i = 0; i != NumOps; ++i) {
6307
ISD::ArgFlagsTy Flags = Outs[i].Flags;
6308
EVT ArgVT = Outs[i].VT;
6309
EVT OrigVT = Outs[i].ArgVT;
6310
6311
if (Flags.isNest())
6312
continue;
6313
6314
if (IsFastCall) {
6315
if (Flags.isByVal()) {
6316
NumGPRsUsed += (Flags.getByValSize()+7)/8;
6317
if (NumGPRsUsed > NumGPRs)
6318
HasParameterArea = true;
6319
} else {
6320
switch (ArgVT.getSimpleVT().SimpleTy) {
6321
default: llvm_unreachable("Unexpected ValueType for argument!");
6322
case MVT::i1:
6323
case MVT::i32:
6324
case MVT::i64:
6325
if (++NumGPRsUsed <= NumGPRs)
6326
continue;
6327
break;
6328
case MVT::v4i32:
6329
case MVT::v8i16:
6330
case MVT::v16i8:
6331
case MVT::v2f64:
6332
case MVT::v2i64:
6333
case MVT::v1i128:
6334
case MVT::f128:
6335
if (++NumVRsUsed <= NumVRs)
6336
continue;
6337
break;
6338
case MVT::v4f32:
6339
if (++NumVRsUsed <= NumVRs)
6340
continue;
6341
break;
6342
case MVT::f32:
6343
case MVT::f64:
6344
if (++NumFPRsUsed <= NumFPRs)
6345
continue;
6346
break;
6347
}
6348
HasParameterArea = true;
6349
}
6350
}
6351
6352
/* Respect alignment of argument on the stack. */
6353
auto Alignement =
6354
CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
6355
NumBytes = alignTo(NumBytes, Alignement);
6356
6357
NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
6358
if (Flags.isInConsecutiveRegsLast())
6359
NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
6360
}
6361
6362
unsigned NumBytesActuallyUsed = NumBytes;
6363
6364
// In the old ELFv1 ABI,
6365
// the prolog code of the callee may store up to 8 GPR argument registers to
6366
// the stack, allowing va_start to index over them in memory if its varargs.
6367
// Because we cannot tell if this is needed on the caller side, we have to
6368
// conservatively assume that it is needed. As such, make sure we have at
6369
// least enough stack space for the caller to store the 8 GPRs.
6370
// In the ELFv2 ABI, we allocate the parameter area iff a callee
6371
// really requires memory operands, e.g. a vararg function.
6372
if (HasParameterArea)
6373
NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
6374
else
6375
NumBytes = LinkageSize;
6376
6377
// Tail call needs the stack to be aligned.
6378
if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
6379
NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
6380
6381
int SPDiff = 0;
6382
6383
// Calculate by how many bytes the stack has to be adjusted in case of tail
6384
// call optimization.
6385
if (!IsSibCall)
6386
SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);
6387
6388
// To protect arguments on the stack from being clobbered in a tail call,
6389
// force all the loads to happen before doing any other lowering.
6390
if (CFlags.IsTailCall)
6391
Chain = DAG.getStackArgumentTokenFactor(Chain);
6392
6393
// Adjust the stack pointer for the new arguments...
6394
// These operations are automatically eliminated by the prolog/epilog pass
6395
if (!IsSibCall)
6396
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
6397
SDValue CallSeqStart = Chain;
6398
6399
// Load the return address and frame pointer so it can be move somewhere else
6400
// later.
6401
SDValue LROp, FPOp;
6402
Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
6403
6404
// Set up a copy of the stack pointer for use loading and storing any
6405
// arguments that may not fit in the registers available for argument
6406
// passing.
6407
SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
6408
6409
// Figure out which arguments are going to go in registers, and which in
6410
// memory. Also, if this is a vararg function, floating point operations
6411
// must be stored to our stack, and loaded into integer regs as well, if
6412
// any integer regs are available for argument passing.
6413
unsigned ArgOffset = LinkageSize;
6414
6415
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
6416
SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
6417
6418
SmallVector<SDValue, 8> MemOpChains;
6419
for (unsigned i = 0; i != NumOps; ++i) {
6420
SDValue Arg = OutVals[i];
6421
ISD::ArgFlagsTy Flags = Outs[i].Flags;
6422
EVT ArgVT = Outs[i].VT;
6423
EVT OrigVT = Outs[i].ArgVT;
6424
6425
// PtrOff will be used to store the current argument to the stack if a
6426
// register cannot be found for it.
6427
SDValue PtrOff;
6428
6429
// We re-align the argument offset for each argument, except when using the
6430
// fast calling convention, when we need to make sure we do that only when
6431
// we'll actually use a stack slot.
6432
auto ComputePtrOff = [&]() {
6433
/* Respect alignment of argument on the stack. */
6434
auto Alignment =
6435
CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
6436
ArgOffset = alignTo(ArgOffset, Alignment);
6437
6438
PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
6439
6440
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
6441
};
6442
6443
if (!IsFastCall) {
6444
ComputePtrOff();
6445
6446
/* Compute GPR index associated with argument offset. */
6447
GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
6448
GPR_idx = std::min(GPR_idx, NumGPRs);
6449
}
6450
6451
// Promote integers to 64-bit values.
6452
if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
6453
// FIXME: Should this use ANY_EXTEND if neither sext nor zext?
6454
unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6455
Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
6456
}
6457
6458
// FIXME memcpy is used way more than necessary. Correctness first.
6459
// Note: "by value" is code for passing a structure by value, not
6460
// basic types.
6461
if (Flags.isByVal()) {
6462
// Note: Size includes alignment padding, so
6463
// struct x { short a; char b; }
6464
// will have Size = 4. With #pragma pack(1), it will have Size = 3.
6465
// These are the proper values we need for right-justifying the
6466
// aggregate in a parameter register.
6467
unsigned Size = Flags.getByValSize();
6468
6469
// An empty aggregate parameter takes up no storage and no
6470
// registers.
6471
if (Size == 0)
6472
continue;
6473
6474
if (IsFastCall)
6475
ComputePtrOff();
6476
6477
// All aggregates smaller than 8 bytes must be passed right-justified.
6478
if (Size==1 || Size==2 || Size==4) {
6479
EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
6480
if (GPR_idx != NumGPRs) {
6481
SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
6482
MachinePointerInfo(), VT);
6483
MemOpChains.push_back(Load.getValue(1));
6484
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
6485
6486
ArgOffset += PtrByteSize;
6487
continue;
6488
}
6489
}
6490
6491
if (GPR_idx == NumGPRs && Size < 8) {
6492
SDValue AddPtr = PtrOff;
6493
if (!isLittleEndian) {
6494
SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
6495
PtrOff.getValueType());
6496
AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
6497
}
6498
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
6499
CallSeqStart,
6500
Flags, DAG, dl);
6501
ArgOffset += PtrByteSize;
6502
continue;
6503
}
6504
// Copy the object to parameter save area if it can not be entirely passed
6505
// by registers.
6506
// FIXME: we only need to copy the parts which need to be passed in
6507
// parameter save area. For the parts passed by registers, we don't need
6508
// to copy them to the stack although we need to allocate space for them
6509
// in parameter save area.
6510
if ((NumGPRs - GPR_idx) * PtrByteSize < Size)
6511
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
6512
CallSeqStart,
6513
Flags, DAG, dl);
6514
6515
// When a register is available, pass a small aggregate right-justified.
6516
if (Size < 8 && GPR_idx != NumGPRs) {
6517
// The easiest way to get this right-justified in a register
6518
// is to copy the structure into the rightmost portion of a
6519
// local variable slot, then load the whole slot into the
6520
// register.
6521
// FIXME: The memcpy seems to produce pretty awful code for
6522
// small aggregates, particularly for packed ones.
6523
// FIXME: It would be preferable to use the slot in the
6524
// parameter save area instead of a new local variable.
6525
SDValue AddPtr = PtrOff;
6526
if (!isLittleEndian) {
6527
SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType());
6528
AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
6529
}
6530
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
6531
CallSeqStart,
6532
Flags, DAG, dl);
6533
6534
// Load the slot into the register.
6535
SDValue Load =
6536
DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo());
6537
MemOpChains.push_back(Load.getValue(1));
6538
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
6539
6540
// Done with this argument.
6541
ArgOffset += PtrByteSize;
6542
continue;
6543
}
6544
6545
// For aggregates larger than PtrByteSize, copy the pieces of the
6546
// object that fit into registers from the parameter save area.
6547
for (unsigned j=0; j<Size; j+=PtrByteSize) {
6548
SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
6549
SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
6550
if (GPR_idx != NumGPRs) {
6551
unsigned LoadSizeInBits = std::min(PtrByteSize, (Size - j)) * 8;
6552
EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), LoadSizeInBits);
6553
SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, AddArg,
6554
MachinePointerInfo(), ObjType);
6555
6556
MemOpChains.push_back(Load.getValue(1));
6557
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
6558
ArgOffset += PtrByteSize;
6559
} else {
6560
ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
6561
break;
6562
}
6563
}
6564
continue;
6565
}
6566
6567
switch (Arg.getSimpleValueType().SimpleTy) {
6568
default: llvm_unreachable("Unexpected ValueType for argument!");
6569
case MVT::i1:
6570
case MVT::i32:
6571
case MVT::i64:
6572
if (Flags.isNest()) {
6573
// The 'nest' parameter, if any, is passed in R11.
6574
RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
6575
break;
6576
}
6577
6578
// These can be scalar arguments or elements of an integer array type
6579
// passed directly. Clang may use those instead of "byval" aggregate
6580
// types to avoid forcing arguments to memory unnecessarily.
6581
if (GPR_idx != NumGPRs) {
6582
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
6583
} else {
6584
if (IsFastCall)
6585
ComputePtrOff();
6586
6587
assert(HasParameterArea &&
6588
"Parameter area must exist to pass an argument in memory.");
6589
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6590
true, CFlags.IsTailCall, false, MemOpChains,
6591
TailCallArguments, dl);
6592
if (IsFastCall)
6593
ArgOffset += PtrByteSize;
6594
}
6595
if (!IsFastCall)
6596
ArgOffset += PtrByteSize;
6597
break;
6598
case MVT::f32:
6599
case MVT::f64: {
6600
// These can be scalar arguments or elements of a float array type
6601
// passed directly. The latter are used to implement ELFv2 homogenous
6602
// float aggregates.
6603
6604
// Named arguments go into FPRs first, and once they overflow, the
6605
// remaining arguments go into GPRs and then the parameter save area.
6606
// Unnamed arguments for vararg functions always go to GPRs and
6607
// then the parameter save area. For now, put all arguments to vararg
6608
// routines always in both locations (FPR *and* GPR or stack slot).
6609
bool NeedGPROrStack = CFlags.IsVarArg || FPR_idx == NumFPRs;
6610
bool NeededLoad = false;
6611
6612
// First load the argument into the next available FPR.
6613
if (FPR_idx != NumFPRs)
6614
RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
6615
6616
// Next, load the argument into GPR or stack slot if needed.
6617
if (!NeedGPROrStack)
6618
;
6619
else if (GPR_idx != NumGPRs && !IsFastCall) {
6620
// FIXME: We may want to re-enable this for CallingConv::Fast on the P8
6621
// once we support fp <-> gpr moves.
6622
6623
// In the non-vararg case, this can only ever happen in the
6624
// presence of f32 array types, since otherwise we never run
6625
// out of FPRs before running out of GPRs.
6626
SDValue ArgVal;
6627
6628
// Double values are always passed in a single GPR.
6629
if (Arg.getValueType() != MVT::f32) {
6630
ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
6631
6632
// Non-array float values are extended and passed in a GPR.
6633
} else if (!Flags.isInConsecutiveRegs()) {
6634
ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
6635
ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
6636
6637
// If we have an array of floats, we collect every odd element
6638
// together with its predecessor into one GPR.
6639
} else if (ArgOffset % PtrByteSize != 0) {
6640
SDValue Lo, Hi;
6641
Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
6642
Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
6643
if (!isLittleEndian)
6644
std::swap(Lo, Hi);
6645
ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6646
6647
// The final element, if even, goes into the first half of a GPR.
6648
} else if (Flags.isInConsecutiveRegsLast()) {
6649
ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
6650
ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
6651
if (!isLittleEndian)
6652
ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
6653
DAG.getConstant(32, dl, MVT::i32));
6654
6655
// Non-final even elements are skipped; they will be handled
6656
// together the with subsequent argument on the next go-around.
6657
} else
6658
ArgVal = SDValue();
6659
6660
if (ArgVal.getNode())
6661
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));
6662
} else {
6663
if (IsFastCall)
6664
ComputePtrOff();
6665
6666
// Single-precision floating-point values are mapped to the
6667
// second (rightmost) word of the stack doubleword.
6668
if (Arg.getValueType() == MVT::f32 &&
6669
!isLittleEndian && !Flags.isInConsecutiveRegs()) {
6670
SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
6671
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
6672
}
6673
6674
assert(HasParameterArea &&
6675
"Parameter area must exist to pass an argument in memory.");
6676
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6677
true, CFlags.IsTailCall, false, MemOpChains,
6678
TailCallArguments, dl);
6679
6680
NeededLoad = true;
6681
}
6682
// When passing an array of floats, the array occupies consecutive
6683
// space in the argument area; only round up to the next doubleword
6684
// at the end of the array. Otherwise, each float takes 8 bytes.
6685
if (!IsFastCall || NeededLoad) {
6686
ArgOffset += (Arg.getValueType() == MVT::f32 &&
6687
Flags.isInConsecutiveRegs()) ? 4 : 8;
6688
if (Flags.isInConsecutiveRegsLast())
6689
ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
6690
}
6691
break;
6692
}
6693
case MVT::v4f32:
6694
case MVT::v4i32:
6695
case MVT::v8i16:
6696
case MVT::v16i8:
6697
case MVT::v2f64:
6698
case MVT::v2i64:
6699
case MVT::v1i128:
6700
case MVT::f128:
6701
// These can be scalar arguments or elements of a vector array type
6702
// passed directly. The latter are used to implement ELFv2 homogenous
6703
// vector aggregates.
6704
6705
// For a varargs call, named arguments go into VRs or on the stack as
6706
// usual; unnamed arguments always go to the stack or the corresponding
6707
// GPRs when within range. For now, we always put the value in both
6708
// locations (or even all three).
6709
if (CFlags.IsVarArg) {
6710
assert(HasParameterArea &&
6711
"Parameter area must exist if we have a varargs call.");
6712
// We could elide this store in the case where the object fits
6713
// entirely in R registers. Maybe later.
6714
SDValue Store =
6715
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
6716
MemOpChains.push_back(Store);
6717
if (VR_idx != NumVRs) {
6718
SDValue Load =
6719
DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
6720
MemOpChains.push_back(Load.getValue(1));
6721
RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
6722
}
6723
ArgOffset += 16;
6724
for (unsigned i=0; i<16; i+=PtrByteSize) {
6725
if (GPR_idx == NumGPRs)
6726
break;
6727
SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
6728
DAG.getConstant(i, dl, PtrVT));
6729
SDValue Load =
6730
DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
6731
MemOpChains.push_back(Load.getValue(1));
6732
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
6733
}
6734
break;
6735
}
6736
6737
// Non-varargs Altivec params go into VRs or on the stack.
6738
if (VR_idx != NumVRs) {
6739
RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
6740
} else {
6741
if (IsFastCall)
6742
ComputePtrOff();
6743
6744
assert(HasParameterArea &&
6745
"Parameter area must exist to pass an argument in memory.");
6746
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6747
true, CFlags.IsTailCall, true, MemOpChains,
6748
TailCallArguments, dl);
6749
if (IsFastCall)
6750
ArgOffset += 16;
6751
}
6752
6753
if (!IsFastCall)
6754
ArgOffset += 16;
6755
break;
6756
}
6757
}
6758
6759
assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&
6760
"mismatch in size of parameter area");
6761
(void)NumBytesActuallyUsed;
6762
6763
if (!MemOpChains.empty())
6764
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
6765
6766
// Check if this is an indirect call (MTCTR/BCTRL).
6767
// See prepareDescriptorIndirectCall and buildCallOperands for more
6768
// information about calls through function pointers in the 64-bit SVR4 ABI.
6769
if (CFlags.IsIndirect) {
6770
// For 64-bit ELFv2 ABI with PCRel, do not save the TOC of the
6771
// caller in the TOC save area.
6772
if (isTOCSaveRestoreRequired(Subtarget)) {
6773
assert(!CFlags.IsTailCall && "Indirect tails calls not supported");
6774
// Load r2 into a virtual register and store it to the TOC save area.
6775
setUsesTOCBasePtr(DAG);
6776
SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
6777
// TOC save area offset.
6778
unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
6779
SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
6780
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
6781
Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr,
6782
MachinePointerInfo::getStack(
6783
DAG.getMachineFunction(), TOCSaveOffset));
6784
}
6785
// In the ELFv2 ABI, R12 must contain the address of an indirect callee.
6786
// This does not mean the MTCTR instruction must use R12; it's easier
6787
// to model this as an extra parameter, so do that.
6788
if (isELFv2ABI && !CFlags.IsPatchPoint)
6789
RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
6790
}
6791
6792
// Build a sequence of copy-to-reg nodes chained together with token chain
6793
// and flag operands which copy the outgoing args into the appropriate regs.
6794
SDValue InGlue;
6795
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
6796
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
6797
RegsToPass[i].second, InGlue);
6798
InGlue = Chain.getValue(1);
6799
}
6800
6801
if (CFlags.IsTailCall && !IsSibCall)
6802
PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
6803
TailCallArguments);
6804
6805
return FinishCall(CFlags, dl, DAG, RegsToPass, InGlue, Chain, CallSeqStart,
6806
Callee, SPDiff, NumBytes, Ins, InVals, CB);
6807
}
6808
6809
// Returns true when the shadow of a general purpose argument register
6810
// in the parameter save area is aligned to at least 'RequiredAlign'.
6811
static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) {
6812
assert(RequiredAlign.value() <= 16 &&
6813
"Required alignment greater than stack alignment.");
6814
switch (Reg) {
6815
default:
6816
report_fatal_error("called on invalid register.");
6817
case PPC::R5:
6818
case PPC::R9:
6819
case PPC::X3:
6820
case PPC::X5:
6821
case PPC::X7:
6822
case PPC::X9:
6823
// These registers are 16 byte aligned which is the most strict aligment
6824
// we can support.
6825
return true;
6826
case PPC::R3:
6827
case PPC::R7:
6828
case PPC::X4:
6829
case PPC::X6:
6830
case PPC::X8:
6831
case PPC::X10:
6832
// The shadow of these registers in the PSA is 8 byte aligned.
6833
return RequiredAlign <= 8;
6834
case PPC::R4:
6835
case PPC::R6:
6836
case PPC::R8:
6837
case PPC::R10:
6838
return RequiredAlign <= 4;
6839
}
6840
}
6841
6842
static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
6843
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
6844
CCState &S) {
6845
AIXCCState &State = static_cast<AIXCCState &>(S);
6846
const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
6847
State.getMachineFunction().getSubtarget());
6848
const bool IsPPC64 = Subtarget.isPPC64();
6849
const unsigned PtrSize = IsPPC64 ? 8 : 4;
6850
const Align PtrAlign(PtrSize);
6851
const Align StackAlign(16);
6852
const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
6853
6854
if (ValVT == MVT::f128)
6855
report_fatal_error("f128 is unimplemented on AIX.");
6856
6857
if (ArgFlags.isNest())
6858
report_fatal_error("Nest arguments are unimplemented.");
6859
6860
static const MCPhysReg GPR_32[] = {// 32-bit registers.
6861
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
6862
PPC::R7, PPC::R8, PPC::R9, PPC::R10};
6863
static const MCPhysReg GPR_64[] = {// 64-bit registers.
6864
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
6865
PPC::X7, PPC::X8, PPC::X9, PPC::X10};
6866
6867
static const MCPhysReg VR[] = {// Vector registers.
6868
PPC::V2, PPC::V3, PPC::V4, PPC::V5,
6869
PPC::V6, PPC::V7, PPC::V8, PPC::V9,
6870
PPC::V10, PPC::V11, PPC::V12, PPC::V13};
6871
6872
const ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32;
6873
6874
if (ArgFlags.isByVal()) {
6875
const Align ByValAlign(ArgFlags.getNonZeroByValAlign());
6876
if (ByValAlign > StackAlign)
6877
report_fatal_error("Pass-by-value arguments with alignment greater than "
6878
"16 are not supported.");
6879
6880
const unsigned ByValSize = ArgFlags.getByValSize();
6881
const Align ObjAlign = ByValAlign > PtrAlign ? ByValAlign : PtrAlign;
6882
6883
// An empty aggregate parameter takes up no storage and no registers,
6884
// but needs a MemLoc for a stack slot for the formal arguments side.
6885
if (ByValSize == 0) {
6886
State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
6887
State.getStackSize(), RegVT, LocInfo));
6888
return false;
6889
}
6890
6891
// Shadow allocate any registers that are not properly aligned.
6892
unsigned NextReg = State.getFirstUnallocated(GPRs);
6893
while (NextReg != GPRs.size() &&
6894
!isGPRShadowAligned(GPRs[NextReg], ObjAlign)) {
6895
// Shadow allocate next registers since its aligment is not strict enough.
6896
unsigned Reg = State.AllocateReg(GPRs);
6897
// Allocate the stack space shadowed by said register.
6898
State.AllocateStack(PtrSize, PtrAlign);
6899
assert(Reg && "Alocating register unexpectedly failed.");
6900
(void)Reg;
6901
NextReg = State.getFirstUnallocated(GPRs);
6902
}
6903
6904
const unsigned StackSize = alignTo(ByValSize, ObjAlign);
6905
unsigned Offset = State.AllocateStack(StackSize, ObjAlign);
6906
for (const unsigned E = Offset + StackSize; Offset < E; Offset += PtrSize) {
6907
if (unsigned Reg = State.AllocateReg(GPRs))
6908
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
6909
else {
6910
State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
6911
Offset, MVT::INVALID_SIMPLE_VALUE_TYPE,
6912
LocInfo));
6913
break;
6914
}
6915
}
6916
return false;
6917
}
6918
6919
// Arguments always reserve parameter save area.
6920
switch (ValVT.SimpleTy) {
6921
default:
6922
report_fatal_error("Unhandled value type for argument.");
6923
case MVT::i64:
6924
// i64 arguments should have been split to i32 for PPC32.
6925
assert(IsPPC64 && "PPC32 should have split i64 values.");
6926
[[fallthrough]];
6927
case MVT::i1:
6928
case MVT::i32: {
6929
const unsigned Offset = State.AllocateStack(PtrSize, PtrAlign);
6930
// AIX integer arguments are always passed in register width.
6931
if (ValVT.getFixedSizeInBits() < RegVT.getFixedSizeInBits())
6932
LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
6933
: CCValAssign::LocInfo::ZExt;
6934
if (unsigned Reg = State.AllocateReg(GPRs))
6935
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
6936
else
6937
State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, RegVT, LocInfo));
6938
6939
return false;
6940
}
6941
case MVT::f32:
6942
case MVT::f64: {
6943
// Parameter save area (PSA) is reserved even if the float passes in fpr.
6944
const unsigned StoreSize = LocVT.getStoreSize();
6945
// Floats are always 4-byte aligned in the PSA on AIX.
6946
// This includes f64 in 64-bit mode for ABI compatibility.
6947
const unsigned Offset =
6948
State.AllocateStack(IsPPC64 ? 8 : StoreSize, Align(4));
6949
unsigned FReg = State.AllocateReg(FPR);
6950
if (FReg)
6951
State.addLoc(CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, LocInfo));
6952
6953
// Reserve and initialize GPRs or initialize the PSA as required.
6954
for (unsigned I = 0; I < StoreSize; I += PtrSize) {
6955
if (unsigned Reg = State.AllocateReg(GPRs)) {
6956
assert(FReg && "An FPR should be available when a GPR is reserved.");
6957
if (State.isVarArg()) {
6958
// Successfully reserved GPRs are only initialized for vararg calls.
6959
// Custom handling is required for:
6960
// f64 in PPC32 needs to be split into 2 GPRs.
6961
// f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.
6962
State.addLoc(
6963
CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));
6964
}
6965
} else {
6966
// If there are insufficient GPRs, the PSA needs to be initialized.
6967
// Initialization occurs even if an FPR was initialized for
6968
// compatibility with the AIX XL compiler. The full memory for the
6969
// argument will be initialized even if a prior word is saved in GPR.
6970
// A custom memLoc is used when the argument also passes in FPR so
6971
// that the callee handling can skip over it easily.
6972
State.addLoc(
6973
FReg ? CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT,
6974
LocInfo)
6975
: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
6976
break;
6977
}
6978
}
6979
6980
return false;
6981
}
6982
case MVT::v4f32:
6983
case MVT::v4i32:
6984
case MVT::v8i16:
6985
case MVT::v16i8:
6986
case MVT::v2i64:
6987
case MVT::v2f64:
6988
case MVT::v1i128: {
6989
const unsigned VecSize = 16;
6990
const Align VecAlign(VecSize);
6991
6992
if (!State.isVarArg()) {
6993
// If there are vector registers remaining we don't consume any stack
6994
// space.
6995
if (unsigned VReg = State.AllocateReg(VR)) {
6996
State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
6997
return false;
6998
}
6999
// Vectors passed on the stack do not shadow GPRs or FPRs even though they
7000
// might be allocated in the portion of the PSA that is shadowed by the
7001
// GPRs.
7002
const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
7003
State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
7004
return false;
7005
}
7006
7007
unsigned NextRegIndex = State.getFirstUnallocated(GPRs);
7008
// Burn any underaligned registers and their shadowed stack space until
7009
// we reach the required alignment.
7010
while (NextRegIndex != GPRs.size() &&
7011
!isGPRShadowAligned(GPRs[NextRegIndex], VecAlign)) {
7012
// Shadow allocate register and its stack shadow.
7013
unsigned Reg = State.AllocateReg(GPRs);
7014
State.AllocateStack(PtrSize, PtrAlign);
7015
assert(Reg && "Allocating register unexpectedly failed.");
7016
(void)Reg;
7017
NextRegIndex = State.getFirstUnallocated(GPRs);
7018
}
7019
7020
// Vectors that are passed as fixed arguments are handled differently.
7021
// They are passed in VRs if any are available (unlike arguments passed
7022
// through ellipses) and shadow GPRs (unlike arguments to non-vaarg
7023
// functions)
7024
if (State.isFixed(ValNo)) {
7025
if (unsigned VReg = State.AllocateReg(VR)) {
7026
State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
7027
// Shadow allocate GPRs and stack space even though we pass in a VR.
7028
for (unsigned I = 0; I != VecSize; I += PtrSize)
7029
State.AllocateReg(GPRs);
7030
State.AllocateStack(VecSize, VecAlign);
7031
return false;
7032
}
7033
// No vector registers remain so pass on the stack.
7034
const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
7035
State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
7036
return false;
7037
}
7038
7039
// If all GPRS are consumed then we pass the argument fully on the stack.
7040
if (NextRegIndex == GPRs.size()) {
7041
const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
7042
State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
7043
return false;
7044
}
7045
7046
// Corner case for 32-bit codegen. We have 2 registers to pass the first
7047
// half of the argument, and then need to pass the remaining half on the
7048
// stack.
7049
if (GPRs[NextRegIndex] == PPC::R9) {
7050
const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
7051
State.addLoc(
7052
CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
7053
7054
const unsigned FirstReg = State.AllocateReg(PPC::R9);
7055
const unsigned SecondReg = State.AllocateReg(PPC::R10);
7056
assert(FirstReg && SecondReg &&
7057
"Allocating R9 or R10 unexpectedly failed.");
7058
State.addLoc(
7059
CCValAssign::getCustomReg(ValNo, ValVT, FirstReg, RegVT, LocInfo));
7060
State.addLoc(
7061
CCValAssign::getCustomReg(ValNo, ValVT, SecondReg, RegVT, LocInfo));
7062
return false;
7063
}
7064
7065
// We have enough GPRs to fully pass the vector argument, and we have
7066
// already consumed any underaligned registers. Start with the custom
7067
// MemLoc and then the custom RegLocs.
7068
const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
7069
State.addLoc(
7070
CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
7071
for (unsigned I = 0; I != VecSize; I += PtrSize) {
7072
const unsigned Reg = State.AllocateReg(GPRs);
7073
assert(Reg && "Failed to allocated register for vararg vector argument");
7074
State.addLoc(
7075
CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));
7076
}
7077
return false;
7078
}
7079
}
7080
return true;
7081
}
7082
7083
// So far, this function is only used by LowerFormalArguments_AIX()
7084
static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,
7085
bool IsPPC64,
7086
bool HasP8Vector,
7087
bool HasVSX) {
7088
assert((IsPPC64 || SVT != MVT::i64) &&
7089
"i64 should have been split for 32-bit codegen.");
7090
7091
switch (SVT) {
7092
default:
7093
report_fatal_error("Unexpected value type for formal argument");
7094
case MVT::i1:
7095
case MVT::i32:
7096
case MVT::i64:
7097
return IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
7098
case MVT::f32:
7099
return HasP8Vector ? &PPC::VSSRCRegClass : &PPC::F4RCRegClass;
7100
case MVT::f64:
7101
return HasVSX ? &PPC::VSFRCRegClass : &PPC::F8RCRegClass;
7102
case MVT::v4f32:
7103
case MVT::v4i32:
7104
case MVT::v8i16:
7105
case MVT::v16i8:
7106
case MVT::v2i64:
7107
case MVT::v2f64:
7108
case MVT::v1i128:
7109
return &PPC::VRRCRegClass;
7110
}
7111
}
7112
7113
static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
7114
SelectionDAG &DAG, SDValue ArgValue,
7115
MVT LocVT, const SDLoc &dl) {
7116
assert(ValVT.isScalarInteger() && LocVT.isScalarInteger());
7117
assert(ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits());
7118
7119
if (Flags.isSExt())
7120
ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue,
7121
DAG.getValueType(ValVT));
7122
else if (Flags.isZExt())
7123
ArgValue = DAG.getNode(ISD::AssertZext, dl, LocVT, ArgValue,
7124
DAG.getValueType(ValVT));
7125
7126
return DAG.getNode(ISD::TRUNCATE, dl, ValVT, ArgValue);
7127
}
7128
7129
static unsigned mapArgRegToOffsetAIX(unsigned Reg, const PPCFrameLowering *FL) {
7130
const unsigned LASize = FL->getLinkageSize();
7131
7132
if (PPC::GPRCRegClass.contains(Reg)) {
7133
assert(Reg >= PPC::R3 && Reg <= PPC::R10 &&
7134
"Reg must be a valid argument register!");
7135
return LASize + 4 * (Reg - PPC::R3);
7136
}
7137
7138
if (PPC::G8RCRegClass.contains(Reg)) {
7139
assert(Reg >= PPC::X3 && Reg <= PPC::X10 &&
7140
"Reg must be a valid argument register!");
7141
return LASize + 8 * (Reg - PPC::X3);
7142
}
7143
7144
llvm_unreachable("Only general purpose registers expected.");
7145
}
7146
7147
// AIX ABI Stack Frame Layout:
7148
//
7149
// Low Memory +--------------------------------------------+
7150
// SP +---> | Back chain | ---+
7151
// | +--------------------------------------------+ |
7152
// | | Saved Condition Register | |
7153
// | +--------------------------------------------+ |
7154
// | | Saved Linkage Register | |
7155
// | +--------------------------------------------+ | Linkage Area
7156
// | | Reserved for compilers | |
7157
// | +--------------------------------------------+ |
7158
// | | Reserved for binders | |
7159
// | +--------------------------------------------+ |
7160
// | | Saved TOC pointer | ---+
7161
// | +--------------------------------------------+
7162
// | | Parameter save area |
7163
// | +--------------------------------------------+
7164
// | | Alloca space |
7165
// | +--------------------------------------------+
7166
// | | Local variable space |
7167
// | +--------------------------------------------+
7168
// | | Float/int conversion temporary |
7169
// | +--------------------------------------------+
7170
// | | Save area for AltiVec registers |
7171
// | +--------------------------------------------+
7172
// | | AltiVec alignment padding |
7173
// | +--------------------------------------------+
7174
// | | Save area for VRSAVE register |
7175
// | +--------------------------------------------+
7176
// | | Save area for General Purpose registers |
7177
// | +--------------------------------------------+
7178
// | | Save area for Floating Point registers |
7179
// | +--------------------------------------------+
7180
// +---- | Back chain |
7181
// High Memory +--------------------------------------------+
7182
//
7183
// Specifications:
7184
// AIX 7.2 Assembler Language Reference
7185
// Subroutine linkage convention
7186
7187
SDValue PPCTargetLowering::LowerFormalArguments_AIX(
7188
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7189
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
7190
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7191
7192
assert((CallConv == CallingConv::C || CallConv == CallingConv::Cold ||
7193
CallConv == CallingConv::Fast) &&
7194
"Unexpected calling convention!");
7195
7196
if (getTargetMachine().Options.GuaranteedTailCallOpt)
7197
report_fatal_error("Tail call support is unimplemented on AIX.");
7198
7199
if (useSoftFloat())
7200
report_fatal_error("Soft float support is unimplemented on AIX.");
7201
7202
const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
7203
7204
const bool IsPPC64 = Subtarget.isPPC64();
7205
const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
7206
7207
// Assign locations to all of the incoming arguments.
7208
SmallVector<CCValAssign, 16> ArgLocs;
7209
MachineFunction &MF = DAG.getMachineFunction();
7210
MachineFrameInfo &MFI = MF.getFrameInfo();
7211
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
7212
AIXCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7213
7214
const EVT PtrVT = getPointerTy(MF.getDataLayout());
7215
// Reserve space for the linkage area on the stack.
7216
const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
7217
CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
7218
CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);
7219
7220
SmallVector<SDValue, 8> MemOps;
7221
7222
for (size_t I = 0, End = ArgLocs.size(); I != End; /* No increment here */) {
7223
CCValAssign &VA = ArgLocs[I++];
7224
MVT LocVT = VA.getLocVT();
7225
MVT ValVT = VA.getValVT();
7226
ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;
7227
// For compatibility with the AIX XL compiler, the float args in the
7228
// parameter save area are initialized even if the argument is available
7229
// in register. The caller is required to initialize both the register
7230
// and memory, however, the callee can choose to expect it in either.
7231
// The memloc is dismissed here because the argument is retrieved from
7232
// the register.
7233
if (VA.isMemLoc() && VA.needsCustom() && ValVT.isFloatingPoint())
7234
continue;
7235
7236
auto HandleMemLoc = [&]() {
7237
const unsigned LocSize = LocVT.getStoreSize();
7238
const unsigned ValSize = ValVT.getStoreSize();
7239
assert((ValSize <= LocSize) &&
7240
"Object size is larger than size of MemLoc");
7241
int CurArgOffset = VA.getLocMemOffset();
7242
// Objects are right-justified because AIX is big-endian.
7243
if (LocSize > ValSize)
7244
CurArgOffset += LocSize - ValSize;
7245
// Potential tail calls could cause overwriting of argument stack slots.
7246
const bool IsImmutable =
7247
!(getTargetMachine().Options.GuaranteedTailCallOpt &&
7248
(CallConv == CallingConv::Fast));
7249
int FI = MFI.CreateFixedObject(ValSize, CurArgOffset, IsImmutable);
7250
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
7251
SDValue ArgValue =
7252
DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo());
7253
InVals.push_back(ArgValue);
7254
};
7255
7256
// Vector arguments to VaArg functions are passed both on the stack, and
7257
// in any available GPRs. Load the value from the stack and add the GPRs
7258
// as live ins.
7259
if (VA.isMemLoc() && VA.needsCustom()) {
7260
assert(ValVT.isVector() && "Unexpected Custom MemLoc type.");
7261
assert(isVarArg && "Only use custom memloc for vararg.");
7262
// ValNo of the custom MemLoc, so we can compare it to the ValNo of the
7263
// matching custom RegLocs.
7264
const unsigned OriginalValNo = VA.getValNo();
7265
(void)OriginalValNo;
7266
7267
auto HandleCustomVecRegLoc = [&]() {
7268
assert(I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7269
"Missing custom RegLoc.");
7270
VA = ArgLocs[I++];
7271
assert(VA.getValVT().isVector() &&
7272
"Unexpected Val type for custom RegLoc.");
7273
assert(VA.getValNo() == OriginalValNo &&
7274
"ValNo mismatch between custom MemLoc and RegLoc.");
7275
MVT::SimpleValueType SVT = VA.getLocVT().SimpleTy;
7276
MF.addLiveIn(VA.getLocReg(),
7277
getRegClassForSVT(SVT, IsPPC64, Subtarget.hasP8Vector(),
7278
Subtarget.hasVSX()));
7279
};
7280
7281
HandleMemLoc();
7282
// In 64-bit there will be exactly 2 custom RegLocs that follow, and in
7283
// in 32-bit there will be 2 custom RegLocs if we are passing in R9 and
7284
// R10.
7285
HandleCustomVecRegLoc();
7286
HandleCustomVecRegLoc();
7287
7288
// If we are targeting 32-bit, there might be 2 extra custom RegLocs if
7289
// we passed the vector in R5, R6, R7 and R8.
7290
if (I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom()) {
7291
assert(!IsPPC64 &&
7292
"Only 2 custom RegLocs expected for 64-bit codegen.");
7293
HandleCustomVecRegLoc();
7294
HandleCustomVecRegLoc();
7295
}
7296
7297
continue;
7298
}
7299
7300
if (VA.isRegLoc()) {
7301
if (VA.getValVT().isScalarInteger())
7302
FuncInfo->appendParameterType(PPCFunctionInfo::FixedType);
7303
else if (VA.getValVT().isFloatingPoint() && !VA.getValVT().isVector()) {
7304
switch (VA.getValVT().SimpleTy) {
7305
default:
7306
report_fatal_error("Unhandled value type for argument.");
7307
case MVT::f32:
7308
FuncInfo->appendParameterType(PPCFunctionInfo::ShortFloatingPoint);
7309
break;
7310
case MVT::f64:
7311
FuncInfo->appendParameterType(PPCFunctionInfo::LongFloatingPoint);
7312
break;
7313
}
7314
} else if (VA.getValVT().isVector()) {
7315
switch (VA.getValVT().SimpleTy) {
7316
default:
7317
report_fatal_error("Unhandled value type for argument.");
7318
case MVT::v16i8:
7319
FuncInfo->appendParameterType(PPCFunctionInfo::VectorChar);
7320
break;
7321
case MVT::v8i16:
7322
FuncInfo->appendParameterType(PPCFunctionInfo::VectorShort);
7323
break;
7324
case MVT::v4i32:
7325
case MVT::v2i64:
7326
case MVT::v1i128:
7327
FuncInfo->appendParameterType(PPCFunctionInfo::VectorInt);
7328
break;
7329
case MVT::v4f32:
7330
case MVT::v2f64:
7331
FuncInfo->appendParameterType(PPCFunctionInfo::VectorFloat);
7332
break;
7333
}
7334
}
7335
}
7336
7337
if (Flags.isByVal() && VA.isMemLoc()) {
7338
const unsigned Size =
7339
alignTo(Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize,
7340
PtrByteSize);
7341
const int FI = MF.getFrameInfo().CreateFixedObject(
7342
Size, VA.getLocMemOffset(), /* IsImmutable */ false,
7343
/* IsAliased */ true);
7344
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
7345
InVals.push_back(FIN);
7346
7347
continue;
7348
}
7349
7350
if (Flags.isByVal()) {
7351
assert(VA.isRegLoc() && "MemLocs should already be handled.");
7352
7353
const MCPhysReg ArgReg = VA.getLocReg();
7354
const PPCFrameLowering *FL = Subtarget.getFrameLowering();
7355
7356
const unsigned StackSize = alignTo(Flags.getByValSize(), PtrByteSize);
7357
const int FI = MF.getFrameInfo().CreateFixedObject(
7358
StackSize, mapArgRegToOffsetAIX(ArgReg, FL), /* IsImmutable */ false,
7359
/* IsAliased */ true);
7360
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
7361
InVals.push_back(FIN);
7362
7363
// Add live ins for all the RegLocs for the same ByVal.
7364
const TargetRegisterClass *RegClass =
7365
IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
7366
7367
auto HandleRegLoc = [&, RegClass, LocVT](const MCPhysReg PhysReg,
7368
unsigned Offset) {
7369
const Register VReg = MF.addLiveIn(PhysReg, RegClass);
7370
// Since the callers side has left justified the aggregate in the
7371
// register, we can simply store the entire register into the stack
7372
// slot.
7373
SDValue CopyFrom = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
7374
// The store to the fixedstack object is needed becuase accessing a
7375
// field of the ByVal will use a gep and load. Ideally we will optimize
7376
// to extracting the value from the register directly, and elide the
7377
// stores when the arguments address is not taken, but that will need to
7378
// be future work.
7379
SDValue Store = DAG.getStore(
7380
CopyFrom.getValue(1), dl, CopyFrom,
7381
DAG.getObjectPtrOffset(dl, FIN, TypeSize::getFixed(Offset)),
7382
MachinePointerInfo::getFixedStack(MF, FI, Offset));
7383
7384
MemOps.push_back(Store);
7385
};
7386
7387
unsigned Offset = 0;
7388
HandleRegLoc(VA.getLocReg(), Offset);
7389
Offset += PtrByteSize;
7390
for (; Offset != StackSize && ArgLocs[I].isRegLoc();
7391
Offset += PtrByteSize) {
7392
assert(ArgLocs[I].getValNo() == VA.getValNo() &&
7393
"RegLocs should be for ByVal argument.");
7394
7395
const CCValAssign RL = ArgLocs[I++];
7396
HandleRegLoc(RL.getLocReg(), Offset);
7397
FuncInfo->appendParameterType(PPCFunctionInfo::FixedType);
7398
}
7399
7400
if (Offset != StackSize) {
7401
assert(ArgLocs[I].getValNo() == VA.getValNo() &&
7402
"Expected MemLoc for remaining bytes.");
7403
assert(ArgLocs[I].isMemLoc() && "Expected MemLoc for remaining bytes.");
7404
// Consume the MemLoc.The InVal has already been emitted, so nothing
7405
// more needs to be done.
7406
++I;
7407
}
7408
7409
continue;
7410
}
7411
7412
if (VA.isRegLoc() && !VA.needsCustom()) {
7413
MVT::SimpleValueType SVT = ValVT.SimpleTy;
7414
Register VReg =
7415
MF.addLiveIn(VA.getLocReg(),
7416
getRegClassForSVT(SVT, IsPPC64, Subtarget.hasP8Vector(),
7417
Subtarget.hasVSX()));
7418
SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
7419
if (ValVT.isScalarInteger() &&
7420
(ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits())) {
7421
ArgValue =
7422
truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
7423
}
7424
InVals.push_back(ArgValue);
7425
continue;
7426
}
7427
if (VA.isMemLoc()) {
7428
HandleMemLoc();
7429
continue;
7430
}
7431
}
7432
7433
// On AIX a minimum of 8 words is saved to the parameter save area.
7434
const unsigned MinParameterSaveArea = 8 * PtrByteSize;
7435
// Area that is at least reserved in the caller of this function.
7436
unsigned CallerReservedArea = std::max<unsigned>(
7437
CCInfo.getStackSize(), LinkageSize + MinParameterSaveArea);
7438
7439
// Set the size that is at least reserved in caller of this function. Tail
7440
// call optimized function's reserved stack space needs to be aligned so
7441
// that taking the difference between two stack areas will result in an
7442
// aligned stack.
7443
CallerReservedArea =
7444
EnsureStackAlignment(Subtarget.getFrameLowering(), CallerReservedArea);
7445
FuncInfo->setMinReservedArea(CallerReservedArea);
7446
7447
if (isVarArg) {
7448
FuncInfo->setVarArgsFrameIndex(
7449
MFI.CreateFixedObject(PtrByteSize, CCInfo.getStackSize(), true));
7450
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
7451
7452
static const MCPhysReg GPR_32[] = {PPC::R3, PPC::R4, PPC::R5, PPC::R6,
7453
PPC::R7, PPC::R8, PPC::R9, PPC::R10};
7454
7455
static const MCPhysReg GPR_64[] = {PPC::X3, PPC::X4, PPC::X5, PPC::X6,
7456
PPC::X7, PPC::X8, PPC::X9, PPC::X10};
7457
const unsigned NumGPArgRegs = std::size(IsPPC64 ? GPR_64 : GPR_32);
7458
7459
// The fixed integer arguments of a variadic function are stored to the
7460
// VarArgsFrameIndex on the stack so that they may be loaded by
7461
// dereferencing the result of va_next.
7462
for (unsigned GPRIndex =
7463
(CCInfo.getStackSize() - LinkageSize) / PtrByteSize;
7464
GPRIndex < NumGPArgRegs; ++GPRIndex) {
7465
7466
const Register VReg =
7467
IsPPC64 ? MF.addLiveIn(GPR_64[GPRIndex], &PPC::G8RCRegClass)
7468
: MF.addLiveIn(GPR_32[GPRIndex], &PPC::GPRCRegClass);
7469
7470
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
7471
SDValue Store =
7472
DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
7473
MemOps.push_back(Store);
7474
// Increment the address for the next argument to store.
7475
SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
7476
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
7477
}
7478
}
7479
7480
if (!MemOps.empty())
7481
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
7482
7483
return Chain;
7484
}
7485
7486
SDValue PPCTargetLowering::LowerCall_AIX(
7487
SDValue Chain, SDValue Callee, CallFlags CFlags,
7488
const SmallVectorImpl<ISD::OutputArg> &Outs,
7489
const SmallVectorImpl<SDValue> &OutVals,
7490
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
7491
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
7492
const CallBase *CB) const {
7493
// See PPCTargetLowering::LowerFormalArguments_AIX() for a description of the
7494
// AIX ABI stack frame layout.
7495
7496
assert((CFlags.CallConv == CallingConv::C ||
7497
CFlags.CallConv == CallingConv::Cold ||
7498
CFlags.CallConv == CallingConv::Fast) &&
7499
"Unexpected calling convention!");
7500
7501
if (CFlags.IsPatchPoint)
7502
report_fatal_error("This call type is unimplemented on AIX.");
7503
7504
const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
7505
7506
MachineFunction &MF = DAG.getMachineFunction();
7507
SmallVector<CCValAssign, 16> ArgLocs;
7508
AIXCCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
7509
*DAG.getContext());
7510
7511
// Reserve space for the linkage save area (LSA) on the stack.
7512
// In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
7513
// [SP][CR][LR][2 x reserved][TOC].
7514
// The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.
7515
const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
7516
const bool IsPPC64 = Subtarget.isPPC64();
7517
const EVT PtrVT = getPointerTy(DAG.getDataLayout());
7518
const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
7519
CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
7520
CCInfo.AnalyzeCallOperands(Outs, CC_AIX);
7521
7522
// The prolog code of the callee may store up to 8 GPR argument registers to
7523
// the stack, allowing va_start to index over them in memory if the callee
7524
// is variadic.
7525
// Because we cannot tell if this is needed on the caller side, we have to
7526
// conservatively assume that it is needed. As such, make sure we have at
7527
// least enough stack space for the caller to store the 8 GPRs.
7528
const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;
7529
const unsigned NumBytes = std::max<unsigned>(
7530
LinkageSize + MinParameterSaveAreaSize, CCInfo.getStackSize());
7531
7532
// Adjust the stack pointer for the new arguments...
7533
// These operations are automatically eliminated by the prolog/epilog pass.
7534
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
7535
SDValue CallSeqStart = Chain;
7536
7537
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
7538
SmallVector<SDValue, 8> MemOpChains;
7539
7540
// Set up a copy of the stack pointer for loading and storing any
7541
// arguments that may not fit in the registers available for argument
7542
// passing.
7543
const SDValue StackPtr = IsPPC64 ? DAG.getRegister(PPC::X1, MVT::i64)
7544
: DAG.getRegister(PPC::R1, MVT::i32);
7545
7546
for (unsigned I = 0, E = ArgLocs.size(); I != E;) {
7547
const unsigned ValNo = ArgLocs[I].getValNo();
7548
SDValue Arg = OutVals[ValNo];
7549
ISD::ArgFlagsTy Flags = Outs[ValNo].Flags;
7550
7551
if (Flags.isByVal()) {
7552
const unsigned ByValSize = Flags.getByValSize();
7553
7554
// Nothing to do for zero-sized ByVals on the caller side.
7555
if (!ByValSize) {
7556
++I;
7557
continue;
7558
}
7559
7560
auto GetLoad = [&](EVT VT, unsigned LoadOffset) {
7561
return DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain,
7562
(LoadOffset != 0)
7563
? DAG.getObjectPtrOffset(
7564
dl, Arg, TypeSize::getFixed(LoadOffset))
7565
: Arg,
7566
MachinePointerInfo(), VT);
7567
};
7568
7569
unsigned LoadOffset = 0;
7570
7571
// Initialize registers, which are fully occupied by the by-val argument.
7572
while (LoadOffset + PtrByteSize <= ByValSize && ArgLocs[I].isRegLoc()) {
7573
SDValue Load = GetLoad(PtrVT, LoadOffset);
7574
MemOpChains.push_back(Load.getValue(1));
7575
LoadOffset += PtrByteSize;
7576
const CCValAssign &ByValVA = ArgLocs[I++];
7577
assert(ByValVA.getValNo() == ValNo &&
7578
"Unexpected location for pass-by-value argument.");
7579
RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), Load));
7580
}
7581
7582
if (LoadOffset == ByValSize)
7583
continue;
7584
7585
// There must be one more loc to handle the remainder.
7586
assert(ArgLocs[I].getValNo() == ValNo &&
7587
"Expected additional location for by-value argument.");
7588
7589
if (ArgLocs[I].isMemLoc()) {
7590
assert(LoadOffset < ByValSize && "Unexpected memloc for by-val arg.");
7591
const CCValAssign &ByValVA = ArgLocs[I++];
7592
ISD::ArgFlagsTy MemcpyFlags = Flags;
7593
// Only memcpy the bytes that don't pass in register.
7594
MemcpyFlags.setByValSize(ByValSize - LoadOffset);
7595
Chain = CallSeqStart = createMemcpyOutsideCallSeq(
7596
(LoadOffset != 0) ? DAG.getObjectPtrOffset(
7597
dl, Arg, TypeSize::getFixed(LoadOffset))
7598
: Arg,
7599
DAG.getObjectPtrOffset(
7600
dl, StackPtr, TypeSize::getFixed(ByValVA.getLocMemOffset())),
7601
CallSeqStart, MemcpyFlags, DAG, dl);
7602
continue;
7603
}
7604
7605
// Initialize the final register residue.
7606
// Any residue that occupies the final by-val arg register must be
7607
// left-justified on AIX. Loads must be a power-of-2 size and cannot be
7608
// larger than the ByValSize. For example: a 7 byte by-val arg requires 4,
7609
// 2 and 1 byte loads.
7610
const unsigned ResidueBytes = ByValSize % PtrByteSize;
7611
assert(ResidueBytes != 0 && LoadOffset + PtrByteSize > ByValSize &&
7612
"Unexpected register residue for by-value argument.");
7613
SDValue ResidueVal;
7614
for (unsigned Bytes = 0; Bytes != ResidueBytes;) {
7615
const unsigned N = llvm::bit_floor(ResidueBytes - Bytes);
7616
const MVT VT =
7617
N == 1 ? MVT::i8
7618
: ((N == 2) ? MVT::i16 : (N == 4 ? MVT::i32 : MVT::i64));
7619
SDValue Load = GetLoad(VT, LoadOffset);
7620
MemOpChains.push_back(Load.getValue(1));
7621
LoadOffset += N;
7622
Bytes += N;
7623
7624
// By-val arguments are passed left-justfied in register.
7625
// Every load here needs to be shifted, otherwise a full register load
7626
// should have been used.
7627
assert(PtrVT.getSimpleVT().getSizeInBits() > (Bytes * 8) &&
7628
"Unexpected load emitted during handling of pass-by-value "
7629
"argument.");
7630
unsigned NumSHLBits = PtrVT.getSimpleVT().getSizeInBits() - (Bytes * 8);
7631
EVT ShiftAmountTy =
7632
getShiftAmountTy(Load->getValueType(0), DAG.getDataLayout());
7633
SDValue SHLAmt = DAG.getConstant(NumSHLBits, dl, ShiftAmountTy);
7634
SDValue ShiftedLoad =
7635
DAG.getNode(ISD::SHL, dl, Load.getValueType(), Load, SHLAmt);
7636
ResidueVal = ResidueVal ? DAG.getNode(ISD::OR, dl, PtrVT, ResidueVal,
7637
ShiftedLoad)
7638
: ShiftedLoad;
7639
}
7640
7641
const CCValAssign &ByValVA = ArgLocs[I++];
7642
RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), ResidueVal));
7643
continue;
7644
}
7645
7646
CCValAssign &VA = ArgLocs[I++];
7647
const MVT LocVT = VA.getLocVT();
7648
const MVT ValVT = VA.getValVT();
7649
7650
switch (VA.getLocInfo()) {
7651
default:
7652
report_fatal_error("Unexpected argument extension type.");
7653
case CCValAssign::Full:
7654
break;
7655
case CCValAssign::ZExt:
7656
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
7657
break;
7658
case CCValAssign::SExt:
7659
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
7660
break;
7661
}
7662
7663
if (VA.isRegLoc() && !VA.needsCustom()) {
7664
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
7665
continue;
7666
}
7667
7668
// Vector arguments passed to VarArg functions need custom handling when
7669
// they are passed (at least partially) in GPRs.
7670
if (VA.isMemLoc() && VA.needsCustom() && ValVT.isVector()) {
7671
assert(CFlags.IsVarArg && "Custom MemLocs only used for Vector args.");
7672
// Store value to its stack slot.
7673
SDValue PtrOff =
7674
DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());
7675
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
7676
SDValue Store =
7677
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
7678
MemOpChains.push_back(Store);
7679
const unsigned OriginalValNo = VA.getValNo();
7680
// Then load the GPRs from the stack
7681
unsigned LoadOffset = 0;
7682
auto HandleCustomVecRegLoc = [&]() {
7683
assert(I != E && "Unexpected end of CCvalAssigns.");
7684
assert(ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7685
"Expected custom RegLoc.");
7686
CCValAssign RegVA = ArgLocs[I++];
7687
assert(RegVA.getValNo() == OriginalValNo &&
7688
"Custom MemLoc ValNo and custom RegLoc ValNo must match.");
7689
SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
7690
DAG.getConstant(LoadOffset, dl, PtrVT));
7691
SDValue Load = DAG.getLoad(PtrVT, dl, Store, Add, MachinePointerInfo());
7692
MemOpChains.push_back(Load.getValue(1));
7693
RegsToPass.push_back(std::make_pair(RegVA.getLocReg(), Load));
7694
LoadOffset += PtrByteSize;
7695
};
7696
7697
// In 64-bit there will be exactly 2 custom RegLocs that follow, and in
7698
// in 32-bit there will be 2 custom RegLocs if we are passing in R9 and
7699
// R10.
7700
HandleCustomVecRegLoc();
7701
HandleCustomVecRegLoc();
7702
7703
if (I != E && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7704
ArgLocs[I].getValNo() == OriginalValNo) {
7705
assert(!IsPPC64 &&
7706
"Only 2 custom RegLocs expected for 64-bit codegen.");
7707
HandleCustomVecRegLoc();
7708
HandleCustomVecRegLoc();
7709
}
7710
7711
continue;
7712
}
7713
7714
if (VA.isMemLoc()) {
7715
SDValue PtrOff =
7716
DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());
7717
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
7718
MemOpChains.push_back(
7719
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
7720
7721
continue;
7722
}
7723
7724
if (!ValVT.isFloatingPoint())
7725
report_fatal_error(
7726
"Unexpected register handling for calling convention.");
7727
7728
// Custom handling is used for GPR initializations for vararg float
7729
// arguments.
7730
assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg &&
7731
LocVT.isInteger() &&
7732
"Custom register handling only expected for VarArg.");
7733
7734
SDValue ArgAsInt =
7735
DAG.getBitcast(MVT::getIntegerVT(ValVT.getSizeInBits()), Arg);
7736
7737
if (Arg.getValueType().getStoreSize() == LocVT.getStoreSize())
7738
// f32 in 32-bit GPR
7739
// f64 in 64-bit GPR
7740
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt));
7741
else if (Arg.getValueType().getFixedSizeInBits() <
7742
LocVT.getFixedSizeInBits())
7743
// f32 in 64-bit GPR.
7744
RegsToPass.push_back(std::make_pair(
7745
VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, LocVT)));
7746
else {
7747
// f64 in two 32-bit GPRs
7748
// The 2 GPRs are marked custom and expected to be adjacent in ArgLocs.
7749
assert(Arg.getValueType() == MVT::f64 && CFlags.IsVarArg && !IsPPC64 &&
7750
"Unexpected custom register for argument!");
7751
CCValAssign &GPR1 = VA;
7752
SDValue MSWAsI64 = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgAsInt,
7753
DAG.getConstant(32, dl, MVT::i8));
7754
RegsToPass.push_back(std::make_pair(
7755
GPR1.getLocReg(), DAG.getZExtOrTrunc(MSWAsI64, dl, MVT::i32)));
7756
7757
if (I != E) {
7758
// If only 1 GPR was available, there will only be one custom GPR and
7759
// the argument will also pass in memory.
7760
CCValAssign &PeekArg = ArgLocs[I];
7761
if (PeekArg.isRegLoc() && PeekArg.getValNo() == PeekArg.getValNo()) {
7762
assert(PeekArg.needsCustom() && "A second custom GPR is expected.");
7763
CCValAssign &GPR2 = ArgLocs[I++];
7764
RegsToPass.push_back(std::make_pair(
7765
GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));
7766
}
7767
}
7768
}
7769
}
7770
7771
if (!MemOpChains.empty())
7772
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
7773
7774
// For indirect calls, we need to save the TOC base to the stack for
7775
// restoration after the call.
7776
if (CFlags.IsIndirect) {
7777
assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
7778
const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister();
7779
const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
7780
const MVT PtrVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
7781
const unsigned TOCSaveOffset =
7782
Subtarget.getFrameLowering()->getTOCSaveOffset();
7783
7784
setUsesTOCBasePtr(DAG);
7785
SDValue Val = DAG.getCopyFromReg(Chain, dl, TOCBaseReg, PtrVT);
7786
SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
7787
SDValue StackPtr = DAG.getRegister(StackPtrReg, PtrVT);
7788
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
7789
Chain = DAG.getStore(
7790
Val.getValue(1), dl, Val, AddPtr,
7791
MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
7792
}
7793
7794
// Build a sequence of copy-to-reg nodes chained together with token chain
7795
// and flag operands which copy the outgoing args into the appropriate regs.
7796
SDValue InGlue;
7797
for (auto Reg : RegsToPass) {
7798
Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, InGlue);
7799
InGlue = Chain.getValue(1);
7800
}
7801
7802
const int SPDiff = 0;
7803
return FinishCall(CFlags, dl, DAG, RegsToPass, InGlue, Chain, CallSeqStart,
7804
Callee, SPDiff, NumBytes, Ins, InVals, CB);
7805
}
7806
7807
bool
7808
PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
7809
MachineFunction &MF, bool isVarArg,
7810
const SmallVectorImpl<ISD::OutputArg> &Outs,
7811
LLVMContext &Context) const {
7812
SmallVector<CCValAssign, 16> RVLocs;
7813
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
7814
return CCInfo.CheckReturn(
7815
Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
7816
? RetCC_PPC_Cold
7817
: RetCC_PPC);
7818
}
7819
7820
SDValue
7821
PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
7822
bool isVarArg,
7823
const SmallVectorImpl<ISD::OutputArg> &Outs,
7824
const SmallVectorImpl<SDValue> &OutVals,
7825
const SDLoc &dl, SelectionDAG &DAG) const {
7826
SmallVector<CCValAssign, 16> RVLocs;
7827
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
7828
*DAG.getContext());
7829
CCInfo.AnalyzeReturn(Outs,
7830
(Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
7831
? RetCC_PPC_Cold
7832
: RetCC_PPC);
7833
7834
SDValue Glue;
7835
SmallVector<SDValue, 4> RetOps(1, Chain);
7836
7837
// Copy the result values into the output registers.
7838
for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) {
7839
CCValAssign &VA = RVLocs[i];
7840
assert(VA.isRegLoc() && "Can only return in registers!");
7841
7842
SDValue Arg = OutVals[RealResIdx];
7843
7844
switch (VA.getLocInfo()) {
7845
default: llvm_unreachable("Unknown loc info!");
7846
case CCValAssign::Full: break;
7847
case CCValAssign::AExt:
7848
Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
7849
break;
7850
case CCValAssign::ZExt:
7851
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
7852
break;
7853
case CCValAssign::SExt:
7854
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
7855
break;
7856
}
7857
if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
7858
bool isLittleEndian = Subtarget.isLittleEndian();
7859
// Legalize ret f64 -> ret 2 x i32.
7860
SDValue SVal =
7861
DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
7862
DAG.getIntPtrConstant(isLittleEndian ? 0 : 1, dl));
7863
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Glue);
7864
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
7865
SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
7866
DAG.getIntPtrConstant(isLittleEndian ? 1 : 0, dl));
7867
Glue = Chain.getValue(1);
7868
VA = RVLocs[++i]; // skip ahead to next loc
7869
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Glue);
7870
} else
7871
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
7872
Glue = Chain.getValue(1);
7873
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
7874
}
7875
7876
RetOps[0] = Chain; // Update chain.
7877
7878
// Add the glue if we have it.
7879
if (Glue.getNode())
7880
RetOps.push_back(Glue);
7881
7882
return DAG.getNode(PPCISD::RET_GLUE, dl, MVT::Other, RetOps);
7883
}
7884
7885
SDValue
7886
PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
7887
SelectionDAG &DAG) const {
7888
SDLoc dl(Op);
7889
7890
// Get the correct type for integers.
7891
EVT IntVT = Op.getValueType();
7892
7893
// Get the inputs.
7894
SDValue Chain = Op.getOperand(0);
7895
SDValue FPSIdx = getFramePointerFrameIndex(DAG);
7896
// Build a DYNAREAOFFSET node.
7897
SDValue Ops[2] = {Chain, FPSIdx};
7898
SDVTList VTs = DAG.getVTList(IntVT);
7899
return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
7900
}
7901
7902
SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
7903
SelectionDAG &DAG) const {
7904
// When we pop the dynamic allocation we need to restore the SP link.
7905
SDLoc dl(Op);
7906
7907
// Get the correct type for pointers.
7908
EVT PtrVT = getPointerTy(DAG.getDataLayout());
7909
7910
// Construct the stack pointer operand.
7911
bool isPPC64 = Subtarget.isPPC64();
7912
unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
7913
SDValue StackPtr = DAG.getRegister(SP, PtrVT);
7914
7915
// Get the operands for the STACKRESTORE.
7916
SDValue Chain = Op.getOperand(0);
7917
SDValue SaveSP = Op.getOperand(1);
7918
7919
// Load the old link SP.
7920
SDValue LoadLinkSP =
7921
DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo());
7922
7923
// Restore the stack pointer.
7924
Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
7925
7926
// Store the old link SP.
7927
return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo());
7928
}
7929
7930
SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
7931
MachineFunction &MF = DAG.getMachineFunction();
7932
bool isPPC64 = Subtarget.isPPC64();
7933
EVT PtrVT = getPointerTy(MF.getDataLayout());
7934
7935
// Get current frame pointer save index. The users of this index will be
7936
// primarily DYNALLOC instructions.
7937
PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
7938
int RASI = FI->getReturnAddrSaveIndex();
7939
7940
// If the frame pointer save index hasn't been defined yet.
7941
if (!RASI) {
7942
// Find out what the fix offset of the frame pointer save area.
7943
int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
7944
// Allocate the frame index for frame pointer save area.
7945
RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false);
7946
// Save the result.
7947
FI->setReturnAddrSaveIndex(RASI);
7948
}
7949
return DAG.getFrameIndex(RASI, PtrVT);
7950
}
7951
7952
SDValue
7953
PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
7954
MachineFunction &MF = DAG.getMachineFunction();
7955
bool isPPC64 = Subtarget.isPPC64();
7956
EVT PtrVT = getPointerTy(MF.getDataLayout());
7957
7958
// Get current frame pointer save index. The users of this index will be
7959
// primarily DYNALLOC instructions.
7960
PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
7961
int FPSI = FI->getFramePointerSaveIndex();
7962
7963
// If the frame pointer save index hasn't been defined yet.
7964
if (!FPSI) {
7965
// Find out what the fix offset of the frame pointer save area.
7966
int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
7967
// Allocate the frame index for frame pointer save area.
7968
FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
7969
// Save the result.
7970
FI->setFramePointerSaveIndex(FPSI);
7971
}
7972
return DAG.getFrameIndex(FPSI, PtrVT);
7973
}
7974
7975
SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
7976
SelectionDAG &DAG) const {
7977
MachineFunction &MF = DAG.getMachineFunction();
7978
// Get the inputs.
7979
SDValue Chain = Op.getOperand(0);
7980
SDValue Size = Op.getOperand(1);
7981
SDLoc dl(Op);
7982
7983
// Get the correct type for pointers.
7984
EVT PtrVT = getPointerTy(DAG.getDataLayout());
7985
// Negate the size.
7986
SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
7987
DAG.getConstant(0, dl, PtrVT), Size);
7988
// Construct a node for the frame pointer save index.
7989
SDValue FPSIdx = getFramePointerFrameIndex(DAG);
7990
SDValue Ops[3] = { Chain, NegSize, FPSIdx };
7991
SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
7992
if (hasInlineStackProbe(MF))
7993
return DAG.getNode(PPCISD::PROBED_ALLOCA, dl, VTs, Ops);
7994
return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
7995
}
7996
7997
SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,
7998
SelectionDAG &DAG) const {
7999
MachineFunction &MF = DAG.getMachineFunction();
8000
8001
bool isPPC64 = Subtarget.isPPC64();
8002
EVT PtrVT = getPointerTy(DAG.getDataLayout());
8003
8004
int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false);
8005
return DAG.getFrameIndex(FI, PtrVT);
8006
}
8007
8008
SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
8009
SelectionDAG &DAG) const {
8010
SDLoc DL(Op);
8011
return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
8012
DAG.getVTList(MVT::i32, MVT::Other),
8013
Op.getOperand(0), Op.getOperand(1));
8014
}
8015
8016
SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
8017
SelectionDAG &DAG) const {
8018
SDLoc DL(Op);
8019
return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
8020
Op.getOperand(0), Op.getOperand(1));
8021
}
8022
8023
SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
8024
if (Op.getValueType().isVector())
8025
return LowerVectorLoad(Op, DAG);
8026
8027
assert(Op.getValueType() == MVT::i1 &&
8028
"Custom lowering only for i1 loads");
8029
8030
// First, load 8 bits into 32 bits, then truncate to 1 bit.
8031
8032
SDLoc dl(Op);
8033
LoadSDNode *LD = cast<LoadSDNode>(Op);
8034
8035
SDValue Chain = LD->getChain();
8036
SDValue BasePtr = LD->getBasePtr();
8037
MachineMemOperand *MMO = LD->getMemOperand();
8038
8039
SDValue NewLD =
8040
DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain,
8041
BasePtr, MVT::i8, MMO);
8042
SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
8043
8044
SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
8045
return DAG.getMergeValues(Ops, dl);
8046
}
8047
8048
SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
8049
if (Op.getOperand(1).getValueType().isVector())
8050
return LowerVectorStore(Op, DAG);
8051
8052
assert(Op.getOperand(1).getValueType() == MVT::i1 &&
8053
"Custom lowering only for i1 stores");
8054
8055
// First, zero extend to 32 bits, then use a truncating store to 8 bits.
8056
8057
SDLoc dl(Op);
8058
StoreSDNode *ST = cast<StoreSDNode>(Op);
8059
8060
SDValue Chain = ST->getChain();
8061
SDValue BasePtr = ST->getBasePtr();
8062
SDValue Value = ST->getValue();
8063
MachineMemOperand *MMO = ST->getMemOperand();
8064
8065
Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()),
8066
Value);
8067
return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
8068
}
8069
8070
// FIXME: Remove this once the ANDI glue bug is fixed:
8071
SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
8072
assert(Op.getValueType() == MVT::i1 &&
8073
"Custom lowering only for i1 results");
8074
8075
SDLoc DL(Op);
8076
return DAG.getNode(PPCISD::ANDI_rec_1_GT_BIT, DL, MVT::i1, Op.getOperand(0));
8077
}
8078
8079
SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
8080
SelectionDAG &DAG) const {
8081
8082
// Implements a vector truncate that fits in a vector register as a shuffle.
8083
// We want to legalize vector truncates down to where the source fits in
8084
// a vector register (and target is therefore smaller than vector register
8085
// size). At that point legalization will try to custom lower the sub-legal
8086
// result and get here - where we can contain the truncate as a single target
8087
// operation.
8088
8089
// For example a trunc <2 x i16> to <2 x i8> could be visualized as follows:
8090
// <MSB1|LSB1, MSB2|LSB2> to <LSB1, LSB2>
8091
//
8092
// We will implement it for big-endian ordering as this (where x denotes
8093
// undefined):
8094
// < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to
8095
// < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u>
8096
//
8097
// The same operation in little-endian ordering will be:
8098
// <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to
8099
// <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>
8100
8101
EVT TrgVT = Op.getValueType();
8102
assert(TrgVT.isVector() && "Vector type expected.");
8103
unsigned TrgNumElts = TrgVT.getVectorNumElements();
8104
EVT EltVT = TrgVT.getVectorElementType();
8105
if (!isOperationCustom(Op.getOpcode(), TrgVT) ||
8106
TrgVT.getSizeInBits() > 128 || !isPowerOf2_32(TrgNumElts) ||
8107
!llvm::has_single_bit<uint32_t>(EltVT.getSizeInBits()))
8108
return SDValue();
8109
8110
SDValue N1 = Op.getOperand(0);
8111
EVT SrcVT = N1.getValueType();
8112
unsigned SrcSize = SrcVT.getSizeInBits();
8113
if (SrcSize > 256 || !isPowerOf2_32(SrcVT.getVectorNumElements()) ||
8114
!llvm::has_single_bit<uint32_t>(
8115
SrcVT.getVectorElementType().getSizeInBits()))
8116
return SDValue();
8117
if (SrcSize == 256 && SrcVT.getVectorNumElements() < 2)
8118
return SDValue();
8119
8120
unsigned WideNumElts = 128 / EltVT.getSizeInBits();
8121
EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
8122
8123
SDLoc DL(Op);
8124
SDValue Op1, Op2;
8125
if (SrcSize == 256) {
8126
EVT VecIdxTy = getVectorIdxTy(DAG.getDataLayout());
8127
EVT SplitVT =
8128
N1.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
8129
unsigned SplitNumElts = SplitVT.getVectorNumElements();
8130
Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1,
8131
DAG.getConstant(0, DL, VecIdxTy));
8132
Op2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1,
8133
DAG.getConstant(SplitNumElts, DL, VecIdxTy));
8134
}
8135
else {
8136
Op1 = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);
8137
Op2 = DAG.getUNDEF(WideVT);
8138
}
8139
8140
// First list the elements we want to keep.
8141
unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
8142
SmallVector<int, 16> ShuffV;
8143
if (Subtarget.isLittleEndian())
8144
for (unsigned i = 0; i < TrgNumElts; ++i)
8145
ShuffV.push_back(i * SizeMult);
8146
else
8147
for (unsigned i = 1; i <= TrgNumElts; ++i)
8148
ShuffV.push_back(i * SizeMult - 1);
8149
8150
// Populate the remaining elements with undefs.
8151
for (unsigned i = TrgNumElts; i < WideNumElts; ++i)
8152
// ShuffV.push_back(i + WideNumElts);
8153
ShuffV.push_back(WideNumElts + 1);
8154
8155
Op1 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op1);
8156
Op2 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op2);
8157
return DAG.getVectorShuffle(WideVT, DL, Op1, Op2, ShuffV);
8158
}
8159
8160
/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
8161
/// possible.
8162
SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
8163
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
8164
EVT ResVT = Op.getValueType();
8165
EVT CmpVT = Op.getOperand(0).getValueType();
8166
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
8167
SDValue TV = Op.getOperand(2), FV = Op.getOperand(3);
8168
SDLoc dl(Op);
8169
8170
// Without power9-vector, we don't have native instruction for f128 comparison.
8171
// Following transformation to libcall is needed for setcc:
8172
// select_cc lhs, rhs, tv, fv, cc -> select_cc (setcc cc, x, y), 0, tv, fv, NE
8173
if (!Subtarget.hasP9Vector() && CmpVT == MVT::f128) {
8174
SDValue Z = DAG.getSetCC(
8175
dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT),
8176
LHS, RHS, CC);
8177
SDValue Zero = DAG.getConstant(0, dl, Z.getValueType());
8178
return DAG.getSelectCC(dl, Z, Zero, TV, FV, ISD::SETNE);
8179
}
8180
8181
// Not FP, or using SPE? Not a fsel.
8182
if (!CmpVT.isFloatingPoint() || !TV.getValueType().isFloatingPoint() ||
8183
Subtarget.hasSPE())
8184
return Op;
8185
8186
SDNodeFlags Flags = Op.getNode()->getFlags();
8187
8188
// We have xsmaxc[dq]p/xsminc[dq]p which are OK to emit even in the
8189
// presence of infinities.
8190
if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {
8191
switch (CC) {
8192
default:
8193
break;
8194
case ISD::SETOGT:
8195
case ISD::SETGT:
8196
return DAG.getNode(PPCISD::XSMAXC, dl, Op.getValueType(), LHS, RHS);
8197
case ISD::SETOLT:
8198
case ISD::SETLT:
8199
return DAG.getNode(PPCISD::XSMINC, dl, Op.getValueType(), LHS, RHS);
8200
}
8201
}
8202
8203
// We might be able to do better than this under some circumstances, but in
8204
// general, fsel-based lowering of select is a finite-math-only optimization.
8205
// For more information, see section F.3 of the 2.06 ISA specification.
8206
// With ISA 3.0
8207
if ((!DAG.getTarget().Options.NoInfsFPMath && !Flags.hasNoInfs()) ||
8208
(!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs()) ||
8209
ResVT == MVT::f128)
8210
return Op;
8211
8212
// If the RHS of the comparison is a 0.0, we don't need to do the
8213
// subtraction at all.
8214
SDValue Sel1;
8215
if (isFloatingPointZero(RHS))
8216
switch (CC) {
8217
default: break; // SETUO etc aren't handled by fsel.
8218
case ISD::SETNE:
8219
std::swap(TV, FV);
8220
[[fallthrough]];
8221
case ISD::SETEQ:
8222
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8223
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
8224
Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
8225
if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
8226
Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
8227
return DAG.getNode(PPCISD::FSEL, dl, ResVT,
8228
DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV);
8229
case ISD::SETULT:
8230
case ISD::SETLT:
8231
std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
8232
[[fallthrough]];
8233
case ISD::SETOGE:
8234
case ISD::SETGE:
8235
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8236
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
8237
return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
8238
case ISD::SETUGT:
8239
case ISD::SETGT:
8240
std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
8241
[[fallthrough]];
8242
case ISD::SETOLE:
8243
case ISD::SETLE:
8244
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8245
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
8246
return DAG.getNode(PPCISD::FSEL, dl, ResVT,
8247
DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
8248
}
8249
8250
SDValue Cmp;
8251
switch (CC) {
8252
default: break; // SETUO etc aren't handled by fsel.
8253
case ISD::SETNE:
8254
std::swap(TV, FV);
8255
[[fallthrough]];
8256
case ISD::SETEQ:
8257
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
8258
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8259
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
8260
Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
8261
if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
8262
Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
8263
return DAG.getNode(PPCISD::FSEL, dl, ResVT,
8264
DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
8265
case ISD::SETULT:
8266
case ISD::SETLT:
8267
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
8268
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8269
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
8270
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
8271
case ISD::SETOGE:
8272
case ISD::SETGE:
8273
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
8274
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8275
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
8276
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
8277
case ISD::SETUGT:
8278
case ISD::SETGT:
8279
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
8280
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8281
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
8282
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
8283
case ISD::SETOLE:
8284
case ISD::SETLE:
8285
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
8286
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8287
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
8288
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
8289
}
8290
return Op;
8291
}
8292
8293
static unsigned getPPCStrictOpcode(unsigned Opc) {
8294
switch (Opc) {
8295
default:
8296
llvm_unreachable("No strict version of this opcode!");
8297
case PPCISD::FCTIDZ:
8298
return PPCISD::STRICT_FCTIDZ;
8299
case PPCISD::FCTIWZ:
8300
return PPCISD::STRICT_FCTIWZ;
8301
case PPCISD::FCTIDUZ:
8302
return PPCISD::STRICT_FCTIDUZ;
8303
case PPCISD::FCTIWUZ:
8304
return PPCISD::STRICT_FCTIWUZ;
8305
case PPCISD::FCFID:
8306
return PPCISD::STRICT_FCFID;
8307
case PPCISD::FCFIDU:
8308
return PPCISD::STRICT_FCFIDU;
8309
case PPCISD::FCFIDS:
8310
return PPCISD::STRICT_FCFIDS;
8311
case PPCISD::FCFIDUS:
8312
return PPCISD::STRICT_FCFIDUS;
8313
}
8314
}
8315
8316
static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG,
8317
const PPCSubtarget &Subtarget) {
8318
SDLoc dl(Op);
8319
bool IsStrict = Op->isStrictFPOpcode();
8320
bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8321
Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8322
8323
// TODO: Any other flags to propagate?
8324
SDNodeFlags Flags;
8325
Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8326
8327
// For strict nodes, source is the second operand.
8328
SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
8329
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
8330
MVT DestTy = Op.getSimpleValueType();
8331
assert(Src.getValueType().isFloatingPoint() &&
8332
(DestTy == MVT::i8 || DestTy == MVT::i16 || DestTy == MVT::i32 ||
8333
DestTy == MVT::i64) &&
8334
"Invalid FP_TO_INT types");
8335
if (Src.getValueType() == MVT::f32) {
8336
if (IsStrict) {
8337
Src =
8338
DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
8339
DAG.getVTList(MVT::f64, MVT::Other), {Chain, Src}, Flags);
8340
Chain = Src.getValue(1);
8341
} else
8342
Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
8343
}
8344
if ((DestTy == MVT::i8 || DestTy == MVT::i16) && Subtarget.hasP9Vector())
8345
DestTy = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
8346
unsigned Opc = ISD::DELETED_NODE;
8347
switch (DestTy.SimpleTy) {
8348
default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
8349
case MVT::i32:
8350
Opc = IsSigned ? PPCISD::FCTIWZ
8351
: (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ);
8352
break;
8353
case MVT::i64:
8354
assert((IsSigned || Subtarget.hasFPCVT()) &&
8355
"i64 FP_TO_UINT is supported only with FPCVT");
8356
Opc = IsSigned ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ;
8357
}
8358
EVT ConvTy = Src.getValueType() == MVT::f128 ? MVT::f128 : MVT::f64;
8359
SDValue Conv;
8360
if (IsStrict) {
8361
Opc = getPPCStrictOpcode(Opc);
8362
Conv = DAG.getNode(Opc, dl, DAG.getVTList(ConvTy, MVT::Other), {Chain, Src},
8363
Flags);
8364
} else {
8365
Conv = DAG.getNode(Opc, dl, ConvTy, Src);
8366
}
8367
return Conv;
8368
}
8369
8370
void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
8371
SelectionDAG &DAG,
8372
const SDLoc &dl) const {
8373
SDValue Tmp = convertFPToInt(Op, DAG, Subtarget);
8374
bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8375
Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8376
bool IsStrict = Op->isStrictFPOpcode();
8377
8378
// Convert the FP value to an int value through memory.
8379
bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
8380
(IsSigned || Subtarget.hasFPCVT());
8381
SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
8382
int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
8383
MachinePointerInfo MPI =
8384
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
8385
8386
// Emit a store to the stack slot.
8387
SDValue Chain = IsStrict ? Tmp.getValue(1) : DAG.getEntryNode();
8388
Align Alignment(DAG.getEVTAlign(Tmp.getValueType()));
8389
if (i32Stack) {
8390
MachineFunction &MF = DAG.getMachineFunction();
8391
Alignment = Align(4);
8392
MachineMemOperand *MMO =
8393
MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment);
8394
SDValue Ops[] = { Chain, Tmp, FIPtr };
8395
Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
8396
DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
8397
} else
8398
Chain = DAG.getStore(Chain, dl, Tmp, FIPtr, MPI, Alignment);
8399
8400
// Result is a load from the stack slot. If loading 4 bytes, make sure to
8401
// add in a bias on big endian.
8402
if (Op.getValueType() == MVT::i32 && !i32Stack) {
8403
FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
8404
DAG.getConstant(4, dl, FIPtr.getValueType()));
8405
MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4);
8406
}
8407
8408
RLI.Chain = Chain;
8409
RLI.Ptr = FIPtr;
8410
RLI.MPI = MPI;
8411
RLI.Alignment = Alignment;
8412
}
8413
8414
/// Custom lowers floating point to integer conversions to use
8415
/// the direct move instructions available in ISA 2.07 to avoid the
8416
/// need for load/store combinations.
8417
SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
8418
SelectionDAG &DAG,
8419
const SDLoc &dl) const {
8420
SDValue Conv = convertFPToInt(Op, DAG, Subtarget);
8421
SDValue Mov = DAG.getNode(PPCISD::MFVSR, dl, Op.getValueType(), Conv);
8422
if (Op->isStrictFPOpcode())
8423
return DAG.getMergeValues({Mov, Conv.getValue(1)}, dl);
8424
else
8425
return Mov;
8426
}
8427
8428
SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
8429
const SDLoc &dl) const {
8430
bool IsStrict = Op->isStrictFPOpcode();
8431
bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8432
Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8433
SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
8434
EVT SrcVT = Src.getValueType();
8435
EVT DstVT = Op.getValueType();
8436
8437
// FP to INT conversions are legal for f128.
8438
if (SrcVT == MVT::f128)
8439
return Subtarget.hasP9Vector() ? Op : SDValue();
8440
8441
// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
8442
// PPC (the libcall is not available).
8443
if (SrcVT == MVT::ppcf128) {
8444
if (DstVT == MVT::i32) {
8445
// TODO: Conservatively pass only nofpexcept flag here. Need to check and
8446
// set other fast-math flags to FP operations in both strict and
8447
// non-strict cases. (FP_TO_SINT, FSUB)
8448
SDNodeFlags Flags;
8449
Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8450
8451
if (IsSigned) {
8452
SDValue Lo, Hi;
8453
std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::f64, MVT::f64);
8454
8455
// Add the two halves of the long double in round-to-zero mode, and use
8456
// a smaller FP_TO_SINT.
8457
if (IsStrict) {
8458
SDValue Res = DAG.getNode(PPCISD::STRICT_FADDRTZ, dl,
8459
DAG.getVTList(MVT::f64, MVT::Other),
8460
{Op.getOperand(0), Lo, Hi}, Flags);
8461
return DAG.getNode(ISD::STRICT_FP_TO_SINT, dl,
8462
DAG.getVTList(MVT::i32, MVT::Other),
8463
{Res.getValue(1), Res}, Flags);
8464
} else {
8465
SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
8466
return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
8467
}
8468
} else {
8469
const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
8470
APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
8471
SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT);
8472
SDValue SignMask = DAG.getConstant(0x80000000, dl, DstVT);
8473
if (IsStrict) {
8474
// Sel = Src < 0x80000000
8475
// FltOfs = select Sel, 0.0, 0x80000000
8476
// IntOfs = select Sel, 0, 0x80000000
8477
// Result = fp_to_sint(Src - FltOfs) ^ IntOfs
8478
SDValue Chain = Op.getOperand(0);
8479
EVT SetCCVT =
8480
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
8481
EVT DstSetCCVT =
8482
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), DstVT);
8483
SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT,
8484
Chain, true);
8485
Chain = Sel.getValue(1);
8486
8487
SDValue FltOfs = DAG.getSelect(
8488
dl, SrcVT, Sel, DAG.getConstantFP(0.0, dl, SrcVT), Cst);
8489
Sel = DAG.getBoolExtOrTrunc(Sel, dl, DstSetCCVT, DstVT);
8490
8491
SDValue Val = DAG.getNode(ISD::STRICT_FSUB, dl,
8492
DAG.getVTList(SrcVT, MVT::Other),
8493
{Chain, Src, FltOfs}, Flags);
8494
Chain = Val.getValue(1);
8495
SDValue SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl,
8496
DAG.getVTList(DstVT, MVT::Other),
8497
{Chain, Val}, Flags);
8498
Chain = SInt.getValue(1);
8499
SDValue IntOfs = DAG.getSelect(
8500
dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT), SignMask);
8501
SDValue Result = DAG.getNode(ISD::XOR, dl, DstVT, SInt, IntOfs);
8502
return DAG.getMergeValues({Result, Chain}, dl);
8503
} else {
8504
// X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
8505
// FIXME: generated code sucks.
8506
SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128, Src, Cst);
8507
True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);
8508
True = DAG.getNode(ISD::ADD, dl, MVT::i32, True, SignMask);
8509
SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
8510
return DAG.getSelectCC(dl, Src, Cst, True, False, ISD::SETGE);
8511
}
8512
}
8513
}
8514
8515
return SDValue();
8516
}
8517
8518
if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
8519
return LowerFP_TO_INTDirectMove(Op, DAG, dl);
8520
8521
ReuseLoadInfo RLI;
8522
LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
8523
8524
return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI,
8525
RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
8526
}
8527
8528
// We're trying to insert a regular store, S, and then a load, L. If the
8529
// incoming value, O, is a load, we might just be able to have our load use the
8530
// address used by O. However, we don't know if anything else will store to
8531
// that address before we can load from it. To prevent this situation, we need
8532
// to insert our load, L, into the chain as a peer of O. To do this, we give L
8533
// the same chain operand as O, we create a token factor from the chain results
8534
// of O and L, and we replace all uses of O's chain result with that token
8535
// factor (see spliceIntoChain below for this last part).
8536
bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
8537
ReuseLoadInfo &RLI,
8538
SelectionDAG &DAG,
8539
ISD::LoadExtType ET) const {
8540
// Conservatively skip reusing for constrained FP nodes.
8541
if (Op->isStrictFPOpcode())
8542
return false;
8543
8544
SDLoc dl(Op);
8545
bool ValidFPToUint = Op.getOpcode() == ISD::FP_TO_UINT &&
8546
(Subtarget.hasFPCVT() || Op.getValueType() == MVT::i32);
8547
if (ET == ISD::NON_EXTLOAD &&
8548
(ValidFPToUint || Op.getOpcode() == ISD::FP_TO_SINT) &&
8549
isOperationLegalOrCustom(Op.getOpcode(),
8550
Op.getOperand(0).getValueType())) {
8551
8552
LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
8553
return true;
8554
}
8555
8556
LoadSDNode *LD = dyn_cast<LoadSDNode>(Op);
8557
if (!LD || LD->getExtensionType() != ET || LD->isVolatile() ||
8558
LD->isNonTemporal())
8559
return false;
8560
if (LD->getMemoryVT() != MemVT)
8561
return false;
8562
8563
// If the result of the load is an illegal type, then we can't build a
8564
// valid chain for reuse since the legalised loads and token factor node that
8565
// ties the legalised loads together uses a different output chain then the
8566
// illegal load.
8567
if (!isTypeLegal(LD->getValueType(0)))
8568
return false;
8569
8570
RLI.Ptr = LD->getBasePtr();
8571
if (LD->isIndexed() && !LD->getOffset().isUndef()) {
8572
assert(LD->getAddressingMode() == ISD::PRE_INC &&
8573
"Non-pre-inc AM on PPC?");
8574
RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,
8575
LD->getOffset());
8576
}
8577
8578
RLI.Chain = LD->getChain();
8579
RLI.MPI = LD->getPointerInfo();
8580
RLI.IsDereferenceable = LD->isDereferenceable();
8581
RLI.IsInvariant = LD->isInvariant();
8582
RLI.Alignment = LD->getAlign();
8583
RLI.AAInfo = LD->getAAInfo();
8584
RLI.Ranges = LD->getRanges();
8585
8586
RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
8587
return true;
8588
}
8589
8590
// Given the head of the old chain, ResChain, insert a token factor containing
8591
// it and NewResChain, and make users of ResChain now be users of that token
8592
// factor.
8593
// TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead.
8594
void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
8595
SDValue NewResChain,
8596
SelectionDAG &DAG) const {
8597
if (!ResChain)
8598
return;
8599
8600
SDLoc dl(NewResChain);
8601
8602
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
8603
NewResChain, DAG.getUNDEF(MVT::Other));
8604
assert(TF.getNode() != NewResChain.getNode() &&
8605
"A new TF really is required here");
8606
8607
DAG.ReplaceAllUsesOfValueWith(ResChain, TF);
8608
DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
8609
}
8610
8611
/// Analyze profitability of direct move
8612
/// prefer float load to int load plus direct move
8613
/// when there is no integer use of int load
8614
bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
8615
SDNode *Origin = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0).getNode();
8616
if (Origin->getOpcode() != ISD::LOAD)
8617
return true;
8618
8619
// If there is no LXSIBZX/LXSIHZX, like Power8,
8620
// prefer direct move if the memory size is 1 or 2 bytes.
8621
MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand();
8622
if (!Subtarget.hasP9Vector() &&
8623
(!MMO->getSize().hasValue() || MMO->getSize().getValue() <= 2))
8624
return true;
8625
8626
for (SDNode::use_iterator UI = Origin->use_begin(),
8627
UE = Origin->use_end();
8628
UI != UE; ++UI) {
8629
8630
// Only look at the users of the loaded value.
8631
if (UI.getUse().get().getResNo() != 0)
8632
continue;
8633
8634
if (UI->getOpcode() != ISD::SINT_TO_FP &&
8635
UI->getOpcode() != ISD::UINT_TO_FP &&
8636
UI->getOpcode() != ISD::STRICT_SINT_TO_FP &&
8637
UI->getOpcode() != ISD::STRICT_UINT_TO_FP)
8638
return true;
8639
}
8640
8641
return false;
8642
}
8643
8644
static SDValue convertIntToFP(SDValue Op, SDValue Src, SelectionDAG &DAG,
8645
const PPCSubtarget &Subtarget,
8646
SDValue Chain = SDValue()) {
8647
bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
8648
Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8649
SDLoc dl(Op);
8650
8651
// TODO: Any other flags to propagate?
8652
SDNodeFlags Flags;
8653
Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8654
8655
// If we have FCFIDS, then use it when converting to single-precision.
8656
// Otherwise, convert to double-precision and then round.
8657
bool IsSingle = Op.getValueType() == MVT::f32 && Subtarget.hasFPCVT();
8658
unsigned ConvOpc = IsSingle ? (IsSigned ? PPCISD::FCFIDS : PPCISD::FCFIDUS)
8659
: (IsSigned ? PPCISD::FCFID : PPCISD::FCFIDU);
8660
EVT ConvTy = IsSingle ? MVT::f32 : MVT::f64;
8661
if (Op->isStrictFPOpcode()) {
8662
if (!Chain)
8663
Chain = Op.getOperand(0);
8664
return DAG.getNode(getPPCStrictOpcode(ConvOpc), dl,
8665
DAG.getVTList(ConvTy, MVT::Other), {Chain, Src}, Flags);
8666
} else
8667
return DAG.getNode(ConvOpc, dl, ConvTy, Src);
8668
}
8669
8670
/// Custom lowers integer to floating point conversions to use
8671
/// the direct move instructions available in ISA 2.07 to avoid the
8672
/// need for load/store combinations.
8673
SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
8674
SelectionDAG &DAG,
8675
const SDLoc &dl) const {
8676
assert((Op.getValueType() == MVT::f32 ||
8677
Op.getValueType() == MVT::f64) &&
8678
"Invalid floating point type as target of conversion");
8679
assert(Subtarget.hasFPCVT() &&
8680
"Int to FP conversions with direct moves require FPCVT");
8681
SDValue Src = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0);
8682
bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
8683
bool Signed = Op.getOpcode() == ISD::SINT_TO_FP ||
8684
Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8685
unsigned MovOpc = (WordInt && !Signed) ? PPCISD::MTVSRZ : PPCISD::MTVSRA;
8686
SDValue Mov = DAG.getNode(MovOpc, dl, MVT::f64, Src);
8687
return convertIntToFP(Op, Mov, DAG, Subtarget);
8688
}
8689
8690
static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {
8691
8692
EVT VecVT = Vec.getValueType();
8693
assert(VecVT.isVector() && "Expected a vector type.");
8694
assert(VecVT.getSizeInBits() < 128 && "Vector is already full width.");
8695
8696
EVT EltVT = VecVT.getVectorElementType();
8697
unsigned WideNumElts = 128 / EltVT.getSizeInBits();
8698
EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
8699
8700
unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements();
8701
SmallVector<SDValue, 16> Ops(NumConcat);
8702
Ops[0] = Vec;
8703
SDValue UndefVec = DAG.getUNDEF(VecVT);
8704
for (unsigned i = 1; i < NumConcat; ++i)
8705
Ops[i] = UndefVec;
8706
8707
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops);
8708
}
8709
8710
SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
8711
const SDLoc &dl) const {
8712
bool IsStrict = Op->isStrictFPOpcode();
8713
unsigned Opc = Op.getOpcode();
8714
SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
8715
assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP ||
8716
Opc == ISD::STRICT_UINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP) &&
8717
"Unexpected conversion type");
8718
assert((Op.getValueType() == MVT::v2f64 || Op.getValueType() == MVT::v4f32) &&
8719
"Supports conversions to v2f64/v4f32 only.");
8720
8721
// TODO: Any other flags to propagate?
8722
SDNodeFlags Flags;
8723
Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8724
8725
bool SignedConv = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
8726
bool FourEltRes = Op.getValueType() == MVT::v4f32;
8727
8728
SDValue Wide = widenVec(DAG, Src, dl);
8729
EVT WideVT = Wide.getValueType();
8730
unsigned WideNumElts = WideVT.getVectorNumElements();
8731
MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64;
8732
8733
SmallVector<int, 16> ShuffV;
8734
for (unsigned i = 0; i < WideNumElts; ++i)
8735
ShuffV.push_back(i + WideNumElts);
8736
8737
int Stride = FourEltRes ? WideNumElts / 4 : WideNumElts / 2;
8738
int SaveElts = FourEltRes ? 4 : 2;
8739
if (Subtarget.isLittleEndian())
8740
for (int i = 0; i < SaveElts; i++)
8741
ShuffV[i * Stride] = i;
8742
else
8743
for (int i = 1; i <= SaveElts; i++)
8744
ShuffV[i * Stride - 1] = i - 1;
8745
8746
SDValue ShuffleSrc2 =
8747
SignedConv ? DAG.getUNDEF(WideVT) : DAG.getConstant(0, dl, WideVT);
8748
SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV);
8749
8750
SDValue Extend;
8751
if (SignedConv) {
8752
Arrange = DAG.getBitcast(IntermediateVT, Arrange);
8753
EVT ExtVT = Src.getValueType();
8754
if (Subtarget.hasP9Altivec())
8755
ExtVT = EVT::getVectorVT(*DAG.getContext(), WideVT.getVectorElementType(),
8756
IntermediateVT.getVectorNumElements());
8757
8758
Extend = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, IntermediateVT, Arrange,
8759
DAG.getValueType(ExtVT));
8760
} else
8761
Extend = DAG.getNode(ISD::BITCAST, dl, IntermediateVT, Arrange);
8762
8763
if (IsStrict)
8764
return DAG.getNode(Opc, dl, DAG.getVTList(Op.getValueType(), MVT::Other),
8765
{Op.getOperand(0), Extend}, Flags);
8766
8767
return DAG.getNode(Opc, dl, Op.getValueType(), Extend);
8768
}
8769
8770
SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
8771
SelectionDAG &DAG) const {
8772
SDLoc dl(Op);
8773
bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
8774
Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8775
bool IsStrict = Op->isStrictFPOpcode();
8776
SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
8777
SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
8778
8779
// TODO: Any other flags to propagate?
8780
SDNodeFlags Flags;
8781
Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8782
8783
EVT InVT = Src.getValueType();
8784
EVT OutVT = Op.getValueType();
8785
if (OutVT.isVector() && OutVT.isFloatingPoint() &&
8786
isOperationCustom(Op.getOpcode(), InVT))
8787
return LowerINT_TO_FPVector(Op, DAG, dl);
8788
8789
// Conversions to f128 are legal.
8790
if (Op.getValueType() == MVT::f128)
8791
return Subtarget.hasP9Vector() ? Op : SDValue();
8792
8793
// Don't handle ppc_fp128 here; let it be lowered to a libcall.
8794
if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
8795
return SDValue();
8796
8797
if (Src.getValueType() == MVT::i1) {
8798
SDValue Sel = DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Src,
8799
DAG.getConstantFP(1.0, dl, Op.getValueType()),
8800
DAG.getConstantFP(0.0, dl, Op.getValueType()));
8801
if (IsStrict)
8802
return DAG.getMergeValues({Sel, Chain}, dl);
8803
else
8804
return Sel;
8805
}
8806
8807
// If we have direct moves, we can do all the conversion, skip the store/load
8808
// however, without FPCVT we can't do most conversions.
8809
if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
8810
Subtarget.isPPC64() && Subtarget.hasFPCVT())
8811
return LowerINT_TO_FPDirectMove(Op, DAG, dl);
8812
8813
assert((IsSigned || Subtarget.hasFPCVT()) &&
8814
"UINT_TO_FP is supported only with FPCVT");
8815
8816
if (Src.getValueType() == MVT::i64) {
8817
SDValue SINT = Src;
8818
// When converting to single-precision, we actually need to convert
8819
// to double-precision first and then round to single-precision.
8820
// To avoid double-rounding effects during that operation, we have
8821
// to prepare the input operand. Bits that might be truncated when
8822
// converting to double-precision are replaced by a bit that won't
8823
// be lost at this stage, but is below the single-precision rounding
8824
// position.
8825
//
8826
// However, if -enable-unsafe-fp-math is in effect, accept double
8827
// rounding to avoid the extra overhead.
8828
if (Op.getValueType() == MVT::f32 &&
8829
!Subtarget.hasFPCVT() &&
8830
!DAG.getTarget().Options.UnsafeFPMath) {
8831
8832
// Twiddle input to make sure the low 11 bits are zero. (If this
8833
// is the case, we are guaranteed the value will fit into the 53 bit
8834
// mantissa of an IEEE double-precision value without rounding.)
8835
// If any of those low 11 bits were not zero originally, make sure
8836
// bit 12 (value 2048) is set instead, so that the final rounding
8837
// to single-precision gets the correct result.
8838
SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
8839
SINT, DAG.getConstant(2047, dl, MVT::i64));
8840
Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
8841
Round, DAG.getConstant(2047, dl, MVT::i64));
8842
Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
8843
Round = DAG.getNode(ISD::AND, dl, MVT::i64,
8844
Round, DAG.getConstant(-2048, dl, MVT::i64));
8845
8846
// However, we cannot use that value unconditionally: if the magnitude
8847
// of the input value is small, the bit-twiddling we did above might
8848
// end up visibly changing the output. Fortunately, in that case, we
8849
// don't need to twiddle bits since the original input will convert
8850
// exactly to double-precision floating-point already. Therefore,
8851
// construct a conditional to use the original value if the top 11
8852
// bits are all sign-bit copies, and use the rounded value computed
8853
// above otherwise.
8854
SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
8855
SINT, DAG.getConstant(53, dl, MVT::i32));
8856
Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
8857
Cond, DAG.getConstant(1, dl, MVT::i64));
8858
Cond = DAG.getSetCC(
8859
dl,
8860
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
8861
Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);
8862
8863
SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
8864
}
8865
8866
ReuseLoadInfo RLI;
8867
SDValue Bits;
8868
8869
MachineFunction &MF = DAG.getMachineFunction();
8870
if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {
8871
Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI,
8872
RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
8873
spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
8874
} else if (Subtarget.hasLFIWAX() &&
8875
canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {
8876
MachineMemOperand *MMO =
8877
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
8878
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
8879
SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8880
Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,
8881
DAG.getVTList(MVT::f64, MVT::Other),
8882
Ops, MVT::i32, MMO);
8883
spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
8884
} else if (Subtarget.hasFPCVT() &&
8885
canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) {
8886
MachineMemOperand *MMO =
8887
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
8888
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
8889
SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8890
Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl,
8891
DAG.getVTList(MVT::f64, MVT::Other),
8892
Ops, MVT::i32, MMO);
8893
spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
8894
} else if (((Subtarget.hasLFIWAX() &&
8895
SINT.getOpcode() == ISD::SIGN_EXTEND) ||
8896
(Subtarget.hasFPCVT() &&
8897
SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
8898
SINT.getOperand(0).getValueType() == MVT::i32) {
8899
MachineFrameInfo &MFI = MF.getFrameInfo();
8900
EVT PtrVT = getPointerTy(DAG.getDataLayout());
8901
8902
int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
8903
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
8904
8905
SDValue Store = DAG.getStore(Chain, dl, SINT.getOperand(0), FIdx,
8906
MachinePointerInfo::getFixedStack(
8907
DAG.getMachineFunction(), FrameIdx));
8908
Chain = Store;
8909
8910
assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
8911
"Expected an i32 store");
8912
8913
RLI.Ptr = FIdx;
8914
RLI.Chain = Chain;
8915
RLI.MPI =
8916
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
8917
RLI.Alignment = Align(4);
8918
8919
MachineMemOperand *MMO =
8920
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
8921
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
8922
SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8923
Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ?
8924
PPCISD::LFIWZX : PPCISD::LFIWAX,
8925
dl, DAG.getVTList(MVT::f64, MVT::Other),
8926
Ops, MVT::i32, MMO);
8927
Chain = Bits.getValue(1);
8928
} else
8929
Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
8930
8931
SDValue FP = convertIntToFP(Op, Bits, DAG, Subtarget, Chain);
8932
if (IsStrict)
8933
Chain = FP.getValue(1);
8934
8935
if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
8936
if (IsStrict)
8937
FP = DAG.getNode(ISD::STRICT_FP_ROUND, dl,
8938
DAG.getVTList(MVT::f32, MVT::Other),
8939
{Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags);
8940
else
8941
FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
8942
DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
8943
}
8944
return FP;
8945
}
8946
8947
assert(Src.getValueType() == MVT::i32 &&
8948
"Unhandled INT_TO_FP type in custom expander!");
8949
// Since we only generate this in 64-bit mode, we can take advantage of
8950
// 64-bit registers. In particular, sign extend the input value into the
8951
// 64-bit register with extsw, store the WHOLE 64-bit value into the stack
8952
// then lfd it and fcfid it.
8953
MachineFunction &MF = DAG.getMachineFunction();
8954
MachineFrameInfo &MFI = MF.getFrameInfo();
8955
EVT PtrVT = getPointerTy(MF.getDataLayout());
8956
8957
SDValue Ld;
8958
if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
8959
ReuseLoadInfo RLI;
8960
bool ReusingLoad;
8961
if (!(ReusingLoad = canReuseLoadAddress(Src, MVT::i32, RLI, DAG))) {
8962
int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
8963
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
8964
8965
SDValue Store = DAG.getStore(Chain, dl, Src, FIdx,
8966
MachinePointerInfo::getFixedStack(
8967
DAG.getMachineFunction(), FrameIdx));
8968
Chain = Store;
8969
8970
assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
8971
"Expected an i32 store");
8972
8973
RLI.Ptr = FIdx;
8974
RLI.Chain = Chain;
8975
RLI.MPI =
8976
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
8977
RLI.Alignment = Align(4);
8978
}
8979
8980
MachineMemOperand *MMO =
8981
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
8982
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
8983
SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8984
Ld = DAG.getMemIntrinsicNode(IsSigned ? PPCISD::LFIWAX : PPCISD::LFIWZX, dl,
8985
DAG.getVTList(MVT::f64, MVT::Other), Ops,
8986
MVT::i32, MMO);
8987
Chain = Ld.getValue(1);
8988
if (ReusingLoad)
8989
spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
8990
} else {
8991
assert(Subtarget.isPPC64() &&
8992
"i32->FP without LFIWAX supported only on PPC64");
8993
8994
int FrameIdx = MFI.CreateStackObject(8, Align(8), false);
8995
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
8996
8997
SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, Src);
8998
8999
// STD the extended value into the stack slot.
9000
SDValue Store = DAG.getStore(
9001
Chain, dl, Ext64, FIdx,
9002
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
9003
Chain = Store;
9004
9005
// Load the value as a double.
9006
Ld = DAG.getLoad(
9007
MVT::f64, dl, Chain, FIdx,
9008
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
9009
Chain = Ld.getValue(1);
9010
}
9011
9012
// FCFID it and return it.
9013
SDValue FP = convertIntToFP(Op, Ld, DAG, Subtarget, Chain);
9014
if (IsStrict)
9015
Chain = FP.getValue(1);
9016
if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
9017
if (IsStrict)
9018
FP = DAG.getNode(ISD::STRICT_FP_ROUND, dl,
9019
DAG.getVTList(MVT::f32, MVT::Other),
9020
{Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags);
9021
else
9022
FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
9023
DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
9024
}
9025
return FP;
9026
}
9027
9028
SDValue PPCTargetLowering::LowerGET_ROUNDING(SDValue Op,
9029
SelectionDAG &DAG) const {
9030
SDLoc dl(Op);
9031
/*
9032
The rounding mode is in bits 30:31 of FPSR, and has the following
9033
settings:
9034
00 Round to nearest
9035
01 Round to 0
9036
10 Round to +inf
9037
11 Round to -inf
9038
9039
GET_ROUNDING, on the other hand, expects the following:
9040
-1 Undefined
9041
0 Round to 0
9042
1 Round to nearest
9043
2 Round to +inf
9044
3 Round to -inf
9045
9046
To perform the conversion, we do:
9047
((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
9048
*/
9049
9050
MachineFunction &MF = DAG.getMachineFunction();
9051
EVT VT = Op.getValueType();
9052
EVT PtrVT = getPointerTy(MF.getDataLayout());
9053
9054
// Save FP Control Word to register
9055
SDValue Chain = Op.getOperand(0);
9056
SDValue MFFS = DAG.getNode(PPCISD::MFFS, dl, {MVT::f64, MVT::Other}, Chain);
9057
Chain = MFFS.getValue(1);
9058
9059
SDValue CWD;
9060
if (isTypeLegal(MVT::i64)) {
9061
CWD = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
9062
DAG.getNode(ISD::BITCAST, dl, MVT::i64, MFFS));
9063
} else {
9064
// Save FP register to stack slot
9065
int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false);
9066
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
9067
Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo());
9068
9069
// Load FP Control Word from low 32 bits of stack slot.
9070
assert(hasBigEndianPartOrdering(MVT::i64, MF.getDataLayout()) &&
9071
"Stack slot adjustment is valid only on big endian subtargets!");
9072
SDValue Four = DAG.getConstant(4, dl, PtrVT);
9073
SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
9074
CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo());
9075
Chain = CWD.getValue(1);
9076
}
9077
9078
// Transform as necessary
9079
SDValue CWD1 =
9080
DAG.getNode(ISD::AND, dl, MVT::i32,
9081
CWD, DAG.getConstant(3, dl, MVT::i32));
9082
SDValue CWD2 =
9083
DAG.getNode(ISD::SRL, dl, MVT::i32,
9084
DAG.getNode(ISD::AND, dl, MVT::i32,
9085
DAG.getNode(ISD::XOR, dl, MVT::i32,
9086
CWD, DAG.getConstant(3, dl, MVT::i32)),
9087
DAG.getConstant(3, dl, MVT::i32)),
9088
DAG.getConstant(1, dl, MVT::i32));
9089
9090
SDValue RetVal =
9091
DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
9092
9093
RetVal =
9094
DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND),
9095
dl, VT, RetVal);
9096
9097
return DAG.getMergeValues({RetVal, Chain}, dl);
9098
}
9099
9100
SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
9101
EVT VT = Op.getValueType();
9102
unsigned BitWidth = VT.getSizeInBits();
9103
SDLoc dl(Op);
9104
assert(Op.getNumOperands() == 3 &&
9105
VT == Op.getOperand(1).getValueType() &&
9106
"Unexpected SHL!");
9107
9108
// Expand into a bunch of logical ops. Note that these ops
9109
// depend on the PPC behavior for oversized shift amounts.
9110
SDValue Lo = Op.getOperand(0);
9111
SDValue Hi = Op.getOperand(1);
9112
SDValue Amt = Op.getOperand(2);
9113
EVT AmtVT = Amt.getValueType();
9114
9115
SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
9116
DAG.getConstant(BitWidth, dl, AmtVT), Amt);
9117
SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
9118
SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
9119
SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
9120
SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
9121
DAG.getConstant(-BitWidth, dl, AmtVT));
9122
SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
9123
SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
9124
SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
9125
SDValue OutOps[] = { OutLo, OutHi };
9126
return DAG.getMergeValues(OutOps, dl);
9127
}
9128
9129
SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
9130
EVT VT = Op.getValueType();
9131
SDLoc dl(Op);
9132
unsigned BitWidth = VT.getSizeInBits();
9133
assert(Op.getNumOperands() == 3 &&
9134
VT == Op.getOperand(1).getValueType() &&
9135
"Unexpected SRL!");
9136
9137
// Expand into a bunch of logical ops. Note that these ops
9138
// depend on the PPC behavior for oversized shift amounts.
9139
SDValue Lo = Op.getOperand(0);
9140
SDValue Hi = Op.getOperand(1);
9141
SDValue Amt = Op.getOperand(2);
9142
EVT AmtVT = Amt.getValueType();
9143
9144
SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
9145
DAG.getConstant(BitWidth, dl, AmtVT), Amt);
9146
SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
9147
SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
9148
SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
9149
SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
9150
DAG.getConstant(-BitWidth, dl, AmtVT));
9151
SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
9152
SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
9153
SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
9154
SDValue OutOps[] = { OutLo, OutHi };
9155
return DAG.getMergeValues(OutOps, dl);
9156
}
9157
9158
SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
9159
SDLoc dl(Op);
9160
EVT VT = Op.getValueType();
9161
unsigned BitWidth = VT.getSizeInBits();
9162
assert(Op.getNumOperands() == 3 &&
9163
VT == Op.getOperand(1).getValueType() &&
9164
"Unexpected SRA!");
9165
9166
// Expand into a bunch of logical ops, followed by a select_cc.
9167
SDValue Lo = Op.getOperand(0);
9168
SDValue Hi = Op.getOperand(1);
9169
SDValue Amt = Op.getOperand(2);
9170
EVT AmtVT = Amt.getValueType();
9171
9172
SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
9173
DAG.getConstant(BitWidth, dl, AmtVT), Amt);
9174
SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
9175
SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
9176
SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
9177
SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
9178
DAG.getConstant(-BitWidth, dl, AmtVT));
9179
SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
9180
SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
9181
SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT),
9182
Tmp4, Tmp6, ISD::SETLE);
9183
SDValue OutOps[] = { OutLo, OutHi };
9184
return DAG.getMergeValues(OutOps, dl);
9185
}
9186
9187
SDValue PPCTargetLowering::LowerFunnelShift(SDValue Op,
9188
SelectionDAG &DAG) const {
9189
SDLoc dl(Op);
9190
EVT VT = Op.getValueType();
9191
unsigned BitWidth = VT.getSizeInBits();
9192
9193
bool IsFSHL = Op.getOpcode() == ISD::FSHL;
9194
SDValue X = Op.getOperand(0);
9195
SDValue Y = Op.getOperand(1);
9196
SDValue Z = Op.getOperand(2);
9197
EVT AmtVT = Z.getValueType();
9198
9199
// fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
9200
// fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
9201
// This is simpler than TargetLowering::expandFunnelShift because we can rely
9202
// on PowerPC shift by BW being well defined.
9203
Z = DAG.getNode(ISD::AND, dl, AmtVT, Z,
9204
DAG.getConstant(BitWidth - 1, dl, AmtVT));
9205
SDValue SubZ =
9206
DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Z);
9207
X = DAG.getNode(PPCISD::SHL, dl, VT, X, IsFSHL ? Z : SubZ);
9208
Y = DAG.getNode(PPCISD::SRL, dl, VT, Y, IsFSHL ? SubZ : Z);
9209
return DAG.getNode(ISD::OR, dl, VT, X, Y);
9210
}
9211
9212
//===----------------------------------------------------------------------===//
9213
// Vector related lowering.
9214
//
9215
9216
/// getCanonicalConstSplat - Build a canonical splat immediate of Val with an
9217
/// element size of SplatSize. Cast the result to VT.
9218
static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,
9219
SelectionDAG &DAG, const SDLoc &dl) {
9220
static const MVT VTys[] = { // canonical VT to use for each size.
9221
MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
9222
};
9223
9224
EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
9225
9226
// For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.
9227
if (Val == ((1LLU << (SplatSize * 8)) - 1)) {
9228
SplatSize = 1;
9229
Val = 0xFF;
9230
}
9231
9232
EVT CanonicalVT = VTys[SplatSize-1];
9233
9234
// Build a canonical splat for this value.
9235
return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT));
9236
}
9237
9238
/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
9239
/// specified intrinsic ID.
9240
static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
9241
const SDLoc &dl, EVT DestVT = MVT::Other) {
9242
if (DestVT == MVT::Other) DestVT = Op.getValueType();
9243
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
9244
DAG.getConstant(IID, dl, MVT::i32), Op);
9245
}
9246
9247
/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
9248
/// specified intrinsic ID.
9249
static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
9250
SelectionDAG &DAG, const SDLoc &dl,
9251
EVT DestVT = MVT::Other) {
9252
if (DestVT == MVT::Other) DestVT = LHS.getValueType();
9253
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
9254
DAG.getConstant(IID, dl, MVT::i32), LHS, RHS);
9255
}
9256
9257
/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
9258
/// specified intrinsic ID.
9259
static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
9260
SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
9261
EVT DestVT = MVT::Other) {
9262
if (DestVT == MVT::Other) DestVT = Op0.getValueType();
9263
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
9264
DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);
9265
}
9266
9267
/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
9268
/// amount. The result has the specified value type.
9269
static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
9270
SelectionDAG &DAG, const SDLoc &dl) {
9271
// Force LHS/RHS to be the right type.
9272
LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
9273
RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);
9274
9275
int Ops[16];
9276
for (unsigned i = 0; i != 16; ++i)
9277
Ops[i] = i + Amt;
9278
SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
9279
return DAG.getNode(ISD::BITCAST, dl, VT, T);
9280
}
9281
9282
/// Do we have an efficient pattern in a .td file for this node?
9283
///
9284
/// \param V - pointer to the BuildVectorSDNode being matched
9285
/// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?
9286
///
9287
/// There are some patterns where it is beneficial to keep a BUILD_VECTOR
9288
/// node as a BUILD_VECTOR node rather than expanding it. The patterns where
9289
/// the opposite is true (expansion is beneficial) are:
9290
/// - The node builds a vector out of integers that are not 32 or 64-bits
9291
/// - The node builds a vector out of constants
9292
/// - The node is a "load-and-splat"
9293
/// In all other cases, we will choose to keep the BUILD_VECTOR.
9294
static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
9295
bool HasDirectMove,
9296
bool HasP8Vector) {
9297
EVT VecVT = V->getValueType(0);
9298
bool RightType = VecVT == MVT::v2f64 ||
9299
(HasP8Vector && VecVT == MVT::v4f32) ||
9300
(HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));
9301
if (!RightType)
9302
return false;
9303
9304
bool IsSplat = true;
9305
bool IsLoad = false;
9306
SDValue Op0 = V->getOperand(0);
9307
9308
// This function is called in a block that confirms the node is not a constant
9309
// splat. So a constant BUILD_VECTOR here means the vector is built out of
9310
// different constants.
9311
if (V->isConstant())
9312
return false;
9313
for (int i = 0, e = V->getNumOperands(); i < e; ++i) {
9314
if (V->getOperand(i).isUndef())
9315
return false;
9316
// We want to expand nodes that represent load-and-splat even if the
9317
// loaded value is a floating point truncation or conversion to int.
9318
if (V->getOperand(i).getOpcode() == ISD::LOAD ||
9319
(V->getOperand(i).getOpcode() == ISD::FP_ROUND &&
9320
V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
9321
(V->getOperand(i).getOpcode() == ISD::FP_TO_SINT &&
9322
V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
9323
(V->getOperand(i).getOpcode() == ISD::FP_TO_UINT &&
9324
V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD))
9325
IsLoad = true;
9326
// If the operands are different or the input is not a load and has more
9327
// uses than just this BV node, then it isn't a splat.
9328
if (V->getOperand(i) != Op0 ||
9329
(!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode())))
9330
IsSplat = false;
9331
}
9332
return !(IsSplat && IsLoad);
9333
}
9334
9335
// Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128.
9336
SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
9337
9338
SDLoc dl(Op);
9339
SDValue Op0 = Op->getOperand(0);
9340
9341
if (!Subtarget.isPPC64() || (Op0.getOpcode() != ISD::BUILD_PAIR) ||
9342
(Op.getValueType() != MVT::f128))
9343
return SDValue();
9344
9345
SDValue Lo = Op0.getOperand(0);
9346
SDValue Hi = Op0.getOperand(1);
9347
if ((Lo.getValueType() != MVT::i64) || (Hi.getValueType() != MVT::i64))
9348
return SDValue();
9349
9350
if (!Subtarget.isLittleEndian())
9351
std::swap(Lo, Hi);
9352
9353
return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Lo, Hi);
9354
}
9355
9356
static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
9357
const SDValue *InputLoad = &Op;
9358
while (InputLoad->getOpcode() == ISD::BITCAST)
9359
InputLoad = &InputLoad->getOperand(0);
9360
if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
9361
InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
9362
IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;
9363
InputLoad = &InputLoad->getOperand(0);
9364
}
9365
if (InputLoad->getOpcode() != ISD::LOAD)
9366
return nullptr;
9367
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
9368
return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
9369
}
9370
9371
// Convert the argument APFloat to a single precision APFloat if there is no
9372
// loss in information during the conversion to single precision APFloat and the
9373
// resulting number is not a denormal number. Return true if successful.
9374
bool llvm::convertToNonDenormSingle(APFloat &ArgAPFloat) {
9375
APFloat APFloatToConvert = ArgAPFloat;
9376
bool LosesInfo = true;
9377
APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
9378
&LosesInfo);
9379
bool Success = (!LosesInfo && !APFloatToConvert.isDenormal());
9380
if (Success)
9381
ArgAPFloat = APFloatToConvert;
9382
return Success;
9383
}
9384
9385
// Bitcast the argument APInt to a double and convert it to a single precision
9386
// APFloat, bitcast the APFloat to an APInt and assign it to the original
9387
// argument if there is no loss in information during the conversion from
9388
// double to single precision APFloat and the resulting number is not a denormal
9389
// number. Return true if successful.
9390
bool llvm::convertToNonDenormSingle(APInt &ArgAPInt) {
9391
double DpValue = ArgAPInt.bitsToDouble();
9392
APFloat APFloatDp(DpValue);
9393
bool Success = convertToNonDenormSingle(APFloatDp);
9394
if (Success)
9395
ArgAPInt = APFloatDp.bitcastToAPInt();
9396
return Success;
9397
}
9398
9399
// Nondestructive check for convertTonNonDenormSingle.
9400
bool llvm::checkConvertToNonDenormSingle(APFloat &ArgAPFloat) {
9401
// Only convert if it loses info, since XXSPLTIDP should
9402
// handle the other case.
9403
APFloat APFloatToConvert = ArgAPFloat;
9404
bool LosesInfo = true;
9405
APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
9406
&LosesInfo);
9407
9408
return (!LosesInfo && !APFloatToConvert.isDenormal());
9409
}
9410
9411
static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
9412
unsigned &Opcode) {
9413
LoadSDNode *InputNode = dyn_cast<LoadSDNode>(Op.getOperand(0));
9414
if (!InputNode || !Subtarget.hasVSX() || !ISD::isUNINDEXEDLoad(InputNode))
9415
return false;
9416
9417
EVT Ty = Op->getValueType(0);
9418
// For v2f64, v4f32 and v4i32 types, we require the load to be non-extending
9419
// as we cannot handle extending loads for these types.
9420
if ((Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32) &&
9421
ISD::isNON_EXTLoad(InputNode))
9422
return true;
9423
9424
EVT MemVT = InputNode->getMemoryVT();
9425
// For v8i16 and v16i8 types, extending loads can be handled as long as the
9426
// memory VT is the same vector element VT type.
9427
// The loads feeding into the v8i16 and v16i8 types will be extending because
9428
// scalar i8/i16 are not legal types.
9429
if ((Ty == MVT::v8i16 || Ty == MVT::v16i8) && ISD::isEXTLoad(InputNode) &&
9430
(MemVT == Ty.getVectorElementType()))
9431
return true;
9432
9433
if (Ty == MVT::v2i64) {
9434
// Check the extend type, when the input type is i32, and the output vector
9435
// type is v2i64.
9436
if (MemVT == MVT::i32) {
9437
if (ISD::isZEXTLoad(InputNode))
9438
Opcode = PPCISD::ZEXT_LD_SPLAT;
9439
if (ISD::isSEXTLoad(InputNode))
9440
Opcode = PPCISD::SEXT_LD_SPLAT;
9441
}
9442
return true;
9443
}
9444
return false;
9445
}
9446
9447
// If this is a case we can't handle, return null and let the default
9448
// expansion code take care of it. If we CAN select this case, and if it
9449
// selects to a single instruction, return Op. Otherwise, if we can codegen
9450
// this case more efficiently than a constant pool load, lower it to the
9451
// sequence of ops that should be used.
9452
SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
9453
SelectionDAG &DAG) const {
9454
SDLoc dl(Op);
9455
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
9456
assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
9457
9458
// Check if this is a splat of a constant value.
9459
APInt APSplatBits, APSplatUndef;
9460
unsigned SplatBitSize;
9461
bool HasAnyUndefs;
9462
bool BVNIsConstantSplat =
9463
BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
9464
HasAnyUndefs, 0, !Subtarget.isLittleEndian());
9465
9466
// If it is a splat of a double, check if we can shrink it to a 32 bit
9467
// non-denormal float which when converted back to double gives us the same
9468
// double. This is to exploit the XXSPLTIDP instruction.
9469
// If we lose precision, we use XXSPLTI32DX.
9470
if (BVNIsConstantSplat && (SplatBitSize == 64) &&
9471
Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
9472
// Check the type first to short-circuit so we don't modify APSplatBits if
9473
// this block isn't executed.
9474
if ((Op->getValueType(0) == MVT::v2f64) &&
9475
convertToNonDenormSingle(APSplatBits)) {
9476
SDValue SplatNode = DAG.getNode(
9477
PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
9478
DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
9479
return DAG.getBitcast(Op.getValueType(), SplatNode);
9480
} else {
9481
// We may lose precision, so we have to use XXSPLTI32DX.
9482
9483
uint32_t Hi =
9484
(uint32_t)((APSplatBits.getZExtValue() & 0xFFFFFFFF00000000LL) >> 32);
9485
uint32_t Lo =
9486
(uint32_t)(APSplatBits.getZExtValue() & 0xFFFFFFFF);
9487
SDValue SplatNode = DAG.getUNDEF(MVT::v2i64);
9488
9489
if (!Hi || !Lo)
9490
// If either load is 0, then we should generate XXLXOR to set to 0.
9491
SplatNode = DAG.getTargetConstant(0, dl, MVT::v2i64);
9492
9493
if (Hi)
9494
SplatNode = DAG.getNode(
9495
PPCISD::XXSPLTI32DX, dl, MVT::v2i64, SplatNode,
9496
DAG.getTargetConstant(0, dl, MVT::i32),
9497
DAG.getTargetConstant(Hi, dl, MVT::i32));
9498
9499
if (Lo)
9500
SplatNode =
9501
DAG.getNode(PPCISD::XXSPLTI32DX, dl, MVT::v2i64, SplatNode,
9502
DAG.getTargetConstant(1, dl, MVT::i32),
9503
DAG.getTargetConstant(Lo, dl, MVT::i32));
9504
9505
return DAG.getBitcast(Op.getValueType(), SplatNode);
9506
}
9507
}
9508
9509
if (!BVNIsConstantSplat || SplatBitSize > 32) {
9510
unsigned NewOpcode = PPCISD::LD_SPLAT;
9511
9512
// Handle load-and-splat patterns as we have instructions that will do this
9513
// in one go.
9514
if (DAG.isSplatValue(Op, true) &&
9515
isValidSplatLoad(Subtarget, Op, NewOpcode)) {
9516
const SDValue *InputLoad = &Op.getOperand(0);
9517
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
9518
9519
// If the input load is an extending load, it will be an i32 -> i64
9520
// extending load and isValidSplatLoad() will update NewOpcode.
9521
unsigned MemorySize = LD->getMemoryVT().getScalarSizeInBits();
9522
unsigned ElementSize =
9523
MemorySize * ((NewOpcode == PPCISD::LD_SPLAT) ? 1 : 2);
9524
9525
assert(((ElementSize == 2 * MemorySize)
9526
? (NewOpcode == PPCISD::ZEXT_LD_SPLAT ||
9527
NewOpcode == PPCISD::SEXT_LD_SPLAT)
9528
: (NewOpcode == PPCISD::LD_SPLAT)) &&
9529
"Unmatched element size and opcode!\n");
9530
9531
// Checking for a single use of this load, we have to check for vector
9532
// width (128 bits) / ElementSize uses (since each operand of the
9533
// BUILD_VECTOR is a separate use of the value.
9534
unsigned NumUsesOfInputLD = 128 / ElementSize;
9535
for (SDValue BVInOp : Op->ops())
9536
if (BVInOp.isUndef())
9537
NumUsesOfInputLD--;
9538
9539
// Exclude somes case where LD_SPLAT is worse than scalar_to_vector:
9540
// Below cases should also happen for "lfiwzx/lfiwax + LE target + index
9541
// 1" and "lxvrhx + BE target + index 7" and "lxvrbx + BE target + index
9542
// 15", but function IsValidSplatLoad() now will only return true when
9543
// the data at index 0 is not nullptr. So we will not get into trouble for
9544
// these cases.
9545
//
9546
// case 1 - lfiwzx/lfiwax
9547
// 1.1: load result is i32 and is sign/zero extend to i64;
9548
// 1.2: build a v2i64 vector type with above loaded value;
9549
// 1.3: the vector has only one value at index 0, others are all undef;
9550
// 1.4: on BE target, so that lfiwzx/lfiwax does not need any permute.
9551
if (NumUsesOfInputLD == 1 &&
9552
(Op->getValueType(0) == MVT::v2i64 && NewOpcode != PPCISD::LD_SPLAT &&
9553
!Subtarget.isLittleEndian() && Subtarget.hasVSX() &&
9554
Subtarget.hasLFIWAX()))
9555
return SDValue();
9556
9557
// case 2 - lxvr[hb]x
9558
// 2.1: load result is at most i16;
9559
// 2.2: build a vector with above loaded value;
9560
// 2.3: the vector has only one value at index 0, others are all undef;
9561
// 2.4: on LE target, so that lxvr[hb]x does not need any permute.
9562
if (NumUsesOfInputLD == 1 && Subtarget.isLittleEndian() &&
9563
Subtarget.isISA3_1() && ElementSize <= 16)
9564
return SDValue();
9565
9566
assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?");
9567
if (InputLoad->getNode()->hasNUsesOfValue(NumUsesOfInputLD, 0) &&
9568
Subtarget.hasVSX()) {
9569
SDValue Ops[] = {
9570
LD->getChain(), // Chain
9571
LD->getBasePtr(), // Ptr
9572
DAG.getValueType(Op.getValueType()) // VT
9573
};
9574
SDValue LdSplt = DAG.getMemIntrinsicNode(
9575
NewOpcode, dl, DAG.getVTList(Op.getValueType(), MVT::Other), Ops,
9576
LD->getMemoryVT(), LD->getMemOperand());
9577
// Replace all uses of the output chain of the original load with the
9578
// output chain of the new load.
9579
DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1),
9580
LdSplt.getValue(1));
9581
return LdSplt;
9582
}
9583
}
9584
9585
// In 64BIT mode BUILD_VECTOR nodes that are not constant splats of up to
9586
// 32-bits can be lowered to VSX instructions under certain conditions.
9587
// Without VSX, there is no pattern more efficient than expanding the node.
9588
if (Subtarget.hasVSX() && Subtarget.isPPC64() &&
9589
haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),
9590
Subtarget.hasP8Vector()))
9591
return Op;
9592
return SDValue();
9593
}
9594
9595
uint64_t SplatBits = APSplatBits.getZExtValue();
9596
uint64_t SplatUndef = APSplatUndef.getZExtValue();
9597
unsigned SplatSize = SplatBitSize / 8;
9598
9599
// First, handle single instruction cases.
9600
9601
// All zeros?
9602
if (SplatBits == 0) {
9603
// Canonicalize all zero vectors to be v4i32.
9604
if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
9605
SDValue Z = DAG.getConstant(0, dl, MVT::v4i32);
9606
Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
9607
}
9608
return Op;
9609
}
9610
9611
// We have XXSPLTIW for constant splats four bytes wide.
9612
// Given vector length is a multiple of 4, 2-byte splats can be replaced
9613
// with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to
9614
// make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
9615
// turned into a 4-byte splat of 0xABABABAB.
9616
if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 2)
9617
return getCanonicalConstSplat(SplatBits | (SplatBits << 16), SplatSize * 2,
9618
Op.getValueType(), DAG, dl);
9619
9620
if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 4)
9621
return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
9622
dl);
9623
9624
// We have XXSPLTIB for constant splats one byte wide.
9625
if (Subtarget.hasP9Vector() && SplatSize == 1)
9626
return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
9627
dl);
9628
9629
// If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
9630
int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
9631
(32-SplatBitSize));
9632
if (SextVal >= -16 && SextVal <= 15)
9633
return getCanonicalConstSplat(SextVal, SplatSize, Op.getValueType(), DAG,
9634
dl);
9635
9636
// Two instruction sequences.
9637
9638
// If this value is in the range [-32,30] and is even, use:
9639
// VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
9640
// If this value is in the range [17,31] and is odd, use:
9641
// VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
9642
// If this value is in the range [-31,-17] and is odd, use:
9643
// VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
9644
// Note the last two are three-instruction sequences.
9645
if (SextVal >= -32 && SextVal <= 31) {
9646
// To avoid having these optimizations undone by constant folding,
9647
// we convert to a pseudo that will be expanded later into one of
9648
// the above forms.
9649
SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32);
9650
EVT VT = (SplatSize == 1 ? MVT::v16i8 :
9651
(SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
9652
SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32);
9653
SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
9654
if (VT == Op.getValueType())
9655
return RetVal;
9656
else
9657
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
9658
}
9659
9660
// If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is
9661
// 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important
9662
// for fneg/fabs.
9663
if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
9664
// Make -1 and vspltisw -1:
9665
SDValue OnesV = getCanonicalConstSplat(-1, 4, MVT::v4i32, DAG, dl);
9666
9667
// Make the VSLW intrinsic, computing 0x8000_0000.
9668
SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
9669
OnesV, DAG, dl);
9670
9671
// xor by OnesV to invert it.
9672
Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
9673
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
9674
}
9675
9676
// Check to see if this is a wide variety of vsplti*, binop self cases.
9677
static const signed char SplatCsts[] = {
9678
-1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
9679
-8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
9680
};
9681
9682
for (unsigned idx = 0; idx < std::size(SplatCsts); ++idx) {
9683
// Indirect through the SplatCsts array so that we favor 'vsplti -1' for
9684
// cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1'
9685
int i = SplatCsts[idx];
9686
9687
// Figure out what shift amount will be used by altivec if shifted by i in
9688
// this splat size.
9689
unsigned TypeShiftAmt = i & (SplatBitSize-1);
9690
9691
// vsplti + shl self.
9692
if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
9693
SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
9694
static const unsigned IIDs[] = { // Intrinsic to use for each size.
9695
Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
9696
Intrinsic::ppc_altivec_vslw
9697
};
9698
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
9699
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
9700
}
9701
9702
// vsplti + srl self.
9703
if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
9704
SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
9705
static const unsigned IIDs[] = { // Intrinsic to use for each size.
9706
Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
9707
Intrinsic::ppc_altivec_vsrw
9708
};
9709
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
9710
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
9711
}
9712
9713
// vsplti + rol self.
9714
if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
9715
((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
9716
SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
9717
static const unsigned IIDs[] = { // Intrinsic to use for each size.
9718
Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
9719
Intrinsic::ppc_altivec_vrlw
9720
};
9721
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
9722
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
9723
}
9724
9725
// t = vsplti c, result = vsldoi t, t, 1
9726
if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
9727
SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
9728
unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
9729
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
9730
}
9731
// t = vsplti c, result = vsldoi t, t, 2
9732
if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
9733
SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
9734
unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
9735
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
9736
}
9737
// t = vsplti c, result = vsldoi t, t, 3
9738
if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
9739
SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
9740
unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
9741
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
9742
}
9743
}
9744
9745
return SDValue();
9746
}
9747
9748
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
9749
/// the specified operations to build the shuffle.
9750
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
9751
SDValue RHS, SelectionDAG &DAG,
9752
const SDLoc &dl) {
9753
unsigned OpNum = (PFEntry >> 26) & 0x0F;
9754
unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
9755
unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
9756
9757
enum {
9758
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
9759
OP_VMRGHW,
9760
OP_VMRGLW,
9761
OP_VSPLTISW0,
9762
OP_VSPLTISW1,
9763
OP_VSPLTISW2,
9764
OP_VSPLTISW3,
9765
OP_VSLDOI4,
9766
OP_VSLDOI8,
9767
OP_VSLDOI12
9768
};
9769
9770
if (OpNum == OP_COPY) {
9771
if (LHSID == (1*9+2)*9+3) return LHS;
9772
assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
9773
return RHS;
9774
}
9775
9776
SDValue OpLHS, OpRHS;
9777
OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
9778
OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
9779
9780
int ShufIdxs[16];
9781
switch (OpNum) {
9782
default: llvm_unreachable("Unknown i32 permute!");
9783
case OP_VMRGHW:
9784
ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3;
9785
ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
9786
ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7;
9787
ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
9788
break;
9789
case OP_VMRGLW:
9790
ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
9791
ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
9792
ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
9793
ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
9794
break;
9795
case OP_VSPLTISW0:
9796
for (unsigned i = 0; i != 16; ++i)
9797
ShufIdxs[i] = (i&3)+0;
9798
break;
9799
case OP_VSPLTISW1:
9800
for (unsigned i = 0; i != 16; ++i)
9801
ShufIdxs[i] = (i&3)+4;
9802
break;
9803
case OP_VSPLTISW2:
9804
for (unsigned i = 0; i != 16; ++i)
9805
ShufIdxs[i] = (i&3)+8;
9806
break;
9807
case OP_VSPLTISW3:
9808
for (unsigned i = 0; i != 16; ++i)
9809
ShufIdxs[i] = (i&3)+12;
9810
break;
9811
case OP_VSLDOI4:
9812
return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
9813
case OP_VSLDOI8:
9814
return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
9815
case OP_VSLDOI12:
9816
return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
9817
}
9818
EVT VT = OpLHS.getValueType();
9819
OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);
9820
OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);
9821
SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
9822
return DAG.getNode(ISD::BITCAST, dl, VT, T);
9823
}
9824
9825
/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
9826
/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
9827
/// SDValue.
9828
SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
9829
SelectionDAG &DAG) const {
9830
const unsigned BytesInVector = 16;
9831
bool IsLE = Subtarget.isLittleEndian();
9832
SDLoc dl(N);
9833
SDValue V1 = N->getOperand(0);
9834
SDValue V2 = N->getOperand(1);
9835
unsigned ShiftElts = 0, InsertAtByte = 0;
9836
bool Swap = false;
9837
9838
// Shifts required to get the byte we want at element 7.
9839
unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,
9840
0, 15, 14, 13, 12, 11, 10, 9};
9841
unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
9842
1, 2, 3, 4, 5, 6, 7, 8};
9843
9844
ArrayRef<int> Mask = N->getMask();
9845
int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
9846
9847
// For each mask element, find out if we're just inserting something
9848
// from V2 into V1 or vice versa.
9849
// Possible permutations inserting an element from V2 into V1:
9850
// X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
9851
// 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
9852
// ...
9853
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
9854
// Inserting from V1 into V2 will be similar, except mask range will be
9855
// [16,31].
9856
9857
bool FoundCandidate = false;
9858
// If both vector operands for the shuffle are the same vector, the mask
9859
// will contain only elements from the first one and the second one will be
9860
// undef.
9861
unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
9862
// Go through the mask of half-words to find an element that's being moved
9863
// from one vector to the other.
9864
for (unsigned i = 0; i < BytesInVector; ++i) {
9865
unsigned CurrentElement = Mask[i];
9866
// If 2nd operand is undefined, we should only look for element 7 in the
9867
// Mask.
9868
if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
9869
continue;
9870
9871
bool OtherElementsInOrder = true;
9872
// Examine the other elements in the Mask to see if they're in original
9873
// order.
9874
for (unsigned j = 0; j < BytesInVector; ++j) {
9875
if (j == i)
9876
continue;
9877
// If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
9878
// from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,
9879
// in which we always assume we're always picking from the 1st operand.
9880
int MaskOffset =
9881
(!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
9882
if (Mask[j] != OriginalOrder[j] + MaskOffset) {
9883
OtherElementsInOrder = false;
9884
break;
9885
}
9886
}
9887
// If other elements are in original order, we record the number of shifts
9888
// we need to get the element we want into element 7. Also record which byte
9889
// in the vector we should insert into.
9890
if (OtherElementsInOrder) {
9891
// If 2nd operand is undefined, we assume no shifts and no swapping.
9892
if (V2.isUndef()) {
9893
ShiftElts = 0;
9894
Swap = false;
9895
} else {
9896
// Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
9897
ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
9898
: BigEndianShifts[CurrentElement & 0xF];
9899
Swap = CurrentElement < BytesInVector;
9900
}
9901
InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
9902
FoundCandidate = true;
9903
break;
9904
}
9905
}
9906
9907
if (!FoundCandidate)
9908
return SDValue();
9909
9910
// Candidate found, construct the proper SDAG sequence with VINSERTB,
9911
// optionally with VECSHL if shift is required.
9912
if (Swap)
9913
std::swap(V1, V2);
9914
if (V2.isUndef())
9915
V2 = V1;
9916
if (ShiftElts) {
9917
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
9918
DAG.getConstant(ShiftElts, dl, MVT::i32));
9919
return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,
9920
DAG.getConstant(InsertAtByte, dl, MVT::i32));
9921
}
9922
return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,
9923
DAG.getConstant(InsertAtByte, dl, MVT::i32));
9924
}
9925
9926
/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
9927
/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
9928
/// SDValue.
9929
SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
9930
SelectionDAG &DAG) const {
9931
const unsigned NumHalfWords = 8;
9932
const unsigned BytesInVector = NumHalfWords * 2;
9933
// Check that the shuffle is on half-words.
9934
if (!isNByteElemShuffleMask(N, 2, 1))
9935
return SDValue();
9936
9937
bool IsLE = Subtarget.isLittleEndian();
9938
SDLoc dl(N);
9939
SDValue V1 = N->getOperand(0);
9940
SDValue V2 = N->getOperand(1);
9941
unsigned ShiftElts = 0, InsertAtByte = 0;
9942
bool Swap = false;
9943
9944
// Shifts required to get the half-word we want at element 3.
9945
unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
9946
unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};
9947
9948
uint32_t Mask = 0;
9949
uint32_t OriginalOrderLow = 0x1234567;
9950
uint32_t OriginalOrderHigh = 0x89ABCDEF;
9951
// Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a
9952
// 32-bit space, only need 4-bit nibbles per element.
9953
for (unsigned i = 0; i < NumHalfWords; ++i) {
9954
unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
9955
Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift);
9956
}
9957
9958
// For each mask element, find out if we're just inserting something
9959
// from V2 into V1 or vice versa. Possible permutations inserting an element
9960
// from V2 into V1:
9961
// X, 1, 2, 3, 4, 5, 6, 7
9962
// 0, X, 2, 3, 4, 5, 6, 7
9963
// 0, 1, X, 3, 4, 5, 6, 7
9964
// 0, 1, 2, X, 4, 5, 6, 7
9965
// 0, 1, 2, 3, X, 5, 6, 7
9966
// 0, 1, 2, 3, 4, X, 6, 7
9967
// 0, 1, 2, 3, 4, 5, X, 7
9968
// 0, 1, 2, 3, 4, 5, 6, X
9969
// Inserting from V1 into V2 will be similar, except mask range will be [8,15].
9970
9971
bool FoundCandidate = false;
9972
// Go through the mask of half-words to find an element that's being moved
9973
// from one vector to the other.
9974
for (unsigned i = 0; i < NumHalfWords; ++i) {
9975
unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
9976
uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
9977
uint32_t MaskOtherElts = ~(0xF << MaskShift);
9978
uint32_t TargetOrder = 0x0;
9979
9980
// If both vector operands for the shuffle are the same vector, the mask
9981
// will contain only elements from the first one and the second one will be
9982
// undef.
9983
if (V2.isUndef()) {
9984
ShiftElts = 0;
9985
unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
9986
TargetOrder = OriginalOrderLow;
9987
Swap = false;
9988
// Skip if not the correct element or mask of other elements don't equal
9989
// to our expected order.
9990
if (MaskOneElt == VINSERTHSrcElem &&
9991
(Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
9992
InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
9993
FoundCandidate = true;
9994
break;
9995
}
9996
} else { // If both operands are defined.
9997
// Target order is [8,15] if the current mask is between [0,7].
9998
TargetOrder =
9999
(MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
10000
// Skip if mask of other elements don't equal our expected order.
10001
if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
10002
// We only need the last 3 bits for the number of shifts.
10003
ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
10004
: BigEndianShifts[MaskOneElt & 0x7];
10005
InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
10006
Swap = MaskOneElt < NumHalfWords;
10007
FoundCandidate = true;
10008
break;
10009
}
10010
}
10011
}
10012
10013
if (!FoundCandidate)
10014
return SDValue();
10015
10016
// Candidate found, construct the proper SDAG sequence with VINSERTH,
10017
// optionally with VECSHL if shift is required.
10018
if (Swap)
10019
std::swap(V1, V2);
10020
if (V2.isUndef())
10021
V2 = V1;
10022
SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
10023
if (ShiftElts) {
10024
// Double ShiftElts because we're left shifting on v16i8 type.
10025
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
10026
DAG.getConstant(2 * ShiftElts, dl, MVT::i32));
10027
SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl);
10028
SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
10029
DAG.getConstant(InsertAtByte, dl, MVT::i32));
10030
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
10031
}
10032
SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
10033
SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
10034
DAG.getConstant(InsertAtByte, dl, MVT::i32));
10035
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
10036
}
10037
10038
/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
10039
/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise
10040
/// return the default SDValue.
10041
SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN,
10042
SelectionDAG &DAG) const {
10043
// The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles
10044
// to v16i8. Peek through the bitcasts to get the actual operands.
10045
SDValue LHS = peekThroughBitcasts(SVN->getOperand(0));
10046
SDValue RHS = peekThroughBitcasts(SVN->getOperand(1));
10047
10048
auto ShuffleMask = SVN->getMask();
10049
SDValue VecShuffle(SVN, 0);
10050
SDLoc DL(SVN);
10051
10052
// Check that we have a four byte shuffle.
10053
if (!isNByteElemShuffleMask(SVN, 4, 1))
10054
return SDValue();
10055
10056
// Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx.
10057
if (RHS->getOpcode() != ISD::BUILD_VECTOR) {
10058
std::swap(LHS, RHS);
10059
VecShuffle = peekThroughBitcasts(DAG.getCommutedVectorShuffle(*SVN));
10060
ShuffleVectorSDNode *CommutedSV = dyn_cast<ShuffleVectorSDNode>(VecShuffle);
10061
if (!CommutedSV)
10062
return SDValue();
10063
ShuffleMask = CommutedSV->getMask();
10064
}
10065
10066
// Ensure that the RHS is a vector of constants.
10067
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
10068
if (!BVN)
10069
return SDValue();
10070
10071
// Check if RHS is a splat of 4-bytes (or smaller).
10072
APInt APSplatValue, APSplatUndef;
10073
unsigned SplatBitSize;
10074
bool HasAnyUndefs;
10075
if (!BVN->isConstantSplat(APSplatValue, APSplatUndef, SplatBitSize,
10076
HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
10077
SplatBitSize > 32)
10078
return SDValue();
10079
10080
// Check that the shuffle mask matches the semantics of XXSPLTI32DX.
10081
// The instruction splats a constant C into two words of the source vector
10082
// producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }.
10083
// Thus we check that the shuffle mask is the equivalent of
10084
// <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively.
10085
// Note: the check above of isNByteElemShuffleMask() ensures that the bytes
10086
// within each word are consecutive, so we only need to check the first byte.
10087
SDValue Index;
10088
bool IsLE = Subtarget.isLittleEndian();
10089
if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) &&
10090
(ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 &&
10091
ShuffleMask[4] > 15 && ShuffleMask[12] > 15))
10092
Index = DAG.getTargetConstant(IsLE ? 0 : 1, DL, MVT::i32);
10093
else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) &&
10094
(ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 &&
10095
ShuffleMask[0] > 15 && ShuffleMask[8] > 15))
10096
Index = DAG.getTargetConstant(IsLE ? 1 : 0, DL, MVT::i32);
10097
else
10098
return SDValue();
10099
10100
// If the splat is narrower than 32-bits, we need to get the 32-bit value
10101
// for XXSPLTI32DX.
10102
unsigned SplatVal = APSplatValue.getZExtValue();
10103
for (; SplatBitSize < 32; SplatBitSize <<= 1)
10104
SplatVal |= (SplatVal << SplatBitSize);
10105
10106
SDValue SplatNode = DAG.getNode(
10107
PPCISD::XXSPLTI32DX, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, LHS),
10108
Index, DAG.getTargetConstant(SplatVal, DL, MVT::i32));
10109
return DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, SplatNode);
10110
}
10111
10112
/// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).
10113
/// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is
10114
/// a multiple of 8. Otherwise convert it to a scalar rotation(i128)
10115
/// i.e (or (shl x, C1), (srl x, 128-C1)).
10116
SDValue PPCTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
10117
assert(Op.getOpcode() == ISD::ROTL && "Should only be called for ISD::ROTL");
10118
assert(Op.getValueType() == MVT::v1i128 &&
10119
"Only set v1i128 as custom, other type shouldn't reach here!");
10120
SDLoc dl(Op);
10121
SDValue N0 = peekThroughBitcasts(Op.getOperand(0));
10122
SDValue N1 = peekThroughBitcasts(Op.getOperand(1));
10123
unsigned SHLAmt = N1.getConstantOperandVal(0);
10124
if (SHLAmt % 8 == 0) {
10125
std::array<int, 16> Mask;
10126
std::iota(Mask.begin(), Mask.end(), 0);
10127
std::rotate(Mask.begin(), Mask.begin() + SHLAmt / 8, Mask.end());
10128
if (SDValue Shuffle =
10129
DAG.getVectorShuffle(MVT::v16i8, dl, DAG.getBitcast(MVT::v16i8, N0),
10130
DAG.getUNDEF(MVT::v16i8), Mask))
10131
return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, Shuffle);
10132
}
10133
SDValue ArgVal = DAG.getBitcast(MVT::i128, N0);
10134
SDValue SHLOp = DAG.getNode(ISD::SHL, dl, MVT::i128, ArgVal,
10135
DAG.getConstant(SHLAmt, dl, MVT::i32));
10136
SDValue SRLOp = DAG.getNode(ISD::SRL, dl, MVT::i128, ArgVal,
10137
DAG.getConstant(128 - SHLAmt, dl, MVT::i32));
10138
SDValue OROp = DAG.getNode(ISD::OR, dl, MVT::i128, SHLOp, SRLOp);
10139
return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, OROp);
10140
}
10141
10142
/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
10143
/// is a shuffle we can handle in a single instruction, return it. Otherwise,
10144
/// return the code it can be lowered into. Worst case, it can always be
10145
/// lowered into a vperm.
10146
SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
10147
SelectionDAG &DAG) const {
10148
SDLoc dl(Op);
10149
SDValue V1 = Op.getOperand(0);
10150
SDValue V2 = Op.getOperand(1);
10151
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10152
10153
// Any nodes that were combined in the target-independent combiner prior
10154
// to vector legalization will not be sent to the target combine. Try to
10155
// combine it here.
10156
if (SDValue NewShuffle = combineVectorShuffle(SVOp, DAG)) {
10157
if (!isa<ShuffleVectorSDNode>(NewShuffle))
10158
return NewShuffle;
10159
Op = NewShuffle;
10160
SVOp = cast<ShuffleVectorSDNode>(Op);
10161
V1 = Op.getOperand(0);
10162
V2 = Op.getOperand(1);
10163
}
10164
EVT VT = Op.getValueType();
10165
bool isLittleEndian = Subtarget.isLittleEndian();
10166
10167
unsigned ShiftElts, InsertAtByte;
10168
bool Swap = false;
10169
10170
// If this is a load-and-splat, we can do that with a single instruction
10171
// in some cases. However if the load has multiple uses, we don't want to
10172
// combine it because that will just produce multiple loads.
10173
bool IsPermutedLoad = false;
10174
const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad);
10175
if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
10176
(PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
10177
InputLoad->hasOneUse()) {
10178
bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4);
10179
int SplatIdx =
10180
PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
10181
10182
// The splat index for permuted loads will be in the left half of the vector
10183
// which is strictly wider than the loaded value by 8 bytes. So we need to
10184
// adjust the splat index to point to the correct address in memory.
10185
if (IsPermutedLoad) {
10186
assert((isLittleEndian || IsFourByte) &&
10187
"Unexpected size for permuted load on big endian target");
10188
SplatIdx += IsFourByte ? 2 : 1;
10189
assert((SplatIdx < (IsFourByte ? 4 : 2)) &&
10190
"Splat of a value outside of the loaded memory");
10191
}
10192
10193
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
10194
// For 4-byte load-and-splat, we need Power9.
10195
if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
10196
uint64_t Offset = 0;
10197
if (IsFourByte)
10198
Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
10199
else
10200
Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
10201
10202
// If the width of the load is the same as the width of the splat,
10203
// loading with an offset would load the wrong memory.
10204
if (LD->getValueType(0).getSizeInBits() == (IsFourByte ? 32 : 64))
10205
Offset = 0;
10206
10207
SDValue BasePtr = LD->getBasePtr();
10208
if (Offset != 0)
10209
BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
10210
BasePtr, DAG.getIntPtrConstant(Offset, dl));
10211
SDValue Ops[] = {
10212
LD->getChain(), // Chain
10213
BasePtr, // BasePtr
10214
DAG.getValueType(Op.getValueType()) // VT
10215
};
10216
SDVTList VTL =
10217
DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other);
10218
SDValue LdSplt =
10219
DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL,
10220
Ops, LD->getMemoryVT(), LD->getMemOperand());
10221
DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1), LdSplt.getValue(1));
10222
if (LdSplt.getValueType() != SVOp->getValueType(0))
10223
LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt);
10224
return LdSplt;
10225
}
10226
}
10227
10228
// All v2i64 and v2f64 shuffles are legal
10229
if (VT == MVT::v2i64 || VT == MVT::v2f64)
10230
return Op;
10231
10232
if (Subtarget.hasP9Vector() &&
10233
PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
10234
isLittleEndian)) {
10235
if (V2.isUndef())
10236
V2 = V1;
10237
else if (Swap)
10238
std::swap(V1, V2);
10239
SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
10240
SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2);
10241
if (ShiftElts) {
10242
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
10243
DAG.getConstant(ShiftElts, dl, MVT::i32));
10244
SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl,
10245
DAG.getConstant(InsertAtByte, dl, MVT::i32));
10246
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
10247
}
10248
SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2,
10249
DAG.getConstant(InsertAtByte, dl, MVT::i32));
10250
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
10251
}
10252
10253
if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
10254
SDValue SplatInsertNode;
10255
if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG)))
10256
return SplatInsertNode;
10257
}
10258
10259
if (Subtarget.hasP9Altivec()) {
10260
SDValue NewISDNode;
10261
if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
10262
return NewISDNode;
10263
10264
if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))
10265
return NewISDNode;
10266
}
10267
10268
if (Subtarget.hasVSX() &&
10269
PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
10270
if (Swap)
10271
std::swap(V1, V2);
10272
SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
10273
SDValue Conv2 =
10274
DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2);
10275
10276
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2,
10277
DAG.getConstant(ShiftElts, dl, MVT::i32));
10278
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);
10279
}
10280
10281
if (Subtarget.hasVSX() &&
10282
PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
10283
if (Swap)
10284
std::swap(V1, V2);
10285
SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
10286
SDValue Conv2 =
10287
DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2);
10288
10289
SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2,
10290
DAG.getConstant(ShiftElts, dl, MVT::i32));
10291
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI);
10292
}
10293
10294
if (Subtarget.hasP9Vector()) {
10295
if (PPC::isXXBRHShuffleMask(SVOp)) {
10296
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
10297
SDValue ReveHWord = DAG.getNode(ISD::BSWAP, dl, MVT::v8i16, Conv);
10298
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord);
10299
} else if (PPC::isXXBRWShuffleMask(SVOp)) {
10300
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
10301
SDValue ReveWord = DAG.getNode(ISD::BSWAP, dl, MVT::v4i32, Conv);
10302
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord);
10303
} else if (PPC::isXXBRDShuffleMask(SVOp)) {
10304
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
10305
SDValue ReveDWord = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Conv);
10306
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord);
10307
} else if (PPC::isXXBRQShuffleMask(SVOp)) {
10308
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1);
10309
SDValue ReveQWord = DAG.getNode(ISD::BSWAP, dl, MVT::v1i128, Conv);
10310
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord);
10311
}
10312
}
10313
10314
if (Subtarget.hasVSX()) {
10315
if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
10316
int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG);
10317
10318
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
10319
SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
10320
DAG.getConstant(SplatIdx, dl, MVT::i32));
10321
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat);
10322
}
10323
10324
// Left shifts of 8 bytes are actually swaps. Convert accordingly.
10325
if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) {
10326
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
10327
SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);
10328
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);
10329
}
10330
}
10331
10332
// Cases that are handled by instructions that take permute immediates
10333
// (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
10334
// selected by the instruction selector.
10335
if (V2.isUndef()) {
10336
if (PPC::isSplatShuffleMask(SVOp, 1) ||
10337
PPC::isSplatShuffleMask(SVOp, 2) ||
10338
PPC::isSplatShuffleMask(SVOp, 4) ||
10339
PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) ||
10340
PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) ||
10341
PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 ||
10342
PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) ||
10343
PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) ||
10344
PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) ||
10345
PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) ||
10346
PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) ||
10347
PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) ||
10348
(Subtarget.hasP8Altivec() && (
10349
PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) ||
10350
PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) ||
10351
PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) {
10352
return Op;
10353
}
10354
}
10355
10356
// Altivec has a variety of "shuffle immediates" that take two vector inputs
10357
// and produce a fixed permutation. If any of these match, do not lower to
10358
// VPERM.
10359
unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
10360
if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) ||
10361
PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) ||
10362
PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 ||
10363
PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
10364
PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
10365
PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
10366
PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
10367
PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
10368
PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
10369
(Subtarget.hasP8Altivec() && (
10370
PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) ||
10371
PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) ||
10372
PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG))))
10373
return Op;
10374
10375
// Check to see if this is a shuffle of 4-byte values. If so, we can use our
10376
// perfect shuffle table to emit an optimal matching sequence.
10377
ArrayRef<int> PermMask = SVOp->getMask();
10378
10379
if (!DisablePerfectShuffle && !isLittleEndian) {
10380
unsigned PFIndexes[4];
10381
bool isFourElementShuffle = true;
10382
for (unsigned i = 0; i != 4 && isFourElementShuffle;
10383
++i) { // Element number
10384
unsigned EltNo = 8; // Start out undef.
10385
for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
10386
if (PermMask[i * 4 + j] < 0)
10387
continue; // Undef, ignore it.
10388
10389
unsigned ByteSource = PermMask[i * 4 + j];
10390
if ((ByteSource & 3) != j) {
10391
isFourElementShuffle = false;
10392
break;
10393
}
10394
10395
if (EltNo == 8) {
10396
EltNo = ByteSource / 4;
10397
} else if (EltNo != ByteSource / 4) {
10398
isFourElementShuffle = false;
10399
break;
10400
}
10401
}
10402
PFIndexes[i] = EltNo;
10403
}
10404
10405
// If this shuffle can be expressed as a shuffle of 4-byte elements, use the
10406
// perfect shuffle vector to determine if it is cost effective to do this as
10407
// discrete instructions, or whether we should use a vperm.
10408
// For now, we skip this for little endian until such time as we have a
10409
// little-endian perfect shuffle table.
10410
if (isFourElementShuffle) {
10411
// Compute the index in the perfect shuffle table.
10412
unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
10413
PFIndexes[2] * 9 + PFIndexes[3];
10414
10415
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
10416
unsigned Cost = (PFEntry >> 30);
10417
10418
// Determining when to avoid vperm is tricky. Many things affect the cost
10419
// of vperm, particularly how many times the perm mask needs to be
10420
// computed. For example, if the perm mask can be hoisted out of a loop or
10421
// is already used (perhaps because there are multiple permutes with the
10422
// same shuffle mask?) the vperm has a cost of 1. OTOH, hoisting the
10423
// permute mask out of the loop requires an extra register.
10424
//
10425
// As a compromise, we only emit discrete instructions if the shuffle can
10426
// be generated in 3 or fewer operations. When we have loop information
10427
// available, if this block is within a loop, we should avoid using vperm
10428
// for 3-operation perms and use a constant pool load instead.
10429
if (Cost < 3)
10430
return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
10431
}
10432
}
10433
10434
// Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
10435
// vector that will get spilled to the constant pool.
10436
if (V2.isUndef()) V2 = V1;
10437
10438
return LowerVPERM(Op, DAG, PermMask, VT, V1, V2);
10439
}
10440
10441
SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
10442
ArrayRef<int> PermMask, EVT VT,
10443
SDValue V1, SDValue V2) const {
10444
unsigned Opcode = PPCISD::VPERM;
10445
EVT ValType = V1.getValueType();
10446
SDLoc dl(Op);
10447
bool NeedSwap = false;
10448
bool isLittleEndian = Subtarget.isLittleEndian();
10449
bool isPPC64 = Subtarget.isPPC64();
10450
10451
if (Subtarget.hasVSX() && Subtarget.hasP9Vector() &&
10452
(V1->hasOneUse() || V2->hasOneUse())) {
10453
LLVM_DEBUG(dbgs() << "At least one of two input vectors are dead - using "
10454
"XXPERM instead\n");
10455
Opcode = PPCISD::XXPERM;
10456
10457
// The second input to XXPERM is also an output so if the second input has
10458
// multiple uses then copying is necessary, as a result we want the
10459
// single-use operand to be used as the second input to prevent copying.
10460
if ((!isLittleEndian && !V2->hasOneUse() && V1->hasOneUse()) ||
10461
(isLittleEndian && !V1->hasOneUse() && V2->hasOneUse())) {
10462
std::swap(V1, V2);
10463
NeedSwap = !NeedSwap;
10464
}
10465
}
10466
10467
// The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
10468
// that it is in input element units, not in bytes. Convert now.
10469
10470
// For little endian, the order of the input vectors is reversed, and
10471
// the permutation mask is complemented with respect to 31. This is
10472
// necessary to produce proper semantics with the big-endian-based vperm
10473
// instruction.
10474
EVT EltVT = V1.getValueType().getVectorElementType();
10475
unsigned BytesPerElement = EltVT.getSizeInBits() / 8;
10476
10477
bool V1HasXXSWAPD = V1->getOperand(0)->getOpcode() == PPCISD::XXSWAPD;
10478
bool V2HasXXSWAPD = V2->getOperand(0)->getOpcode() == PPCISD::XXSWAPD;
10479
10480
/*
10481
Vectors will be appended like so: [ V1 | v2 ]
10482
XXSWAPD on V1:
10483
[ A | B | C | D ] -> [ C | D | A | B ]
10484
0-3 4-7 8-11 12-15 0-3 4-7 8-11 12-15
10485
i.e. index of A, B += 8, and index of C, D -= 8.
10486
XXSWAPD on V2:
10487
[ E | F | G | H ] -> [ G | H | E | F ]
10488
16-19 20-23 24-27 28-31 16-19 20-23 24-27 28-31
10489
i.e. index of E, F += 8, index of G, H -= 8
10490
Swap V1 and V2:
10491
[ V1 | V2 ] -> [ V2 | V1 ]
10492
0-15 16-31 0-15 16-31
10493
i.e. index of V1 += 16, index of V2 -= 16
10494
*/
10495
10496
SmallVector<SDValue, 16> ResultMask;
10497
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
10498
unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
10499
10500
if (V1HasXXSWAPD) {
10501
if (SrcElt < 8)
10502
SrcElt += 8;
10503
else if (SrcElt < 16)
10504
SrcElt -= 8;
10505
}
10506
if (V2HasXXSWAPD) {
10507
if (SrcElt > 23)
10508
SrcElt -= 8;
10509
else if (SrcElt > 15)
10510
SrcElt += 8;
10511
}
10512
if (NeedSwap) {
10513
if (SrcElt < 16)
10514
SrcElt += 16;
10515
else
10516
SrcElt -= 16;
10517
}
10518
for (unsigned j = 0; j != BytesPerElement; ++j)
10519
if (isLittleEndian)
10520
ResultMask.push_back(
10521
DAG.getConstant(31 - (SrcElt * BytesPerElement + j), dl, MVT::i32));
10522
else
10523
ResultMask.push_back(
10524
DAG.getConstant(SrcElt * BytesPerElement + j, dl, MVT::i32));
10525
}
10526
10527
if (V1HasXXSWAPD) {
10528
dl = SDLoc(V1->getOperand(0));
10529
V1 = V1->getOperand(0)->getOperand(1);
10530
}
10531
if (V2HasXXSWAPD) {
10532
dl = SDLoc(V2->getOperand(0));
10533
V2 = V2->getOperand(0)->getOperand(1);
10534
}
10535
10536
if (isPPC64 && (V1HasXXSWAPD || V2HasXXSWAPD)) {
10537
if (ValType != MVT::v2f64)
10538
V1 = DAG.getBitcast(MVT::v2f64, V1);
10539
if (V2.getValueType() != MVT::v2f64)
10540
V2 = DAG.getBitcast(MVT::v2f64, V2);
10541
}
10542
10543
ShufflesHandledWithVPERM++;
10544
SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);
10545
LLVM_DEBUG({
10546
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10547
if (Opcode == PPCISD::XXPERM) {
10548
dbgs() << "Emitting a XXPERM for the following shuffle:\n";
10549
} else {
10550
dbgs() << "Emitting a VPERM for the following shuffle:\n";
10551
}
10552
SVOp->dump();
10553
dbgs() << "With the following permute control vector:\n";
10554
VPermMask.dump();
10555
});
10556
10557
if (Opcode == PPCISD::XXPERM)
10558
VPermMask = DAG.getBitcast(MVT::v4i32, VPermMask);
10559
10560
// Only need to place items backwards in LE,
10561
// the mask was properly calculated.
10562
if (isLittleEndian)
10563
std::swap(V1, V2);
10564
10565
SDValue VPERMNode =
10566
DAG.getNode(Opcode, dl, V1.getValueType(), V1, V2, VPermMask);
10567
10568
VPERMNode = DAG.getBitcast(ValType, VPERMNode);
10569
return VPERMNode;
10570
}
10571
10572
/// getVectorCompareInfo - Given an intrinsic, return false if it is not a
10573
/// vector comparison. If it is, return true and fill in Opc/isDot with
10574
/// information about the intrinsic.
10575
static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
10576
bool &isDot, const PPCSubtarget &Subtarget) {
10577
unsigned IntrinsicID = Intrin.getConstantOperandVal(0);
10578
CompareOpc = -1;
10579
isDot = false;
10580
switch (IntrinsicID) {
10581
default:
10582
return false;
10583
// Comparison predicates.
10584
case Intrinsic::ppc_altivec_vcmpbfp_p:
10585
CompareOpc = 966;
10586
isDot = true;
10587
break;
10588
case Intrinsic::ppc_altivec_vcmpeqfp_p:
10589
CompareOpc = 198;
10590
isDot = true;
10591
break;
10592
case Intrinsic::ppc_altivec_vcmpequb_p:
10593
CompareOpc = 6;
10594
isDot = true;
10595
break;
10596
case Intrinsic::ppc_altivec_vcmpequh_p:
10597
CompareOpc = 70;
10598
isDot = true;
10599
break;
10600
case Intrinsic::ppc_altivec_vcmpequw_p:
10601
CompareOpc = 134;
10602
isDot = true;
10603
break;
10604
case Intrinsic::ppc_altivec_vcmpequd_p:
10605
if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10606
CompareOpc = 199;
10607
isDot = true;
10608
} else
10609
return false;
10610
break;
10611
case Intrinsic::ppc_altivec_vcmpneb_p:
10612
case Intrinsic::ppc_altivec_vcmpneh_p:
10613
case Intrinsic::ppc_altivec_vcmpnew_p:
10614
case Intrinsic::ppc_altivec_vcmpnezb_p:
10615
case Intrinsic::ppc_altivec_vcmpnezh_p:
10616
case Intrinsic::ppc_altivec_vcmpnezw_p:
10617
if (Subtarget.hasP9Altivec()) {
10618
switch (IntrinsicID) {
10619
default:
10620
llvm_unreachable("Unknown comparison intrinsic.");
10621
case Intrinsic::ppc_altivec_vcmpneb_p:
10622
CompareOpc = 7;
10623
break;
10624
case Intrinsic::ppc_altivec_vcmpneh_p:
10625
CompareOpc = 71;
10626
break;
10627
case Intrinsic::ppc_altivec_vcmpnew_p:
10628
CompareOpc = 135;
10629
break;
10630
case Intrinsic::ppc_altivec_vcmpnezb_p:
10631
CompareOpc = 263;
10632
break;
10633
case Intrinsic::ppc_altivec_vcmpnezh_p:
10634
CompareOpc = 327;
10635
break;
10636
case Intrinsic::ppc_altivec_vcmpnezw_p:
10637
CompareOpc = 391;
10638
break;
10639
}
10640
isDot = true;
10641
} else
10642
return false;
10643
break;
10644
case Intrinsic::ppc_altivec_vcmpgefp_p:
10645
CompareOpc = 454;
10646
isDot = true;
10647
break;
10648
case Intrinsic::ppc_altivec_vcmpgtfp_p:
10649
CompareOpc = 710;
10650
isDot = true;
10651
break;
10652
case Intrinsic::ppc_altivec_vcmpgtsb_p:
10653
CompareOpc = 774;
10654
isDot = true;
10655
break;
10656
case Intrinsic::ppc_altivec_vcmpgtsh_p:
10657
CompareOpc = 838;
10658
isDot = true;
10659
break;
10660
case Intrinsic::ppc_altivec_vcmpgtsw_p:
10661
CompareOpc = 902;
10662
isDot = true;
10663
break;
10664
case Intrinsic::ppc_altivec_vcmpgtsd_p:
10665
if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10666
CompareOpc = 967;
10667
isDot = true;
10668
} else
10669
return false;
10670
break;
10671
case Intrinsic::ppc_altivec_vcmpgtub_p:
10672
CompareOpc = 518;
10673
isDot = true;
10674
break;
10675
case Intrinsic::ppc_altivec_vcmpgtuh_p:
10676
CompareOpc = 582;
10677
isDot = true;
10678
break;
10679
case Intrinsic::ppc_altivec_vcmpgtuw_p:
10680
CompareOpc = 646;
10681
isDot = true;
10682
break;
10683
case Intrinsic::ppc_altivec_vcmpgtud_p:
10684
if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10685
CompareOpc = 711;
10686
isDot = true;
10687
} else
10688
return false;
10689
break;
10690
10691
case Intrinsic::ppc_altivec_vcmpequq:
10692
case Intrinsic::ppc_altivec_vcmpgtsq:
10693
case Intrinsic::ppc_altivec_vcmpgtuq:
10694
if (!Subtarget.isISA3_1())
10695
return false;
10696
switch (IntrinsicID) {
10697
default:
10698
llvm_unreachable("Unknown comparison intrinsic.");
10699
case Intrinsic::ppc_altivec_vcmpequq:
10700
CompareOpc = 455;
10701
break;
10702
case Intrinsic::ppc_altivec_vcmpgtsq:
10703
CompareOpc = 903;
10704
break;
10705
case Intrinsic::ppc_altivec_vcmpgtuq:
10706
CompareOpc = 647;
10707
break;
10708
}
10709
break;
10710
10711
// VSX predicate comparisons use the same infrastructure
10712
case Intrinsic::ppc_vsx_xvcmpeqdp_p:
10713
case Intrinsic::ppc_vsx_xvcmpgedp_p:
10714
case Intrinsic::ppc_vsx_xvcmpgtdp_p:
10715
case Intrinsic::ppc_vsx_xvcmpeqsp_p:
10716
case Intrinsic::ppc_vsx_xvcmpgesp_p:
10717
case Intrinsic::ppc_vsx_xvcmpgtsp_p:
10718
if (Subtarget.hasVSX()) {
10719
switch (IntrinsicID) {
10720
case Intrinsic::ppc_vsx_xvcmpeqdp_p:
10721
CompareOpc = 99;
10722
break;
10723
case Intrinsic::ppc_vsx_xvcmpgedp_p:
10724
CompareOpc = 115;
10725
break;
10726
case Intrinsic::ppc_vsx_xvcmpgtdp_p:
10727
CompareOpc = 107;
10728
break;
10729
case Intrinsic::ppc_vsx_xvcmpeqsp_p:
10730
CompareOpc = 67;
10731
break;
10732
case Intrinsic::ppc_vsx_xvcmpgesp_p:
10733
CompareOpc = 83;
10734
break;
10735
case Intrinsic::ppc_vsx_xvcmpgtsp_p:
10736
CompareOpc = 75;
10737
break;
10738
}
10739
isDot = true;
10740
} else
10741
return false;
10742
break;
10743
10744
// Normal Comparisons.
10745
case Intrinsic::ppc_altivec_vcmpbfp:
10746
CompareOpc = 966;
10747
break;
10748
case Intrinsic::ppc_altivec_vcmpeqfp:
10749
CompareOpc = 198;
10750
break;
10751
case Intrinsic::ppc_altivec_vcmpequb:
10752
CompareOpc = 6;
10753
break;
10754
case Intrinsic::ppc_altivec_vcmpequh:
10755
CompareOpc = 70;
10756
break;
10757
case Intrinsic::ppc_altivec_vcmpequw:
10758
CompareOpc = 134;
10759
break;
10760
case Intrinsic::ppc_altivec_vcmpequd:
10761
if (Subtarget.hasP8Altivec())
10762
CompareOpc = 199;
10763
else
10764
return false;
10765
break;
10766
case Intrinsic::ppc_altivec_vcmpneb:
10767
case Intrinsic::ppc_altivec_vcmpneh:
10768
case Intrinsic::ppc_altivec_vcmpnew:
10769
case Intrinsic::ppc_altivec_vcmpnezb:
10770
case Intrinsic::ppc_altivec_vcmpnezh:
10771
case Intrinsic::ppc_altivec_vcmpnezw:
10772
if (Subtarget.hasP9Altivec())
10773
switch (IntrinsicID) {
10774
default:
10775
llvm_unreachable("Unknown comparison intrinsic.");
10776
case Intrinsic::ppc_altivec_vcmpneb:
10777
CompareOpc = 7;
10778
break;
10779
case Intrinsic::ppc_altivec_vcmpneh:
10780
CompareOpc = 71;
10781
break;
10782
case Intrinsic::ppc_altivec_vcmpnew:
10783
CompareOpc = 135;
10784
break;
10785
case Intrinsic::ppc_altivec_vcmpnezb:
10786
CompareOpc = 263;
10787
break;
10788
case Intrinsic::ppc_altivec_vcmpnezh:
10789
CompareOpc = 327;
10790
break;
10791
case Intrinsic::ppc_altivec_vcmpnezw:
10792
CompareOpc = 391;
10793
break;
10794
}
10795
else
10796
return false;
10797
break;
10798
case Intrinsic::ppc_altivec_vcmpgefp:
10799
CompareOpc = 454;
10800
break;
10801
case Intrinsic::ppc_altivec_vcmpgtfp:
10802
CompareOpc = 710;
10803
break;
10804
case Intrinsic::ppc_altivec_vcmpgtsb:
10805
CompareOpc = 774;
10806
break;
10807
case Intrinsic::ppc_altivec_vcmpgtsh:
10808
CompareOpc = 838;
10809
break;
10810
case Intrinsic::ppc_altivec_vcmpgtsw:
10811
CompareOpc = 902;
10812
break;
10813
case Intrinsic::ppc_altivec_vcmpgtsd:
10814
if (Subtarget.hasP8Altivec())
10815
CompareOpc = 967;
10816
else
10817
return false;
10818
break;
10819
case Intrinsic::ppc_altivec_vcmpgtub:
10820
CompareOpc = 518;
10821
break;
10822
case Intrinsic::ppc_altivec_vcmpgtuh:
10823
CompareOpc = 582;
10824
break;
10825
case Intrinsic::ppc_altivec_vcmpgtuw:
10826
CompareOpc = 646;
10827
break;
10828
case Intrinsic::ppc_altivec_vcmpgtud:
10829
if (Subtarget.hasP8Altivec())
10830
CompareOpc = 711;
10831
else
10832
return false;
10833
break;
10834
case Intrinsic::ppc_altivec_vcmpequq_p:
10835
case Intrinsic::ppc_altivec_vcmpgtsq_p:
10836
case Intrinsic::ppc_altivec_vcmpgtuq_p:
10837
if (!Subtarget.isISA3_1())
10838
return false;
10839
switch (IntrinsicID) {
10840
default:
10841
llvm_unreachable("Unknown comparison intrinsic.");
10842
case Intrinsic::ppc_altivec_vcmpequq_p:
10843
CompareOpc = 455;
10844
break;
10845
case Intrinsic::ppc_altivec_vcmpgtsq_p:
10846
CompareOpc = 903;
10847
break;
10848
case Intrinsic::ppc_altivec_vcmpgtuq_p:
10849
CompareOpc = 647;
10850
break;
10851
}
10852
isDot = true;
10853
break;
10854
}
10855
return true;
10856
}
10857
10858
/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
10859
/// lower, do it, otherwise return null.
10860
SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10861
SelectionDAG &DAG) const {
10862
unsigned IntrinsicID = Op.getConstantOperandVal(0);
10863
10864
SDLoc dl(Op);
10865
10866
switch (IntrinsicID) {
10867
case Intrinsic::thread_pointer:
10868
// Reads the thread pointer register, used for __builtin_thread_pointer.
10869
if (Subtarget.isPPC64())
10870
return DAG.getRegister(PPC::X13, MVT::i64);
10871
return DAG.getRegister(PPC::R2, MVT::i32);
10872
10873
case Intrinsic::ppc_rldimi: {
10874
assert(Subtarget.isPPC64() && "rldimi is only available in 64-bit!");
10875
SDValue Src = Op.getOperand(1);
10876
APInt Mask = Op.getConstantOperandAPInt(4);
10877
if (Mask.isZero())
10878
return Op.getOperand(2);
10879
if (Mask.isAllOnes())
10880
return DAG.getNode(ISD::ROTL, dl, MVT::i64, Src, Op.getOperand(3));
10881
uint64_t SH = Op.getConstantOperandVal(3);
10882
unsigned MB = 0, ME = 0;
10883
if (!isRunOfOnes64(Mask.getZExtValue(), MB, ME))
10884
report_fatal_error("invalid rldimi mask!");
10885
// rldimi requires ME=63-SH, otherwise rotation is needed before rldimi.
10886
if (ME < 63 - SH) {
10887
Src = DAG.getNode(ISD::ROTL, dl, MVT::i64, Src,
10888
DAG.getConstant(ME + SH + 1, dl, MVT::i32));
10889
} else if (ME > 63 - SH) {
10890
Src = DAG.getNode(ISD::ROTL, dl, MVT::i64, Src,
10891
DAG.getConstant(ME + SH - 63, dl, MVT::i32));
10892
}
10893
return SDValue(
10894
DAG.getMachineNode(PPC::RLDIMI, dl, MVT::i64,
10895
{Op.getOperand(2), Src,
10896
DAG.getTargetConstant(63 - ME, dl, MVT::i32),
10897
DAG.getTargetConstant(MB, dl, MVT::i32)}),
10898
0);
10899
}
10900
10901
case Intrinsic::ppc_rlwimi: {
10902
APInt Mask = Op.getConstantOperandAPInt(4);
10903
if (Mask.isZero())
10904
return Op.getOperand(2);
10905
if (Mask.isAllOnes())
10906
return DAG.getNode(ISD::ROTL, dl, MVT::i32, Op.getOperand(1),
10907
Op.getOperand(3));
10908
unsigned MB = 0, ME = 0;
10909
if (!isRunOfOnes(Mask.getZExtValue(), MB, ME))
10910
report_fatal_error("invalid rlwimi mask!");
10911
return SDValue(DAG.getMachineNode(
10912
PPC::RLWIMI, dl, MVT::i32,
10913
{Op.getOperand(2), Op.getOperand(1), Op.getOperand(3),
10914
DAG.getTargetConstant(MB, dl, MVT::i32),
10915
DAG.getTargetConstant(ME, dl, MVT::i32)}),
10916
0);
10917
}
10918
10919
case Intrinsic::ppc_rlwnm: {
10920
if (Op.getConstantOperandVal(3) == 0)
10921
return DAG.getConstant(0, dl, MVT::i32);
10922
unsigned MB = 0, ME = 0;
10923
if (!isRunOfOnes(Op.getConstantOperandVal(3), MB, ME))
10924
report_fatal_error("invalid rlwnm mask!");
10925
return SDValue(
10926
DAG.getMachineNode(PPC::RLWNM, dl, MVT::i32,
10927
{Op.getOperand(1), Op.getOperand(2),
10928
DAG.getTargetConstant(MB, dl, MVT::i32),
10929
DAG.getTargetConstant(ME, dl, MVT::i32)}),
10930
0);
10931
}
10932
10933
case Intrinsic::ppc_mma_disassemble_acc: {
10934
if (Subtarget.isISAFuture()) {
10935
EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
10936
SDValue WideVec =
10937
SDValue(DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, ReturnTypes,
10938
Op.getOperand(1)),
10939
0);
10940
SmallVector<SDValue, 4> RetOps;
10941
SDValue Value = SDValue(WideVec.getNode(), 0);
10942
SDValue Value2 = SDValue(WideVec.getNode(), 1);
10943
10944
SDValue Extract;
10945
Extract = DAG.getNode(
10946
PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,
10947
Subtarget.isLittleEndian() ? Value2 : Value,
10948
DAG.getConstant(Subtarget.isLittleEndian() ? 1 : 0,
10949
dl, getPointerTy(DAG.getDataLayout())));
10950
RetOps.push_back(Extract);
10951
Extract = DAG.getNode(
10952
PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,
10953
Subtarget.isLittleEndian() ? Value2 : Value,
10954
DAG.getConstant(Subtarget.isLittleEndian() ? 0 : 1,
10955
dl, getPointerTy(DAG.getDataLayout())));
10956
RetOps.push_back(Extract);
10957
Extract = DAG.getNode(
10958
PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,
10959
Subtarget.isLittleEndian() ? Value : Value2,
10960
DAG.getConstant(Subtarget.isLittleEndian() ? 1 : 0,
10961
dl, getPointerTy(DAG.getDataLayout())));
10962
RetOps.push_back(Extract);
10963
Extract = DAG.getNode(
10964
PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,
10965
Subtarget.isLittleEndian() ? Value : Value2,
10966
DAG.getConstant(Subtarget.isLittleEndian() ? 0 : 1,
10967
dl, getPointerTy(DAG.getDataLayout())));
10968
RetOps.push_back(Extract);
10969
return DAG.getMergeValues(RetOps, dl);
10970
}
10971
[[fallthrough]];
10972
}
10973
case Intrinsic::ppc_vsx_disassemble_pair: {
10974
int NumVecs = 2;
10975
SDValue WideVec = Op.getOperand(1);
10976
if (IntrinsicID == Intrinsic::ppc_mma_disassemble_acc) {
10977
NumVecs = 4;
10978
WideVec = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, WideVec);
10979
}
10980
SmallVector<SDValue, 4> RetOps;
10981
for (int VecNo = 0; VecNo < NumVecs; VecNo++) {
10982
SDValue Extract = DAG.getNode(
10983
PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, WideVec,
10984
DAG.getConstant(Subtarget.isLittleEndian() ? NumVecs - 1 - VecNo
10985
: VecNo,
10986
dl, getPointerTy(DAG.getDataLayout())));
10987
RetOps.push_back(Extract);
10988
}
10989
return DAG.getMergeValues(RetOps, dl);
10990
}
10991
10992
case Intrinsic::ppc_mma_xxmfacc:
10993
case Intrinsic::ppc_mma_xxmtacc: {
10994
// Allow pre-isa-future subtargets to lower as normal.
10995
if (!Subtarget.isISAFuture())
10996
return SDValue();
10997
// The intrinsics for xxmtacc and xxmfacc take one argument of
10998
// type v512i1, for future cpu the corresponding wacc instruction
10999
// dmxx[inst|extf]dmr512 is always generated for type v512i1, negating
11000
// the need to produce the xxm[t|f]acc.
11001
SDValue WideVec = Op.getOperand(1);
11002
DAG.ReplaceAllUsesWith(Op, WideVec);
11003
return SDValue();
11004
}
11005
11006
case Intrinsic::ppc_unpack_longdouble: {
11007
auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
11008
assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
11009
"Argument of long double unpack must be 0 or 1!");
11010
return DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Op.getOperand(1),
11011
DAG.getConstant(!!(Idx->getSExtValue()), dl,
11012
Idx->getValueType(0)));
11013
}
11014
11015
case Intrinsic::ppc_compare_exp_lt:
11016
case Intrinsic::ppc_compare_exp_gt:
11017
case Intrinsic::ppc_compare_exp_eq:
11018
case Intrinsic::ppc_compare_exp_uo: {
11019
unsigned Pred;
11020
switch (IntrinsicID) {
11021
case Intrinsic::ppc_compare_exp_lt:
11022
Pred = PPC::PRED_LT;
11023
break;
11024
case Intrinsic::ppc_compare_exp_gt:
11025
Pred = PPC::PRED_GT;
11026
break;
11027
case Intrinsic::ppc_compare_exp_eq:
11028
Pred = PPC::PRED_EQ;
11029
break;
11030
case Intrinsic::ppc_compare_exp_uo:
11031
Pred = PPC::PRED_UN;
11032
break;
11033
}
11034
return SDValue(
11035
DAG.getMachineNode(
11036
PPC::SELECT_CC_I4, dl, MVT::i32,
11037
{SDValue(DAG.getMachineNode(PPC::XSCMPEXPDP, dl, MVT::i32,
11038
Op.getOperand(1), Op.getOperand(2)),
11039
0),
11040
DAG.getConstant(1, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32),
11041
DAG.getTargetConstant(Pred, dl, MVT::i32)}),
11042
0);
11043
}
11044
case Intrinsic::ppc_test_data_class: {
11045
EVT OpVT = Op.getOperand(1).getValueType();
11046
unsigned CmprOpc = OpVT == MVT::f128 ? PPC::XSTSTDCQP
11047
: (OpVT == MVT::f64 ? PPC::XSTSTDCDP
11048
: PPC::XSTSTDCSP);
11049
return SDValue(
11050
DAG.getMachineNode(
11051
PPC::SELECT_CC_I4, dl, MVT::i32,
11052
{SDValue(DAG.getMachineNode(CmprOpc, dl, MVT::i32, Op.getOperand(2),
11053
Op.getOperand(1)),
11054
0),
11055
DAG.getConstant(1, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32),
11056
DAG.getTargetConstant(PPC::PRED_EQ, dl, MVT::i32)}),
11057
0);
11058
}
11059
case Intrinsic::ppc_fnmsub: {
11060
EVT VT = Op.getOperand(1).getValueType();
11061
if (!Subtarget.hasVSX() || (!Subtarget.hasFloat128() && VT == MVT::f128))
11062
return DAG.getNode(
11063
ISD::FNEG, dl, VT,
11064
DAG.getNode(ISD::FMA, dl, VT, Op.getOperand(1), Op.getOperand(2),
11065
DAG.getNode(ISD::FNEG, dl, VT, Op.getOperand(3))));
11066
return DAG.getNode(PPCISD::FNMSUB, dl, VT, Op.getOperand(1),
11067
Op.getOperand(2), Op.getOperand(3));
11068
}
11069
case Intrinsic::ppc_convert_f128_to_ppcf128:
11070
case Intrinsic::ppc_convert_ppcf128_to_f128: {
11071
RTLIB::Libcall LC = IntrinsicID == Intrinsic::ppc_convert_ppcf128_to_f128
11072
? RTLIB::CONVERT_PPCF128_F128
11073
: RTLIB::CONVERT_F128_PPCF128;
11074
MakeLibCallOptions CallOptions;
11075
std::pair<SDValue, SDValue> Result =
11076
makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(1), CallOptions,
11077
dl, SDValue());
11078
return Result.first;
11079
}
11080
case Intrinsic::ppc_maxfe:
11081
case Intrinsic::ppc_maxfl:
11082
case Intrinsic::ppc_maxfs:
11083
case Intrinsic::ppc_minfe:
11084
case Intrinsic::ppc_minfl:
11085
case Intrinsic::ppc_minfs: {
11086
EVT VT = Op.getValueType();
11087
assert(
11088
all_of(Op->ops().drop_front(4),
11089
[VT](const SDUse &Use) { return Use.getValueType() == VT; }) &&
11090
"ppc_[max|min]f[e|l|s] must have uniform type arguments");
11091
(void)VT;
11092
ISD::CondCode CC = ISD::SETGT;
11093
if (IntrinsicID == Intrinsic::ppc_minfe ||
11094
IntrinsicID == Intrinsic::ppc_minfl ||
11095
IntrinsicID == Intrinsic::ppc_minfs)
11096
CC = ISD::SETLT;
11097
unsigned I = Op.getNumOperands() - 2, Cnt = I;
11098
SDValue Res = Op.getOperand(I);
11099
for (--I; Cnt != 0; --Cnt, I = (--I == 0 ? (Op.getNumOperands() - 1) : I)) {
11100
Res =
11101
DAG.getSelectCC(dl, Res, Op.getOperand(I), Res, Op.getOperand(I), CC);
11102
}
11103
return Res;
11104
}
11105
}
11106
11107
// If this is a lowered altivec predicate compare, CompareOpc is set to the
11108
// opcode number of the comparison.
11109
int CompareOpc;
11110
bool isDot;
11111
if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
11112
return SDValue(); // Don't custom lower most intrinsics.
11113
11114
// If this is a non-dot comparison, make the VCMP node and we are done.
11115
if (!isDot) {
11116
SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
11117
Op.getOperand(1), Op.getOperand(2),
11118
DAG.getConstant(CompareOpc, dl, MVT::i32));
11119
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);
11120
}
11121
11122
// Create the PPCISD altivec 'dot' comparison node.
11123
SDValue Ops[] = {
11124
Op.getOperand(2), // LHS
11125
Op.getOperand(3), // RHS
11126
DAG.getConstant(CompareOpc, dl, MVT::i32)
11127
};
11128
EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
11129
SDValue CompNode = DAG.getNode(PPCISD::VCMP_rec, dl, VTs, Ops);
11130
11131
// Now that we have the comparison, emit a copy from the CR to a GPR.
11132
// This is flagged to the above dot comparison.
11133
SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,
11134
DAG.getRegister(PPC::CR6, MVT::i32),
11135
CompNode.getValue(1));
11136
11137
// Unpack the result based on how the target uses it.
11138
unsigned BitNo; // Bit # of CR6.
11139
bool InvertBit; // Invert result?
11140
switch (Op.getConstantOperandVal(1)) {
11141
default: // Can't happen, don't crash on invalid number though.
11142
case 0: // Return the value of the EQ bit of CR6.
11143
BitNo = 0; InvertBit = false;
11144
break;
11145
case 1: // Return the inverted value of the EQ bit of CR6.
11146
BitNo = 0; InvertBit = true;
11147
break;
11148
case 2: // Return the value of the LT bit of CR6.
11149
BitNo = 2; InvertBit = false;
11150
break;
11151
case 3: // Return the inverted value of the LT bit of CR6.
11152
BitNo = 2; InvertBit = true;
11153
break;
11154
}
11155
11156
// Shift the bit into the low position.
11157
Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
11158
DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32));
11159
// Isolate the bit.
11160
Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
11161
DAG.getConstant(1, dl, MVT::i32));
11162
11163
// If we are supposed to, toggle the bit.
11164
if (InvertBit)
11165
Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
11166
DAG.getConstant(1, dl, MVT::i32));
11167
return Flags;
11168
}
11169
11170
SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11171
SelectionDAG &DAG) const {
11172
// SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to
11173
// the beginning of the argument list.
11174
int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1;
11175
SDLoc DL(Op);
11176
switch (Op.getConstantOperandVal(ArgStart)) {
11177
case Intrinsic::ppc_cfence: {
11178
assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
11179
SDValue Val = Op.getOperand(ArgStart + 1);
11180
EVT Ty = Val.getValueType();
11181
if (Ty == MVT::i128) {
11182
// FIXME: Testing one of two paired registers is sufficient to guarantee
11183
// ordering?
11184
Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, Val);
11185
}
11186
unsigned Opcode = Subtarget.isPPC64() ? PPC::CFENCE8 : PPC::CFENCE;
11187
EVT FTy = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
11188
return SDValue(
11189
DAG.getMachineNode(Opcode, DL, MVT::Other,
11190
DAG.getNode(ISD::ANY_EXTEND, DL, FTy, Val),
11191
Op.getOperand(0)),
11192
0);
11193
}
11194
default:
11195
break;
11196
}
11197
return SDValue();
11198
}
11199
11200
// Lower scalar BSWAP64 to xxbrd.
11201
SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
11202
SDLoc dl(Op);
11203
if (!Subtarget.isPPC64())
11204
return Op;
11205
// MTVSRDD
11206
Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
11207
Op.getOperand(0));
11208
// XXBRD
11209
Op = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Op);
11210
// MFVSRD
11211
int VectorIndex = 0;
11212
if (Subtarget.isLittleEndian())
11213
VectorIndex = 1;
11214
Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
11215
DAG.getTargetConstant(VectorIndex, dl, MVT::i32));
11216
return Op;
11217
}
11218
11219
// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
11220
// compared to a value that is atomically loaded (atomic loads zero-extend).
11221
SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11222
SelectionDAG &DAG) const {
11223
assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
11224
"Expecting an atomic compare-and-swap here.");
11225
SDLoc dl(Op);
11226
auto *AtomicNode = cast<AtomicSDNode>(Op.getNode());
11227
EVT MemVT = AtomicNode->getMemoryVT();
11228
if (MemVT.getSizeInBits() >= 32)
11229
return Op;
11230
11231
SDValue CmpOp = Op.getOperand(2);
11232
// If this is already correctly zero-extended, leave it alone.
11233
auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits());
11234
if (DAG.MaskedValueIsZero(CmpOp, HighBits))
11235
return Op;
11236
11237
// Clear the high bits of the compare operand.
11238
unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
11239
SDValue NewCmpOp =
11240
DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp,
11241
DAG.getConstant(MaskVal, dl, MVT::i32));
11242
11243
// Replace the existing compare operand with the properly zero-extended one.
11244
SmallVector<SDValue, 4> Ops;
11245
for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
11246
Ops.push_back(AtomicNode->getOperand(i));
11247
Ops[2] = NewCmpOp;
11248
MachineMemOperand *MMO = AtomicNode->getMemOperand();
11249
SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other);
11250
auto NodeTy =
11251
(MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
11252
return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
11253
}
11254
11255
SDValue PPCTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op,
11256
SelectionDAG &DAG) const {
11257
AtomicSDNode *N = cast<AtomicSDNode>(Op.getNode());
11258
EVT MemVT = N->getMemoryVT();
11259
assert(MemVT.getSimpleVT() == MVT::i128 &&
11260
"Expect quadword atomic operations");
11261
SDLoc dl(N);
11262
unsigned Opc = N->getOpcode();
11263
switch (Opc) {
11264
case ISD::ATOMIC_LOAD: {
11265
// Lower quadword atomic load to int_ppc_atomic_load_i128 which will be
11266
// lowered to ppc instructions by pattern matching instruction selector.
11267
SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
11268
SmallVector<SDValue, 4> Ops{
11269
N->getOperand(0),
11270
DAG.getConstant(Intrinsic::ppc_atomic_load_i128, dl, MVT::i32)};
11271
for (int I = 1, E = N->getNumOperands(); I < E; ++I)
11272
Ops.push_back(N->getOperand(I));
11273
SDValue LoadedVal = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, Tys,
11274
Ops, MemVT, N->getMemOperand());
11275
SDValue ValLo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i128, LoadedVal);
11276
SDValue ValHi =
11277
DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i128, LoadedVal.getValue(1));
11278
ValHi = DAG.getNode(ISD::SHL, dl, MVT::i128, ValHi,
11279
DAG.getConstant(64, dl, MVT::i32));
11280
SDValue Val =
11281
DAG.getNode(ISD::OR, dl, {MVT::i128, MVT::Other}, {ValLo, ValHi});
11282
return DAG.getNode(ISD::MERGE_VALUES, dl, {MVT::i128, MVT::Other},
11283
{Val, LoadedVal.getValue(2)});
11284
}
11285
case ISD::ATOMIC_STORE: {
11286
// Lower quadword atomic store to int_ppc_atomic_store_i128 which will be
11287
// lowered to ppc instructions by pattern matching instruction selector.
11288
SDVTList Tys = DAG.getVTList(MVT::Other);
11289
SmallVector<SDValue, 4> Ops{
11290
N->getOperand(0),
11291
DAG.getConstant(Intrinsic::ppc_atomic_store_i128, dl, MVT::i32)};
11292
SDValue Val = N->getOperand(1);
11293
SDValue ValLo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, Val);
11294
SDValue ValHi = DAG.getNode(ISD::SRL, dl, MVT::i128, Val,
11295
DAG.getConstant(64, dl, MVT::i32));
11296
ValHi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, ValHi);
11297
Ops.push_back(ValLo);
11298
Ops.push_back(ValHi);
11299
Ops.push_back(N->getOperand(2));
11300
return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops, MemVT,
11301
N->getMemOperand());
11302
}
11303
default:
11304
llvm_unreachable("Unexpected atomic opcode");
11305
}
11306
}
11307
11308
static SDValue getDataClassTest(SDValue Op, FPClassTest Mask, const SDLoc &Dl,
11309
SelectionDAG &DAG,
11310
const PPCSubtarget &Subtarget) {
11311
assert(Mask <= fcAllFlags && "Invalid fp_class flags!");
11312
11313
enum DataClassMask {
11314
DC_NAN = 1 << 6,
11315
DC_NEG_INF = 1 << 4,
11316
DC_POS_INF = 1 << 5,
11317
DC_NEG_ZERO = 1 << 2,
11318
DC_POS_ZERO = 1 << 3,
11319
DC_NEG_SUBNORM = 1,
11320
DC_POS_SUBNORM = 1 << 1,
11321
};
11322
11323
EVT VT = Op.getValueType();
11324
11325
unsigned TestOp = VT == MVT::f128 ? PPC::XSTSTDCQP
11326
: VT == MVT::f64 ? PPC::XSTSTDCDP
11327
: PPC::XSTSTDCSP;
11328
11329
if (Mask == fcAllFlags)
11330
return DAG.getBoolConstant(true, Dl, MVT::i1, VT);
11331
if (Mask == 0)
11332
return DAG.getBoolConstant(false, Dl, MVT::i1, VT);
11333
11334
// When it's cheaper or necessary to test reverse flags.
11335
if ((Mask & fcNormal) == fcNormal || Mask == ~fcQNan || Mask == ~fcSNan) {
11336
SDValue Rev = getDataClassTest(Op, ~Mask, Dl, DAG, Subtarget);
11337
return DAG.getNOT(Dl, Rev, MVT::i1);
11338
}
11339
11340
// Power doesn't support testing whether a value is 'normal'. Test the rest
11341
// first, and test if it's 'not not-normal' with expected sign.
11342
if (Mask & fcNormal) {
11343
SDValue Rev(DAG.getMachineNode(
11344
TestOp, Dl, MVT::i32,
11345
DAG.getTargetConstant(DC_NAN | DC_NEG_INF | DC_POS_INF |
11346
DC_NEG_ZERO | DC_POS_ZERO |
11347
DC_NEG_SUBNORM | DC_POS_SUBNORM,
11348
Dl, MVT::i32),
11349
Op),
11350
0);
11351
// Sign are stored in CR bit 0, result are in CR bit 2.
11352
SDValue Sign(
11353
DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, Dl, MVT::i1, Rev,
11354
DAG.getTargetConstant(PPC::sub_lt, Dl, MVT::i32)),
11355
0);
11356
SDValue Normal(DAG.getNOT(
11357
Dl,
11358
SDValue(DAG.getMachineNode(
11359
TargetOpcode::EXTRACT_SUBREG, Dl, MVT::i1, Rev,
11360
DAG.getTargetConstant(PPC::sub_eq, Dl, MVT::i32)),
11361
0),
11362
MVT::i1));
11363
if (Mask & fcPosNormal)
11364
Sign = DAG.getNOT(Dl, Sign, MVT::i1);
11365
SDValue Result = DAG.getNode(ISD::AND, Dl, MVT::i1, Sign, Normal);
11366
if (Mask == fcPosNormal || Mask == fcNegNormal)
11367
return Result;
11368
11369
return DAG.getNode(
11370
ISD::OR, Dl, MVT::i1,
11371
getDataClassTest(Op, Mask & ~fcNormal, Dl, DAG, Subtarget), Result);
11372
}
11373
11374
// The instruction doesn't differentiate between signaling or quiet NaN. Test
11375
// the rest first, and test if it 'is NaN and is signaling/quiet'.
11376
if ((Mask & fcNan) == fcQNan || (Mask & fcNan) == fcSNan) {
11377
bool IsQuiet = Mask & fcQNan;
11378
SDValue NanCheck = getDataClassTest(Op, fcNan, Dl, DAG, Subtarget);
11379
11380
// Quietness is determined by the first bit in fraction field.
11381
uint64_t QuietMask = 0;
11382
SDValue HighWord;
11383
if (VT == MVT::f128) {
11384
HighWord = DAG.getNode(
11385
ISD::EXTRACT_VECTOR_ELT, Dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Op),
11386
DAG.getVectorIdxConstant(Subtarget.isLittleEndian() ? 3 : 0, Dl));
11387
QuietMask = 0x8000;
11388
} else if (VT == MVT::f64) {
11389
if (Subtarget.isPPC64()) {
11390
HighWord = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32,
11391
DAG.getBitcast(MVT::i64, Op),
11392
DAG.getConstant(1, Dl, MVT::i32));
11393
} else {
11394
SDValue Vec = DAG.getBitcast(
11395
MVT::v4i32, DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v2f64, Op));
11396
HighWord = DAG.getNode(
11397
ISD::EXTRACT_VECTOR_ELT, Dl, MVT::i32, Vec,
11398
DAG.getVectorIdxConstant(Subtarget.isLittleEndian() ? 1 : 0, Dl));
11399
}
11400
QuietMask = 0x80000;
11401
} else if (VT == MVT::f32) {
11402
HighWord = DAG.getBitcast(MVT::i32, Op);
11403
QuietMask = 0x400000;
11404
}
11405
SDValue NanRes = DAG.getSetCC(
11406
Dl, MVT::i1,
11407
DAG.getNode(ISD::AND, Dl, MVT::i32, HighWord,
11408
DAG.getConstant(QuietMask, Dl, MVT::i32)),
11409
DAG.getConstant(0, Dl, MVT::i32), IsQuiet ? ISD::SETNE : ISD::SETEQ);
11410
NanRes = DAG.getNode(ISD::AND, Dl, MVT::i1, NanCheck, NanRes);
11411
if (Mask == fcQNan || Mask == fcSNan)
11412
return NanRes;
11413
11414
return DAG.getNode(ISD::OR, Dl, MVT::i1,
11415
getDataClassTest(Op, Mask & ~fcNan, Dl, DAG, Subtarget),
11416
NanRes);
11417
}
11418
11419
unsigned NativeMask = 0;
11420
if ((Mask & fcNan) == fcNan)
11421
NativeMask |= DC_NAN;
11422
if (Mask & fcNegInf)
11423
NativeMask |= DC_NEG_INF;
11424
if (Mask & fcPosInf)
11425
NativeMask |= DC_POS_INF;
11426
if (Mask & fcNegZero)
11427
NativeMask |= DC_NEG_ZERO;
11428
if (Mask & fcPosZero)
11429
NativeMask |= DC_POS_ZERO;
11430
if (Mask & fcNegSubnormal)
11431
NativeMask |= DC_NEG_SUBNORM;
11432
if (Mask & fcPosSubnormal)
11433
NativeMask |= DC_POS_SUBNORM;
11434
return SDValue(
11435
DAG.getMachineNode(
11436
TargetOpcode::EXTRACT_SUBREG, Dl, MVT::i1,
11437
SDValue(DAG.getMachineNode(
11438
TestOp, Dl, MVT::i32,
11439
DAG.getTargetConstant(NativeMask, Dl, MVT::i32), Op),
11440
0),
11441
DAG.getTargetConstant(PPC::sub_eq, Dl, MVT::i32)),
11442
0);
11443
}
11444
11445
SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op,
11446
SelectionDAG &DAG) const {
11447
assert(Subtarget.hasP9Vector() && "Test data class requires Power9");
11448
SDValue LHS = Op.getOperand(0);
11449
uint64_t RHSC = Op.getConstantOperandVal(1);
11450
SDLoc Dl(Op);
11451
FPClassTest Category = static_cast<FPClassTest>(RHSC);
11452
return getDataClassTest(LHS, Category, Dl, DAG, Subtarget);
11453
}
11454
11455
SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
11456
SelectionDAG &DAG) const {
11457
SDLoc dl(Op);
11458
// Create a stack slot that is 16-byte aligned.
11459
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
11460
int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
11461
EVT PtrVT = getPointerTy(DAG.getDataLayout());
11462
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
11463
11464
// Store the input value into Value#0 of the stack slot.
11465
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
11466
MachinePointerInfo());
11467
// Load it out.
11468
return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
11469
}
11470
11471
SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
11472
SelectionDAG &DAG) const {
11473
assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
11474
"Should only be called for ISD::INSERT_VECTOR_ELT");
11475
11476
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
11477
11478
EVT VT = Op.getValueType();
11479
SDLoc dl(Op);
11480
SDValue V1 = Op.getOperand(0);
11481
SDValue V2 = Op.getOperand(1);
11482
11483
if (VT == MVT::v2f64 && C)
11484
return Op;
11485
11486
if (Subtarget.hasP9Vector()) {
11487
// A f32 load feeding into a v4f32 insert_vector_elt is handled in this way
11488
// because on P10, it allows this specific insert_vector_elt load pattern to
11489
// utilize the refactored load and store infrastructure in order to exploit
11490
// prefixed loads.
11491
// On targets with inexpensive direct moves (Power9 and up), a
11492
// (insert_vector_elt v4f32:$vec, (f32 load)) is always better as an integer
11493
// load since a single precision load will involve conversion to double
11494
// precision on the load followed by another conversion to single precision.
11495
if ((VT == MVT::v4f32) && (V2.getValueType() == MVT::f32) &&
11496
(isa<LoadSDNode>(V2))) {
11497
SDValue BitcastVector = DAG.getBitcast(MVT::v4i32, V1);
11498
SDValue BitcastLoad = DAG.getBitcast(MVT::i32, V2);
11499
SDValue InsVecElt =
11500
DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4i32, BitcastVector,
11501
BitcastLoad, Op.getOperand(2));
11502
return DAG.getBitcast(MVT::v4f32, InsVecElt);
11503
}
11504
}
11505
11506
if (Subtarget.isISA3_1()) {
11507
if ((VT == MVT::v2i64 || VT == MVT::v2f64) && !Subtarget.isPPC64())
11508
return SDValue();
11509
// On P10, we have legal lowering for constant and variable indices for
11510
// all vectors.
11511
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
11512
VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
11513
return Op;
11514
}
11515
11516
// Before P10, we have legal lowering for constant indices but not for
11517
// variable ones.
11518
if (!C)
11519
return SDValue();
11520
11521
// We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
11522
if (VT == MVT::v8i16 || VT == MVT::v16i8) {
11523
SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
11524
unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
11525
unsigned InsertAtElement = C->getZExtValue();
11526
unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
11527
if (Subtarget.isLittleEndian()) {
11528
InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
11529
}
11530
return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
11531
DAG.getConstant(InsertAtByte, dl, MVT::i32));
11532
}
11533
return Op;
11534
}
11535
11536
SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
11537
SelectionDAG &DAG) const {
11538
SDLoc dl(Op);
11539
LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
11540
SDValue LoadChain = LN->getChain();
11541
SDValue BasePtr = LN->getBasePtr();
11542
EVT VT = Op.getValueType();
11543
11544
if (VT != MVT::v256i1 && VT != MVT::v512i1)
11545
return Op;
11546
11547
// Type v256i1 is used for pairs and v512i1 is used for accumulators.
11548
// Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in
11549
// 2 or 4 vsx registers.
11550
assert((VT != MVT::v512i1 || Subtarget.hasMMA()) &&
11551
"Type unsupported without MMA");
11552
assert((VT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
11553
"Type unsupported without paired vector support");
11554
Align Alignment = LN->getAlign();
11555
SmallVector<SDValue, 4> Loads;
11556
SmallVector<SDValue, 4> LoadChains;
11557
unsigned NumVecs = VT.getSizeInBits() / 128;
11558
for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
11559
SDValue Load =
11560
DAG.getLoad(MVT::v16i8, dl, LoadChain, BasePtr,
11561
LN->getPointerInfo().getWithOffset(Idx * 16),
11562
commonAlignment(Alignment, Idx * 16),
11563
LN->getMemOperand()->getFlags(), LN->getAAInfo());
11564
BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
11565
DAG.getConstant(16, dl, BasePtr.getValueType()));
11566
Loads.push_back(Load);
11567
LoadChains.push_back(Load.getValue(1));
11568
}
11569
if (Subtarget.isLittleEndian()) {
11570
std::reverse(Loads.begin(), Loads.end());
11571
std::reverse(LoadChains.begin(), LoadChains.end());
11572
}
11573
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
11574
SDValue Value =
11575
DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD,
11576
dl, VT, Loads);
11577
SDValue RetOps[] = {Value, TF};
11578
return DAG.getMergeValues(RetOps, dl);
11579
}
11580
11581
SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
11582
SelectionDAG &DAG) const {
11583
SDLoc dl(Op);
11584
StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
11585
SDValue StoreChain = SN->getChain();
11586
SDValue BasePtr = SN->getBasePtr();
11587
SDValue Value = SN->getValue();
11588
SDValue Value2 = SN->getValue();
11589
EVT StoreVT = Value.getValueType();
11590
11591
if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1)
11592
return Op;
11593
11594
// Type v256i1 is used for pairs and v512i1 is used for accumulators.
11595
// Here we create 2 or 4 v16i8 stores to store the pair or accumulator
11596
// underlying registers individually.
11597
assert((StoreVT != MVT::v512i1 || Subtarget.hasMMA()) &&
11598
"Type unsupported without MMA");
11599
assert((StoreVT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
11600
"Type unsupported without paired vector support");
11601
Align Alignment = SN->getAlign();
11602
SmallVector<SDValue, 4> Stores;
11603
unsigned NumVecs = 2;
11604
if (StoreVT == MVT::v512i1) {
11605
if (Subtarget.isISAFuture()) {
11606
EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
11607
MachineSDNode *ExtNode = DAG.getMachineNode(
11608
PPC::DMXXEXTFDMR512, dl, ReturnTypes, Op.getOperand(1));
11609
11610
Value = SDValue(ExtNode, 0);
11611
Value2 = SDValue(ExtNode, 1);
11612
} else
11613
Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value);
11614
NumVecs = 4;
11615
}
11616
for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
11617
unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx;
11618
SDValue Elt;
11619
if (Subtarget.isISAFuture()) {
11620
VecNum = Subtarget.isLittleEndian() ? 1 - (Idx % 2) : (Idx % 2);
11621
Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,
11622
Idx > 1 ? Value2 : Value,
11623
DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));
11624
} else
11625
Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value,
11626
DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));
11627
11628
SDValue Store =
11629
DAG.getStore(StoreChain, dl, Elt, BasePtr,
11630
SN->getPointerInfo().getWithOffset(Idx * 16),
11631
commonAlignment(Alignment, Idx * 16),
11632
SN->getMemOperand()->getFlags(), SN->getAAInfo());
11633
BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
11634
DAG.getConstant(16, dl, BasePtr.getValueType()));
11635
Stores.push_back(Store);
11636
}
11637
SDValue TF = DAG.getTokenFactor(dl, Stores);
11638
return TF;
11639
}
11640
11641
SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
11642
SDLoc dl(Op);
11643
if (Op.getValueType() == MVT::v4i32) {
11644
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
11645
11646
SDValue Zero = getCanonicalConstSplat(0, 1, MVT::v4i32, DAG, dl);
11647
// +16 as shift amt.
11648
SDValue Neg16 = getCanonicalConstSplat(-16, 4, MVT::v4i32, DAG, dl);
11649
SDValue RHSSwap = // = vrlw RHS, 16
11650
BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
11651
11652
// Shrinkify inputs to v8i16.
11653
LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);
11654
RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);
11655
RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);
11656
11657
// Low parts multiplied together, generating 32-bit results (we ignore the
11658
// top parts).
11659
SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
11660
LHS, RHS, DAG, dl, MVT::v4i32);
11661
11662
SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
11663
LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
11664
// Shift the high parts up 16 bits.
11665
HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
11666
Neg16, DAG, dl);
11667
return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
11668
} else if (Op.getValueType() == MVT::v16i8) {
11669
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
11670
bool isLittleEndian = Subtarget.isLittleEndian();
11671
11672
// Multiply the even 8-bit parts, producing 16-bit sums.
11673
SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
11674
LHS, RHS, DAG, dl, MVT::v8i16);
11675
EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);
11676
11677
// Multiply the odd 8-bit parts, producing 16-bit sums.
11678
SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
11679
LHS, RHS, DAG, dl, MVT::v8i16);
11680
OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);
11681
11682
// Merge the results together. Because vmuleub and vmuloub are
11683
// instructions with a big-endian bias, we must reverse the
11684
// element numbering and reverse the meaning of "odd" and "even"
11685
// when generating little endian code.
11686
int Ops[16];
11687
for (unsigned i = 0; i != 8; ++i) {
11688
if (isLittleEndian) {
11689
Ops[i*2 ] = 2*i;
11690
Ops[i*2+1] = 2*i+16;
11691
} else {
11692
Ops[i*2 ] = 2*i+1;
11693
Ops[i*2+1] = 2*i+1+16;
11694
}
11695
}
11696
if (isLittleEndian)
11697
return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
11698
else
11699
return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
11700
} else {
11701
llvm_unreachable("Unknown mul to lower!");
11702
}
11703
}
11704
11705
SDValue PPCTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
11706
bool IsStrict = Op->isStrictFPOpcode();
11707
if (Op.getOperand(IsStrict ? 1 : 0).getValueType() == MVT::f128 &&
11708
!Subtarget.hasP9Vector())
11709
return SDValue();
11710
11711
return Op;
11712
}
11713
11714
// Custom lowering for fpext vf32 to v2f64
11715
SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
11716
11717
assert(Op.getOpcode() == ISD::FP_EXTEND &&
11718
"Should only be called for ISD::FP_EXTEND");
11719
11720
// FIXME: handle extends from half precision float vectors on P9.
11721
// We only want to custom lower an extend from v2f32 to v2f64.
11722
if (Op.getValueType() != MVT::v2f64 ||
11723
Op.getOperand(0).getValueType() != MVT::v2f32)
11724
return SDValue();
11725
11726
SDLoc dl(Op);
11727
SDValue Op0 = Op.getOperand(0);
11728
11729
switch (Op0.getOpcode()) {
11730
default:
11731
return SDValue();
11732
case ISD::EXTRACT_SUBVECTOR: {
11733
assert(Op0.getNumOperands() == 2 &&
11734
isa<ConstantSDNode>(Op0->getOperand(1)) &&
11735
"Node should have 2 operands with second one being a constant!");
11736
11737
if (Op0.getOperand(0).getValueType() != MVT::v4f32)
11738
return SDValue();
11739
11740
// Custom lower is only done for high or low doubleword.
11741
int Idx = Op0.getConstantOperandVal(1);
11742
if (Idx % 2 != 0)
11743
return SDValue();
11744
11745
// Since input is v4f32, at this point Idx is either 0 or 2.
11746
// Shift to get the doubleword position we want.
11747
int DWord = Idx >> 1;
11748
11749
// High and low word positions are different on little endian.
11750
if (Subtarget.isLittleEndian())
11751
DWord ^= 0x1;
11752
11753
return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64,
11754
Op0.getOperand(0), DAG.getConstant(DWord, dl, MVT::i32));
11755
}
11756
case ISD::FADD:
11757
case ISD::FMUL:
11758
case ISD::FSUB: {
11759
SDValue NewLoad[2];
11760
for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {
11761
// Ensure both input are loads.
11762
SDValue LdOp = Op0.getOperand(i);
11763
if (LdOp.getOpcode() != ISD::LOAD)
11764
return SDValue();
11765
// Generate new load node.
11766
LoadSDNode *LD = cast<LoadSDNode>(LdOp);
11767
SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
11768
NewLoad[i] = DAG.getMemIntrinsicNode(
11769
PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
11770
LD->getMemoryVT(), LD->getMemOperand());
11771
}
11772
SDValue NewOp =
11773
DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, NewLoad[0],
11774
NewLoad[1], Op0.getNode()->getFlags());
11775
return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewOp,
11776
DAG.getConstant(0, dl, MVT::i32));
11777
}
11778
case ISD::LOAD: {
11779
LoadSDNode *LD = cast<LoadSDNode>(Op0);
11780
SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
11781
SDValue NewLd = DAG.getMemIntrinsicNode(
11782
PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
11783
LD->getMemoryVT(), LD->getMemOperand());
11784
return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewLd,
11785
DAG.getConstant(0, dl, MVT::i32));
11786
}
11787
}
11788
llvm_unreachable("ERROR:Should return for all cases within swtich.");
11789
}
11790
11791
/// LowerOperation - Provide custom lowering hooks for some operations.
11792
///
11793
SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
11794
switch (Op.getOpcode()) {
11795
default: llvm_unreachable("Wasn't expecting to be able to lower this!");
11796
case ISD::FPOW: return lowerPow(Op, DAG);
11797
case ISD::FSIN: return lowerSin(Op, DAG);
11798
case ISD::FCOS: return lowerCos(Op, DAG);
11799
case ISD::FLOG: return lowerLog(Op, DAG);
11800
case ISD::FLOG10: return lowerLog10(Op, DAG);
11801
case ISD::FEXP: return lowerExp(Op, DAG);
11802
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
11803
case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
11804
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
11805
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
11806
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
11807
case ISD::STRICT_FSETCC:
11808
case ISD::STRICT_FSETCCS:
11809
case ISD::SETCC: return LowerSETCC(Op, DAG);
11810
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
11811
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
11812
11813
case ISD::INLINEASM:
11814
case ISD::INLINEASM_BR: return LowerINLINEASM(Op, DAG);
11815
// Variable argument lowering.
11816
case ISD::VASTART: return LowerVASTART(Op, DAG);
11817
case ISD::VAARG: return LowerVAARG(Op, DAG);
11818
case ISD::VACOPY: return LowerVACOPY(Op, DAG);
11819
11820
case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG);
11821
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
11822
case ISD::GET_DYNAMIC_AREA_OFFSET:
11823
return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
11824
11825
// Exception handling lowering.
11826
case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG);
11827
case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
11828
case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
11829
11830
case ISD::LOAD: return LowerLOAD(Op, DAG);
11831
case ISD::STORE: return LowerSTORE(Op, DAG);
11832
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
11833
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
11834
case ISD::STRICT_FP_TO_UINT:
11835
case ISD::STRICT_FP_TO_SINT:
11836
case ISD::FP_TO_UINT:
11837
case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op));
11838
case ISD::STRICT_UINT_TO_FP:
11839
case ISD::STRICT_SINT_TO_FP:
11840
case ISD::UINT_TO_FP:
11841
case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
11842
case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
11843
11844
// Lower 64-bit shifts.
11845
case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG);
11846
case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
11847
case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);
11848
11849
case ISD::FSHL: return LowerFunnelShift(Op, DAG);
11850
case ISD::FSHR: return LowerFunnelShift(Op, DAG);
11851
11852
// Vector-related lowering.
11853
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
11854
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
11855
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
11856
case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
11857
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
11858
case ISD::MUL: return LowerMUL(Op, DAG);
11859
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
11860
case ISD::STRICT_FP_ROUND:
11861
case ISD::FP_ROUND:
11862
return LowerFP_ROUND(Op, DAG);
11863
case ISD::ROTL: return LowerROTL(Op, DAG);
11864
11865
// For counter-based loop handling.
11866
case ISD::INTRINSIC_W_CHAIN: return SDValue();
11867
11868
case ISD::BITCAST: return LowerBITCAST(Op, DAG);
11869
11870
// Frame & Return address.
11871
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
11872
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
11873
11874
case ISD::INTRINSIC_VOID:
11875
return LowerINTRINSIC_VOID(Op, DAG);
11876
case ISD::BSWAP:
11877
return LowerBSWAP(Op, DAG);
11878
case ISD::ATOMIC_CMP_SWAP:
11879
return LowerATOMIC_CMP_SWAP(Op, DAG);
11880
case ISD::ATOMIC_STORE:
11881
return LowerATOMIC_LOAD_STORE(Op, DAG);
11882
case ISD::IS_FPCLASS:
11883
return LowerIS_FPCLASS(Op, DAG);
11884
}
11885
}
11886
11887
void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
11888
SmallVectorImpl<SDValue>&Results,
11889
SelectionDAG &DAG) const {
11890
SDLoc dl(N);
11891
switch (N->getOpcode()) {
11892
default:
11893
llvm_unreachable("Do not know how to custom type legalize this operation!");
11894
case ISD::ATOMIC_LOAD: {
11895
SDValue Res = LowerATOMIC_LOAD_STORE(SDValue(N, 0), DAG);
11896
Results.push_back(Res);
11897
Results.push_back(Res.getValue(1));
11898
break;
11899
}
11900
case ISD::READCYCLECOUNTER: {
11901
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
11902
SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));
11903
11904
Results.push_back(
11905
DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, RTB, RTB.getValue(1)));
11906
Results.push_back(RTB.getValue(2));
11907
break;
11908
}
11909
case ISD::INTRINSIC_W_CHAIN: {
11910
if (N->getConstantOperandVal(1) != Intrinsic::loop_decrement)
11911
break;
11912
11913
assert(N->getValueType(0) == MVT::i1 &&
11914
"Unexpected result type for CTR decrement intrinsic");
11915
EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
11916
N->getValueType(0));
11917
SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
11918
SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
11919
N->getOperand(1));
11920
11921
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewInt));
11922
Results.push_back(NewInt.getValue(1));
11923
break;
11924
}
11925
case ISD::INTRINSIC_WO_CHAIN: {
11926
switch (N->getConstantOperandVal(0)) {
11927
case Intrinsic::ppc_pack_longdouble:
11928
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
11929
N->getOperand(2), N->getOperand(1)));
11930
break;
11931
case Intrinsic::ppc_maxfe:
11932
case Intrinsic::ppc_minfe:
11933
case Intrinsic::ppc_fnmsub:
11934
case Intrinsic::ppc_convert_f128_to_ppcf128:
11935
Results.push_back(LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG));
11936
break;
11937
}
11938
break;
11939
}
11940
case ISD::VAARG: {
11941
if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64())
11942
return;
11943
11944
EVT VT = N->getValueType(0);
11945
11946
if (VT == MVT::i64) {
11947
SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG);
11948
11949
Results.push_back(NewNode);
11950
Results.push_back(NewNode.getValue(1));
11951
}
11952
return;
11953
}
11954
case ISD::STRICT_FP_TO_SINT:
11955
case ISD::STRICT_FP_TO_UINT:
11956
case ISD::FP_TO_SINT:
11957
case ISD::FP_TO_UINT: {
11958
// LowerFP_TO_INT() can only handle f32 and f64.
11959
if (N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType() ==
11960
MVT::ppcf128)
11961
return;
11962
SDValue LoweredValue = LowerFP_TO_INT(SDValue(N, 0), DAG, dl);
11963
Results.push_back(LoweredValue);
11964
if (N->isStrictFPOpcode())
11965
Results.push_back(LoweredValue.getValue(1));
11966
return;
11967
}
11968
case ISD::TRUNCATE: {
11969
if (!N->getValueType(0).isVector())
11970
return;
11971
SDValue Lowered = LowerTRUNCATEVector(SDValue(N, 0), DAG);
11972
if (Lowered)
11973
Results.push_back(Lowered);
11974
return;
11975
}
11976
case ISD::FSHL:
11977
case ISD::FSHR:
11978
// Don't handle funnel shifts here.
11979
return;
11980
case ISD::BITCAST:
11981
// Don't handle bitcast here.
11982
return;
11983
case ISD::FP_EXTEND:
11984
SDValue Lowered = LowerFP_EXTEND(SDValue(N, 0), DAG);
11985
if (Lowered)
11986
Results.push_back(Lowered);
11987
return;
11988
}
11989
}
11990
11991
//===----------------------------------------------------------------------===//
11992
// Other Lowering Code
11993
//===----------------------------------------------------------------------===//
11994
11995
static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
11996
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
11997
Function *Func = Intrinsic::getDeclaration(M, Id);
11998
return Builder.CreateCall(Func, {});
11999
}
12000
12001
// The mappings for emitLeading/TrailingFence is taken from
12002
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
12003
Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
12004
Instruction *Inst,
12005
AtomicOrdering Ord) const {
12006
if (Ord == AtomicOrdering::SequentiallyConsistent)
12007
return callIntrinsic(Builder, Intrinsic::ppc_sync);
12008
if (isReleaseOrStronger(Ord))
12009
return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
12010
return nullptr;
12011
}
12012
12013
Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
12014
Instruction *Inst,
12015
AtomicOrdering Ord) const {
12016
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
12017
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
12018
// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
12019
// and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
12020
if (isa<LoadInst>(Inst))
12021
return Builder.CreateCall(
12022
Intrinsic::getDeclaration(
12023
Builder.GetInsertBlock()->getParent()->getParent(),
12024
Intrinsic::ppc_cfence, {Inst->getType()}),
12025
{Inst});
12026
// FIXME: Can use isync for rmw operation.
12027
return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
12028
}
12029
return nullptr;
12030
}
12031
12032
MachineBasicBlock *
12033
PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
12034
unsigned AtomicSize,
12035
unsigned BinOpcode,
12036
unsigned CmpOpcode,
12037
unsigned CmpPred) const {
12038
// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
12039
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
12040
12041
auto LoadMnemonic = PPC::LDARX;
12042
auto StoreMnemonic = PPC::STDCX;
12043
switch (AtomicSize) {
12044
default:
12045
llvm_unreachable("Unexpected size of atomic entity");
12046
case 1:
12047
LoadMnemonic = PPC::LBARX;
12048
StoreMnemonic = PPC::STBCX;
12049
assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
12050
break;
12051
case 2:
12052
LoadMnemonic = PPC::LHARX;
12053
StoreMnemonic = PPC::STHCX;
12054
assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
12055
break;
12056
case 4:
12057
LoadMnemonic = PPC::LWARX;
12058
StoreMnemonic = PPC::STWCX;
12059
break;
12060
case 8:
12061
LoadMnemonic = PPC::LDARX;
12062
StoreMnemonic = PPC::STDCX;
12063
break;
12064
}
12065
12066
const BasicBlock *LLVM_BB = BB->getBasicBlock();
12067
MachineFunction *F = BB->getParent();
12068
MachineFunction::iterator It = ++BB->getIterator();
12069
12070
Register dest = MI.getOperand(0).getReg();
12071
Register ptrA = MI.getOperand(1).getReg();
12072
Register ptrB = MI.getOperand(2).getReg();
12073
Register incr = MI.getOperand(3).getReg();
12074
DebugLoc dl = MI.getDebugLoc();
12075
12076
MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
12077
MachineBasicBlock *loop2MBB =
12078
CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
12079
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
12080
F->insert(It, loopMBB);
12081
if (CmpOpcode)
12082
F->insert(It, loop2MBB);
12083
F->insert(It, exitMBB);
12084
exitMBB->splice(exitMBB->begin(), BB,
12085
std::next(MachineBasicBlock::iterator(MI)), BB->end());
12086
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
12087
12088
MachineRegisterInfo &RegInfo = F->getRegInfo();
12089
Register TmpReg = (!BinOpcode) ? incr :
12090
RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass
12091
: &PPC::GPRCRegClass);
12092
12093
// thisMBB:
12094
// ...
12095
// fallthrough --> loopMBB
12096
BB->addSuccessor(loopMBB);
12097
12098
// loopMBB:
12099
// l[wd]arx dest, ptr
12100
// add r0, dest, incr
12101
// st[wd]cx. r0, ptr
12102
// bne- loopMBB
12103
// fallthrough --> exitMBB
12104
12105
// For max/min...
12106
// loopMBB:
12107
// l[wd]arx dest, ptr
12108
// cmpl?[wd] dest, incr
12109
// bgt exitMBB
12110
// loop2MBB:
12111
// st[wd]cx. dest, ptr
12112
// bne- loopMBB
12113
// fallthrough --> exitMBB
12114
12115
BB = loopMBB;
12116
BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
12117
.addReg(ptrA).addReg(ptrB);
12118
if (BinOpcode)
12119
BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
12120
if (CmpOpcode) {
12121
Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
12122
// Signed comparisons of byte or halfword values must be sign-extended.
12123
if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
12124
Register ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
12125
BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
12126
ExtReg).addReg(dest);
12127
BuildMI(BB, dl, TII->get(CmpOpcode), CrReg).addReg(ExtReg).addReg(incr);
12128
} else
12129
BuildMI(BB, dl, TII->get(CmpOpcode), CrReg).addReg(dest).addReg(incr);
12130
12131
BuildMI(BB, dl, TII->get(PPC::BCC))
12132
.addImm(CmpPred)
12133
.addReg(CrReg)
12134
.addMBB(exitMBB);
12135
BB->addSuccessor(loop2MBB);
12136
BB->addSuccessor(exitMBB);
12137
BB = loop2MBB;
12138
}
12139
BuildMI(BB, dl, TII->get(StoreMnemonic))
12140
.addReg(TmpReg).addReg(ptrA).addReg(ptrB);
12141
BuildMI(BB, dl, TII->get(PPC::BCC))
12142
.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
12143
BB->addSuccessor(loopMBB);
12144
BB->addSuccessor(exitMBB);
12145
12146
// exitMBB:
12147
// ...
12148
BB = exitMBB;
12149
return BB;
12150
}
12151
12152
static bool isSignExtended(MachineInstr &MI, const PPCInstrInfo *TII) {
12153
switch(MI.getOpcode()) {
12154
default:
12155
return false;
12156
case PPC::COPY:
12157
return TII->isSignExtended(MI.getOperand(1).getReg(),
12158
&MI.getMF()->getRegInfo());
12159
case PPC::LHA:
12160
case PPC::LHA8:
12161
case PPC::LHAU:
12162
case PPC::LHAU8:
12163
case PPC::LHAUX:
12164
case PPC::LHAUX8:
12165
case PPC::LHAX:
12166
case PPC::LHAX8:
12167
case PPC::LWA:
12168
case PPC::LWAUX:
12169
case PPC::LWAX:
12170
case PPC::LWAX_32:
12171
case PPC::LWA_32:
12172
case PPC::PLHA:
12173
case PPC::PLHA8:
12174
case PPC::PLHA8pc:
12175
case PPC::PLHApc:
12176
case PPC::PLWA:
12177
case PPC::PLWA8:
12178
case PPC::PLWA8pc:
12179
case PPC::PLWApc:
12180
case PPC::EXTSB:
12181
case PPC::EXTSB8:
12182
case PPC::EXTSB8_32_64:
12183
case PPC::EXTSB8_rec:
12184
case PPC::EXTSB_rec:
12185
case PPC::EXTSH:
12186
case PPC::EXTSH8:
12187
case PPC::EXTSH8_32_64:
12188
case PPC::EXTSH8_rec:
12189
case PPC::EXTSH_rec:
12190
case PPC::EXTSW:
12191
case PPC::EXTSWSLI:
12192
case PPC::EXTSWSLI_32_64:
12193
case PPC::EXTSWSLI_32_64_rec:
12194
case PPC::EXTSWSLI_rec:
12195
case PPC::EXTSW_32:
12196
case PPC::EXTSW_32_64:
12197
case PPC::EXTSW_32_64_rec:
12198
case PPC::EXTSW_rec:
12199
case PPC::SRAW:
12200
case PPC::SRAWI:
12201
case PPC::SRAWI_rec:
12202
case PPC::SRAW_rec:
12203
return true;
12204
}
12205
return false;
12206
}
12207
12208
MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
12209
MachineInstr &MI, MachineBasicBlock *BB,
12210
bool is8bit, // operation
12211
unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const {
12212
// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
12213
const PPCInstrInfo *TII = Subtarget.getInstrInfo();
12214
12215
// If this is a signed comparison and the value being compared is not known
12216
// to be sign extended, sign extend it here.
12217
DebugLoc dl = MI.getDebugLoc();
12218
MachineFunction *F = BB->getParent();
12219
MachineRegisterInfo &RegInfo = F->getRegInfo();
12220
Register incr = MI.getOperand(3).getReg();
12221
bool IsSignExtended =
12222
incr.isVirtual() && isSignExtended(*RegInfo.getVRegDef(incr), TII);
12223
12224
if (CmpOpcode == PPC::CMPW && !IsSignExtended) {
12225
Register ValueReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
12226
BuildMI(*BB, MI, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueReg)
12227
.addReg(MI.getOperand(3).getReg());
12228
MI.getOperand(3).setReg(ValueReg);
12229
incr = ValueReg;
12230
}
12231
// If we support part-word atomic mnemonics, just use them
12232
if (Subtarget.hasPartwordAtomics())
12233
return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, CmpOpcode,
12234
CmpPred);
12235
12236
// In 64 bit mode we have to use 64 bits for addresses, even though the
12237
// lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
12238
// registers without caring whether they're 32 or 64, but here we're
12239
// doing actual arithmetic on the addresses.
12240
bool is64bit = Subtarget.isPPC64();
12241
bool isLittleEndian = Subtarget.isLittleEndian();
12242
unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
12243
12244
const BasicBlock *LLVM_BB = BB->getBasicBlock();
12245
MachineFunction::iterator It = ++BB->getIterator();
12246
12247
Register dest = MI.getOperand(0).getReg();
12248
Register ptrA = MI.getOperand(1).getReg();
12249
Register ptrB = MI.getOperand(2).getReg();
12250
12251
MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
12252
MachineBasicBlock *loop2MBB =
12253
CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
12254
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
12255
F->insert(It, loopMBB);
12256
if (CmpOpcode)
12257
F->insert(It, loop2MBB);
12258
F->insert(It, exitMBB);
12259
exitMBB->splice(exitMBB->begin(), BB,
12260
std::next(MachineBasicBlock::iterator(MI)), BB->end());
12261
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
12262
12263
const TargetRegisterClass *RC =
12264
is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
12265
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
12266
12267
Register PtrReg = RegInfo.createVirtualRegister(RC);
12268
Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
12269
Register ShiftReg =
12270
isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
12271
Register Incr2Reg = RegInfo.createVirtualRegister(GPRC);
12272
Register MaskReg = RegInfo.createVirtualRegister(GPRC);
12273
Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
12274
Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
12275
Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
12276
Register Tmp3Reg = RegInfo.createVirtualRegister(GPRC);
12277
Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
12278
Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
12279
Register SrwDestReg = RegInfo.createVirtualRegister(GPRC);
12280
Register Ptr1Reg;
12281
Register TmpReg =
12282
(!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC);
12283
12284
// thisMBB:
12285
// ...
12286
// fallthrough --> loopMBB
12287
BB->addSuccessor(loopMBB);
12288
12289
// The 4-byte load must be aligned, while a char or short may be
12290
// anywhere in the word. Hence all this nasty bookkeeping code.
12291
// add ptr1, ptrA, ptrB [copy if ptrA==0]
12292
// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
12293
// xori shift, shift1, 24 [16]
12294
// rlwinm ptr, ptr1, 0, 0, 29
12295
// slw incr2, incr, shift
12296
// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
12297
// slw mask, mask2, shift
12298
// loopMBB:
12299
// lwarx tmpDest, ptr
12300
// add tmp, tmpDest, incr2
12301
// andc tmp2, tmpDest, mask
12302
// and tmp3, tmp, mask
12303
// or tmp4, tmp3, tmp2
12304
// stwcx. tmp4, ptr
12305
// bne- loopMBB
12306
// fallthrough --> exitMBB
12307
// srw SrwDest, tmpDest, shift
12308
// rlwinm SrwDest, SrwDest, 0, 24 [16], 31
12309
if (ptrA != ZeroReg) {
12310
Ptr1Reg = RegInfo.createVirtualRegister(RC);
12311
BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
12312
.addReg(ptrA)
12313
.addReg(ptrB);
12314
} else {
12315
Ptr1Reg = ptrB;
12316
}
12317
// We need use 32-bit subregister to avoid mismatch register class in 64-bit
12318
// mode.
12319
BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)
12320
.addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)
12321
.addImm(3)
12322
.addImm(27)
12323
.addImm(is8bit ? 28 : 27);
12324
if (!isLittleEndian)
12325
BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)
12326
.addReg(Shift1Reg)
12327
.addImm(is8bit ? 24 : 16);
12328
if (is64bit)
12329
BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
12330
.addReg(Ptr1Reg)
12331
.addImm(0)
12332
.addImm(61);
12333
else
12334
BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
12335
.addReg(Ptr1Reg)
12336
.addImm(0)
12337
.addImm(0)
12338
.addImm(29);
12339
BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg).addReg(incr).addReg(ShiftReg);
12340
if (is8bit)
12341
BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
12342
else {
12343
BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
12344
BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
12345
.addReg(Mask3Reg)
12346
.addImm(65535);
12347
}
12348
BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
12349
.addReg(Mask2Reg)
12350
.addReg(ShiftReg);
12351
12352
BB = loopMBB;
12353
BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
12354
.addReg(ZeroReg)
12355
.addReg(PtrReg);
12356
if (BinOpcode)
12357
BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
12358
.addReg(Incr2Reg)
12359
.addReg(TmpDestReg);
12360
BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)
12361
.addReg(TmpDestReg)
12362
.addReg(MaskReg);
12363
BuildMI(BB, dl, TII->get(PPC::AND), Tmp3Reg).addReg(TmpReg).addReg(MaskReg);
12364
if (CmpOpcode) {
12365
// For unsigned comparisons, we can directly compare the shifted values.
12366
// For signed comparisons we shift and sign extend.
12367
Register SReg = RegInfo.createVirtualRegister(GPRC);
12368
Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
12369
BuildMI(BB, dl, TII->get(PPC::AND), SReg)
12370
.addReg(TmpDestReg)
12371
.addReg(MaskReg);
12372
unsigned ValueReg = SReg;
12373
unsigned CmpReg = Incr2Reg;
12374
if (CmpOpcode == PPC::CMPW) {
12375
ValueReg = RegInfo.createVirtualRegister(GPRC);
12376
BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg)
12377
.addReg(SReg)
12378
.addReg(ShiftReg);
12379
Register ValueSReg = RegInfo.createVirtualRegister(GPRC);
12380
BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg)
12381
.addReg(ValueReg);
12382
ValueReg = ValueSReg;
12383
CmpReg = incr;
12384
}
12385
BuildMI(BB, dl, TII->get(CmpOpcode), CrReg).addReg(ValueReg).addReg(CmpReg);
12386
BuildMI(BB, dl, TII->get(PPC::BCC))
12387
.addImm(CmpPred)
12388
.addReg(CrReg)
12389
.addMBB(exitMBB);
12390
BB->addSuccessor(loop2MBB);
12391
BB->addSuccessor(exitMBB);
12392
BB = loop2MBB;
12393
}
12394
BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg).addReg(Tmp3Reg).addReg(Tmp2Reg);
12395
BuildMI(BB, dl, TII->get(PPC::STWCX))
12396
.addReg(Tmp4Reg)
12397
.addReg(ZeroReg)
12398
.addReg(PtrReg);
12399
BuildMI(BB, dl, TII->get(PPC::BCC))
12400
.addImm(PPC::PRED_NE)
12401
.addReg(PPC::CR0)
12402
.addMBB(loopMBB);
12403
BB->addSuccessor(loopMBB);
12404
BB->addSuccessor(exitMBB);
12405
12406
// exitMBB:
12407
// ...
12408
BB = exitMBB;
12409
// Since the shift amount is not a constant, we need to clear
12410
// the upper bits with a separate RLWINM.
12411
BuildMI(*BB, BB->begin(), dl, TII->get(PPC::RLWINM), dest)
12412
.addReg(SrwDestReg)
12413
.addImm(0)
12414
.addImm(is8bit ? 24 : 16)
12415
.addImm(31);
12416
BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), SrwDestReg)
12417
.addReg(TmpDestReg)
12418
.addReg(ShiftReg);
12419
return BB;
12420
}
12421
12422
llvm::MachineBasicBlock *
12423
PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
12424
MachineBasicBlock *MBB) const {
12425
DebugLoc DL = MI.getDebugLoc();
12426
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
12427
const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
12428
12429
MachineFunction *MF = MBB->getParent();
12430
MachineRegisterInfo &MRI = MF->getRegInfo();
12431
12432
const BasicBlock *BB = MBB->getBasicBlock();
12433
MachineFunction::iterator I = ++MBB->getIterator();
12434
12435
Register DstReg = MI.getOperand(0).getReg();
12436
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
12437
assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
12438
Register mainDstReg = MRI.createVirtualRegister(RC);
12439
Register restoreDstReg = MRI.createVirtualRegister(RC);
12440
12441
MVT PVT = getPointerTy(MF->getDataLayout());
12442
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
12443
"Invalid Pointer Size!");
12444
// For v = setjmp(buf), we generate
12445
//
12446
// thisMBB:
12447
// SjLjSetup mainMBB
12448
// bl mainMBB
12449
// v_restore = 1
12450
// b sinkMBB
12451
//
12452
// mainMBB:
12453
// buf[LabelOffset] = LR
12454
// v_main = 0
12455
//
12456
// sinkMBB:
12457
// v = phi(main, restore)
12458
//
12459
12460
MachineBasicBlock *thisMBB = MBB;
12461
MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
12462
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
12463
MF->insert(I, mainMBB);
12464
MF->insert(I, sinkMBB);
12465
12466
MachineInstrBuilder MIB;
12467
12468
// Transfer the remainder of BB and its successor edges to sinkMBB.
12469
sinkMBB->splice(sinkMBB->begin(), MBB,
12470
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
12471
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
12472
12473
// Note that the structure of the jmp_buf used here is not compatible
12474
// with that used by libc, and is not designed to be. Specifically, it
12475
// stores only those 'reserved' registers that LLVM does not otherwise
12476
// understand how to spill. Also, by convention, by the time this
12477
// intrinsic is called, Clang has already stored the frame address in the
12478
// first slot of the buffer and stack address in the third. Following the
12479
// X86 target code, we'll store the jump address in the second slot. We also
12480
// need to save the TOC pointer (R2) to handle jumps between shared
12481
// libraries, and that will be stored in the fourth slot. The thread
12482
// identifier (R13) is not affected.
12483
12484
// thisMBB:
12485
const int64_t LabelOffset = 1 * PVT.getStoreSize();
12486
const int64_t TOCOffset = 3 * PVT.getStoreSize();
12487
const int64_t BPOffset = 4 * PVT.getStoreSize();
12488
12489
// Prepare IP either in reg.
12490
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
12491
Register LabelReg = MRI.createVirtualRegister(PtrRC);
12492
Register BufReg = MI.getOperand(1).getReg();
12493
12494
if (Subtarget.is64BitELFABI()) {
12495
setUsesTOCBasePtr(*MBB->getParent());
12496
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
12497
.addReg(PPC::X2)
12498
.addImm(TOCOffset)
12499
.addReg(BufReg)
12500
.cloneMemRefs(MI);
12501
}
12502
12503
// Naked functions never have a base pointer, and so we use r1. For all
12504
// other functions, this decision must be delayed until during PEI.
12505
unsigned BaseReg;
12506
if (MF->getFunction().hasFnAttribute(Attribute::Naked))
12507
BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
12508
else
12509
BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
12510
12511
MIB = BuildMI(*thisMBB, MI, DL,
12512
TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
12513
.addReg(BaseReg)
12514
.addImm(BPOffset)
12515
.addReg(BufReg)
12516
.cloneMemRefs(MI);
12517
12518
// Setup
12519
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
12520
MIB.addRegMask(TRI->getNoPreservedMask());
12521
12522
BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
12523
12524
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))
12525
.addMBB(mainMBB);
12526
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);
12527
12528
thisMBB->addSuccessor(mainMBB, BranchProbability::getZero());
12529
thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne());
12530
12531
// mainMBB:
12532
// mainDstReg = 0
12533
MIB =
12534
BuildMI(mainMBB, DL,
12535
TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
12536
12537
// Store IP
12538
if (Subtarget.isPPC64()) {
12539
MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
12540
.addReg(LabelReg)
12541
.addImm(LabelOffset)
12542
.addReg(BufReg);
12543
} else {
12544
MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
12545
.addReg(LabelReg)
12546
.addImm(LabelOffset)
12547
.addReg(BufReg);
12548
}
12549
MIB.cloneMemRefs(MI);
12550
12551
BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);
12552
mainMBB->addSuccessor(sinkMBB);
12553
12554
// sinkMBB:
12555
BuildMI(*sinkMBB, sinkMBB->begin(), DL,
12556
TII->get(PPC::PHI), DstReg)
12557
.addReg(mainDstReg).addMBB(mainMBB)
12558
.addReg(restoreDstReg).addMBB(thisMBB);
12559
12560
MI.eraseFromParent();
12561
return sinkMBB;
12562
}
12563
12564
MachineBasicBlock *
12565
PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
12566
MachineBasicBlock *MBB) const {
12567
DebugLoc DL = MI.getDebugLoc();
12568
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
12569
12570
MachineFunction *MF = MBB->getParent();
12571
MachineRegisterInfo &MRI = MF->getRegInfo();
12572
12573
MVT PVT = getPointerTy(MF->getDataLayout());
12574
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
12575
"Invalid Pointer Size!");
12576
12577
const TargetRegisterClass *RC =
12578
(PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
12579
Register Tmp = MRI.createVirtualRegister(RC);
12580
// Since FP is only updated here but NOT referenced, it's treated as GPR.
12581
unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
12582
unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
12583
unsigned BP =
12584
(PVT == MVT::i64)
12585
? PPC::X30
12586
: (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
12587
: PPC::R30);
12588
12589
MachineInstrBuilder MIB;
12590
12591
const int64_t LabelOffset = 1 * PVT.getStoreSize();
12592
const int64_t SPOffset = 2 * PVT.getStoreSize();
12593
const int64_t TOCOffset = 3 * PVT.getStoreSize();
12594
const int64_t BPOffset = 4 * PVT.getStoreSize();
12595
12596
Register BufReg = MI.getOperand(0).getReg();
12597
12598
// Reload FP (the jumped-to function may not have had a
12599
// frame pointer, and if so, then its r31 will be restored
12600
// as necessary).
12601
if (PVT == MVT::i64) {
12602
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)
12603
.addImm(0)
12604
.addReg(BufReg);
12605
} else {
12606
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)
12607
.addImm(0)
12608
.addReg(BufReg);
12609
}
12610
MIB.cloneMemRefs(MI);
12611
12612
// Reload IP
12613
if (PVT == MVT::i64) {
12614
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
12615
.addImm(LabelOffset)
12616
.addReg(BufReg);
12617
} else {
12618
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
12619
.addImm(LabelOffset)
12620
.addReg(BufReg);
12621
}
12622
MIB.cloneMemRefs(MI);
12623
12624
// Reload SP
12625
if (PVT == MVT::i64) {
12626
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
12627
.addImm(SPOffset)
12628
.addReg(BufReg);
12629
} else {
12630
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
12631
.addImm(SPOffset)
12632
.addReg(BufReg);
12633
}
12634
MIB.cloneMemRefs(MI);
12635
12636
// Reload BP
12637
if (PVT == MVT::i64) {
12638
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)
12639
.addImm(BPOffset)
12640
.addReg(BufReg);
12641
} else {
12642
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)
12643
.addImm(BPOffset)
12644
.addReg(BufReg);
12645
}
12646
MIB.cloneMemRefs(MI);
12647
12648
// Reload TOC
12649
if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
12650
setUsesTOCBasePtr(*MBB->getParent());
12651
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
12652
.addImm(TOCOffset)
12653
.addReg(BufReg)
12654
.cloneMemRefs(MI);
12655
}
12656
12657
// Jump
12658
BuildMI(*MBB, MI, DL,
12659
TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
12660
BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
12661
12662
MI.eraseFromParent();
12663
return MBB;
12664
}
12665
12666
bool PPCTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
12667
// If the function specifically requests inline stack probes, emit them.
12668
if (MF.getFunction().hasFnAttribute("probe-stack"))
12669
return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
12670
"inline-asm";
12671
return false;
12672
}
12673
12674
unsigned PPCTargetLowering::getStackProbeSize(const MachineFunction &MF) const {
12675
const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
12676
unsigned StackAlign = TFI->getStackAlignment();
12677
assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
12678
"Unexpected stack alignment");
12679
// The default stack probe size is 4096 if the function has no
12680
// stack-probe-size attribute.
12681
const Function &Fn = MF.getFunction();
12682
unsigned StackProbeSize =
12683
Fn.getFnAttributeAsParsedInteger("stack-probe-size", 4096);
12684
// Round down to the stack alignment.
12685
StackProbeSize &= ~(StackAlign - 1);
12686
return StackProbeSize ? StackProbeSize : StackAlign;
12687
}
12688
12689
// Lower dynamic stack allocation with probing. `emitProbedAlloca` is splitted
12690
// into three phases. In the first phase, it uses pseudo instruction
12691
// PREPARE_PROBED_ALLOCA to get the future result of actual FramePointer and
12692
// FinalStackPtr. In the second phase, it generates a loop for probing blocks.
12693
// At last, it uses pseudo instruction DYNAREAOFFSET to get the future result of
12694
// MaxCallFrameSize so that it can calculate correct data area pointer.
12695
MachineBasicBlock *
12696
PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,
12697
MachineBasicBlock *MBB) const {
12698
const bool isPPC64 = Subtarget.isPPC64();
12699
MachineFunction *MF = MBB->getParent();
12700
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
12701
DebugLoc DL = MI.getDebugLoc();
12702
const unsigned ProbeSize = getStackProbeSize(*MF);
12703
const BasicBlock *ProbedBB = MBB->getBasicBlock();
12704
MachineRegisterInfo &MRI = MF->getRegInfo();
12705
// The CFG of probing stack looks as
12706
// +-----+
12707
// | MBB |
12708
// +--+--+
12709
// |
12710
// +----v----+
12711
// +--->+ TestMBB +---+
12712
// | +----+----+ |
12713
// | | |
12714
// | +-----v----+ |
12715
// +---+ BlockMBB | |
12716
// +----------+ |
12717
// |
12718
// +---------+ |
12719
// | TailMBB +<--+
12720
// +---------+
12721
// In MBB, calculate previous frame pointer and final stack pointer.
12722
// In TestMBB, test if sp is equal to final stack pointer, if so, jump to
12723
// TailMBB. In BlockMBB, update the sp atomically and jump back to TestMBB.
12724
// TailMBB is spliced via \p MI.
12725
MachineBasicBlock *TestMBB = MF->CreateMachineBasicBlock(ProbedBB);
12726
MachineBasicBlock *TailMBB = MF->CreateMachineBasicBlock(ProbedBB);
12727
MachineBasicBlock *BlockMBB = MF->CreateMachineBasicBlock(ProbedBB);
12728
12729
MachineFunction::iterator MBBIter = ++MBB->getIterator();
12730
MF->insert(MBBIter, TestMBB);
12731
MF->insert(MBBIter, BlockMBB);
12732
MF->insert(MBBIter, TailMBB);
12733
12734
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
12735
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
12736
12737
Register DstReg = MI.getOperand(0).getReg();
12738
Register NegSizeReg = MI.getOperand(1).getReg();
12739
Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;
12740
Register FinalStackPtr = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12741
Register FramePointer = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12742
Register ActualNegSizeReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12743
12744
// Since value of NegSizeReg might be realigned in prologepilog, insert a
12745
// PREPARE_PROBED_ALLOCA pseudo instruction to get actual FramePointer and
12746
// NegSize.
12747
unsigned ProbeOpc;
12748
if (!MRI.hasOneNonDBGUse(NegSizeReg))
12749
ProbeOpc =
12750
isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64 : PPC::PREPARE_PROBED_ALLOCA_32;
12751
else
12752
// By introducing PREPARE_PROBED_ALLOCA_NEGSIZE_OPT, ActualNegSizeReg
12753
// and NegSizeReg will be allocated in the same phyreg to avoid
12754
// redundant copy when NegSizeReg has only one use which is current MI and
12755
// will be replaced by PREPARE_PROBED_ALLOCA then.
12756
ProbeOpc = isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64
12757
: PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32;
12758
BuildMI(*MBB, {MI}, DL, TII->get(ProbeOpc), FramePointer)
12759
.addDef(ActualNegSizeReg)
12760
.addReg(NegSizeReg)
12761
.add(MI.getOperand(2))
12762
.add(MI.getOperand(3));
12763
12764
// Calculate final stack pointer, which equals to SP + ActualNegSize.
12765
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4),
12766
FinalStackPtr)
12767
.addReg(SPReg)
12768
.addReg(ActualNegSizeReg);
12769
12770
// Materialize a scratch register for update.
12771
int64_t NegProbeSize = -(int64_t)ProbeSize;
12772
assert(isInt<32>(NegProbeSize) && "Unhandled probe size!");
12773
Register ScratchReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12774
if (!isInt<16>(NegProbeSize)) {
12775
Register TempReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12776
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS), TempReg)
12777
.addImm(NegProbeSize >> 16);
12778
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI),
12779
ScratchReg)
12780
.addReg(TempReg)
12781
.addImm(NegProbeSize & 0xFFFF);
12782
} else
12783
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LI8 : PPC::LI), ScratchReg)
12784
.addImm(NegProbeSize);
12785
12786
{
12787
// Probing leading residual part.
12788
Register Div = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12789
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::DIVD : PPC::DIVW), Div)
12790
.addReg(ActualNegSizeReg)
12791
.addReg(ScratchReg);
12792
Register Mul = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12793
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::MULLD : PPC::MULLW), Mul)
12794
.addReg(Div)
12795
.addReg(ScratchReg);
12796
Register NegMod = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12797
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::SUBF8 : PPC::SUBF), NegMod)
12798
.addReg(Mul)
12799
.addReg(ActualNegSizeReg);
12800
BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
12801
.addReg(FramePointer)
12802
.addReg(SPReg)
12803
.addReg(NegMod);
12804
}
12805
12806
{
12807
// Remaining part should be multiple of ProbeSize.
12808
Register CmpResult = MRI.createVirtualRegister(&PPC::CRRCRegClass);
12809
BuildMI(TestMBB, DL, TII->get(isPPC64 ? PPC::CMPD : PPC::CMPW), CmpResult)
12810
.addReg(SPReg)
12811
.addReg(FinalStackPtr);
12812
BuildMI(TestMBB, DL, TII->get(PPC::BCC))
12813
.addImm(PPC::PRED_EQ)
12814
.addReg(CmpResult)
12815
.addMBB(TailMBB);
12816
TestMBB->addSuccessor(BlockMBB);
12817
TestMBB->addSuccessor(TailMBB);
12818
}
12819
12820
{
12821
// Touch the block.
12822
// |P...|P...|P...
12823
BuildMI(BlockMBB, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
12824
.addReg(FramePointer)
12825
.addReg(SPReg)
12826
.addReg(ScratchReg);
12827
BuildMI(BlockMBB, DL, TII->get(PPC::B)).addMBB(TestMBB);
12828
BlockMBB->addSuccessor(TestMBB);
12829
}
12830
12831
// Calculation of MaxCallFrameSize is deferred to prologepilog, use
12832
// DYNAREAOFFSET pseudo instruction to get the future result.
12833
Register MaxCallFrameSizeReg =
12834
MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12835
BuildMI(TailMBB, DL,
12836
TII->get(isPPC64 ? PPC::DYNAREAOFFSET8 : PPC::DYNAREAOFFSET),
12837
MaxCallFrameSizeReg)
12838
.add(MI.getOperand(2))
12839
.add(MI.getOperand(3));
12840
BuildMI(TailMBB, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4), DstReg)
12841
.addReg(SPReg)
12842
.addReg(MaxCallFrameSizeReg);
12843
12844
// Splice instructions after MI to TailMBB.
12845
TailMBB->splice(TailMBB->end(), MBB,
12846
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
12847
TailMBB->transferSuccessorsAndUpdatePHIs(MBB);
12848
MBB->addSuccessor(TestMBB);
12849
12850
// Delete the pseudo instruction.
12851
MI.eraseFromParent();
12852
12853
++NumDynamicAllocaProbed;
12854
return TailMBB;
12855
}
12856
12857
static bool IsSelectCC(MachineInstr &MI) {
12858
switch (MI.getOpcode()) {
12859
case PPC::SELECT_CC_I4:
12860
case PPC::SELECT_CC_I8:
12861
case PPC::SELECT_CC_F4:
12862
case PPC::SELECT_CC_F8:
12863
case PPC::SELECT_CC_F16:
12864
case PPC::SELECT_CC_VRRC:
12865
case PPC::SELECT_CC_VSFRC:
12866
case PPC::SELECT_CC_VSSRC:
12867
case PPC::SELECT_CC_VSRC:
12868
case PPC::SELECT_CC_SPE4:
12869
case PPC::SELECT_CC_SPE:
12870
return true;
12871
default:
12872
return false;
12873
}
12874
}
12875
12876
static bool IsSelect(MachineInstr &MI) {
12877
switch (MI.getOpcode()) {
12878
case PPC::SELECT_I4:
12879
case PPC::SELECT_I8:
12880
case PPC::SELECT_F4:
12881
case PPC::SELECT_F8:
12882
case PPC::SELECT_F16:
12883
case PPC::SELECT_SPE:
12884
case PPC::SELECT_SPE4:
12885
case PPC::SELECT_VRRC:
12886
case PPC::SELECT_VSFRC:
12887
case PPC::SELECT_VSSRC:
12888
case PPC::SELECT_VSRC:
12889
return true;
12890
default:
12891
return false;
12892
}
12893
}
12894
12895
MachineBasicBlock *
12896
PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
12897
MachineBasicBlock *BB) const {
12898
if (MI.getOpcode() == TargetOpcode::STACKMAP ||
12899
MI.getOpcode() == TargetOpcode::PATCHPOINT) {
12900
if (Subtarget.is64BitELFABI() &&
12901
MI.getOpcode() == TargetOpcode::PATCHPOINT &&
12902
!Subtarget.isUsingPCRelativeCalls()) {
12903
// Call lowering should have added an r2 operand to indicate a dependence
12904
// on the TOC base pointer value. It can't however, because there is no
12905
// way to mark the dependence as implicit there, and so the stackmap code
12906
// will confuse it with a regular operand. Instead, add the dependence
12907
// here.
12908
MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true));
12909
}
12910
12911
return emitPatchPoint(MI, BB);
12912
}
12913
12914
if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 ||
12915
MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {
12916
return emitEHSjLjSetJmp(MI, BB);
12917
} else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 ||
12918
MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {
12919
return emitEHSjLjLongJmp(MI, BB);
12920
}
12921
12922
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
12923
12924
// To "insert" these instructions we actually have to insert their
12925
// control-flow patterns.
12926
const BasicBlock *LLVM_BB = BB->getBasicBlock();
12927
MachineFunction::iterator It = ++BB->getIterator();
12928
12929
MachineFunction *F = BB->getParent();
12930
MachineRegisterInfo &MRI = F->getRegInfo();
12931
12932
if (Subtarget.hasISEL() &&
12933
(MI.getOpcode() == PPC::SELECT_CC_I4 ||
12934
MI.getOpcode() == PPC::SELECT_CC_I8 ||
12935
MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8)) {
12936
SmallVector<MachineOperand, 2> Cond;
12937
if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
12938
MI.getOpcode() == PPC::SELECT_CC_I8)
12939
Cond.push_back(MI.getOperand(4));
12940
else
12941
Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
12942
Cond.push_back(MI.getOperand(1));
12943
12944
DebugLoc dl = MI.getDebugLoc();
12945
TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond,
12946
MI.getOperand(2).getReg(), MI.getOperand(3).getReg());
12947
} else if (IsSelectCC(MI) || IsSelect(MI)) {
12948
// The incoming instruction knows the destination vreg to set, the
12949
// condition code register to branch on, the true/false values to
12950
// select between, and a branch opcode to use.
12951
12952
// thisMBB:
12953
// ...
12954
// TrueVal = ...
12955
// cmpTY ccX, r1, r2
12956
// bCC sinkMBB
12957
// fallthrough --> copy0MBB
12958
MachineBasicBlock *thisMBB = BB;
12959
MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12960
MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12961
DebugLoc dl = MI.getDebugLoc();
12962
F->insert(It, copy0MBB);
12963
F->insert(It, sinkMBB);
12964
12965
// Set the call frame size on entry to the new basic blocks.
12966
// See https://reviews.llvm.org/D156113.
12967
unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12968
copy0MBB->setCallFrameSize(CallFrameSize);
12969
sinkMBB->setCallFrameSize(CallFrameSize);
12970
12971
// Transfer the remainder of BB and its successor edges to sinkMBB.
12972
sinkMBB->splice(sinkMBB->begin(), BB,
12973
std::next(MachineBasicBlock::iterator(MI)), BB->end());
12974
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
12975
12976
// Next, add the true and fallthrough blocks as its successors.
12977
BB->addSuccessor(copy0MBB);
12978
BB->addSuccessor(sinkMBB);
12979
12980
if (IsSelect(MI)) {
12981
BuildMI(BB, dl, TII->get(PPC::BC))
12982
.addReg(MI.getOperand(1).getReg())
12983
.addMBB(sinkMBB);
12984
} else {
12985
unsigned SelectPred = MI.getOperand(4).getImm();
12986
BuildMI(BB, dl, TII->get(PPC::BCC))
12987
.addImm(SelectPred)
12988
.addReg(MI.getOperand(1).getReg())
12989
.addMBB(sinkMBB);
12990
}
12991
12992
// copy0MBB:
12993
// %FalseValue = ...
12994
// # fallthrough to sinkMBB
12995
BB = copy0MBB;
12996
12997
// Update machine-CFG edges
12998
BB->addSuccessor(sinkMBB);
12999
13000
// sinkMBB:
13001
// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
13002
// ...
13003
BB = sinkMBB;
13004
BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg())
13005
.addReg(MI.getOperand(3).getReg())
13006
.addMBB(copy0MBB)
13007
.addReg(MI.getOperand(2).getReg())
13008
.addMBB(thisMBB);
13009
} else if (MI.getOpcode() == PPC::ReadTB) {
13010
// To read the 64-bit time-base register on a 32-bit target, we read the
13011
// two halves. Should the counter have wrapped while it was being read, we
13012
// need to try again.
13013
// ...
13014
// readLoop:
13015
// mfspr Rx,TBU # load from TBU
13016
// mfspr Ry,TB # load from TB
13017
// mfspr Rz,TBU # load from TBU
13018
// cmpw crX,Rx,Rz # check if 'old'='new'
13019
// bne readLoop # branch if they're not equal
13020
// ...
13021
13022
MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);
13023
MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
13024
DebugLoc dl = MI.getDebugLoc();
13025
F->insert(It, readMBB);
13026
F->insert(It, sinkMBB);
13027
13028
// Transfer the remainder of BB and its successor edges to sinkMBB.
13029
sinkMBB->splice(sinkMBB->begin(), BB,
13030
std::next(MachineBasicBlock::iterator(MI)), BB->end());
13031
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
13032
13033
BB->addSuccessor(readMBB);
13034
BB = readMBB;
13035
13036
MachineRegisterInfo &RegInfo = F->getRegInfo();
13037
Register ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
13038
Register LoReg = MI.getOperand(0).getReg();
13039
Register HiReg = MI.getOperand(1).getReg();
13040
13041
BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
13042
BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
13043
BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);
13044
13045
Register CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
13046
13047
BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
13048
.addReg(HiReg)
13049
.addReg(ReadAgainReg);
13050
BuildMI(BB, dl, TII->get(PPC::BCC))
13051
.addImm(PPC::PRED_NE)
13052
.addReg(CmpReg)
13053
.addMBB(readMBB);
13054
13055
BB->addSuccessor(readMBB);
13056
BB->addSuccessor(sinkMBB);
13057
} else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
13058
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
13059
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
13060
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
13061
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
13062
BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4);
13063
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
13064
BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8);
13065
13066
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
13067
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
13068
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
13069
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
13070
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
13071
BB = EmitAtomicBinary(MI, BB, 4, PPC::AND);
13072
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
13073
BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8);
13074
13075
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
13076
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
13077
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
13078
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
13079
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
13080
BB = EmitAtomicBinary(MI, BB, 4, PPC::OR);
13081
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
13082
BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8);
13083
13084
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
13085
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
13086
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
13087
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
13088
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
13089
BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR);
13090
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
13091
BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8);
13092
13093
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
13094
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
13095
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
13096
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
13097
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
13098
BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND);
13099
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
13100
BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8);
13101
13102
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
13103
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
13104
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
13105
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
13106
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
13107
BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF);
13108
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
13109
BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8);
13110
13111
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8)
13112
BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LT);
13113
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16)
13114
BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LT);
13115
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32)
13116
BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LT);
13117
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64)
13118
BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LT);
13119
13120
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8)
13121
BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GT);
13122
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16)
13123
BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GT);
13124
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32)
13125
BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GT);
13126
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64)
13127
BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GT);
13128
13129
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8)
13130
BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LT);
13131
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16)
13132
BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LT);
13133
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32)
13134
BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LT);
13135
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64)
13136
BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LT);
13137
13138
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8)
13139
BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GT);
13140
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16)
13141
BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GT);
13142
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32)
13143
BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GT);
13144
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64)
13145
BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GT);
13146
13147
else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)
13148
BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
13149
else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)
13150
BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
13151
else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)
13152
BB = EmitAtomicBinary(MI, BB, 4, 0);
13153
else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
13154
BB = EmitAtomicBinary(MI, BB, 8, 0);
13155
else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
13156
MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
13157
(Subtarget.hasPartwordAtomics() &&
13158
MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) ||
13159
(Subtarget.hasPartwordAtomics() &&
13160
MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
13161
bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
13162
13163
auto LoadMnemonic = PPC::LDARX;
13164
auto StoreMnemonic = PPC::STDCX;
13165
switch (MI.getOpcode()) {
13166
default:
13167
llvm_unreachable("Compare and swap of unknown size");
13168
case PPC::ATOMIC_CMP_SWAP_I8:
13169
LoadMnemonic = PPC::LBARX;
13170
StoreMnemonic = PPC::STBCX;
13171
assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
13172
break;
13173
case PPC::ATOMIC_CMP_SWAP_I16:
13174
LoadMnemonic = PPC::LHARX;
13175
StoreMnemonic = PPC::STHCX;
13176
assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
13177
break;
13178
case PPC::ATOMIC_CMP_SWAP_I32:
13179
LoadMnemonic = PPC::LWARX;
13180
StoreMnemonic = PPC::STWCX;
13181
break;
13182
case PPC::ATOMIC_CMP_SWAP_I64:
13183
LoadMnemonic = PPC::LDARX;
13184
StoreMnemonic = PPC::STDCX;
13185
break;
13186
}
13187
MachineRegisterInfo &RegInfo = F->getRegInfo();
13188
Register dest = MI.getOperand(0).getReg();
13189
Register ptrA = MI.getOperand(1).getReg();
13190
Register ptrB = MI.getOperand(2).getReg();
13191
Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
13192
Register oldval = MI.getOperand(3).getReg();
13193
Register newval = MI.getOperand(4).getReg();
13194
DebugLoc dl = MI.getDebugLoc();
13195
13196
MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
13197
MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
13198
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
13199
F->insert(It, loop1MBB);
13200
F->insert(It, loop2MBB);
13201
F->insert(It, exitMBB);
13202
exitMBB->splice(exitMBB->begin(), BB,
13203
std::next(MachineBasicBlock::iterator(MI)), BB->end());
13204
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
13205
13206
// thisMBB:
13207
// ...
13208
// fallthrough --> loopMBB
13209
BB->addSuccessor(loop1MBB);
13210
13211
// loop1MBB:
13212
// l[bhwd]arx dest, ptr
13213
// cmp[wd] dest, oldval
13214
// bne- exitBB
13215
// loop2MBB:
13216
// st[bhwd]cx. newval, ptr
13217
// bne- loopMBB
13218
// b exitBB
13219
// exitBB:
13220
BB = loop1MBB;
13221
BuildMI(BB, dl, TII->get(LoadMnemonic), dest).addReg(ptrA).addReg(ptrB);
13222
BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), CrReg)
13223
.addReg(dest)
13224
.addReg(oldval);
13225
BuildMI(BB, dl, TII->get(PPC::BCC))
13226
.addImm(PPC::PRED_NE)
13227
.addReg(CrReg)
13228
.addMBB(exitMBB);
13229
BB->addSuccessor(loop2MBB);
13230
BB->addSuccessor(exitMBB);
13231
13232
BB = loop2MBB;
13233
BuildMI(BB, dl, TII->get(StoreMnemonic))
13234
.addReg(newval)
13235
.addReg(ptrA)
13236
.addReg(ptrB);
13237
BuildMI(BB, dl, TII->get(PPC::BCC))
13238
.addImm(PPC::PRED_NE)
13239
.addReg(PPC::CR0)
13240
.addMBB(loop1MBB);
13241
BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
13242
BB->addSuccessor(loop1MBB);
13243
BB->addSuccessor(exitMBB);
13244
13245
// exitMBB:
13246
// ...
13247
BB = exitMBB;
13248
} else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
13249
MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
13250
// We must use 64-bit registers for addresses when targeting 64-bit,
13251
// since we're actually doing arithmetic on them. Other registers
13252
// can be 32-bit.
13253
bool is64bit = Subtarget.isPPC64();
13254
bool isLittleEndian = Subtarget.isLittleEndian();
13255
bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
13256
13257
Register dest = MI.getOperand(0).getReg();
13258
Register ptrA = MI.getOperand(1).getReg();
13259
Register ptrB = MI.getOperand(2).getReg();
13260
Register oldval = MI.getOperand(3).getReg();
13261
Register newval = MI.getOperand(4).getReg();
13262
DebugLoc dl = MI.getDebugLoc();
13263
13264
MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
13265
MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
13266
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
13267
F->insert(It, loop1MBB);
13268
F->insert(It, loop2MBB);
13269
F->insert(It, exitMBB);
13270
exitMBB->splice(exitMBB->begin(), BB,
13271
std::next(MachineBasicBlock::iterator(MI)), BB->end());
13272
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
13273
13274
MachineRegisterInfo &RegInfo = F->getRegInfo();
13275
const TargetRegisterClass *RC =
13276
is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
13277
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
13278
13279
Register PtrReg = RegInfo.createVirtualRegister(RC);
13280
Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
13281
Register ShiftReg =
13282
isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
13283
Register NewVal2Reg = RegInfo.createVirtualRegister(GPRC);
13284
Register NewVal3Reg = RegInfo.createVirtualRegister(GPRC);
13285
Register OldVal2Reg = RegInfo.createVirtualRegister(GPRC);
13286
Register OldVal3Reg = RegInfo.createVirtualRegister(GPRC);
13287
Register MaskReg = RegInfo.createVirtualRegister(GPRC);
13288
Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
13289
Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
13290
Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
13291
Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
13292
Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
13293
Register Ptr1Reg;
13294
Register TmpReg = RegInfo.createVirtualRegister(GPRC);
13295
Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
13296
Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
13297
// thisMBB:
13298
// ...
13299
// fallthrough --> loopMBB
13300
BB->addSuccessor(loop1MBB);
13301
13302
// The 4-byte load must be aligned, while a char or short may be
13303
// anywhere in the word. Hence all this nasty bookkeeping code.
13304
// add ptr1, ptrA, ptrB [copy if ptrA==0]
13305
// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
13306
// xori shift, shift1, 24 [16]
13307
// rlwinm ptr, ptr1, 0, 0, 29
13308
// slw newval2, newval, shift
13309
// slw oldval2, oldval,shift
13310
// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
13311
// slw mask, mask2, shift
13312
// and newval3, newval2, mask
13313
// and oldval3, oldval2, mask
13314
// loop1MBB:
13315
// lwarx tmpDest, ptr
13316
// and tmp, tmpDest, mask
13317
// cmpw tmp, oldval3
13318
// bne- exitBB
13319
// loop2MBB:
13320
// andc tmp2, tmpDest, mask
13321
// or tmp4, tmp2, newval3
13322
// stwcx. tmp4, ptr
13323
// bne- loop1MBB
13324
// b exitBB
13325
// exitBB:
13326
// srw dest, tmpDest, shift
13327
if (ptrA != ZeroReg) {
13328
Ptr1Reg = RegInfo.createVirtualRegister(RC);
13329
BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
13330
.addReg(ptrA)
13331
.addReg(ptrB);
13332
} else {
13333
Ptr1Reg = ptrB;
13334
}
13335
13336
// We need use 32-bit subregister to avoid mismatch register class in 64-bit
13337
// mode.
13338
BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)
13339
.addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)
13340
.addImm(3)
13341
.addImm(27)
13342
.addImm(is8bit ? 28 : 27);
13343
if (!isLittleEndian)
13344
BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)
13345
.addReg(Shift1Reg)
13346
.addImm(is8bit ? 24 : 16);
13347
if (is64bit)
13348
BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
13349
.addReg(Ptr1Reg)
13350
.addImm(0)
13351
.addImm(61);
13352
else
13353
BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
13354
.addReg(Ptr1Reg)
13355
.addImm(0)
13356
.addImm(0)
13357
.addImm(29);
13358
BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
13359
.addReg(newval)
13360
.addReg(ShiftReg);
13361
BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
13362
.addReg(oldval)
13363
.addReg(ShiftReg);
13364
if (is8bit)
13365
BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
13366
else {
13367
BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
13368
BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
13369
.addReg(Mask3Reg)
13370
.addImm(65535);
13371
}
13372
BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
13373
.addReg(Mask2Reg)
13374
.addReg(ShiftReg);
13375
BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
13376
.addReg(NewVal2Reg)
13377
.addReg(MaskReg);
13378
BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
13379
.addReg(OldVal2Reg)
13380
.addReg(MaskReg);
13381
13382
BB = loop1MBB;
13383
BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
13384
.addReg(ZeroReg)
13385
.addReg(PtrReg);
13386
BuildMI(BB, dl, TII->get(PPC::AND), TmpReg)
13387
.addReg(TmpDestReg)
13388
.addReg(MaskReg);
13389
BuildMI(BB, dl, TII->get(PPC::CMPW), CrReg)
13390
.addReg(TmpReg)
13391
.addReg(OldVal3Reg);
13392
BuildMI(BB, dl, TII->get(PPC::BCC))
13393
.addImm(PPC::PRED_NE)
13394
.addReg(CrReg)
13395
.addMBB(exitMBB);
13396
BB->addSuccessor(loop2MBB);
13397
BB->addSuccessor(exitMBB);
13398
13399
BB = loop2MBB;
13400
BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)
13401
.addReg(TmpDestReg)
13402
.addReg(MaskReg);
13403
BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg)
13404
.addReg(Tmp2Reg)
13405
.addReg(NewVal3Reg);
13406
BuildMI(BB, dl, TII->get(PPC::STWCX))
13407
.addReg(Tmp4Reg)
13408
.addReg(ZeroReg)
13409
.addReg(PtrReg);
13410
BuildMI(BB, dl, TII->get(PPC::BCC))
13411
.addImm(PPC::PRED_NE)
13412
.addReg(PPC::CR0)
13413
.addMBB(loop1MBB);
13414
BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
13415
BB->addSuccessor(loop1MBB);
13416
BB->addSuccessor(exitMBB);
13417
13418
// exitMBB:
13419
// ...
13420
BB = exitMBB;
13421
BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest)
13422
.addReg(TmpReg)
13423
.addReg(ShiftReg);
13424
} else if (MI.getOpcode() == PPC::FADDrtz) {
13425
// This pseudo performs an FADD with rounding mode temporarily forced
13426
// to round-to-zero. We emit this via custom inserter since the FPSCR
13427
// is not modeled at the SelectionDAG level.
13428
Register Dest = MI.getOperand(0).getReg();
13429
Register Src1 = MI.getOperand(1).getReg();
13430
Register Src2 = MI.getOperand(2).getReg();
13431
DebugLoc dl = MI.getDebugLoc();
13432
13433
MachineRegisterInfo &RegInfo = F->getRegInfo();
13434
Register MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
13435
13436
// Save FPSCR value.
13437
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
13438
13439
// Set rounding mode to round-to-zero.
13440
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1))
13441
.addImm(31)
13442
.addReg(PPC::RM, RegState::ImplicitDefine);
13443
13444
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0))
13445
.addImm(30)
13446
.addReg(PPC::RM, RegState::ImplicitDefine);
13447
13448
// Perform addition.
13449
auto MIB = BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest)
13450
.addReg(Src1)
13451
.addReg(Src2);
13452
if (MI.getFlag(MachineInstr::NoFPExcept))
13453
MIB.setMIFlag(MachineInstr::NoFPExcept);
13454
13455
// Restore FPSCR value.
13456
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
13457
} else if (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||
13458
MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT ||
13459
MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||
13460
MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8) {
13461
unsigned Opcode = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||
13462
MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8)
13463
? PPC::ANDI8_rec
13464
: PPC::ANDI_rec;
13465
bool IsEQ = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||
13466
MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8);
13467
13468
MachineRegisterInfo &RegInfo = F->getRegInfo();
13469
Register Dest = RegInfo.createVirtualRegister(
13470
Opcode == PPC::ANDI_rec ? &PPC::GPRCRegClass : &PPC::G8RCRegClass);
13471
13472
DebugLoc Dl = MI.getDebugLoc();
13473
BuildMI(*BB, MI, Dl, TII->get(Opcode), Dest)
13474
.addReg(MI.getOperand(1).getReg())
13475
.addImm(1);
13476
BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
13477
MI.getOperand(0).getReg())
13478
.addReg(IsEQ ? PPC::CR0EQ : PPC::CR0GT);
13479
} else if (MI.getOpcode() == PPC::TCHECK_RET) {
13480
DebugLoc Dl = MI.getDebugLoc();
13481
MachineRegisterInfo &RegInfo = F->getRegInfo();
13482
Register CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
13483
BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
13484
BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
13485
MI.getOperand(0).getReg())
13486
.addReg(CRReg);
13487
} else if (MI.getOpcode() == PPC::TBEGIN_RET) {
13488
DebugLoc Dl = MI.getDebugLoc();
13489
unsigned Imm = MI.getOperand(1).getImm();
13490
BuildMI(*BB, MI, Dl, TII->get(PPC::TBEGIN)).addImm(Imm);
13491
BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
13492
MI.getOperand(0).getReg())
13493
.addReg(PPC::CR0EQ);
13494
} else if (MI.getOpcode() == PPC::SETRNDi) {
13495
DebugLoc dl = MI.getDebugLoc();
13496
Register OldFPSCRReg = MI.getOperand(0).getReg();
13497
13498
// Save FPSCR value.
13499
if (MRI.use_empty(OldFPSCRReg))
13500
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), OldFPSCRReg);
13501
else
13502
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
13503
13504
// The floating point rounding mode is in the bits 62:63 of FPCSR, and has
13505
// the following settings:
13506
// 00 Round to nearest
13507
// 01 Round to 0
13508
// 10 Round to +inf
13509
// 11 Round to -inf
13510
13511
// When the operand is immediate, using the two least significant bits of
13512
// the immediate to set the bits 62:63 of FPSCR.
13513
unsigned Mode = MI.getOperand(1).getImm();
13514
BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))
13515
.addImm(31)
13516
.addReg(PPC::RM, RegState::ImplicitDefine);
13517
13518
BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))
13519
.addImm(30)
13520
.addReg(PPC::RM, RegState::ImplicitDefine);
13521
} else if (MI.getOpcode() == PPC::SETRND) {
13522
DebugLoc dl = MI.getDebugLoc();
13523
13524
// Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg
13525
// or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg.
13526
// If the target doesn't have DirectMove, we should use stack to do the
13527
// conversion, because the target doesn't have the instructions like mtvsrd
13528
// or mfvsrd to do this conversion directly.
13529
auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) {
13530
if (Subtarget.hasDirectMove()) {
13531
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), DestReg)
13532
.addReg(SrcReg);
13533
} else {
13534
// Use stack to do the register copy.
13535
unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD;
13536
MachineRegisterInfo &RegInfo = F->getRegInfo();
13537
const TargetRegisterClass *RC = RegInfo.getRegClass(SrcReg);
13538
if (RC == &PPC::F8RCRegClass) {
13539
// Copy register from F8RCRegClass to G8RCRegclass.
13540
assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) &&
13541
"Unsupported RegClass.");
13542
13543
StoreOp = PPC::STFD;
13544
LoadOp = PPC::LD;
13545
} else {
13546
// Copy register from G8RCRegClass to F8RCRegclass.
13547
assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) &&
13548
(RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) &&
13549
"Unsupported RegClass.");
13550
}
13551
13552
MachineFrameInfo &MFI = F->getFrameInfo();
13553
int FrameIdx = MFI.CreateStackObject(8, Align(8), false);
13554
13555
MachineMemOperand *MMOStore = F->getMachineMemOperand(
13556
MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
13557
MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
13558
MFI.getObjectAlign(FrameIdx));
13559
13560
// Store the SrcReg into the stack.
13561
BuildMI(*BB, MI, dl, TII->get(StoreOp))
13562
.addReg(SrcReg)
13563
.addImm(0)
13564
.addFrameIndex(FrameIdx)
13565
.addMemOperand(MMOStore);
13566
13567
MachineMemOperand *MMOLoad = F->getMachineMemOperand(
13568
MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
13569
MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
13570
MFI.getObjectAlign(FrameIdx));
13571
13572
// Load from the stack where SrcReg is stored, and save to DestReg,
13573
// so we have done the RegClass conversion from RegClass::SrcReg to
13574
// RegClass::DestReg.
13575
BuildMI(*BB, MI, dl, TII->get(LoadOp), DestReg)
13576
.addImm(0)
13577
.addFrameIndex(FrameIdx)
13578
.addMemOperand(MMOLoad);
13579
}
13580
};
13581
13582
Register OldFPSCRReg = MI.getOperand(0).getReg();
13583
13584
// Save FPSCR value.
13585
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
13586
13587
// When the operand is gprc register, use two least significant bits of the
13588
// register and mtfsf instruction to set the bits 62:63 of FPSCR.
13589
//
13590
// copy OldFPSCRTmpReg, OldFPSCRReg
13591
// (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1)
13592
// rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 62
13593
// copy NewFPSCRReg, NewFPSCRTmpReg
13594
// mtfsf 255, NewFPSCRReg
13595
MachineOperand SrcOp = MI.getOperand(1);
13596
MachineRegisterInfo &RegInfo = F->getRegInfo();
13597
Register OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
13598
13599
copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);
13600
13601
Register ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
13602
Register ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
13603
13604
// The first operand of INSERT_SUBREG should be a register which has
13605
// subregisters, we only care about its RegClass, so we should use an
13606
// IMPLICIT_DEF register.
13607
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), ImDefReg);
13608
BuildMI(*BB, MI, dl, TII->get(PPC::INSERT_SUBREG), ExtSrcReg)
13609
.addReg(ImDefReg)
13610
.add(SrcOp)
13611
.addImm(1);
13612
13613
Register NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
13614
BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg)
13615
.addReg(OldFPSCRTmpReg)
13616
.addReg(ExtSrcReg)
13617
.addImm(0)
13618
.addImm(62);
13619
13620
Register NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
13621
copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);
13622
13623
// The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63
13624
// bits of FPSCR.
13625
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF))
13626
.addImm(255)
13627
.addReg(NewFPSCRReg)
13628
.addImm(0)
13629
.addImm(0);
13630
} else if (MI.getOpcode() == PPC::SETFLM) {
13631
DebugLoc Dl = MI.getDebugLoc();
13632
13633
// Result of setflm is previous FPSCR content, so we need to save it first.
13634
Register OldFPSCRReg = MI.getOperand(0).getReg();
13635
if (MRI.use_empty(OldFPSCRReg))
13636
BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::IMPLICIT_DEF), OldFPSCRReg);
13637
else
13638
BuildMI(*BB, MI, Dl, TII->get(PPC::MFFS), OldFPSCRReg);
13639
13640
// Put bits in 32:63 to FPSCR.
13641
Register NewFPSCRReg = MI.getOperand(1).getReg();
13642
BuildMI(*BB, MI, Dl, TII->get(PPC::MTFSF))
13643
.addImm(255)
13644
.addReg(NewFPSCRReg)
13645
.addImm(0)
13646
.addImm(0);
13647
} else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 ||
13648
MI.getOpcode() == PPC::PROBED_ALLOCA_64) {
13649
return emitProbedAlloca(MI, BB);
13650
} else if (MI.getOpcode() == PPC::SPLIT_QUADWORD) {
13651
DebugLoc DL = MI.getDebugLoc();
13652
Register Src = MI.getOperand(2).getReg();
13653
Register Lo = MI.getOperand(0).getReg();
13654
Register Hi = MI.getOperand(1).getReg();
13655
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY))
13656
.addDef(Lo)
13657
.addUse(Src, 0, PPC::sub_gp8_x1);
13658
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY))
13659
.addDef(Hi)
13660
.addUse(Src, 0, PPC::sub_gp8_x0);
13661
} else if (MI.getOpcode() == PPC::LQX_PSEUDO ||
13662
MI.getOpcode() == PPC::STQX_PSEUDO) {
13663
DebugLoc DL = MI.getDebugLoc();
13664
// Ptr is used as the ptr_rc_no_r0 part
13665
// of LQ/STQ's memory operand and adding result of RA and RB,
13666
// so it has to be g8rc_and_g8rc_nox0.
13667
Register Ptr =
13668
F->getRegInfo().createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass);
13669
Register Val = MI.getOperand(0).getReg();
13670
Register RA = MI.getOperand(1).getReg();
13671
Register RB = MI.getOperand(2).getReg();
13672
BuildMI(*BB, MI, DL, TII->get(PPC::ADD8), Ptr).addReg(RA).addReg(RB);
13673
BuildMI(*BB, MI, DL,
13674
MI.getOpcode() == PPC::LQX_PSEUDO ? TII->get(PPC::LQ)
13675
: TII->get(PPC::STQ))
13676
.addReg(Val, MI.getOpcode() == PPC::LQX_PSEUDO ? RegState::Define : 0)
13677
.addImm(0)
13678
.addReg(Ptr);
13679
} else {
13680
llvm_unreachable("Unexpected instr type to insert");
13681
}
13682
13683
MI.eraseFromParent(); // The pseudo instruction is gone now.
13684
return BB;
13685
}
13686
13687
//===----------------------------------------------------------------------===//
13688
// Target Optimization Hooks
13689
//===----------------------------------------------------------------------===//
13690
13691
static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
13692
// For the estimates, convergence is quadratic, so we essentially double the
13693
// number of digits correct after every iteration. For both FRE and FRSQRTE,
13694
// the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
13695
// this is 2^-14. IEEE float has 23 digits and double has 52 digits.
13696
int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
13697
if (VT.getScalarType() == MVT::f64)
13698
RefinementSteps++;
13699
return RefinementSteps;
13700
}
13701
13702
SDValue PPCTargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
13703
const DenormalMode &Mode) const {
13704
// We only have VSX Vector Test for software Square Root.
13705
EVT VT = Op.getValueType();
13706
if (!isTypeLegal(MVT::i1) ||
13707
(VT != MVT::f64 &&
13708
((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX())))
13709
return TargetLowering::getSqrtInputTest(Op, DAG, Mode);
13710
13711
SDLoc DL(Op);
13712
// The output register of FTSQRT is CR field.
13713
SDValue FTSQRT = DAG.getNode(PPCISD::FTSQRT, DL, MVT::i32, Op);
13714
// ftsqrt BF,FRB
13715
// Let e_b be the unbiased exponent of the double-precision
13716
// floating-point operand in register FRB.
13717
// fe_flag is set to 1 if either of the following conditions occurs.
13718
// - The double-precision floating-point operand in register FRB is a zero,
13719
// a NaN, or an infinity, or a negative value.
13720
// - e_b is less than or equal to -970.
13721
// Otherwise fe_flag is set to 0.
13722
// Both VSX and non-VSX versions would set EQ bit in the CR if the number is
13723
// not eligible for iteration. (zero/negative/infinity/nan or unbiased
13724
// exponent is less than -970)
13725
SDValue SRIdxVal = DAG.getTargetConstant(PPC::sub_eq, DL, MVT::i32);
13726
return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i1,
13727
FTSQRT, SRIdxVal),
13728
0);
13729
}
13730
13731
SDValue
13732
PPCTargetLowering::getSqrtResultForDenormInput(SDValue Op,
13733
SelectionDAG &DAG) const {
13734
// We only have VSX Vector Square Root.
13735
EVT VT = Op.getValueType();
13736
if (VT != MVT::f64 &&
13737
((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX()))
13738
return TargetLowering::getSqrtResultForDenormInput(Op, DAG);
13739
13740
return DAG.getNode(PPCISD::FSQRT, SDLoc(Op), VT, Op);
13741
}
13742
13743
SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
13744
int Enabled, int &RefinementSteps,
13745
bool &UseOneConstNR,
13746
bool Reciprocal) const {
13747
EVT VT = Operand.getValueType();
13748
if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
13749
(VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
13750
(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
13751
(VT == MVT::v2f64 && Subtarget.hasVSX())) {
13752
if (RefinementSteps == ReciprocalEstimate::Unspecified)
13753
RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
13754
13755
// The Newton-Raphson computation with a single constant does not provide
13756
// enough accuracy on some CPUs.
13757
UseOneConstNR = !Subtarget.needsTwoConstNR();
13758
return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
13759
}
13760
return SDValue();
13761
}
13762
13763
SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
13764
int Enabled,
13765
int &RefinementSteps) const {
13766
EVT VT = Operand.getValueType();
13767
if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
13768
(VT == MVT::f64 && Subtarget.hasFRE()) ||
13769
(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
13770
(VT == MVT::v2f64 && Subtarget.hasVSX())) {
13771
if (RefinementSteps == ReciprocalEstimate::Unspecified)
13772
RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
13773
return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
13774
}
13775
return SDValue();
13776
}
13777
13778
unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
13779
// Note: This functionality is used only when unsafe-fp-math is enabled, and
13780
// on cores with reciprocal estimates (which are used when unsafe-fp-math is
13781
// enabled for division), this functionality is redundant with the default
13782
// combiner logic (once the division -> reciprocal/multiply transformation
13783
// has taken place). As a result, this matters more for older cores than for
13784
// newer ones.
13785
13786
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
13787
// reciprocal if there are two or more FDIVs (for embedded cores with only
13788
// one FP pipeline) for three or more FDIVs (for generic OOO cores).
13789
switch (Subtarget.getCPUDirective()) {
13790
default:
13791
return 3;
13792
case PPC::DIR_440:
13793
case PPC::DIR_A2:
13794
case PPC::DIR_E500:
13795
case PPC::DIR_E500mc:
13796
case PPC::DIR_E5500:
13797
return 2;
13798
}
13799
}
13800
13801
// isConsecutiveLSLoc needs to work even if all adds have not yet been
13802
// collapsed, and so we need to look through chains of them.
13803
static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
13804
int64_t& Offset, SelectionDAG &DAG) {
13805
if (DAG.isBaseWithConstantOffset(Loc)) {
13806
Base = Loc.getOperand(0);
13807
Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
13808
13809
// The base might itself be a base plus an offset, and if so, accumulate
13810
// that as well.
13811
getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG);
13812
}
13813
}
13814
13815
static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
13816
unsigned Bytes, int Dist,
13817
SelectionDAG &DAG) {
13818
if (VT.getSizeInBits() / 8 != Bytes)
13819
return false;
13820
13821
SDValue BaseLoc = Base->getBasePtr();
13822
if (Loc.getOpcode() == ISD::FrameIndex) {
13823
if (BaseLoc.getOpcode() != ISD::FrameIndex)
13824
return false;
13825
const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
13826
int FI = cast<FrameIndexSDNode>(Loc)->getIndex();
13827
int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
13828
int FS = MFI.getObjectSize(FI);
13829
int BFS = MFI.getObjectSize(BFI);
13830
if (FS != BFS || FS != (int)Bytes) return false;
13831
return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes);
13832
}
13833
13834
SDValue Base1 = Loc, Base2 = BaseLoc;
13835
int64_t Offset1 = 0, Offset2 = 0;
13836
getBaseWithConstantOffset(Loc, Base1, Offset1, DAG);
13837
getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG);
13838
if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
13839
return true;
13840
13841
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13842
const GlobalValue *GV1 = nullptr;
13843
const GlobalValue *GV2 = nullptr;
13844
Offset1 = 0;
13845
Offset2 = 0;
13846
bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
13847
bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
13848
if (isGA1 && isGA2 && GV1 == GV2)
13849
return Offset1 == (Offset2 + Dist*Bytes);
13850
return false;
13851
}
13852
13853
// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
13854
// not enforce equality of the chain operands.
13855
static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
13856
unsigned Bytes, int Dist,
13857
SelectionDAG &DAG) {
13858
if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) {
13859
EVT VT = LS->getMemoryVT();
13860
SDValue Loc = LS->getBasePtr();
13861
return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
13862
}
13863
13864
if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
13865
EVT VT;
13866
switch (N->getConstantOperandVal(1)) {
13867
default: return false;
13868
case Intrinsic::ppc_altivec_lvx:
13869
case Intrinsic::ppc_altivec_lvxl:
13870
case Intrinsic::ppc_vsx_lxvw4x:
13871
case Intrinsic::ppc_vsx_lxvw4x_be:
13872
VT = MVT::v4i32;
13873
break;
13874
case Intrinsic::ppc_vsx_lxvd2x:
13875
case Intrinsic::ppc_vsx_lxvd2x_be:
13876
VT = MVT::v2f64;
13877
break;
13878
case Intrinsic::ppc_altivec_lvebx:
13879
VT = MVT::i8;
13880
break;
13881
case Intrinsic::ppc_altivec_lvehx:
13882
VT = MVT::i16;
13883
break;
13884
case Intrinsic::ppc_altivec_lvewx:
13885
VT = MVT::i32;
13886
break;
13887
}
13888
13889
return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG);
13890
}
13891
13892
if (N->getOpcode() == ISD::INTRINSIC_VOID) {
13893
EVT VT;
13894
switch (N->getConstantOperandVal(1)) {
13895
default: return false;
13896
case Intrinsic::ppc_altivec_stvx:
13897
case Intrinsic::ppc_altivec_stvxl:
13898
case Intrinsic::ppc_vsx_stxvw4x:
13899
VT = MVT::v4i32;
13900
break;
13901
case Intrinsic::ppc_vsx_stxvd2x:
13902
VT = MVT::v2f64;
13903
break;
13904
case Intrinsic::ppc_vsx_stxvw4x_be:
13905
VT = MVT::v4i32;
13906
break;
13907
case Intrinsic::ppc_vsx_stxvd2x_be:
13908
VT = MVT::v2f64;
13909
break;
13910
case Intrinsic::ppc_altivec_stvebx:
13911
VT = MVT::i8;
13912
break;
13913
case Intrinsic::ppc_altivec_stvehx:
13914
VT = MVT::i16;
13915
break;
13916
case Intrinsic::ppc_altivec_stvewx:
13917
VT = MVT::i32;
13918
break;
13919
}
13920
13921
return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG);
13922
}
13923
13924
return false;
13925
}
13926
13927
// Return true is there is a nearyby consecutive load to the one provided
13928
// (regardless of alignment). We search up and down the chain, looking though
13929
// token factors and other loads (but nothing else). As a result, a true result
13930
// indicates that it is safe to create a new consecutive load adjacent to the
13931
// load provided.
13932
static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
13933
SDValue Chain = LD->getChain();
13934
EVT VT = LD->getMemoryVT();
13935
13936
SmallSet<SDNode *, 16> LoadRoots;
13937
SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
13938
SmallSet<SDNode *, 16> Visited;
13939
13940
// First, search up the chain, branching to follow all token-factor operands.
13941
// If we find a consecutive load, then we're done, otherwise, record all
13942
// nodes just above the top-level loads and token factors.
13943
while (!Queue.empty()) {
13944
SDNode *ChainNext = Queue.pop_back_val();
13945
if (!Visited.insert(ChainNext).second)
13946
continue;
13947
13948
if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {
13949
if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
13950
return true;
13951
13952
if (!Visited.count(ChainLD->getChain().getNode()))
13953
Queue.push_back(ChainLD->getChain().getNode());
13954
} else if (ChainNext->getOpcode() == ISD::TokenFactor) {
13955
for (const SDUse &O : ChainNext->ops())
13956
if (!Visited.count(O.getNode()))
13957
Queue.push_back(O.getNode());
13958
} else
13959
LoadRoots.insert(ChainNext);
13960
}
13961
13962
// Second, search down the chain, starting from the top-level nodes recorded
13963
// in the first phase. These top-level nodes are the nodes just above all
13964
// loads and token factors. Starting with their uses, recursively look though
13965
// all loads (just the chain uses) and token factors to find a consecutive
13966
// load.
13967
Visited.clear();
13968
Queue.clear();
13969
13970
for (SDNode *I : LoadRoots) {
13971
Queue.push_back(I);
13972
13973
while (!Queue.empty()) {
13974
SDNode *LoadRoot = Queue.pop_back_val();
13975
if (!Visited.insert(LoadRoot).second)
13976
continue;
13977
13978
if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))
13979
if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
13980
return true;
13981
13982
for (SDNode *U : LoadRoot->uses())
13983
if (((isa<MemSDNode>(U) &&
13984
cast<MemSDNode>(U)->getChain().getNode() == LoadRoot) ||
13985
U->getOpcode() == ISD::TokenFactor) &&
13986
!Visited.count(U))
13987
Queue.push_back(U);
13988
}
13989
}
13990
13991
return false;
13992
}
13993
13994
/// This function is called when we have proved that a SETCC node can be replaced
13995
/// by subtraction (and other supporting instructions) so that the result of
13996
/// comparison is kept in a GPR instead of CR. This function is purely for
13997
/// codegen purposes and has some flags to guide the codegen process.
13998
static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
13999
bool Swap, SDLoc &DL, SelectionDAG &DAG) {
14000
assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
14001
14002
// Zero extend the operands to the largest legal integer. Originally, they
14003
// must be of a strictly smaller size.
14004
auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0),
14005
DAG.getConstant(Size, DL, MVT::i32));
14006
auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1),
14007
DAG.getConstant(Size, DL, MVT::i32));
14008
14009
// Swap if needed. Depends on the condition code.
14010
if (Swap)
14011
std::swap(Op0, Op1);
14012
14013
// Subtract extended integers.
14014
auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1);
14015
14016
// Move the sign bit to the least significant position and zero out the rest.
14017
// Now the least significant bit carries the result of original comparison.
14018
auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode,
14019
DAG.getConstant(Size - 1, DL, MVT::i32));
14020
auto Final = Shifted;
14021
14022
// Complement the result if needed. Based on the condition code.
14023
if (Complement)
14024
Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted,
14025
DAG.getConstant(1, DL, MVT::i64));
14026
14027
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final);
14028
}
14029
14030
SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
14031
DAGCombinerInfo &DCI) const {
14032
assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
14033
14034
SelectionDAG &DAG = DCI.DAG;
14035
SDLoc DL(N);
14036
14037
// Size of integers being compared has a critical role in the following
14038
// analysis, so we prefer to do this when all types are legal.
14039
if (!DCI.isAfterLegalizeDAG())
14040
return SDValue();
14041
14042
// If all users of SETCC extend its value to a legal integer type
14043
// then we replace SETCC with a subtraction
14044
for (const SDNode *U : N->uses())
14045
if (U->getOpcode() != ISD::ZERO_EXTEND)
14046
return SDValue();
14047
14048
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14049
auto OpSize = N->getOperand(0).getValueSizeInBits();
14050
14051
unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();
14052
14053
if (OpSize < Size) {
14054
switch (CC) {
14055
default: break;
14056
case ISD::SETULT:
14057
return generateEquivalentSub(N, Size, false, false, DL, DAG);
14058
case ISD::SETULE:
14059
return generateEquivalentSub(N, Size, true, true, DL, DAG);
14060
case ISD::SETUGT:
14061
return generateEquivalentSub(N, Size, false, true, DL, DAG);
14062
case ISD::SETUGE:
14063
return generateEquivalentSub(N, Size, true, false, DL, DAG);
14064
}
14065
}
14066
14067
return SDValue();
14068
}
14069
14070
SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
14071
DAGCombinerInfo &DCI) const {
14072
SelectionDAG &DAG = DCI.DAG;
14073
SDLoc dl(N);
14074
14075
assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
14076
// If we're tracking CR bits, we need to be careful that we don't have:
14077
// trunc(binary-ops(zext(x), zext(y)))
14078
// or
14079
// trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
14080
// such that we're unnecessarily moving things into GPRs when it would be
14081
// better to keep them in CR bits.
14082
14083
// Note that trunc here can be an actual i1 trunc, or can be the effective
14084
// truncation that comes from a setcc or select_cc.
14085
if (N->getOpcode() == ISD::TRUNCATE &&
14086
N->getValueType(0) != MVT::i1)
14087
return SDValue();
14088
14089
if (N->getOperand(0).getValueType() != MVT::i32 &&
14090
N->getOperand(0).getValueType() != MVT::i64)
14091
return SDValue();
14092
14093
if (N->getOpcode() == ISD::SETCC ||
14094
N->getOpcode() == ISD::SELECT_CC) {
14095
// If we're looking at a comparison, then we need to make sure that the
14096
// high bits (all except for the first) don't matter the result.
14097
ISD::CondCode CC =
14098
cast<CondCodeSDNode>(N->getOperand(
14099
N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
14100
unsigned OpBits = N->getOperand(0).getValueSizeInBits();
14101
14102
if (ISD::isSignedIntSetCC(CC)) {
14103
if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits ||
14104
DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)
14105
return SDValue();
14106
} else if (ISD::isUnsignedIntSetCC(CC)) {
14107
if (!DAG.MaskedValueIsZero(N->getOperand(0),
14108
APInt::getHighBitsSet(OpBits, OpBits-1)) ||
14109
!DAG.MaskedValueIsZero(N->getOperand(1),
14110
APInt::getHighBitsSet(OpBits, OpBits-1)))
14111
return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
14112
: SDValue());
14113
} else {
14114
// This is neither a signed nor an unsigned comparison, just make sure
14115
// that the high bits are equal.
14116
KnownBits Op1Known = DAG.computeKnownBits(N->getOperand(0));
14117
KnownBits Op2Known = DAG.computeKnownBits(N->getOperand(1));
14118
14119
// We don't really care about what is known about the first bit (if
14120
// anything), so pretend that it is known zero for both to ensure they can
14121
// be compared as constants.
14122
Op1Known.Zero.setBit(0); Op1Known.One.clearBit(0);
14123
Op2Known.Zero.setBit(0); Op2Known.One.clearBit(0);
14124
14125
if (!Op1Known.isConstant() || !Op2Known.isConstant() ||
14126
Op1Known.getConstant() != Op2Known.getConstant())
14127
return SDValue();
14128
}
14129
}
14130
14131
// We now know that the higher-order bits are irrelevant, we just need to
14132
// make sure that all of the intermediate operations are bit operations, and
14133
// all inputs are extensions.
14134
if (N->getOperand(0).getOpcode() != ISD::AND &&
14135
N->getOperand(0).getOpcode() != ISD::OR &&
14136
N->getOperand(0).getOpcode() != ISD::XOR &&
14137
N->getOperand(0).getOpcode() != ISD::SELECT &&
14138
N->getOperand(0).getOpcode() != ISD::SELECT_CC &&
14139
N->getOperand(0).getOpcode() != ISD::TRUNCATE &&
14140
N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&
14141
N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
14142
N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)
14143
return SDValue();
14144
14145
if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&
14146
N->getOperand(1).getOpcode() != ISD::AND &&
14147
N->getOperand(1).getOpcode() != ISD::OR &&
14148
N->getOperand(1).getOpcode() != ISD::XOR &&
14149
N->getOperand(1).getOpcode() != ISD::SELECT &&
14150
N->getOperand(1).getOpcode() != ISD::SELECT_CC &&
14151
N->getOperand(1).getOpcode() != ISD::TRUNCATE &&
14152
N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&
14153
N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
14154
N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)
14155
return SDValue();
14156
14157
SmallVector<SDValue, 4> Inputs;
14158
SmallVector<SDValue, 8> BinOps, PromOps;
14159
SmallPtrSet<SDNode *, 16> Visited;
14160
14161
for (unsigned i = 0; i < 2; ++i) {
14162
if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
14163
N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
14164
N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
14165
N->getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
14166
isa<ConstantSDNode>(N->getOperand(i)))
14167
Inputs.push_back(N->getOperand(i));
14168
else
14169
BinOps.push_back(N->getOperand(i));
14170
14171
if (N->getOpcode() == ISD::TRUNCATE)
14172
break;
14173
}
14174
14175
// Visit all inputs, collect all binary operations (and, or, xor and
14176
// select) that are all fed by extensions.
14177
while (!BinOps.empty()) {
14178
SDValue BinOp = BinOps.pop_back_val();
14179
14180
if (!Visited.insert(BinOp.getNode()).second)
14181
continue;
14182
14183
PromOps.push_back(BinOp);
14184
14185
for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
14186
// The condition of the select is not promoted.
14187
if (BinOp.getOpcode() == ISD::SELECT && i == 0)
14188
continue;
14189
if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
14190
continue;
14191
14192
if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
14193
BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
14194
BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
14195
BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
14196
isa<ConstantSDNode>(BinOp.getOperand(i))) {
14197
Inputs.push_back(BinOp.getOperand(i));
14198
} else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
14199
BinOp.getOperand(i).getOpcode() == ISD::OR ||
14200
BinOp.getOperand(i).getOpcode() == ISD::XOR ||
14201
BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
14202
BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||
14203
BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
14204
BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
14205
BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
14206
BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
14207
BinOps.push_back(BinOp.getOperand(i));
14208
} else {
14209
// We have an input that is not an extension or another binary
14210
// operation; we'll abort this transformation.
14211
return SDValue();
14212
}
14213
}
14214
}
14215
14216
// Make sure that this is a self-contained cluster of operations (which
14217
// is not quite the same thing as saying that everything has only one
14218
// use).
14219
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
14220
if (isa<ConstantSDNode>(Inputs[i]))
14221
continue;
14222
14223
for (const SDNode *User : Inputs[i].getNode()->uses()) {
14224
if (User != N && !Visited.count(User))
14225
return SDValue();
14226
14227
// Make sure that we're not going to promote the non-output-value
14228
// operand(s) or SELECT or SELECT_CC.
14229
// FIXME: Although we could sometimes handle this, and it does occur in
14230
// practice that one of the condition inputs to the select is also one of
14231
// the outputs, we currently can't deal with this.
14232
if (User->getOpcode() == ISD::SELECT) {
14233
if (User->getOperand(0) == Inputs[i])
14234
return SDValue();
14235
} else if (User->getOpcode() == ISD::SELECT_CC) {
14236
if (User->getOperand(0) == Inputs[i] ||
14237
User->getOperand(1) == Inputs[i])
14238
return SDValue();
14239
}
14240
}
14241
}
14242
14243
for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
14244
for (const SDNode *User : PromOps[i].getNode()->uses()) {
14245
if (User != N && !Visited.count(User))
14246
return SDValue();
14247
14248
// Make sure that we're not going to promote the non-output-value
14249
// operand(s) or SELECT or SELECT_CC.
14250
// FIXME: Although we could sometimes handle this, and it does occur in
14251
// practice that one of the condition inputs to the select is also one of
14252
// the outputs, we currently can't deal with this.
14253
if (User->getOpcode() == ISD::SELECT) {
14254
if (User->getOperand(0) == PromOps[i])
14255
return SDValue();
14256
} else if (User->getOpcode() == ISD::SELECT_CC) {
14257
if (User->getOperand(0) == PromOps[i] ||
14258
User->getOperand(1) == PromOps[i])
14259
return SDValue();
14260
}
14261
}
14262
}
14263
14264
// Replace all inputs with the extension operand.
14265
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
14266
// Constants may have users outside the cluster of to-be-promoted nodes,
14267
// and so we need to replace those as we do the promotions.
14268
if (isa<ConstantSDNode>(Inputs[i]))
14269
continue;
14270
else
14271
DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
14272
}
14273
14274
std::list<HandleSDNode> PromOpHandles;
14275
for (auto &PromOp : PromOps)
14276
PromOpHandles.emplace_back(PromOp);
14277
14278
// Replace all operations (these are all the same, but have a different
14279
// (i1) return type). DAG.getNode will validate that the types of
14280
// a binary operator match, so go through the list in reverse so that
14281
// we've likely promoted both operands first. Any intermediate truncations or
14282
// extensions disappear.
14283
while (!PromOpHandles.empty()) {
14284
SDValue PromOp = PromOpHandles.back().getValue();
14285
PromOpHandles.pop_back();
14286
14287
if (PromOp.getOpcode() == ISD::TRUNCATE ||
14288
PromOp.getOpcode() == ISD::SIGN_EXTEND ||
14289
PromOp.getOpcode() == ISD::ZERO_EXTEND ||
14290
PromOp.getOpcode() == ISD::ANY_EXTEND) {
14291
if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
14292
PromOp.getOperand(0).getValueType() != MVT::i1) {
14293
// The operand is not yet ready (see comment below).
14294
PromOpHandles.emplace_front(PromOp);
14295
continue;
14296
}
14297
14298
SDValue RepValue = PromOp.getOperand(0);
14299
if (isa<ConstantSDNode>(RepValue))
14300
RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);
14301
14302
DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);
14303
continue;
14304
}
14305
14306
unsigned C;
14307
switch (PromOp.getOpcode()) {
14308
default: C = 0; break;
14309
case ISD::SELECT: C = 1; break;
14310
case ISD::SELECT_CC: C = 2; break;
14311
}
14312
14313
if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
14314
PromOp.getOperand(C).getValueType() != MVT::i1) ||
14315
(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
14316
PromOp.getOperand(C+1).getValueType() != MVT::i1)) {
14317
// The to-be-promoted operands of this node have not yet been
14318
// promoted (this should be rare because we're going through the
14319
// list backward, but if one of the operands has several users in
14320
// this cluster of to-be-promoted nodes, it is possible).
14321
PromOpHandles.emplace_front(PromOp);
14322
continue;
14323
}
14324
14325
SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
14326
PromOp.getNode()->op_end());
14327
14328
// If there are any constant inputs, make sure they're replaced now.
14329
for (unsigned i = 0; i < 2; ++i)
14330
if (isa<ConstantSDNode>(Ops[C+i]))
14331
Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);
14332
14333
DAG.ReplaceAllUsesOfValueWith(PromOp,
14334
DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
14335
}
14336
14337
// Now we're left with the initial truncation itself.
14338
if (N->getOpcode() == ISD::TRUNCATE)
14339
return N->getOperand(0);
14340
14341
// Otherwise, this is a comparison. The operands to be compared have just
14342
// changed type (to i1), but everything else is the same.
14343
return SDValue(N, 0);
14344
}
14345
14346
SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
14347
DAGCombinerInfo &DCI) const {
14348
SelectionDAG &DAG = DCI.DAG;
14349
SDLoc dl(N);
14350
14351
// If we're tracking CR bits, we need to be careful that we don't have:
14352
// zext(binary-ops(trunc(x), trunc(y)))
14353
// or
14354
// zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
14355
// such that we're unnecessarily moving things into CR bits that can more
14356
// efficiently stay in GPRs. Note that if we're not certain that the high
14357
// bits are set as required by the final extension, we still may need to do
14358
// some masking to get the proper behavior.
14359
14360
// This same functionality is important on PPC64 when dealing with
14361
// 32-to-64-bit extensions; these occur often when 32-bit values are used as
14362
// the return values of functions. Because it is so similar, it is handled
14363
// here as well.
14364
14365
if (N->getValueType(0) != MVT::i32 &&
14366
N->getValueType(0) != MVT::i64)
14367
return SDValue();
14368
14369
if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) ||
14370
(N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
14371
return SDValue();
14372
14373
if (N->getOperand(0).getOpcode() != ISD::AND &&
14374
N->getOperand(0).getOpcode() != ISD::OR &&
14375
N->getOperand(0).getOpcode() != ISD::XOR &&
14376
N->getOperand(0).getOpcode() != ISD::SELECT &&
14377
N->getOperand(0).getOpcode() != ISD::SELECT_CC)
14378
return SDValue();
14379
14380
SmallVector<SDValue, 4> Inputs;
14381
SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;
14382
SmallPtrSet<SDNode *, 16> Visited;
14383
14384
// Visit all inputs, collect all binary operations (and, or, xor and
14385
// select) that are all fed by truncations.
14386
while (!BinOps.empty()) {
14387
SDValue BinOp = BinOps.pop_back_val();
14388
14389
if (!Visited.insert(BinOp.getNode()).second)
14390
continue;
14391
14392
PromOps.push_back(BinOp);
14393
14394
for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
14395
// The condition of the select is not promoted.
14396
if (BinOp.getOpcode() == ISD::SELECT && i == 0)
14397
continue;
14398
if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
14399
continue;
14400
14401
if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
14402
isa<ConstantSDNode>(BinOp.getOperand(i))) {
14403
Inputs.push_back(BinOp.getOperand(i));
14404
} else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
14405
BinOp.getOperand(i).getOpcode() == ISD::OR ||
14406
BinOp.getOperand(i).getOpcode() == ISD::XOR ||
14407
BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
14408
BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
14409
BinOps.push_back(BinOp.getOperand(i));
14410
} else {
14411
// We have an input that is not a truncation or another binary
14412
// operation; we'll abort this transformation.
14413
return SDValue();
14414
}
14415
}
14416
}
14417
14418
// The operands of a select that must be truncated when the select is
14419
// promoted because the operand is actually part of the to-be-promoted set.
14420
DenseMap<SDNode *, EVT> SelectTruncOp[2];
14421
14422
// Make sure that this is a self-contained cluster of operations (which
14423
// is not quite the same thing as saying that everything has only one
14424
// use).
14425
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
14426
if (isa<ConstantSDNode>(Inputs[i]))
14427
continue;
14428
14429
for (SDNode *User : Inputs[i].getNode()->uses()) {
14430
if (User != N && !Visited.count(User))
14431
return SDValue();
14432
14433
// If we're going to promote the non-output-value operand(s) or SELECT or
14434
// SELECT_CC, record them for truncation.
14435
if (User->getOpcode() == ISD::SELECT) {
14436
if (User->getOperand(0) == Inputs[i])
14437
SelectTruncOp[0].insert(std::make_pair(User,
14438
User->getOperand(0).getValueType()));
14439
} else if (User->getOpcode() == ISD::SELECT_CC) {
14440
if (User->getOperand(0) == Inputs[i])
14441
SelectTruncOp[0].insert(std::make_pair(User,
14442
User->getOperand(0).getValueType()));
14443
if (User->getOperand(1) == Inputs[i])
14444
SelectTruncOp[1].insert(std::make_pair(User,
14445
User->getOperand(1).getValueType()));
14446
}
14447
}
14448
}
14449
14450
for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
14451
for (SDNode *User : PromOps[i].getNode()->uses()) {
14452
if (User != N && !Visited.count(User))
14453
return SDValue();
14454
14455
// If we're going to promote the non-output-value operand(s) or SELECT or
14456
// SELECT_CC, record them for truncation.
14457
if (User->getOpcode() == ISD::SELECT) {
14458
if (User->getOperand(0) == PromOps[i])
14459
SelectTruncOp[0].insert(std::make_pair(User,
14460
User->getOperand(0).getValueType()));
14461
} else if (User->getOpcode() == ISD::SELECT_CC) {
14462
if (User->getOperand(0) == PromOps[i])
14463
SelectTruncOp[0].insert(std::make_pair(User,
14464
User->getOperand(0).getValueType()));
14465
if (User->getOperand(1) == PromOps[i])
14466
SelectTruncOp[1].insert(std::make_pair(User,
14467
User->getOperand(1).getValueType()));
14468
}
14469
}
14470
}
14471
14472
unsigned PromBits = N->getOperand(0).getValueSizeInBits();
14473
bool ReallyNeedsExt = false;
14474
if (N->getOpcode() != ISD::ANY_EXTEND) {
14475
// If all of the inputs are not already sign/zero extended, then
14476
// we'll still need to do that at the end.
14477
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
14478
if (isa<ConstantSDNode>(Inputs[i]))
14479
continue;
14480
14481
unsigned OpBits =
14482
Inputs[i].getOperand(0).getValueSizeInBits();
14483
assert(PromBits < OpBits && "Truncation not to a smaller bit count?");
14484
14485
if ((N->getOpcode() == ISD::ZERO_EXTEND &&
14486
!DAG.MaskedValueIsZero(Inputs[i].getOperand(0),
14487
APInt::getHighBitsSet(OpBits,
14488
OpBits-PromBits))) ||
14489
(N->getOpcode() == ISD::SIGN_EXTEND &&
14490
DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <
14491
(OpBits-(PromBits-1)))) {
14492
ReallyNeedsExt = true;
14493
break;
14494
}
14495
}
14496
}
14497
14498
// Replace all inputs, either with the truncation operand, or a
14499
// truncation or extension to the final output type.
14500
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
14501
// Constant inputs need to be replaced with the to-be-promoted nodes that
14502
// use them because they might have users outside of the cluster of
14503
// promoted nodes.
14504
if (isa<ConstantSDNode>(Inputs[i]))
14505
continue;
14506
14507
SDValue InSrc = Inputs[i].getOperand(0);
14508
if (Inputs[i].getValueType() == N->getValueType(0))
14509
DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);
14510
else if (N->getOpcode() == ISD::SIGN_EXTEND)
14511
DAG.ReplaceAllUsesOfValueWith(Inputs[i],
14512
DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));
14513
else if (N->getOpcode() == ISD::ZERO_EXTEND)
14514
DAG.ReplaceAllUsesOfValueWith(Inputs[i],
14515
DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));
14516
else
14517
DAG.ReplaceAllUsesOfValueWith(Inputs[i],
14518
DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
14519
}
14520
14521
std::list<HandleSDNode> PromOpHandles;
14522
for (auto &PromOp : PromOps)
14523
PromOpHandles.emplace_back(PromOp);
14524
14525
// Replace all operations (these are all the same, but have a different
14526
// (promoted) return type). DAG.getNode will validate that the types of
14527
// a binary operator match, so go through the list in reverse so that
14528
// we've likely promoted both operands first.
14529
while (!PromOpHandles.empty()) {
14530
SDValue PromOp = PromOpHandles.back().getValue();
14531
PromOpHandles.pop_back();
14532
14533
unsigned C;
14534
switch (PromOp.getOpcode()) {
14535
default: C = 0; break;
14536
case ISD::SELECT: C = 1; break;
14537
case ISD::SELECT_CC: C = 2; break;
14538
}
14539
14540
if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
14541
PromOp.getOperand(C).getValueType() != N->getValueType(0)) ||
14542
(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
14543
PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {
14544
// The to-be-promoted operands of this node have not yet been
14545
// promoted (this should be rare because we're going through the
14546
// list backward, but if one of the operands has several users in
14547
// this cluster of to-be-promoted nodes, it is possible).
14548
PromOpHandles.emplace_front(PromOp);
14549
continue;
14550
}
14551
14552
// For SELECT and SELECT_CC nodes, we do a similar check for any
14553
// to-be-promoted comparison inputs.
14554
if (PromOp.getOpcode() == ISD::SELECT ||
14555
PromOp.getOpcode() == ISD::SELECT_CC) {
14556
if ((SelectTruncOp[0].count(PromOp.getNode()) &&
14557
PromOp.getOperand(0).getValueType() != N->getValueType(0)) ||
14558
(SelectTruncOp[1].count(PromOp.getNode()) &&
14559
PromOp.getOperand(1).getValueType() != N->getValueType(0))) {
14560
PromOpHandles.emplace_front(PromOp);
14561
continue;
14562
}
14563
}
14564
14565
SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
14566
PromOp.getNode()->op_end());
14567
14568
// If this node has constant inputs, then they'll need to be promoted here.
14569
for (unsigned i = 0; i < 2; ++i) {
14570
if (!isa<ConstantSDNode>(Ops[C+i]))
14571
continue;
14572
if (Ops[C+i].getValueType() == N->getValueType(0))
14573
continue;
14574
14575
if (N->getOpcode() == ISD::SIGN_EXTEND)
14576
Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
14577
else if (N->getOpcode() == ISD::ZERO_EXTEND)
14578
Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
14579
else
14580
Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
14581
}
14582
14583
// If we've promoted the comparison inputs of a SELECT or SELECT_CC,
14584
// truncate them again to the original value type.
14585
if (PromOp.getOpcode() == ISD::SELECT ||
14586
PromOp.getOpcode() == ISD::SELECT_CC) {
14587
auto SI0 = SelectTruncOp[0].find(PromOp.getNode());
14588
if (SI0 != SelectTruncOp[0].end())
14589
Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]);
14590
auto SI1 = SelectTruncOp[1].find(PromOp.getNode());
14591
if (SI1 != SelectTruncOp[1].end())
14592
Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]);
14593
}
14594
14595
DAG.ReplaceAllUsesOfValueWith(PromOp,
14596
DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
14597
}
14598
14599
// Now we're left with the initial extension itself.
14600
if (!ReallyNeedsExt)
14601
return N->getOperand(0);
14602
14603
// To zero extend, just mask off everything except for the first bit (in the
14604
// i1 case).
14605
if (N->getOpcode() == ISD::ZERO_EXTEND)
14606
return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),
14607
DAG.getConstant(APInt::getLowBitsSet(
14608
N->getValueSizeInBits(0), PromBits),
14609
dl, N->getValueType(0)));
14610
14611
assert(N->getOpcode() == ISD::SIGN_EXTEND &&
14612
"Invalid extension type");
14613
EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
14614
SDValue ShiftCst =
14615
DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
14616
return DAG.getNode(
14617
ISD::SRA, dl, N->getValueType(0),
14618
DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst),
14619
ShiftCst);
14620
}
14621
14622
SDValue PPCTargetLowering::combineSetCC(SDNode *N,
14623
DAGCombinerInfo &DCI) const {
14624
assert(N->getOpcode() == ISD::SETCC &&
14625
"Should be called with a SETCC node");
14626
14627
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14628
if (CC == ISD::SETNE || CC == ISD::SETEQ) {
14629
SDValue LHS = N->getOperand(0);
14630
SDValue RHS = N->getOperand(1);
14631
14632
// If there is a '0 - y' pattern, canonicalize the pattern to the RHS.
14633
if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
14634
LHS.hasOneUse())
14635
std::swap(LHS, RHS);
14636
14637
// x == 0-y --> x+y == 0
14638
// x != 0-y --> x+y != 0
14639
if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
14640
RHS.hasOneUse()) {
14641
SDLoc DL(N);
14642
SelectionDAG &DAG = DCI.DAG;
14643
EVT VT = N->getValueType(0);
14644
EVT OpVT = LHS.getValueType();
14645
SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
14646
return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
14647
}
14648
}
14649
14650
return DAGCombineTruncBoolExt(N, DCI);
14651
}
14652
14653
// Is this an extending load from an f32 to an f64?
14654
static bool isFPExtLoad(SDValue Op) {
14655
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()))
14656
return LD->getExtensionType() == ISD::EXTLOAD &&
14657
Op.getValueType() == MVT::f64;
14658
return false;
14659
}
14660
14661
/// Reduces the number of fp-to-int conversion when building a vector.
14662
///
14663
/// If this vector is built out of floating to integer conversions,
14664
/// transform it to a vector built out of floating point values followed by a
14665
/// single floating to integer conversion of the vector.
14666
/// Namely (build_vector (fptosi $A), (fptosi $B), ...)
14667
/// becomes (fptosi (build_vector ($A, $B, ...)))
14668
SDValue PPCTargetLowering::
14669
combineElementTruncationToVectorTruncation(SDNode *N,
14670
DAGCombinerInfo &DCI) const {
14671
assert(N->getOpcode() == ISD::BUILD_VECTOR &&
14672
"Should be called with a BUILD_VECTOR node");
14673
14674
SelectionDAG &DAG = DCI.DAG;
14675
SDLoc dl(N);
14676
14677
SDValue FirstInput = N->getOperand(0);
14678
assert(FirstInput.getOpcode() == PPCISD::MFVSR &&
14679
"The input operand must be an fp-to-int conversion.");
14680
14681
// This combine happens after legalization so the fp_to_[su]i nodes are
14682
// already converted to PPCSISD nodes.
14683
unsigned FirstConversion = FirstInput.getOperand(0).getOpcode();
14684
if (FirstConversion == PPCISD::FCTIDZ ||
14685
FirstConversion == PPCISD::FCTIDUZ ||
14686
FirstConversion == PPCISD::FCTIWZ ||
14687
FirstConversion == PPCISD::FCTIWUZ) {
14688
bool IsSplat = true;
14689
bool Is32Bit = FirstConversion == PPCISD::FCTIWZ ||
14690
FirstConversion == PPCISD::FCTIWUZ;
14691
EVT SrcVT = FirstInput.getOperand(0).getValueType();
14692
SmallVector<SDValue, 4> Ops;
14693
EVT TargetVT = N->getValueType(0);
14694
for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
14695
SDValue NextOp = N->getOperand(i);
14696
if (NextOp.getOpcode() != PPCISD::MFVSR)
14697
return SDValue();
14698
unsigned NextConversion = NextOp.getOperand(0).getOpcode();
14699
if (NextConversion != FirstConversion)
14700
return SDValue();
14701
// If we are converting to 32-bit integers, we need to add an FP_ROUND.
14702
// This is not valid if the input was originally double precision. It is
14703
// also not profitable to do unless this is an extending load in which
14704
// case doing this combine will allow us to combine consecutive loads.
14705
if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0)))
14706
return SDValue();
14707
if (N->getOperand(i) != FirstInput)
14708
IsSplat = false;
14709
}
14710
14711
// If this is a splat, we leave it as-is since there will be only a single
14712
// fp-to-int conversion followed by a splat of the integer. This is better
14713
// for 32-bit and smaller ints and neutral for 64-bit ints.
14714
if (IsSplat)
14715
return SDValue();
14716
14717
// Now that we know we have the right type of node, get its operands
14718
for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
14719
SDValue In = N->getOperand(i).getOperand(0);
14720
if (Is32Bit) {
14721
// For 32-bit values, we need to add an FP_ROUND node (if we made it
14722
// here, we know that all inputs are extending loads so this is safe).
14723
if (In.isUndef())
14724
Ops.push_back(DAG.getUNDEF(SrcVT));
14725
else {
14726
SDValue Trunc =
14727
DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, In.getOperand(0),
14728
DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
14729
Ops.push_back(Trunc);
14730
}
14731
} else
14732
Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0));
14733
}
14734
14735
unsigned Opcode;
14736
if (FirstConversion == PPCISD::FCTIDZ ||
14737
FirstConversion == PPCISD::FCTIWZ)
14738
Opcode = ISD::FP_TO_SINT;
14739
else
14740
Opcode = ISD::FP_TO_UINT;
14741
14742
EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;
14743
SDValue BV = DAG.getBuildVector(NewVT, dl, Ops);
14744
return DAG.getNode(Opcode, dl, TargetVT, BV);
14745
}
14746
return SDValue();
14747
}
14748
14749
/// Reduce the number of loads when building a vector.
14750
///
14751
/// Building a vector out of multiple loads can be converted to a load
14752
/// of the vector type if the loads are consecutive. If the loads are
14753
/// consecutive but in descending order, a shuffle is added at the end
14754
/// to reorder the vector.
14755
static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
14756
assert(N->getOpcode() == ISD::BUILD_VECTOR &&
14757
"Should be called with a BUILD_VECTOR node");
14758
14759
SDLoc dl(N);
14760
14761
// Return early for non byte-sized type, as they can't be consecutive.
14762
if (!N->getValueType(0).getVectorElementType().isByteSized())
14763
return SDValue();
14764
14765
bool InputsAreConsecutiveLoads = true;
14766
bool InputsAreReverseConsecutive = true;
14767
unsigned ElemSize = N->getValueType(0).getScalarType().getStoreSize();
14768
SDValue FirstInput = N->getOperand(0);
14769
bool IsRoundOfExtLoad = false;
14770
LoadSDNode *FirstLoad = nullptr;
14771
14772
if (FirstInput.getOpcode() == ISD::FP_ROUND &&
14773
FirstInput.getOperand(0).getOpcode() == ISD::LOAD) {
14774
FirstLoad = cast<LoadSDNode>(FirstInput.getOperand(0));
14775
IsRoundOfExtLoad = FirstLoad->getExtensionType() == ISD::EXTLOAD;
14776
}
14777
// Not a build vector of (possibly fp_rounded) loads.
14778
if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) ||
14779
N->getNumOperands() == 1)
14780
return SDValue();
14781
14782
if (!IsRoundOfExtLoad)
14783
FirstLoad = cast<LoadSDNode>(FirstInput);
14784
14785
SmallVector<LoadSDNode *, 4> InputLoads;
14786
InputLoads.push_back(FirstLoad);
14787
for (int i = 1, e = N->getNumOperands(); i < e; ++i) {
14788
// If any inputs are fp_round(extload), they all must be.
14789
if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND)
14790
return SDValue();
14791
14792
SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) :
14793
N->getOperand(i);
14794
if (NextInput.getOpcode() != ISD::LOAD)
14795
return SDValue();
14796
14797
SDValue PreviousInput =
14798
IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1);
14799
LoadSDNode *LD1 = cast<LoadSDNode>(PreviousInput);
14800
LoadSDNode *LD2 = cast<LoadSDNode>(NextInput);
14801
14802
// If any inputs are fp_round(extload), they all must be.
14803
if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)
14804
return SDValue();
14805
14806
// We only care about regular loads. The PPC-specific load intrinsics
14807
// will not lead to a merge opportunity.
14808
if (!DAG.areNonVolatileConsecutiveLoads(LD2, LD1, ElemSize, 1))
14809
InputsAreConsecutiveLoads = false;
14810
if (!DAG.areNonVolatileConsecutiveLoads(LD1, LD2, ElemSize, 1))
14811
InputsAreReverseConsecutive = false;
14812
14813
// Exit early if the loads are neither consecutive nor reverse consecutive.
14814
if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)
14815
return SDValue();
14816
InputLoads.push_back(LD2);
14817
}
14818
14819
assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&
14820
"The loads cannot be both consecutive and reverse consecutive.");
14821
14822
SDValue WideLoad;
14823
SDValue ReturnSDVal;
14824
if (InputsAreConsecutiveLoads) {
14825
assert(FirstLoad && "Input needs to be a LoadSDNode.");
14826
WideLoad = DAG.getLoad(N->getValueType(0), dl, FirstLoad->getChain(),
14827
FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),
14828
FirstLoad->getAlign());
14829
ReturnSDVal = WideLoad;
14830
} else if (InputsAreReverseConsecutive) {
14831
LoadSDNode *LastLoad = InputLoads.back();
14832
assert(LastLoad && "Input needs to be a LoadSDNode.");
14833
WideLoad = DAG.getLoad(N->getValueType(0), dl, LastLoad->getChain(),
14834
LastLoad->getBasePtr(), LastLoad->getPointerInfo(),
14835
LastLoad->getAlign());
14836
SmallVector<int, 16> Ops;
14837
for (int i = N->getNumOperands() - 1; i >= 0; i--)
14838
Ops.push_back(i);
14839
14840
ReturnSDVal = DAG.getVectorShuffle(N->getValueType(0), dl, WideLoad,
14841
DAG.getUNDEF(N->getValueType(0)), Ops);
14842
} else
14843
return SDValue();
14844
14845
for (auto *LD : InputLoads)
14846
DAG.makeEquivalentMemoryOrdering(LD, WideLoad);
14847
return ReturnSDVal;
14848
}
14849
14850
// This function adds the required vector_shuffle needed to get
14851
// the elements of the vector extract in the correct position
14852
// as specified by the CorrectElems encoding.
14853
static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
14854
SDValue Input, uint64_t Elems,
14855
uint64_t CorrectElems) {
14856
SDLoc dl(N);
14857
14858
unsigned NumElems = Input.getValueType().getVectorNumElements();
14859
SmallVector<int, 16> ShuffleMask(NumElems, -1);
14860
14861
// Knowing the element indices being extracted from the original
14862
// vector and the order in which they're being inserted, just put
14863
// them at element indices required for the instruction.
14864
for (unsigned i = 0; i < N->getNumOperands(); i++) {
14865
if (DAG.getDataLayout().isLittleEndian())
14866
ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
14867
else
14868
ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
14869
CorrectElems = CorrectElems >> 8;
14870
Elems = Elems >> 8;
14871
}
14872
14873
SDValue Shuffle =
14874
DAG.getVectorShuffle(Input.getValueType(), dl, Input,
14875
DAG.getUNDEF(Input.getValueType()), ShuffleMask);
14876
14877
EVT VT = N->getValueType(0);
14878
SDValue Conv = DAG.getBitcast(VT, Shuffle);
14879
14880
EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
14881
Input.getValueType().getVectorElementType(),
14882
VT.getVectorNumElements());
14883
return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Conv,
14884
DAG.getValueType(ExtVT));
14885
}
14886
14887
// Look for build vector patterns where input operands come from sign
14888
// extended vector_extract elements of specific indices. If the correct indices
14889
// aren't used, add a vector shuffle to fix up the indices and create
14890
// SIGN_EXTEND_INREG node which selects the vector sign extend instructions
14891
// during instruction selection.
14892
static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
14893
// This array encodes the indices that the vector sign extend instructions
14894
// extract from when extending from one type to another for both BE and LE.
14895
// The right nibble of each byte corresponds to the LE incides.
14896
// and the left nibble of each byte corresponds to the BE incides.
14897
// For example: 0x3074B8FC byte->word
14898
// For LE: the allowed indices are: 0x0,0x4,0x8,0xC
14899
// For BE: the allowed indices are: 0x3,0x7,0xB,0xF
14900
// For example: 0x000070F8 byte->double word
14901
// For LE: the allowed indices are: 0x0,0x8
14902
// For BE: the allowed indices are: 0x7,0xF
14903
uint64_t TargetElems[] = {
14904
0x3074B8FC, // b->w
14905
0x000070F8, // b->d
14906
0x10325476, // h->w
14907
0x00003074, // h->d
14908
0x00001032, // w->d
14909
};
14910
14911
uint64_t Elems = 0;
14912
int Index;
14913
SDValue Input;
14914
14915
auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
14916
if (!Op)
14917
return false;
14918
if (Op.getOpcode() != ISD::SIGN_EXTEND &&
14919
Op.getOpcode() != ISD::SIGN_EXTEND_INREG)
14920
return false;
14921
14922
// A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value
14923
// of the right width.
14924
SDValue Extract = Op.getOperand(0);
14925
if (Extract.getOpcode() == ISD::ANY_EXTEND)
14926
Extract = Extract.getOperand(0);
14927
if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14928
return false;
14929
14930
ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
14931
if (!ExtOp)
14932
return false;
14933
14934
Index = ExtOp->getZExtValue();
14935
if (Input && Input != Extract.getOperand(0))
14936
return false;
14937
14938
if (!Input)
14939
Input = Extract.getOperand(0);
14940
14941
Elems = Elems << 8;
14942
Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
14943
Elems |= Index;
14944
14945
return true;
14946
};
14947
14948
// If the build vector operands aren't sign extended vector extracts,
14949
// of the same input vector, then return.
14950
for (unsigned i = 0; i < N->getNumOperands(); i++) {
14951
if (!isSExtOfVecExtract(N->getOperand(i))) {
14952
return SDValue();
14953
}
14954
}
14955
14956
// If the vector extract indices are not correct, add the appropriate
14957
// vector_shuffle.
14958
int TgtElemArrayIdx;
14959
int InputSize = Input.getValueType().getScalarSizeInBits();
14960
int OutputSize = N->getValueType(0).getScalarSizeInBits();
14961
if (InputSize + OutputSize == 40)
14962
TgtElemArrayIdx = 0;
14963
else if (InputSize + OutputSize == 72)
14964
TgtElemArrayIdx = 1;
14965
else if (InputSize + OutputSize == 48)
14966
TgtElemArrayIdx = 2;
14967
else if (InputSize + OutputSize == 80)
14968
TgtElemArrayIdx = 3;
14969
else if (InputSize + OutputSize == 96)
14970
TgtElemArrayIdx = 4;
14971
else
14972
return SDValue();
14973
14974
uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
14975
CorrectElems = DAG.getDataLayout().isLittleEndian()
14976
? CorrectElems & 0x0F0F0F0F0F0F0F0F
14977
: CorrectElems & 0xF0F0F0F0F0F0F0F0;
14978
if (Elems != CorrectElems) {
14979
return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
14980
}
14981
14982
// Regular lowering will catch cases where a shuffle is not needed.
14983
return SDValue();
14984
}
14985
14986
// Look for the pattern of a load from a narrow width to i128, feeding
14987
// into a BUILD_VECTOR of v1i128. Replace this sequence with a PPCISD node
14988
// (LXVRZX). This node represents a zero extending load that will be matched
14989
// to the Load VSX Vector Rightmost instructions.
14990
static SDValue combineBVZEXTLOAD(SDNode *N, SelectionDAG &DAG) {
14991
SDLoc DL(N);
14992
14993
// This combine is only eligible for a BUILD_VECTOR of v1i128.
14994
if (N->getValueType(0) != MVT::v1i128)
14995
return SDValue();
14996
14997
SDValue Operand = N->getOperand(0);
14998
// Proceed with the transformation if the operand to the BUILD_VECTOR
14999
// is a load instruction.
15000
if (Operand.getOpcode() != ISD::LOAD)
15001
return SDValue();
15002
15003
auto *LD = cast<LoadSDNode>(Operand);
15004
EVT MemoryType = LD->getMemoryVT();
15005
15006
// This transformation is only valid if the we are loading either a byte,
15007
// halfword, word, or doubleword.
15008
bool ValidLDType = MemoryType == MVT::i8 || MemoryType == MVT::i16 ||
15009
MemoryType == MVT::i32 || MemoryType == MVT::i64;
15010
15011
// Ensure that the load from the narrow width is being zero extended to i128.
15012
if (!ValidLDType ||
15013
(LD->getExtensionType() != ISD::ZEXTLOAD &&
15014
LD->getExtensionType() != ISD::EXTLOAD))
15015
return SDValue();
15016
15017
SDValue LoadOps[] = {
15018
LD->getChain(), LD->getBasePtr(),
15019
DAG.getIntPtrConstant(MemoryType.getScalarSizeInBits(), DL)};
15020
15021
return DAG.getMemIntrinsicNode(PPCISD::LXVRZX, DL,
15022
DAG.getVTList(MVT::v1i128, MVT::Other),
15023
LoadOps, MemoryType, LD->getMemOperand());
15024
}
15025
15026
SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
15027
DAGCombinerInfo &DCI) const {
15028
assert(N->getOpcode() == ISD::BUILD_VECTOR &&
15029
"Should be called with a BUILD_VECTOR node");
15030
15031
SelectionDAG &DAG = DCI.DAG;
15032
SDLoc dl(N);
15033
15034
if (!Subtarget.hasVSX())
15035
return SDValue();
15036
15037
// The target independent DAG combiner will leave a build_vector of
15038
// float-to-int conversions intact. We can generate MUCH better code for
15039
// a float-to-int conversion of a vector of floats.
15040
SDValue FirstInput = N->getOperand(0);
15041
if (FirstInput.getOpcode() == PPCISD::MFVSR) {
15042
SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);
15043
if (Reduced)
15044
return Reduced;
15045
}
15046
15047
// If we're building a vector out of consecutive loads, just load that
15048
// vector type.
15049
SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);
15050
if (Reduced)
15051
return Reduced;
15052
15053
// If we're building a vector out of extended elements from another vector
15054
// we have P9 vector integer extend instructions. The code assumes legal
15055
// input types (i.e. it can't handle things like v4i16) so do not run before
15056
// legalization.
15057
if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) {
15058
Reduced = combineBVOfVecSExt(N, DAG);
15059
if (Reduced)
15060
return Reduced;
15061
}
15062
15063
// On Power10, the Load VSX Vector Rightmost instructions can be utilized
15064
// if this is a BUILD_VECTOR of v1i128, and if the operand to the BUILD_VECTOR
15065
// is a load from <valid narrow width> to i128.
15066
if (Subtarget.isISA3_1()) {
15067
SDValue BVOfZLoad = combineBVZEXTLOAD(N, DAG);
15068
if (BVOfZLoad)
15069
return BVOfZLoad;
15070
}
15071
15072
if (N->getValueType(0) != MVT::v2f64)
15073
return SDValue();
15074
15075
// Looking for:
15076
// (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
15077
if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&
15078
FirstInput.getOpcode() != ISD::UINT_TO_FP)
15079
return SDValue();
15080
if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP &&
15081
N->getOperand(1).getOpcode() != ISD::UINT_TO_FP)
15082
return SDValue();
15083
if (FirstInput.getOpcode() != N->getOperand(1).getOpcode())
15084
return SDValue();
15085
15086
SDValue Ext1 = FirstInput.getOperand(0);
15087
SDValue Ext2 = N->getOperand(1).getOperand(0);
15088
if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15089
Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15090
return SDValue();
15091
15092
ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1));
15093
ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));
15094
if (!Ext1Op || !Ext2Op)
15095
return SDValue();
15096
if (Ext1.getOperand(0).getValueType() != MVT::v4i32 ||
15097
Ext1.getOperand(0) != Ext2.getOperand(0))
15098
return SDValue();
15099
15100
int FirstElem = Ext1Op->getZExtValue();
15101
int SecondElem = Ext2Op->getZExtValue();
15102
int SubvecIdx;
15103
if (FirstElem == 0 && SecondElem == 1)
15104
SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
15105
else if (FirstElem == 2 && SecondElem == 3)
15106
SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
15107
else
15108
return SDValue();
15109
15110
SDValue SrcVec = Ext1.getOperand(0);
15111
auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ?
15112
PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
15113
return DAG.getNode(NodeType, dl, MVT::v2f64,
15114
SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl));
15115
}
15116
15117
SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
15118
DAGCombinerInfo &DCI) const {
15119
assert((N->getOpcode() == ISD::SINT_TO_FP ||
15120
N->getOpcode() == ISD::UINT_TO_FP) &&
15121
"Need an int -> FP conversion node here");
15122
15123
if (useSoftFloat() || !Subtarget.has64BitSupport())
15124
return SDValue();
15125
15126
SelectionDAG &DAG = DCI.DAG;
15127
SDLoc dl(N);
15128
SDValue Op(N, 0);
15129
15130
// Don't handle ppc_fp128 here or conversions that are out-of-range capable
15131
// from the hardware.
15132
if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
15133
return SDValue();
15134
if (!Op.getOperand(0).getValueType().isSimple())
15135
return SDValue();
15136
if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) ||
15137
Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64))
15138
return SDValue();
15139
15140
SDValue FirstOperand(Op.getOperand(0));
15141
bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
15142
(FirstOperand.getValueType() == MVT::i8 ||
15143
FirstOperand.getValueType() == MVT::i16);
15144
if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
15145
bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
15146
bool DstDouble = Op.getValueType() == MVT::f64;
15147
unsigned ConvOp = Signed ?
15148
(DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) :
15149
(DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);
15150
SDValue WidthConst =
15151
DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
15152
dl, false);
15153
LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode());
15154
SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
15155
SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
15156
DAG.getVTList(MVT::f64, MVT::Other),
15157
Ops, MVT::i8, LDN->getMemOperand());
15158
DAG.makeEquivalentMemoryOrdering(LDN, Ld);
15159
15160
// For signed conversion, we need to sign-extend the value in the VSR
15161
if (Signed) {
15162
SDValue ExtOps[] = { Ld, WidthConst };
15163
SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps);
15164
return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext);
15165
} else
15166
return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);
15167
}
15168
15169
15170
// For i32 intermediate values, unfortunately, the conversion functions
15171
// leave the upper 32 bits of the value are undefined. Within the set of
15172
// scalar instructions, we have no method for zero- or sign-extending the
15173
// value. Thus, we cannot handle i32 intermediate values here.
15174
if (Op.getOperand(0).getValueType() == MVT::i32)
15175
return SDValue();
15176
15177
assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
15178
"UINT_TO_FP is supported only with FPCVT");
15179
15180
// If we have FCFIDS, then use it when converting to single-precision.
15181
// Otherwise, convert to double-precision and then round.
15182
unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
15183
? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
15184
: PPCISD::FCFIDS)
15185
: (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
15186
: PPCISD::FCFID);
15187
MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
15188
? MVT::f32
15189
: MVT::f64;
15190
15191
// If we're converting from a float, to an int, and back to a float again,
15192
// then we don't need the store/load pair at all.
15193
if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
15194
Subtarget.hasFPCVT()) ||
15195
(Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) {
15196
SDValue Src = Op.getOperand(0).getOperand(0);
15197
if (Src.getValueType() == MVT::f32) {
15198
Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
15199
DCI.AddToWorklist(Src.getNode());
15200
} else if (Src.getValueType() != MVT::f64) {
15201
// Make sure that we don't pick up a ppc_fp128 source value.
15202
return SDValue();
15203
}
15204
15205
unsigned FCTOp =
15206
Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
15207
PPCISD::FCTIDUZ;
15208
15209
SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src);
15210
SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp);
15211
15212
if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
15213
FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
15214
DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
15215
DCI.AddToWorklist(FP.getNode());
15216
}
15217
15218
return FP;
15219
}
15220
15221
return SDValue();
15222
}
15223
15224
// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
15225
// builtins) into loads with swaps.
15226
SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
15227
DAGCombinerInfo &DCI) const {
15228
// Delay VSX load for LE combine until after LegalizeOps to prioritize other
15229
// load combines.
15230
if (DCI.isBeforeLegalizeOps())
15231
return SDValue();
15232
15233
SelectionDAG &DAG = DCI.DAG;
15234
SDLoc dl(N);
15235
SDValue Chain;
15236
SDValue Base;
15237
MachineMemOperand *MMO;
15238
15239
switch (N->getOpcode()) {
15240
default:
15241
llvm_unreachable("Unexpected opcode for little endian VSX load");
15242
case ISD::LOAD: {
15243
LoadSDNode *LD = cast<LoadSDNode>(N);
15244
Chain = LD->getChain();
15245
Base = LD->getBasePtr();
15246
MMO = LD->getMemOperand();
15247
// If the MMO suggests this isn't a load of a full vector, leave
15248
// things alone. For a built-in, we have to make the change for
15249
// correctness, so if there is a size problem that will be a bug.
15250
if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)
15251
return SDValue();
15252
break;
15253
}
15254
case ISD::INTRINSIC_W_CHAIN: {
15255
MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
15256
Chain = Intrin->getChain();
15257
// Similarly to the store case below, Intrin->getBasePtr() doesn't get
15258
// us what we want. Get operand 2 instead.
15259
Base = Intrin->getOperand(2);
15260
MMO = Intrin->getMemOperand();
15261
break;
15262
}
15263
}
15264
15265
MVT VecTy = N->getValueType(0).getSimpleVT();
15266
15267
SDValue LoadOps[] = { Chain, Base };
15268
SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
15269
DAG.getVTList(MVT::v2f64, MVT::Other),
15270
LoadOps, MVT::v2f64, MMO);
15271
15272
DCI.AddToWorklist(Load.getNode());
15273
Chain = Load.getValue(1);
15274
SDValue Swap = DAG.getNode(
15275
PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load);
15276
DCI.AddToWorklist(Swap.getNode());
15277
15278
// Add a bitcast if the resulting load type doesn't match v2f64.
15279
if (VecTy != MVT::v2f64) {
15280
SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap);
15281
DCI.AddToWorklist(N.getNode());
15282
// Package {bitcast value, swap's chain} to match Load's shape.
15283
return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other),
15284
N, Swap.getValue(1));
15285
}
15286
15287
return Swap;
15288
}
15289
15290
// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
15291
// builtins) into stores with swaps.
15292
SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
15293
DAGCombinerInfo &DCI) const {
15294
// Delay VSX store for LE combine until after LegalizeOps to prioritize other
15295
// store combines.
15296
if (DCI.isBeforeLegalizeOps())
15297
return SDValue();
15298
15299
SelectionDAG &DAG = DCI.DAG;
15300
SDLoc dl(N);
15301
SDValue Chain;
15302
SDValue Base;
15303
unsigned SrcOpnd;
15304
MachineMemOperand *MMO;
15305
15306
switch (N->getOpcode()) {
15307
default:
15308
llvm_unreachable("Unexpected opcode for little endian VSX store");
15309
case ISD::STORE: {
15310
StoreSDNode *ST = cast<StoreSDNode>(N);
15311
Chain = ST->getChain();
15312
Base = ST->getBasePtr();
15313
MMO = ST->getMemOperand();
15314
SrcOpnd = 1;
15315
// If the MMO suggests this isn't a store of a full vector, leave
15316
// things alone. For a built-in, we have to make the change for
15317
// correctness, so if there is a size problem that will be a bug.
15318
if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)
15319
return SDValue();
15320
break;
15321
}
15322
case ISD::INTRINSIC_VOID: {
15323
MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
15324
Chain = Intrin->getChain();
15325
// Intrin->getBasePtr() oddly does not get what we want.
15326
Base = Intrin->getOperand(3);
15327
MMO = Intrin->getMemOperand();
15328
SrcOpnd = 2;
15329
break;
15330
}
15331
}
15332
15333
SDValue Src = N->getOperand(SrcOpnd);
15334
MVT VecTy = Src.getValueType().getSimpleVT();
15335
15336
// All stores are done as v2f64 and possible bit cast.
15337
if (VecTy != MVT::v2f64) {
15338
Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);
15339
DCI.AddToWorklist(Src.getNode());
15340
}
15341
15342
SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
15343
DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src);
15344
DCI.AddToWorklist(Swap.getNode());
15345
Chain = Swap.getValue(1);
15346
SDValue StoreOps[] = { Chain, Swap, Base };
15347
SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,
15348
DAG.getVTList(MVT::Other),
15349
StoreOps, VecTy, MMO);
15350
DCI.AddToWorklist(Store.getNode());
15351
return Store;
15352
}
15353
15354
// Handle DAG combine for STORE (FP_TO_INT F).
15355
SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
15356
DAGCombinerInfo &DCI) const {
15357
SelectionDAG &DAG = DCI.DAG;
15358
SDLoc dl(N);
15359
unsigned Opcode = N->getOperand(1).getOpcode();
15360
(void)Opcode;
15361
bool Strict = N->getOperand(1)->isStrictFPOpcode();
15362
15363
assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT ||
15364
Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT)
15365
&& "Not a FP_TO_INT Instruction!");
15366
15367
SDValue Val = N->getOperand(1).getOperand(Strict ? 1 : 0);
15368
EVT Op1VT = N->getOperand(1).getValueType();
15369
EVT ResVT = Val.getValueType();
15370
15371
if (!Subtarget.hasVSX() || !Subtarget.hasFPCVT() || !isTypeLegal(ResVT))
15372
return SDValue();
15373
15374
// Only perform combine for conversion to i64/i32 or power9 i16/i8.
15375
bool ValidTypeForStoreFltAsInt =
15376
(Op1VT == MVT::i32 || (Op1VT == MVT::i64 && Subtarget.isPPC64()) ||
15377
(Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));
15378
15379
// TODO: Lower conversion from f128 on all VSX targets
15380
if (ResVT == MVT::ppcf128 || (ResVT == MVT::f128 && !Subtarget.hasP9Vector()))
15381
return SDValue();
15382
15383
if ((Op1VT != MVT::i64 && !Subtarget.hasP8Vector()) ||
15384
cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)
15385
return SDValue();
15386
15387
Val = convertFPToInt(N->getOperand(1), DAG, Subtarget);
15388
15389
// Set number of bytes being converted.
15390
unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8;
15391
SDValue Ops[] = {N->getOperand(0), Val, N->getOperand(2),
15392
DAG.getIntPtrConstant(ByteSize, dl, false),
15393
DAG.getValueType(Op1VT)};
15394
15395
Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl,
15396
DAG.getVTList(MVT::Other), Ops,
15397
cast<StoreSDNode>(N)->getMemoryVT(),
15398
cast<StoreSDNode>(N)->getMemOperand());
15399
15400
return Val;
15401
}
15402
15403
static bool isAlternatingShuffMask(const ArrayRef<int> &Mask, int NumElts) {
15404
// Check that the source of the element keeps flipping
15405
// (i.e. Mask[i] < NumElts -> Mask[i+i] >= NumElts).
15406
bool PrevElemFromFirstVec = Mask[0] < NumElts;
15407
for (int i = 1, e = Mask.size(); i < e; i++) {
15408
if (PrevElemFromFirstVec && Mask[i] < NumElts)
15409
return false;
15410
if (!PrevElemFromFirstVec && Mask[i] >= NumElts)
15411
return false;
15412
PrevElemFromFirstVec = !PrevElemFromFirstVec;
15413
}
15414
return true;
15415
}
15416
15417
static bool isSplatBV(SDValue Op) {
15418
if (Op.getOpcode() != ISD::BUILD_VECTOR)
15419
return false;
15420
SDValue FirstOp;
15421
15422
// Find first non-undef input.
15423
for (int i = 0, e = Op.getNumOperands(); i < e; i++) {
15424
FirstOp = Op.getOperand(i);
15425
if (!FirstOp.isUndef())
15426
break;
15427
}
15428
15429
// All inputs are undef or the same as the first non-undef input.
15430
for (int i = 1, e = Op.getNumOperands(); i < e; i++)
15431
if (Op.getOperand(i) != FirstOp && !Op.getOperand(i).isUndef())
15432
return false;
15433
return true;
15434
}
15435
15436
static SDValue isScalarToVec(SDValue Op) {
15437
if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
15438
return Op;
15439
if (Op.getOpcode() != ISD::BITCAST)
15440
return SDValue();
15441
Op = Op.getOperand(0);
15442
if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
15443
return Op;
15444
return SDValue();
15445
}
15446
15447
// Fix up the shuffle mask to account for the fact that the result of
15448
// scalar_to_vector is not in lane zero. This just takes all values in
15449
// the ranges specified by the min/max indices and adds the number of
15450
// elements required to ensure each element comes from the respective
15451
// position in the valid lane.
15452
// On little endian, that's just the corresponding element in the other
15453
// half of the vector. On big endian, it is in the same half but right
15454
// justified rather than left justified in that half.
15455
static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV,
15456
int LHSMaxIdx, int RHSMinIdx,
15457
int RHSMaxIdx, int HalfVec,
15458
unsigned ValidLaneWidth,
15459
const PPCSubtarget &Subtarget) {
15460
for (int i = 0, e = ShuffV.size(); i < e; i++) {
15461
int Idx = ShuffV[i];
15462
if ((Idx >= 0 && Idx < LHSMaxIdx) || (Idx >= RHSMinIdx && Idx < RHSMaxIdx))
15463
ShuffV[i] +=
15464
Subtarget.isLittleEndian() ? HalfVec : HalfVec - ValidLaneWidth;
15465
}
15466
}
15467
15468
// Replace a SCALAR_TO_VECTOR with a SCALAR_TO_VECTOR_PERMUTED except if
15469
// the original is:
15470
// (<n x Ty> (scalar_to_vector (Ty (extract_elt <n x Ty> %a, C))))
15471
// In such a case, just change the shuffle mask to extract the element
15472
// from the permuted index.
15473
static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG,
15474
const PPCSubtarget &Subtarget) {
15475
SDLoc dl(OrigSToV);
15476
EVT VT = OrigSToV.getValueType();
15477
assert(OrigSToV.getOpcode() == ISD::SCALAR_TO_VECTOR &&
15478
"Expecting a SCALAR_TO_VECTOR here");
15479
SDValue Input = OrigSToV.getOperand(0);
15480
15481
if (Input.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
15482
ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Input.getOperand(1));
15483
SDValue OrigVector = Input.getOperand(0);
15484
15485
// Can't handle non-const element indices or different vector types
15486
// for the input to the extract and the output of the scalar_to_vector.
15487
if (Idx && VT == OrigVector.getValueType()) {
15488
unsigned NumElts = VT.getVectorNumElements();
15489
assert(
15490
NumElts > 1 &&
15491
"Cannot produce a permuted scalar_to_vector for one element vector");
15492
SmallVector<int, 16> NewMask(NumElts, -1);
15493
unsigned ResultInElt = NumElts / 2;
15494
ResultInElt -= Subtarget.isLittleEndian() ? 0 : 1;
15495
NewMask[ResultInElt] = Idx->getZExtValue();
15496
return DAG.getVectorShuffle(VT, dl, OrigVector, OrigVector, NewMask);
15497
}
15498
}
15499
return DAG.getNode(PPCISD::SCALAR_TO_VECTOR_PERMUTED, dl, VT,
15500
OrigSToV.getOperand(0));
15501
}
15502
15503
// On little endian subtargets, combine shuffles such as:
15504
// vector_shuffle<16,1,17,3,18,5,19,7,20,9,21,11,22,13,23,15>, <zero>, %b
15505
// into:
15506
// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7>, <zero>, %b
15507
// because the latter can be matched to a single instruction merge.
15508
// Furthermore, SCALAR_TO_VECTOR on little endian always involves a permute
15509
// to put the value into element zero. Adjust the shuffle mask so that the
15510
// vector can remain in permuted form (to prevent a swap prior to a shuffle).
15511
// On big endian targets, this is still useful for SCALAR_TO_VECTOR
15512
// nodes with elements smaller than doubleword because all the ways
15513
// of getting scalar data into a vector register put the value in the
15514
// rightmost element of the left half of the vector.
15515
SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
15516
SelectionDAG &DAG) const {
15517
SDValue LHS = SVN->getOperand(0);
15518
SDValue RHS = SVN->getOperand(1);
15519
auto Mask = SVN->getMask();
15520
int NumElts = LHS.getValueType().getVectorNumElements();
15521
SDValue Res(SVN, 0);
15522
SDLoc dl(SVN);
15523
bool IsLittleEndian = Subtarget.isLittleEndian();
15524
15525
// On big endian targets this is only useful for subtargets with direct moves.
15526
// On little endian targets it would be useful for all subtargets with VSX.
15527
// However adding special handling for LE subtargets without direct moves
15528
// would be wasted effort since the minimum arch for LE is ISA 2.07 (Power8)
15529
// which includes direct moves.
15530
if (!Subtarget.hasDirectMove())
15531
return Res;
15532
15533
// If this is not a shuffle of a shuffle and the first element comes from
15534
// the second vector, canonicalize to the commuted form. This will make it
15535
// more likely to match one of the single instruction patterns.
15536
if (Mask[0] >= NumElts && LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
15537
RHS.getOpcode() != ISD::VECTOR_SHUFFLE) {
15538
std::swap(LHS, RHS);
15539
Res = DAG.getCommutedVectorShuffle(*SVN);
15540
Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
15541
}
15542
15543
// Adjust the shuffle mask if either input vector comes from a
15544
// SCALAR_TO_VECTOR and keep the respective input vector in permuted
15545
// form (to prevent the need for a swap).
15546
SmallVector<int, 16> ShuffV(Mask);
15547
SDValue SToVLHS = isScalarToVec(LHS);
15548
SDValue SToVRHS = isScalarToVec(RHS);
15549
if (SToVLHS || SToVRHS) {
15550
// FIXME: If both LHS and RHS are SCALAR_TO_VECTOR, but are not the
15551
// same type and have differing element sizes, then do not perform
15552
// the following transformation. The current transformation for
15553
// SCALAR_TO_VECTOR assumes that both input vectors have the same
15554
// element size. This will be updated in the future to account for
15555
// differing sizes of the LHS and RHS.
15556
if (SToVLHS && SToVRHS &&
15557
(SToVLHS.getValueType().getScalarSizeInBits() !=
15558
SToVRHS.getValueType().getScalarSizeInBits()))
15559
return Res;
15560
15561
int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements()
15562
: SToVRHS.getValueType().getVectorNumElements();
15563
int NumEltsOut = ShuffV.size();
15564
// The width of the "valid lane" (i.e. the lane that contains the value that
15565
// is vectorized) needs to be expressed in terms of the number of elements
15566
// of the shuffle. It is thereby the ratio of the values before and after
15567
// any bitcast.
15568
unsigned ValidLaneWidth =
15569
SToVLHS ? SToVLHS.getValueType().getScalarSizeInBits() /
15570
LHS.getValueType().getScalarSizeInBits()
15571
: SToVRHS.getValueType().getScalarSizeInBits() /
15572
RHS.getValueType().getScalarSizeInBits();
15573
15574
// Initially assume that neither input is permuted. These will be adjusted
15575
// accordingly if either input is.
15576
int LHSMaxIdx = -1;
15577
int RHSMinIdx = -1;
15578
int RHSMaxIdx = -1;
15579
int HalfVec = LHS.getValueType().getVectorNumElements() / 2;
15580
15581
// Get the permuted scalar to vector nodes for the source(s) that come from
15582
// ISD::SCALAR_TO_VECTOR.
15583
// On big endian systems, this only makes sense for element sizes smaller
15584
// than 64 bits since for 64-bit elements, all instructions already put
15585
// the value into element zero. Since scalar size of LHS and RHS may differ
15586
// after isScalarToVec, this should be checked using their own sizes.
15587
if (SToVLHS) {
15588
if (!IsLittleEndian && SToVLHS.getValueType().getScalarSizeInBits() >= 64)
15589
return Res;
15590
// Set up the values for the shuffle vector fixup.
15591
LHSMaxIdx = NumEltsOut / NumEltsIn;
15592
SToVLHS = getSToVPermuted(SToVLHS, DAG, Subtarget);
15593
if (SToVLHS.getValueType() != LHS.getValueType())
15594
SToVLHS = DAG.getBitcast(LHS.getValueType(), SToVLHS);
15595
LHS = SToVLHS;
15596
}
15597
if (SToVRHS) {
15598
if (!IsLittleEndian && SToVRHS.getValueType().getScalarSizeInBits() >= 64)
15599
return Res;
15600
RHSMinIdx = NumEltsOut;
15601
RHSMaxIdx = NumEltsOut / NumEltsIn + RHSMinIdx;
15602
SToVRHS = getSToVPermuted(SToVRHS, DAG, Subtarget);
15603
if (SToVRHS.getValueType() != RHS.getValueType())
15604
SToVRHS = DAG.getBitcast(RHS.getValueType(), SToVRHS);
15605
RHS = SToVRHS;
15606
}
15607
15608
// Fix up the shuffle mask to reflect where the desired element actually is.
15609
// The minimum and maximum indices that correspond to element zero for both
15610
// the LHS and RHS are computed and will control which shuffle mask entries
15611
// are to be changed. For example, if the RHS is permuted, any shuffle mask
15612
// entries in the range [RHSMinIdx,RHSMaxIdx) will be adjusted.
15613
fixupShuffleMaskForPermutedSToV(ShuffV, LHSMaxIdx, RHSMinIdx, RHSMaxIdx,
15614
HalfVec, ValidLaneWidth, Subtarget);
15615
Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
15616
15617
// We may have simplified away the shuffle. We won't be able to do anything
15618
// further with it here.
15619
if (!isa<ShuffleVectorSDNode>(Res))
15620
return Res;
15621
Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
15622
}
15623
15624
SDValue TheSplat = IsLittleEndian ? RHS : LHS;
15625
// The common case after we commuted the shuffle is that the RHS is a splat
15626
// and we have elements coming in from the splat at indices that are not
15627
// conducive to using a merge.
15628
// Example:
15629
// vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, <zero>
15630
if (!isSplatBV(TheSplat))
15631
return Res;
15632
15633
// We are looking for a mask such that all even elements are from
15634
// one vector and all odd elements from the other.
15635
if (!isAlternatingShuffMask(Mask, NumElts))
15636
return Res;
15637
15638
// Adjust the mask so we are pulling in the same index from the splat
15639
// as the index from the interesting vector in consecutive elements.
15640
if (IsLittleEndian) {
15641
// Example (even elements from first vector):
15642
// vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
15643
if (Mask[0] < NumElts)
15644
for (int i = 1, e = Mask.size(); i < e; i += 2) {
15645
if (ShuffV[i] < 0)
15646
continue;
15647
// If element from non-splat is undef, pick first element from splat.
15648
ShuffV[i] = (ShuffV[i - 1] >= 0 ? ShuffV[i - 1] : 0) + NumElts;
15649
}
15650
// Example (odd elements from first vector):
15651
// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
15652
else
15653
for (int i = 0, e = Mask.size(); i < e; i += 2) {
15654
if (ShuffV[i] < 0)
15655
continue;
15656
// If element from non-splat is undef, pick first element from splat.
15657
ShuffV[i] = (ShuffV[i + 1] >= 0 ? ShuffV[i + 1] : 0) + NumElts;
15658
}
15659
} else {
15660
// Example (even elements from first vector):
15661
// vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> <zero>, t1
15662
if (Mask[0] < NumElts)
15663
for (int i = 0, e = Mask.size(); i < e; i += 2) {
15664
if (ShuffV[i] < 0)
15665
continue;
15666
// If element from non-splat is undef, pick first element from splat.
15667
ShuffV[i] = ShuffV[i + 1] >= 0 ? ShuffV[i + 1] - NumElts : 0;
15668
}
15669
// Example (odd elements from first vector):
15670
// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> <zero>, t1
15671
else
15672
for (int i = 1, e = Mask.size(); i < e; i += 2) {
15673
if (ShuffV[i] < 0)
15674
continue;
15675
// If element from non-splat is undef, pick first element from splat.
15676
ShuffV[i] = ShuffV[i - 1] >= 0 ? ShuffV[i - 1] - NumElts : 0;
15677
}
15678
}
15679
15680
// If the RHS has undefs, we need to remove them since we may have created
15681
// a shuffle that adds those instead of the splat value.
15682
SDValue SplatVal =
15683
cast<BuildVectorSDNode>(TheSplat.getNode())->getSplatValue();
15684
TheSplat = DAG.getSplatBuildVector(TheSplat.getValueType(), dl, SplatVal);
15685
15686
if (IsLittleEndian)
15687
RHS = TheSplat;
15688
else
15689
LHS = TheSplat;
15690
return DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
15691
}
15692
15693
SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
15694
LSBaseSDNode *LSBase,
15695
DAGCombinerInfo &DCI) const {
15696
assert((ISD::isNormalLoad(LSBase) || ISD::isNormalStore(LSBase)) &&
15697
"Not a reverse memop pattern!");
15698
15699
auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool {
15700
auto Mask = SVN->getMask();
15701
int i = 0;
15702
auto I = Mask.rbegin();
15703
auto E = Mask.rend();
15704
15705
for (; I != E; ++I) {
15706
if (*I != i)
15707
return false;
15708
i++;
15709
}
15710
return true;
15711
};
15712
15713
SelectionDAG &DAG = DCI.DAG;
15714
EVT VT = SVN->getValueType(0);
15715
15716
if (!isTypeLegal(VT) || !Subtarget.isLittleEndian() || !Subtarget.hasVSX())
15717
return SDValue();
15718
15719
// Before P9, we have PPCVSXSwapRemoval pass to hack the element order.
15720
// See comment in PPCVSXSwapRemoval.cpp.
15721
// It is conflict with PPCVSXSwapRemoval opt. So we don't do it.
15722
if (!Subtarget.hasP9Vector())
15723
return SDValue();
15724
15725
if(!IsElementReverse(SVN))
15726
return SDValue();
15727
15728
if (LSBase->getOpcode() == ISD::LOAD) {
15729
// If the load return value 0 has more than one user except the
15730
// shufflevector instruction, it is not profitable to replace the
15731
// shufflevector with a reverse load.
15732
for (SDNode::use_iterator UI = LSBase->use_begin(), UE = LSBase->use_end();
15733
UI != UE; ++UI)
15734
if (UI.getUse().getResNo() == 0 && UI->getOpcode() != ISD::VECTOR_SHUFFLE)
15735
return SDValue();
15736
15737
SDLoc dl(LSBase);
15738
SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};
15739
return DAG.getMemIntrinsicNode(
15740
PPCISD::LOAD_VEC_BE, dl, DAG.getVTList(VT, MVT::Other), LoadOps,
15741
LSBase->getMemoryVT(), LSBase->getMemOperand());
15742
}
15743
15744
if (LSBase->getOpcode() == ISD::STORE) {
15745
// If there are other uses of the shuffle, the swap cannot be avoided.
15746
// Forcing the use of an X-Form (since swapped stores only have
15747
// X-Forms) without removing the swap is unprofitable.
15748
if (!SVN->hasOneUse())
15749
return SDValue();
15750
15751
SDLoc dl(LSBase);
15752
SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(0),
15753
LSBase->getBasePtr()};
15754
return DAG.getMemIntrinsicNode(
15755
PPCISD::STORE_VEC_BE, dl, DAG.getVTList(MVT::Other), StoreOps,
15756
LSBase->getMemoryVT(), LSBase->getMemOperand());
15757
}
15758
15759
llvm_unreachable("Expected a load or store node here");
15760
}
15761
15762
static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) {
15763
unsigned IntrinsicID = Intrin.getConstantOperandVal(1);
15764
if (IntrinsicID == Intrinsic::ppc_stdcx)
15765
StoreWidth = 8;
15766
else if (IntrinsicID == Intrinsic::ppc_stwcx)
15767
StoreWidth = 4;
15768
else if (IntrinsicID == Intrinsic::ppc_sthcx)
15769
StoreWidth = 2;
15770
else if (IntrinsicID == Intrinsic::ppc_stbcx)
15771
StoreWidth = 1;
15772
else
15773
return false;
15774
return true;
15775
}
15776
15777
SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
15778
DAGCombinerInfo &DCI) const {
15779
SelectionDAG &DAG = DCI.DAG;
15780
SDLoc dl(N);
15781
switch (N->getOpcode()) {
15782
default: break;
15783
case ISD::ADD:
15784
return combineADD(N, DCI);
15785
case ISD::AND: {
15786
// We don't want (and (zext (shift...)), C) if C fits in the width of the
15787
// original input as that will prevent us from selecting optimal rotates.
15788
// This only matters if the input to the extend is i32 widened to i64.
15789
SDValue Op1 = N->getOperand(0);
15790
SDValue Op2 = N->getOperand(1);
15791
if ((Op1.getOpcode() != ISD::ZERO_EXTEND &&
15792
Op1.getOpcode() != ISD::ANY_EXTEND) ||
15793
!isa<ConstantSDNode>(Op2) || N->getValueType(0) != MVT::i64 ||
15794
Op1.getOperand(0).getValueType() != MVT::i32)
15795
break;
15796
SDValue NarrowOp = Op1.getOperand(0);
15797
if (NarrowOp.getOpcode() != ISD::SHL && NarrowOp.getOpcode() != ISD::SRL &&
15798
NarrowOp.getOpcode() != ISD::ROTL && NarrowOp.getOpcode() != ISD::ROTR)
15799
break;
15800
15801
uint64_t Imm = Op2->getAsZExtVal();
15802
// Make sure that the constant is narrow enough to fit in the narrow type.
15803
if (!isUInt<32>(Imm))
15804
break;
15805
SDValue ConstOp = DAG.getConstant(Imm, dl, MVT::i32);
15806
SDValue NarrowAnd = DAG.getNode(ISD::AND, dl, MVT::i32, NarrowOp, ConstOp);
15807
return DAG.getZExtOrTrunc(NarrowAnd, dl, N->getValueType(0));
15808
}
15809
case ISD::SHL:
15810
return combineSHL(N, DCI);
15811
case ISD::SRA:
15812
return combineSRA(N, DCI);
15813
case ISD::SRL:
15814
return combineSRL(N, DCI);
15815
case ISD::MUL:
15816
return combineMUL(N, DCI);
15817
case ISD::FMA:
15818
case PPCISD::FNMSUB:
15819
return combineFMALike(N, DCI);
15820
case PPCISD::SHL:
15821
if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
15822
return N->getOperand(0);
15823
break;
15824
case PPCISD::SRL:
15825
if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0.
15826
return N->getOperand(0);
15827
break;
15828
case PPCISD::SRA:
15829
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
15830
if (C->isZero() || // 0 >>s V -> 0.
15831
C->isAllOnes()) // -1 >>s V -> -1.
15832
return N->getOperand(0);
15833
}
15834
break;
15835
case ISD::SIGN_EXTEND:
15836
case ISD::ZERO_EXTEND:
15837
case ISD::ANY_EXTEND:
15838
return DAGCombineExtBoolTrunc(N, DCI);
15839
case ISD::TRUNCATE:
15840
return combineTRUNCATE(N, DCI);
15841
case ISD::SETCC:
15842
if (SDValue CSCC = combineSetCC(N, DCI))
15843
return CSCC;
15844
[[fallthrough]];
15845
case ISD::SELECT_CC:
15846
return DAGCombineTruncBoolExt(N, DCI);
15847
case ISD::SINT_TO_FP:
15848
case ISD::UINT_TO_FP:
15849
return combineFPToIntToFP(N, DCI);
15850
case ISD::VECTOR_SHUFFLE:
15851
if (ISD::isNormalLoad(N->getOperand(0).getNode())) {
15852
LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0));
15853
return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI);
15854
}
15855
return combineVectorShuffle(cast<ShuffleVectorSDNode>(N), DCI.DAG);
15856
case ISD::STORE: {
15857
15858
EVT Op1VT = N->getOperand(1).getValueType();
15859
unsigned Opcode = N->getOperand(1).getOpcode();
15860
15861
if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT ||
15862
Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT) {
15863
SDValue Val = combineStoreFPToInt(N, DCI);
15864
if (Val)
15865
return Val;
15866
}
15867
15868
if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) {
15869
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N->getOperand(1));
15870
SDValue Val= combineVReverseMemOP(SVN, cast<LSBaseSDNode>(N), DCI);
15871
if (Val)
15872
return Val;
15873
}
15874
15875
// Turn STORE (BSWAP) -> sthbrx/stwbrx.
15876
if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP &&
15877
N->getOperand(1).getNode()->hasOneUse() &&
15878
(Op1VT == MVT::i32 || Op1VT == MVT::i16 ||
15879
(Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {
15880
15881
// STBRX can only handle simple types and it makes no sense to store less
15882
// two bytes in byte-reversed order.
15883
EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
15884
if (mVT.isExtended() || mVT.getSizeInBits() < 16)
15885
break;
15886
15887
SDValue BSwapOp = N->getOperand(1).getOperand(0);
15888
// Do an any-extend to 32-bits if this is a half-word input.
15889
if (BSwapOp.getValueType() == MVT::i16)
15890
BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
15891
15892
// If the type of BSWAP operand is wider than stored memory width
15893
// it need to be shifted to the right side before STBRX.
15894
if (Op1VT.bitsGT(mVT)) {
15895
int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
15896
BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,
15897
DAG.getConstant(Shift, dl, MVT::i32));
15898
// Need to truncate if this is a bswap of i64 stored as i32/i16.
15899
if (Op1VT == MVT::i64)
15900
BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp);
15901
}
15902
15903
SDValue Ops[] = {
15904
N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT)
15905
};
15906
return
15907
DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
15908
Ops, cast<StoreSDNode>(N)->getMemoryVT(),
15909
cast<StoreSDNode>(N)->getMemOperand());
15910
}
15911
15912
// STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>
15913
// So it can increase the chance of CSE constant construction.
15914
if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
15915
isa<ConstantSDNode>(N->getOperand(1)) && Op1VT == MVT::i32) {
15916
// Need to sign-extended to 64-bits to handle negative values.
15917
EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();
15918
uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),
15919
MemVT.getSizeInBits());
15920
SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64);
15921
15922
// DAG.getTruncStore() can't be used here because it doesn't accept
15923
// the general (base + offset) addressing mode.
15924
// So we use UpdateNodeOperands and setTruncatingStore instead.
15925
DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2),
15926
N->getOperand(3));
15927
cast<StoreSDNode>(N)->setTruncatingStore(true);
15928
return SDValue(N, 0);
15929
}
15930
15931
// For little endian, VSX stores require generating xxswapd/lxvd2x.
15932
// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
15933
if (Op1VT.isSimple()) {
15934
MVT StoreVT = Op1VT.getSimpleVT();
15935
if (Subtarget.needsSwapsForVSXMemOps() &&
15936
(StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
15937
StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
15938
return expandVSXStoreForLE(N, DCI);
15939
}
15940
break;
15941
}
15942
case ISD::LOAD: {
15943
LoadSDNode *LD = cast<LoadSDNode>(N);
15944
EVT VT = LD->getValueType(0);
15945
15946
// For little endian, VSX loads require generating lxvd2x/xxswapd.
15947
// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
15948
if (VT.isSimple()) {
15949
MVT LoadVT = VT.getSimpleVT();
15950
if (Subtarget.needsSwapsForVSXMemOps() &&
15951
(LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
15952
LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
15953
return expandVSXLoadForLE(N, DCI);
15954
}
15955
15956
// We sometimes end up with a 64-bit integer load, from which we extract
15957
// two single-precision floating-point numbers. This happens with
15958
// std::complex<float>, and other similar structures, because of the way we
15959
// canonicalize structure copies. However, if we lack direct moves,
15960
// then the final bitcasts from the extracted integer values to the
15961
// floating-point numbers turn into store/load pairs. Even with direct moves,
15962
// just loading the two floating-point numbers is likely better.
15963
auto ReplaceTwoFloatLoad = [&]() {
15964
if (VT != MVT::i64)
15965
return false;
15966
15967
if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
15968
LD->isVolatile())
15969
return false;
15970
15971
// We're looking for a sequence like this:
15972
// t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
15973
// t16: i64 = srl t13, Constant:i32<32>
15974
// t17: i32 = truncate t16
15975
// t18: f32 = bitcast t17
15976
// t19: i32 = truncate t13
15977
// t20: f32 = bitcast t19
15978
15979
if (!LD->hasNUsesOfValue(2, 0))
15980
return false;
15981
15982
auto UI = LD->use_begin();
15983
while (UI.getUse().getResNo() != 0) ++UI;
15984
SDNode *Trunc = *UI++;
15985
while (UI.getUse().getResNo() != 0) ++UI;
15986
SDNode *RightShift = *UI;
15987
if (Trunc->getOpcode() != ISD::TRUNCATE)
15988
std::swap(Trunc, RightShift);
15989
15990
if (Trunc->getOpcode() != ISD::TRUNCATE ||
15991
Trunc->getValueType(0) != MVT::i32 ||
15992
!Trunc->hasOneUse())
15993
return false;
15994
if (RightShift->getOpcode() != ISD::SRL ||
15995
!isa<ConstantSDNode>(RightShift->getOperand(1)) ||
15996
RightShift->getConstantOperandVal(1) != 32 ||
15997
!RightShift->hasOneUse())
15998
return false;
15999
16000
SDNode *Trunc2 = *RightShift->use_begin();
16001
if (Trunc2->getOpcode() != ISD::TRUNCATE ||
16002
Trunc2->getValueType(0) != MVT::i32 ||
16003
!Trunc2->hasOneUse())
16004
return false;
16005
16006
SDNode *Bitcast = *Trunc->use_begin();
16007
SDNode *Bitcast2 = *Trunc2->use_begin();
16008
16009
if (Bitcast->getOpcode() != ISD::BITCAST ||
16010
Bitcast->getValueType(0) != MVT::f32)
16011
return false;
16012
if (Bitcast2->getOpcode() != ISD::BITCAST ||
16013
Bitcast2->getValueType(0) != MVT::f32)
16014
return false;
16015
16016
if (Subtarget.isLittleEndian())
16017
std::swap(Bitcast, Bitcast2);
16018
16019
// Bitcast has the second float (in memory-layout order) and Bitcast2
16020
// has the first one.
16021
16022
SDValue BasePtr = LD->getBasePtr();
16023
if (LD->isIndexed()) {
16024
assert(LD->getAddressingMode() == ISD::PRE_INC &&
16025
"Non-pre-inc AM on PPC?");
16026
BasePtr =
16027
DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
16028
LD->getOffset());
16029
}
16030
16031
auto MMOFlags =
16032
LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
16033
SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
16034
LD->getPointerInfo(), LD->getAlign(),
16035
MMOFlags, LD->getAAInfo());
16036
SDValue AddPtr =
16037
DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
16038
BasePtr, DAG.getIntPtrConstant(4, dl));
16039
SDValue FloatLoad2 = DAG.getLoad(
16040
MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
16041
LD->getPointerInfo().getWithOffset(4),
16042
commonAlignment(LD->getAlign(), 4), MMOFlags, LD->getAAInfo());
16043
16044
if (LD->isIndexed()) {
16045
// Note that DAGCombine should re-form any pre-increment load(s) from
16046
// what is produced here if that makes sense.
16047
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
16048
}
16049
16050
DCI.CombineTo(Bitcast2, FloatLoad);
16051
DCI.CombineTo(Bitcast, FloatLoad2);
16052
16053
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
16054
SDValue(FloatLoad2.getNode(), 1));
16055
return true;
16056
};
16057
16058
if (ReplaceTwoFloatLoad())
16059
return SDValue(N, 0);
16060
16061
EVT MemVT = LD->getMemoryVT();
16062
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
16063
Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty);
16064
if (LD->isUnindexed() && VT.isVector() &&
16065
((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
16066
// P8 and later hardware should just use LOAD.
16067
!Subtarget.hasP8Vector() &&
16068
(VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
16069
VT == MVT::v4f32))) &&
16070
LD->getAlign() < ABIAlignment) {
16071
// This is a type-legal unaligned Altivec load.
16072
SDValue Chain = LD->getChain();
16073
SDValue Ptr = LD->getBasePtr();
16074
bool isLittleEndian = Subtarget.isLittleEndian();
16075
16076
// This implements the loading of unaligned vectors as described in
16077
// the venerable Apple Velocity Engine overview. Specifically:
16078
// https://developer.apple.com/hardwaredrivers/ve/alignment.html
16079
// https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
16080
//
16081
// The general idea is to expand a sequence of one or more unaligned
16082
// loads into an alignment-based permutation-control instruction (lvsl
16083
// or lvsr), a series of regular vector loads (which always truncate
16084
// their input address to an aligned address), and a series of
16085
// permutations. The results of these permutations are the requested
16086
// loaded values. The trick is that the last "extra" load is not taken
16087
// from the address you might suspect (sizeof(vector) bytes after the
16088
// last requested load), but rather sizeof(vector) - 1 bytes after the
16089
// last requested vector. The point of this is to avoid a page fault if
16090
// the base address happened to be aligned. This works because if the
16091
// base address is aligned, then adding less than a full vector length
16092
// will cause the last vector in the sequence to be (re)loaded.
16093
// Otherwise, the next vector will be fetched as you might suspect was
16094
// necessary.
16095
16096
// We might be able to reuse the permutation generation from
16097
// a different base address offset from this one by an aligned amount.
16098
// The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
16099
// optimization later.
16100
Intrinsic::ID Intr, IntrLD, IntrPerm;
16101
MVT PermCntlTy, PermTy, LDTy;
16102
Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr
16103
: Intrinsic::ppc_altivec_lvsl;
16104
IntrLD = Intrinsic::ppc_altivec_lvx;
16105
IntrPerm = Intrinsic::ppc_altivec_vperm;
16106
PermCntlTy = MVT::v16i8;
16107
PermTy = MVT::v4i32;
16108
LDTy = MVT::v4i32;
16109
16110
SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);
16111
16112
// Create the new MMO for the new base load. It is like the original MMO,
16113
// but represents an area in memory almost twice the vector size centered
16114
// on the original address. If the address is unaligned, we might start
16115
// reading up to (sizeof(vector)-1) bytes below the address of the
16116
// original unaligned load.
16117
MachineFunction &MF = DAG.getMachineFunction();
16118
MachineMemOperand *BaseMMO =
16119
MF.getMachineMemOperand(LD->getMemOperand(),
16120
-(int64_t)MemVT.getStoreSize()+1,
16121
2*MemVT.getStoreSize()-1);
16122
16123
// Create the new base load.
16124
SDValue LDXIntID =
16125
DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout()));
16126
SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
16127
SDValue BaseLoad =
16128
DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
16129
DAG.getVTList(PermTy, MVT::Other),
16130
BaseLoadOps, LDTy, BaseMMO);
16131
16132
// Note that the value of IncOffset (which is provided to the next
16133
// load's pointer info offset value, and thus used to calculate the
16134
// alignment), and the value of IncValue (which is actually used to
16135
// increment the pointer value) are different! This is because we
16136
// require the next load to appear to be aligned, even though it
16137
// is actually offset from the base pointer by a lesser amount.
16138
int IncOffset = VT.getSizeInBits() / 8;
16139
int IncValue = IncOffset;
16140
16141
// Walk (both up and down) the chain looking for another load at the real
16142
// (aligned) offset (the alignment of the other load does not matter in
16143
// this case). If found, then do not use the offset reduction trick, as
16144
// that will prevent the loads from being later combined (as they would
16145
// otherwise be duplicates).
16146
if (!findConsecutiveLoad(LD, DAG))
16147
--IncValue;
16148
16149
SDValue Increment =
16150
DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout()));
16151
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16152
16153
MachineMemOperand *ExtraMMO =
16154
MF.getMachineMemOperand(LD->getMemOperand(),
16155
1, 2*MemVT.getStoreSize()-1);
16156
SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
16157
SDValue ExtraLoad =
16158
DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
16159
DAG.getVTList(PermTy, MVT::Other),
16160
ExtraLoadOps, LDTy, ExtraMMO);
16161
16162
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
16163
BaseLoad.getValue(1), ExtraLoad.getValue(1));
16164
16165
// Because vperm has a big-endian bias, we must reverse the order
16166
// of the input vectors and complement the permute control vector
16167
// when generating little endian code. We have already handled the
16168
// latter by using lvsr instead of lvsl, so just reverse BaseLoad
16169
// and ExtraLoad here.
16170
SDValue Perm;
16171
if (isLittleEndian)
16172
Perm = BuildIntrinsicOp(IntrPerm,
16173
ExtraLoad, BaseLoad, PermCntl, DAG, dl);
16174
else
16175
Perm = BuildIntrinsicOp(IntrPerm,
16176
BaseLoad, ExtraLoad, PermCntl, DAG, dl);
16177
16178
if (VT != PermTy)
16179
Perm = Subtarget.hasAltivec()
16180
? DAG.getNode(ISD::BITCAST, dl, VT, Perm)
16181
: DAG.getNode(ISD::FP_ROUND, dl, VT, Perm,
16182
DAG.getTargetConstant(1, dl, MVT::i64));
16183
// second argument is 1 because this rounding
16184
// is always exact.
16185
16186
// The output of the permutation is our loaded result, the TokenFactor is
16187
// our new chain.
16188
DCI.CombineTo(N, Perm, TF);
16189
return SDValue(N, 0);
16190
}
16191
}
16192
break;
16193
case ISD::INTRINSIC_WO_CHAIN: {
16194
bool isLittleEndian = Subtarget.isLittleEndian();
16195
unsigned IID = N->getConstantOperandVal(0);
16196
Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
16197
: Intrinsic::ppc_altivec_lvsl);
16198
if (IID == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) {
16199
SDValue Add = N->getOperand(1);
16200
16201
int Bits = 4 /* 16 byte alignment */;
16202
16203
if (DAG.MaskedValueIsZero(Add->getOperand(1),
16204
APInt::getAllOnes(Bits /* alignment */)
16205
.zext(Add.getScalarValueSizeInBits()))) {
16206
SDNode *BasePtr = Add->getOperand(0).getNode();
16207
for (SDNode *U : BasePtr->uses()) {
16208
if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
16209
U->getConstantOperandVal(0) == IID) {
16210
// We've found another LVSL/LVSR, and this address is an aligned
16211
// multiple of that one. The results will be the same, so use the
16212
// one we've just found instead.
16213
16214
return SDValue(U, 0);
16215
}
16216
}
16217
}
16218
16219
if (isa<ConstantSDNode>(Add->getOperand(1))) {
16220
SDNode *BasePtr = Add->getOperand(0).getNode();
16221
for (SDNode *U : BasePtr->uses()) {
16222
if (U->getOpcode() == ISD::ADD &&
16223
isa<ConstantSDNode>(U->getOperand(1)) &&
16224
(Add->getConstantOperandVal(1) - U->getConstantOperandVal(1)) %
16225
(1ULL << Bits) ==
16226
0) {
16227
SDNode *OtherAdd = U;
16228
for (SDNode *V : OtherAdd->uses()) {
16229
if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
16230
V->getConstantOperandVal(0) == IID) {
16231
return SDValue(V, 0);
16232
}
16233
}
16234
}
16235
}
16236
}
16237
}
16238
16239
// Combine vmaxsw/h/b(a, a's negation) to abs(a)
16240
// Expose the vabsduw/h/b opportunity for down stream
16241
if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() &&
16242
(IID == Intrinsic::ppc_altivec_vmaxsw ||
16243
IID == Intrinsic::ppc_altivec_vmaxsh ||
16244
IID == Intrinsic::ppc_altivec_vmaxsb)) {
16245
SDValue V1 = N->getOperand(1);
16246
SDValue V2 = N->getOperand(2);
16247
if ((V1.getSimpleValueType() == MVT::v4i32 ||
16248
V1.getSimpleValueType() == MVT::v8i16 ||
16249
V1.getSimpleValueType() == MVT::v16i8) &&
16250
V1.getSimpleValueType() == V2.getSimpleValueType()) {
16251
// (0-a, a)
16252
if (V1.getOpcode() == ISD::SUB &&
16253
ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
16254
V1.getOperand(1) == V2) {
16255
return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2);
16256
}
16257
// (a, 0-a)
16258
if (V2.getOpcode() == ISD::SUB &&
16259
ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
16260
V2.getOperand(1) == V1) {
16261
return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
16262
}
16263
// (x-y, y-x)
16264
if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB &&
16265
V1.getOperand(0) == V2.getOperand(1) &&
16266
V1.getOperand(1) == V2.getOperand(0)) {
16267
return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
16268
}
16269
}
16270
}
16271
}
16272
16273
break;
16274
case ISD::INTRINSIC_W_CHAIN:
16275
switch (N->getConstantOperandVal(1)) {
16276
default:
16277
break;
16278
case Intrinsic::ppc_altivec_vsum4sbs:
16279
case Intrinsic::ppc_altivec_vsum4shs:
16280
case Intrinsic::ppc_altivec_vsum4ubs: {
16281
// These sum-across intrinsics only have a chain due to the side effect
16282
// that they may set the SAT bit. If we know the SAT bit will not be set
16283
// for some inputs, we can replace any uses of their chain with the
16284
// input chain.
16285
if (BuildVectorSDNode *BVN =
16286
dyn_cast<BuildVectorSDNode>(N->getOperand(3))) {
16287
APInt APSplatBits, APSplatUndef;
16288
unsigned SplatBitSize;
16289
bool HasAnyUndefs;
16290
bool BVNIsConstantSplat = BVN->isConstantSplat(
16291
APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0,
16292
!Subtarget.isLittleEndian());
16293
// If the constant splat vector is 0, the SAT bit will not be set.
16294
if (BVNIsConstantSplat && APSplatBits == 0)
16295
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), N->getOperand(0));
16296
}
16297
return SDValue();
16298
}
16299
case Intrinsic::ppc_vsx_lxvw4x:
16300
case Intrinsic::ppc_vsx_lxvd2x:
16301
// For little endian, VSX loads require generating lxvd2x/xxswapd.
16302
// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
16303
if (Subtarget.needsSwapsForVSXMemOps())
16304
return expandVSXLoadForLE(N, DCI);
16305
break;
16306
}
16307
break;
16308
case ISD::INTRINSIC_VOID:
16309
// For little endian, VSX stores require generating xxswapd/stxvd2x.
16310
// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
16311
if (Subtarget.needsSwapsForVSXMemOps()) {
16312
switch (N->getConstantOperandVal(1)) {
16313
default:
16314
break;
16315
case Intrinsic::ppc_vsx_stxvw4x:
16316
case Intrinsic::ppc_vsx_stxvd2x:
16317
return expandVSXStoreForLE(N, DCI);
16318
}
16319
}
16320
break;
16321
case ISD::BSWAP: {
16322
// Turn BSWAP (LOAD) -> lhbrx/lwbrx.
16323
// For subtargets without LDBRX, we can still do better than the default
16324
// expansion even for 64-bit BSWAP (LOAD).
16325
bool Is64BitBswapOn64BitTgt =
16326
Subtarget.isPPC64() && N->getValueType(0) == MVT::i64;
16327
bool IsSingleUseNormalLd = ISD::isNormalLoad(N->getOperand(0).getNode()) &&
16328
N->getOperand(0).hasOneUse();
16329
if (IsSingleUseNormalLd &&
16330
(N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
16331
(Subtarget.hasLDBRX() && Is64BitBswapOn64BitTgt))) {
16332
SDValue Load = N->getOperand(0);
16333
LoadSDNode *LD = cast<LoadSDNode>(Load);
16334
// Create the byte-swapping load.
16335
SDValue Ops[] = {
16336
LD->getChain(), // Chain
16337
LD->getBasePtr(), // Ptr
16338
DAG.getValueType(N->getValueType(0)) // VT
16339
};
16340
SDValue BSLoad =
16341
DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
16342
DAG.getVTList(N->getValueType(0) == MVT::i64 ?
16343
MVT::i64 : MVT::i32, MVT::Other),
16344
Ops, LD->getMemoryVT(), LD->getMemOperand());
16345
16346
// If this is an i16 load, insert the truncate.
16347
SDValue ResVal = BSLoad;
16348
if (N->getValueType(0) == MVT::i16)
16349
ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);
16350
16351
// First, combine the bswap away. This makes the value produced by the
16352
// load dead.
16353
DCI.CombineTo(N, ResVal);
16354
16355
// Next, combine the load away, we give it a bogus result value but a real
16356
// chain result. The result value is dead because the bswap is dead.
16357
DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
16358
16359
// Return N so it doesn't get rechecked!
16360
return SDValue(N, 0);
16361
}
16362
// Convert this to two 32-bit bswap loads and a BUILD_PAIR. Do this only
16363
// before legalization so that the BUILD_PAIR is handled correctly.
16364
if (!DCI.isBeforeLegalize() || !Is64BitBswapOn64BitTgt ||
16365
!IsSingleUseNormalLd)
16366
return SDValue();
16367
LoadSDNode *LD = cast<LoadSDNode>(N->getOperand(0));
16368
16369
// Can't split volatile or atomic loads.
16370
if (!LD->isSimple())
16371
return SDValue();
16372
SDValue BasePtr = LD->getBasePtr();
16373
SDValue Lo = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr,
16374
LD->getPointerInfo(), LD->getAlign());
16375
Lo = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Lo);
16376
BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
16377
DAG.getIntPtrConstant(4, dl));
16378
MachineMemOperand *NewMMO = DAG.getMachineFunction().getMachineMemOperand(
16379
LD->getMemOperand(), 4, 4);
16380
SDValue Hi = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr, NewMMO);
16381
Hi = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Hi);
16382
SDValue Res;
16383
if (Subtarget.isLittleEndian())
16384
Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Hi, Lo);
16385
else
16386
Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
16387
SDValue TF =
16388
DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
16389
Hi.getOperand(0).getValue(1), Lo.getOperand(0).getValue(1));
16390
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), TF);
16391
return Res;
16392
}
16393
case PPCISD::VCMP:
16394
// If a VCMP_rec node already exists with exactly the same operands as this
16395
// node, use its result instead of this node (VCMP_rec computes both a CR6
16396
// and a normal output).
16397
//
16398
if (!N->getOperand(0).hasOneUse() &&
16399
!N->getOperand(1).hasOneUse() &&
16400
!N->getOperand(2).hasOneUse()) {
16401
16402
// Scan all of the users of the LHS, looking for VCMP_rec's that match.
16403
SDNode *VCMPrecNode = nullptr;
16404
16405
SDNode *LHSN = N->getOperand(0).getNode();
16406
for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
16407
UI != E; ++UI)
16408
if (UI->getOpcode() == PPCISD::VCMP_rec &&
16409
UI->getOperand(1) == N->getOperand(1) &&
16410
UI->getOperand(2) == N->getOperand(2) &&
16411
UI->getOperand(0) == N->getOperand(0)) {
16412
VCMPrecNode = *UI;
16413
break;
16414
}
16415
16416
// If there is no VCMP_rec node, or if the flag value has a single use,
16417
// don't transform this.
16418
if (!VCMPrecNode || VCMPrecNode->hasNUsesOfValue(0, 1))
16419
break;
16420
16421
// Look at the (necessarily single) use of the flag value. If it has a
16422
// chain, this transformation is more complex. Note that multiple things
16423
// could use the value result, which we should ignore.
16424
SDNode *FlagUser = nullptr;
16425
for (SDNode::use_iterator UI = VCMPrecNode->use_begin();
16426
FlagUser == nullptr; ++UI) {
16427
assert(UI != VCMPrecNode->use_end() && "Didn't find user!");
16428
SDNode *User = *UI;
16429
for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
16430
if (User->getOperand(i) == SDValue(VCMPrecNode, 1)) {
16431
FlagUser = User;
16432
break;
16433
}
16434
}
16435
}
16436
16437
// If the user is a MFOCRF instruction, we know this is safe.
16438
// Otherwise we give up for right now.
16439
if (FlagUser->getOpcode() == PPCISD::MFOCRF)
16440
return SDValue(VCMPrecNode, 0);
16441
}
16442
break;
16443
case ISD::BR_CC: {
16444
// If this is a branch on an altivec predicate comparison, lower this so
16445
// that we don't have to do a MFOCRF: instead, branch directly on CR6. This
16446
// lowering is done pre-legalize, because the legalizer lowers the predicate
16447
// compare down to code that is difficult to reassemble.
16448
// This code also handles branches that depend on the result of a store
16449
// conditional.
16450
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
16451
SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
16452
16453
int CompareOpc;
16454
bool isDot;
16455
16456
if (!isa<ConstantSDNode>(RHS) || (CC != ISD::SETEQ && CC != ISD::SETNE))
16457
break;
16458
16459
// Since we are doing this pre-legalize, the RHS can be a constant of
16460
// arbitrary bitwidth which may cause issues when trying to get the value
16461
// from the underlying APInt.
16462
auto RHSAPInt = RHS->getAsAPIntVal();
16463
if (!RHSAPInt.isIntN(64))
16464
break;
16465
16466
unsigned Val = RHSAPInt.getZExtValue();
16467
auto isImpossibleCompare = [&]() {
16468
// If this is a comparison against something other than 0/1, then we know
16469
// that the condition is never/always true.
16470
if (Val != 0 && Val != 1) {
16471
if (CC == ISD::SETEQ) // Cond never true, remove branch.
16472
return N->getOperand(0);
16473
// Always !=, turn it into an unconditional branch.
16474
return DAG.getNode(ISD::BR, dl, MVT::Other,
16475
N->getOperand(0), N->getOperand(4));
16476
}
16477
return SDValue();
16478
};
16479
// Combine branches fed by store conditional instructions (st[bhwd]cx).
16480
unsigned StoreWidth = 0;
16481
if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
16482
isStoreConditional(LHS, StoreWidth)) {
16483
if (SDValue Impossible = isImpossibleCompare())
16484
return Impossible;
16485
PPC::Predicate CompOpc;
16486
// eq 0 => ne
16487
// ne 0 => eq
16488
// eq 1 => eq
16489
// ne 1 => ne
16490
if (Val == 0)
16491
CompOpc = CC == ISD::SETEQ ? PPC::PRED_NE : PPC::PRED_EQ;
16492
else
16493
CompOpc = CC == ISD::SETEQ ? PPC::PRED_EQ : PPC::PRED_NE;
16494
16495
SDValue Ops[] = {LHS.getOperand(0), LHS.getOperand(2), LHS.getOperand(3),
16496
DAG.getConstant(StoreWidth, dl, MVT::i32)};
16497
auto *MemNode = cast<MemSDNode>(LHS);
16498
SDValue ConstSt = DAG.getMemIntrinsicNode(
16499
PPCISD::STORE_COND, dl,
16500
DAG.getVTList(MVT::i32, MVT::Other, MVT::Glue), Ops,
16501
MemNode->getMemoryVT(), MemNode->getMemOperand());
16502
16503
SDValue InChain;
16504
// Unchain the branch from the original store conditional.
16505
if (N->getOperand(0) == LHS.getValue(1))
16506
InChain = LHS.getOperand(0);
16507
else if (N->getOperand(0).getOpcode() == ISD::TokenFactor) {
16508
SmallVector<SDValue, 4> InChains;
16509
SDValue InTF = N->getOperand(0);
16510
for (int i = 0, e = InTF.getNumOperands(); i < e; i++)
16511
if (InTF.getOperand(i) != LHS.getValue(1))
16512
InChains.push_back(InTF.getOperand(i));
16513
InChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, InChains);
16514
}
16515
16516
return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, InChain,
16517
DAG.getConstant(CompOpc, dl, MVT::i32),
16518
DAG.getRegister(PPC::CR0, MVT::i32), N->getOperand(4),
16519
ConstSt.getValue(2));
16520
}
16521
16522
if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
16523
getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
16524
assert(isDot && "Can't compare against a vector result!");
16525
16526
if (SDValue Impossible = isImpossibleCompare())
16527
return Impossible;
16528
16529
bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
16530
// Create the PPCISD altivec 'dot' comparison node.
16531
SDValue Ops[] = {
16532
LHS.getOperand(2), // LHS of compare
16533
LHS.getOperand(3), // RHS of compare
16534
DAG.getConstant(CompareOpc, dl, MVT::i32)
16535
};
16536
EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
16537
SDValue CompNode = DAG.getNode(PPCISD::VCMP_rec, dl, VTs, Ops);
16538
16539
// Unpack the result based on how the target uses it.
16540
PPC::Predicate CompOpc;
16541
switch (LHS.getConstantOperandVal(1)) {
16542
default: // Can't happen, don't crash on invalid number though.
16543
case 0: // Branch on the value of the EQ bit of CR6.
16544
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
16545
break;
16546
case 1: // Branch on the inverted value of the EQ bit of CR6.
16547
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
16548
break;
16549
case 2: // Branch on the value of the LT bit of CR6.
16550
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
16551
break;
16552
case 3: // Branch on the inverted value of the LT bit of CR6.
16553
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
16554
break;
16555
}
16556
16557
return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
16558
DAG.getConstant(CompOpc, dl, MVT::i32),
16559
DAG.getRegister(PPC::CR6, MVT::i32),
16560
N->getOperand(4), CompNode.getValue(1));
16561
}
16562
break;
16563
}
16564
case ISD::BUILD_VECTOR:
16565
return DAGCombineBuildVector(N, DCI);
16566
}
16567
16568
return SDValue();
16569
}
16570
16571
SDValue
16572
PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
16573
SelectionDAG &DAG,
16574
SmallVectorImpl<SDNode *> &Created) const {
16575
// fold (sdiv X, pow2)
16576
EVT VT = N->getValueType(0);
16577
if (VT == MVT::i64 && !Subtarget.isPPC64())
16578
return SDValue();
16579
if ((VT != MVT::i32 && VT != MVT::i64) ||
16580
!(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
16581
return SDValue();
16582
16583
SDLoc DL(N);
16584
SDValue N0 = N->getOperand(0);
16585
16586
bool IsNegPow2 = Divisor.isNegatedPowerOf2();
16587
unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countr_zero();
16588
SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT);
16589
16590
SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt);
16591
Created.push_back(Op.getNode());
16592
16593
if (IsNegPow2) {
16594
Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
16595
Created.push_back(Op.getNode());
16596
}
16597
16598
return Op;
16599
}
16600
16601
//===----------------------------------------------------------------------===//
16602
// Inline Assembly Support
16603
//===----------------------------------------------------------------------===//
16604
16605
void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
16606
KnownBits &Known,
16607
const APInt &DemandedElts,
16608
const SelectionDAG &DAG,
16609
unsigned Depth) const {
16610
Known.resetAll();
16611
switch (Op.getOpcode()) {
16612
default: break;
16613
case PPCISD::LBRX: {
16614
// lhbrx is known to have the top bits cleared out.
16615
if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
16616
Known.Zero = 0xFFFF0000;
16617
break;
16618
}
16619
case ISD::INTRINSIC_WO_CHAIN: {
16620
switch (Op.getConstantOperandVal(0)) {
16621
default: break;
16622
case Intrinsic::ppc_altivec_vcmpbfp_p:
16623
case Intrinsic::ppc_altivec_vcmpeqfp_p:
16624
case Intrinsic::ppc_altivec_vcmpequb_p:
16625
case Intrinsic::ppc_altivec_vcmpequh_p:
16626
case Intrinsic::ppc_altivec_vcmpequw_p:
16627
case Intrinsic::ppc_altivec_vcmpequd_p:
16628
case Intrinsic::ppc_altivec_vcmpequq_p:
16629
case Intrinsic::ppc_altivec_vcmpgefp_p:
16630
case Intrinsic::ppc_altivec_vcmpgtfp_p:
16631
case Intrinsic::ppc_altivec_vcmpgtsb_p:
16632
case Intrinsic::ppc_altivec_vcmpgtsh_p:
16633
case Intrinsic::ppc_altivec_vcmpgtsw_p:
16634
case Intrinsic::ppc_altivec_vcmpgtsd_p:
16635
case Intrinsic::ppc_altivec_vcmpgtsq_p:
16636
case Intrinsic::ppc_altivec_vcmpgtub_p:
16637
case Intrinsic::ppc_altivec_vcmpgtuh_p:
16638
case Intrinsic::ppc_altivec_vcmpgtuw_p:
16639
case Intrinsic::ppc_altivec_vcmpgtud_p:
16640
case Intrinsic::ppc_altivec_vcmpgtuq_p:
16641
Known.Zero = ~1U; // All bits but the low one are known to be zero.
16642
break;
16643
}
16644
break;
16645
}
16646
case ISD::INTRINSIC_W_CHAIN: {
16647
switch (Op.getConstantOperandVal(1)) {
16648
default:
16649
break;
16650
case Intrinsic::ppc_load2r:
16651
// Top bits are cleared for load2r (which is the same as lhbrx).
16652
Known.Zero = 0xFFFF0000;
16653
break;
16654
}
16655
break;
16656
}
16657
}
16658
}
16659
16660
Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
16661
switch (Subtarget.getCPUDirective()) {
16662
default: break;
16663
case PPC::DIR_970:
16664
case PPC::DIR_PWR4:
16665
case PPC::DIR_PWR5:
16666
case PPC::DIR_PWR5X:
16667
case PPC::DIR_PWR6:
16668
case PPC::DIR_PWR6X:
16669
case PPC::DIR_PWR7:
16670
case PPC::DIR_PWR8:
16671
case PPC::DIR_PWR9:
16672
case PPC::DIR_PWR10:
16673
case PPC::DIR_PWR11:
16674
case PPC::DIR_PWR_FUTURE: {
16675
if (!ML)
16676
break;
16677
16678
if (!DisableInnermostLoopAlign32) {
16679
// If the nested loop is an innermost loop, prefer to a 32-byte alignment,
16680
// so that we can decrease cache misses and branch-prediction misses.
16681
// Actual alignment of the loop will depend on the hotness check and other
16682
// logic in alignBlocks.
16683
if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())
16684
return Align(32);
16685
}
16686
16687
const PPCInstrInfo *TII = Subtarget.getInstrInfo();
16688
16689
// For small loops (between 5 and 8 instructions), align to a 32-byte
16690
// boundary so that the entire loop fits in one instruction-cache line.
16691
uint64_t LoopSize = 0;
16692
for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
16693
for (const MachineInstr &J : **I) {
16694
LoopSize += TII->getInstSizeInBytes(J);
16695
if (LoopSize > 32)
16696
break;
16697
}
16698
16699
if (LoopSize > 16 && LoopSize <= 32)
16700
return Align(32);
16701
16702
break;
16703
}
16704
}
16705
16706
return TargetLowering::getPrefLoopAlignment(ML);
16707
}
16708
16709
/// getConstraintType - Given a constraint, return the type of
16710
/// constraint it is for this target.
16711
PPCTargetLowering::ConstraintType
16712
PPCTargetLowering::getConstraintType(StringRef Constraint) const {
16713
if (Constraint.size() == 1) {
16714
switch (Constraint[0]) {
16715
default: break;
16716
case 'b':
16717
case 'r':
16718
case 'f':
16719
case 'd':
16720
case 'v':
16721
case 'y':
16722
return C_RegisterClass;
16723
case 'Z':
16724
// FIXME: While Z does indicate a memory constraint, it specifically
16725
// indicates an r+r address (used in conjunction with the 'y' modifier
16726
// in the replacement string). Currently, we're forcing the base
16727
// register to be r0 in the asm printer (which is interpreted as zero)
16728
// and forming the complete address in the second register. This is
16729
// suboptimal.
16730
return C_Memory;
16731
}
16732
} else if (Constraint == "wc") { // individual CR bits.
16733
return C_RegisterClass;
16734
} else if (Constraint == "wa" || Constraint == "wd" ||
16735
Constraint == "wf" || Constraint == "ws" ||
16736
Constraint == "wi" || Constraint == "ww") {
16737
return C_RegisterClass; // VSX registers.
16738
}
16739
return TargetLowering::getConstraintType(Constraint);
16740
}
16741
16742
/// Examine constraint type and operand type and determine a weight value.
16743
/// This object must already have been set up with the operand type
16744
/// and the current alternative constraint selected.
16745
TargetLowering::ConstraintWeight
16746
PPCTargetLowering::getSingleConstraintMatchWeight(
16747
AsmOperandInfo &info, const char *constraint) const {
16748
ConstraintWeight weight = CW_Invalid;
16749
Value *CallOperandVal = info.CallOperandVal;
16750
// If we don't have a value, we can't do a match,
16751
// but allow it at the lowest weight.
16752
if (!CallOperandVal)
16753
return CW_Default;
16754
Type *type = CallOperandVal->getType();
16755
16756
// Look at the constraint type.
16757
if (StringRef(constraint) == "wc" && type->isIntegerTy(1))
16758
return CW_Register; // an individual CR bit.
16759
else if ((StringRef(constraint) == "wa" ||
16760
StringRef(constraint) == "wd" ||
16761
StringRef(constraint) == "wf") &&
16762
type->isVectorTy())
16763
return CW_Register;
16764
else if (StringRef(constraint) == "wi" && type->isIntegerTy(64))
16765
return CW_Register; // just hold 64-bit integers data.
16766
else if (StringRef(constraint) == "ws" && type->isDoubleTy())
16767
return CW_Register;
16768
else if (StringRef(constraint) == "ww" && type->isFloatTy())
16769
return CW_Register;
16770
16771
switch (*constraint) {
16772
default:
16773
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
16774
break;
16775
case 'b':
16776
if (type->isIntegerTy())
16777
weight = CW_Register;
16778
break;
16779
case 'f':
16780
if (type->isFloatTy())
16781
weight = CW_Register;
16782
break;
16783
case 'd':
16784
if (type->isDoubleTy())
16785
weight = CW_Register;
16786
break;
16787
case 'v':
16788
if (type->isVectorTy())
16789
weight = CW_Register;
16790
break;
16791
case 'y':
16792
weight = CW_Register;
16793
break;
16794
case 'Z':
16795
weight = CW_Memory;
16796
break;
16797
}
16798
return weight;
16799
}
16800
16801
std::pair<unsigned, const TargetRegisterClass *>
16802
PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
16803
StringRef Constraint,
16804
MVT VT) const {
16805
if (Constraint.size() == 1) {
16806
// GCC RS6000 Constraint Letters
16807
switch (Constraint[0]) {
16808
case 'b': // R1-R31
16809
if (VT == MVT::i64 && Subtarget.isPPC64())
16810
return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
16811
return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
16812
case 'r': // R0-R31
16813
if (VT == MVT::i64 && Subtarget.isPPC64())
16814
return std::make_pair(0U, &PPC::G8RCRegClass);
16815
return std::make_pair(0U, &PPC::GPRCRegClass);
16816
// 'd' and 'f' constraints are both defined to be "the floating point
16817
// registers", where one is for 32-bit and the other for 64-bit. We don't
16818
// really care overly much here so just give them all the same reg classes.
16819
case 'd':
16820
case 'f':
16821
if (Subtarget.hasSPE()) {
16822
if (VT == MVT::f32 || VT == MVT::i32)
16823
return std::make_pair(0U, &PPC::GPRCRegClass);
16824
if (VT == MVT::f64 || VT == MVT::i64)
16825
return std::make_pair(0U, &PPC::SPERCRegClass);
16826
} else {
16827
if (VT == MVT::f32 || VT == MVT::i32)
16828
return std::make_pair(0U, &PPC::F4RCRegClass);
16829
if (VT == MVT::f64 || VT == MVT::i64)
16830
return std::make_pair(0U, &PPC::F8RCRegClass);
16831
}
16832
break;
16833
case 'v':
16834
if (Subtarget.hasAltivec() && VT.isVector())
16835
return std::make_pair(0U, &PPC::VRRCRegClass);
16836
else if (Subtarget.hasVSX())
16837
// Scalars in Altivec registers only make sense with VSX.
16838
return std::make_pair(0U, &PPC::VFRCRegClass);
16839
break;
16840
case 'y': // crrc
16841
return std::make_pair(0U, &PPC::CRRCRegClass);
16842
}
16843
} else if (Constraint == "wc" && Subtarget.useCRBits()) {
16844
// An individual CR bit.
16845
return std::make_pair(0U, &PPC::CRBITRCRegClass);
16846
} else if ((Constraint == "wa" || Constraint == "wd" ||
16847
Constraint == "wf" || Constraint == "wi") &&
16848
Subtarget.hasVSX()) {
16849
// A VSX register for either a scalar (FP) or vector. There is no
16850
// support for single precision scalars on subtargets prior to Power8.
16851
if (VT.isVector())
16852
return std::make_pair(0U, &PPC::VSRCRegClass);
16853
if (VT == MVT::f32 && Subtarget.hasP8Vector())
16854
return std::make_pair(0U, &PPC::VSSRCRegClass);
16855
return std::make_pair(0U, &PPC::VSFRCRegClass);
16856
} else if ((Constraint == "ws" || Constraint == "ww") && Subtarget.hasVSX()) {
16857
if (VT == MVT::f32 && Subtarget.hasP8Vector())
16858
return std::make_pair(0U, &PPC::VSSRCRegClass);
16859
else
16860
return std::make_pair(0U, &PPC::VSFRCRegClass);
16861
} else if (Constraint == "lr") {
16862
if (VT == MVT::i64)
16863
return std::make_pair(0U, &PPC::LR8RCRegClass);
16864
else
16865
return std::make_pair(0U, &PPC::LRRCRegClass);
16866
}
16867
16868
// Handle special cases of physical registers that are not properly handled
16869
// by the base class.
16870
if (Constraint[0] == '{' && Constraint[Constraint.size() - 1] == '}') {
16871
// If we name a VSX register, we can't defer to the base class because it
16872
// will not recognize the correct register (their names will be VSL{0-31}
16873
// and V{0-31} so they won't match). So we match them here.
16874
if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {
16875
int VSNum = atoi(Constraint.data() + 3);
16876
assert(VSNum >= 0 && VSNum <= 63 &&
16877
"Attempted to access a vsr out of range");
16878
if (VSNum < 32)
16879
return std::make_pair(PPC::VSL0 + VSNum, &PPC::VSRCRegClass);
16880
return std::make_pair(PPC::V0 + VSNum - 32, &PPC::VSRCRegClass);
16881
}
16882
16883
// For float registers, we can't defer to the base class as it will match
16884
// the SPILLTOVSRRC class.
16885
if (Constraint.size() > 3 && Constraint[1] == 'f') {
16886
int RegNum = atoi(Constraint.data() + 2);
16887
if (RegNum > 31 || RegNum < 0)
16888
report_fatal_error("Invalid floating point register number");
16889
if (VT == MVT::f32 || VT == MVT::i32)
16890
return Subtarget.hasSPE()
16891
? std::make_pair(PPC::R0 + RegNum, &PPC::GPRCRegClass)
16892
: std::make_pair(PPC::F0 + RegNum, &PPC::F4RCRegClass);
16893
if (VT == MVT::f64 || VT == MVT::i64)
16894
return Subtarget.hasSPE()
16895
? std::make_pair(PPC::S0 + RegNum, &PPC::SPERCRegClass)
16896
: std::make_pair(PPC::F0 + RegNum, &PPC::F8RCRegClass);
16897
}
16898
}
16899
16900
std::pair<unsigned, const TargetRegisterClass *> R =
16901
TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16902
16903
// r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
16904
// (which we call X[0-9]+). If a 64-bit value has been requested, and a
16905
// 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
16906
// register.
16907
// FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
16908
// the AsmName field from *RegisterInfo.td, then this would not be necessary.
16909
if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
16910
PPC::GPRCRegClass.contains(R.first))
16911
return std::make_pair(TRI->getMatchingSuperReg(R.first,
16912
PPC::sub_32, &PPC::G8RCRegClass),
16913
&PPC::G8RCRegClass);
16914
16915
// GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
16916
if (!R.second && StringRef("{cc}").equals_insensitive(Constraint)) {
16917
R.first = PPC::CR0;
16918
R.second = &PPC::CRRCRegClass;
16919
}
16920
// FIXME: This warning should ideally be emitted in the front end.
16921
const auto &TM = getTargetMachine();
16922
if (Subtarget.isAIXABI() && !TM.getAIXExtendedAltivecABI()) {
16923
if (((R.first >= PPC::V20 && R.first <= PPC::V31) ||
16924
(R.first >= PPC::VF20 && R.first <= PPC::VF31)) &&
16925
(R.second == &PPC::VSRCRegClass || R.second == &PPC::VSFRCRegClass))
16926
errs() << "warning: vector registers 20 to 32 are reserved in the "
16927
"default AIX AltiVec ABI and cannot be used\n";
16928
}
16929
16930
return R;
16931
}
16932
16933
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
16934
/// vector. If it is invalid, don't add anything to Ops.
16935
void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
16936
StringRef Constraint,
16937
std::vector<SDValue> &Ops,
16938
SelectionDAG &DAG) const {
16939
SDValue Result;
16940
16941
// Only support length 1 constraints.
16942
if (Constraint.size() > 1)
16943
return;
16944
16945
char Letter = Constraint[0];
16946
switch (Letter) {
16947
default: break;
16948
case 'I':
16949
case 'J':
16950
case 'K':
16951
case 'L':
16952
case 'M':
16953
case 'N':
16954
case 'O':
16955
case 'P': {
16956
ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
16957
if (!CST) return; // Must be an immediate to match.
16958
SDLoc dl(Op);
16959
int64_t Value = CST->getSExtValue();
16960
EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
16961
// numbers are printed as such.
16962
switch (Letter) {
16963
default: llvm_unreachable("Unknown constraint letter!");
16964
case 'I': // "I" is a signed 16-bit constant.
16965
if (isInt<16>(Value))
16966
Result = DAG.getTargetConstant(Value, dl, TCVT);
16967
break;
16968
case 'J': // "J" is a constant with only the high-order 16 bits nonzero.
16969
if (isShiftedUInt<16, 16>(Value))
16970
Result = DAG.getTargetConstant(Value, dl, TCVT);
16971
break;
16972
case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.
16973
if (isShiftedInt<16, 16>(Value))
16974
Result = DAG.getTargetConstant(Value, dl, TCVT);
16975
break;
16976
case 'K': // "K" is a constant with only the low-order 16 bits nonzero.
16977
if (isUInt<16>(Value))
16978
Result = DAG.getTargetConstant(Value, dl, TCVT);
16979
break;
16980
case 'M': // "M" is a constant that is greater than 31.
16981
if (Value > 31)
16982
Result = DAG.getTargetConstant(Value, dl, TCVT);
16983
break;
16984
case 'N': // "N" is a positive constant that is an exact power of two.
16985
if (Value > 0 && isPowerOf2_64(Value))
16986
Result = DAG.getTargetConstant(Value, dl, TCVT);
16987
break;
16988
case 'O': // "O" is the constant zero.
16989
if (Value == 0)
16990
Result = DAG.getTargetConstant(Value, dl, TCVT);
16991
break;
16992
case 'P': // "P" is a constant whose negation is a signed 16-bit constant.
16993
if (isInt<16>(-Value))
16994
Result = DAG.getTargetConstant(Value, dl, TCVT);
16995
break;
16996
}
16997
break;
16998
}
16999
}
17000
17001
if (Result.getNode()) {
17002
Ops.push_back(Result);
17003
return;
17004
}
17005
17006
// Handle standard constraint letters.
17007
TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
17008
}
17009
17010
void PPCTargetLowering::CollectTargetIntrinsicOperands(const CallInst &I,
17011
SmallVectorImpl<SDValue> &Ops,
17012
SelectionDAG &DAG) const {
17013
if (I.getNumOperands() <= 1)
17014
return;
17015
if (!isa<ConstantSDNode>(Ops[1].getNode()))
17016
return;
17017
auto IntrinsicID = Ops[1].getNode()->getAsZExtVal();
17018
if (IntrinsicID != Intrinsic::ppc_tdw && IntrinsicID != Intrinsic::ppc_tw &&
17019
IntrinsicID != Intrinsic::ppc_trapd && IntrinsicID != Intrinsic::ppc_trap)
17020
return;
17021
17022
if (MDNode *MDN = I.getMetadata(LLVMContext::MD_annotation))
17023
Ops.push_back(DAG.getMDNode(MDN));
17024
}
17025
17026
// isLegalAddressingMode - Return true if the addressing mode represented
17027
// by AM is legal for this target, for a load/store of the specified type.
17028
bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
17029
const AddrMode &AM, Type *Ty,
17030
unsigned AS,
17031
Instruction *I) const {
17032
// Vector type r+i form is supported since power9 as DQ form. We don't check
17033
// the offset matching DQ form requirement(off % 16 == 0), because on PowerPC,
17034
// imm form is preferred and the offset can be adjusted to use imm form later
17035
// in pass PPCLoopInstrFormPrep. Also in LSR, for one LSRUse, it uses min and
17036
// max offset to check legal addressing mode, we should be a little aggressive
17037
// to contain other offsets for that LSRUse.
17038
if (Ty->isVectorTy() && AM.BaseOffs != 0 && !Subtarget.hasP9Vector())
17039
return false;
17040
17041
// PPC allows a sign-extended 16-bit immediate field.
17042
if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
17043
return false;
17044
17045
// No global is ever allowed as a base.
17046
if (AM.BaseGV)
17047
return false;
17048
17049
// PPC only support r+r,
17050
switch (AM.Scale) {
17051
case 0: // "r+i" or just "i", depending on HasBaseReg.
17052
break;
17053
case 1:
17054
if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
17055
return false;
17056
// Otherwise we have r+r or r+i.
17057
break;
17058
case 2:
17059
if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed.
17060
return false;
17061
// Allow 2*r as r+r.
17062
break;
17063
default:
17064
// No other scales are supported.
17065
return false;
17066
}
17067
17068
return true;
17069
}
17070
17071
SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
17072
SelectionDAG &DAG) const {
17073
MachineFunction &MF = DAG.getMachineFunction();
17074
MachineFrameInfo &MFI = MF.getFrameInfo();
17075
MFI.setReturnAddressIsTaken(true);
17076
17077
if (verifyReturnAddressArgumentIsConstant(Op, DAG))
17078
return SDValue();
17079
17080
SDLoc dl(Op);
17081
unsigned Depth = Op.getConstantOperandVal(0);
17082
17083
// Make sure the function does not optimize away the store of the RA to
17084
// the stack.
17085
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
17086
FuncInfo->setLRStoreRequired();
17087
bool isPPC64 = Subtarget.isPPC64();
17088
auto PtrVT = getPointerTy(MF.getDataLayout());
17089
17090
if (Depth > 0) {
17091
// The link register (return address) is saved in the caller's frame
17092
// not the callee's stack frame. So we must get the caller's frame
17093
// address and load the return address at the LR offset from there.
17094
SDValue FrameAddr =
17095
DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
17096
LowerFRAMEADDR(Op, DAG), MachinePointerInfo());
17097
SDValue Offset =
17098
DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,
17099
isPPC64 ? MVT::i64 : MVT::i32);
17100
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17101
DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
17102
MachinePointerInfo());
17103
}
17104
17105
// Just load the return address off the stack.
17106
SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
17107
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
17108
MachinePointerInfo());
17109
}
17110
17111
SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
17112
SelectionDAG &DAG) const {
17113
SDLoc dl(Op);
17114
unsigned Depth = Op.getConstantOperandVal(0);
17115
17116
MachineFunction &MF = DAG.getMachineFunction();
17117
MachineFrameInfo &MFI = MF.getFrameInfo();
17118
MFI.setFrameAddressIsTaken(true);
17119
17120
EVT PtrVT = getPointerTy(MF.getDataLayout());
17121
bool isPPC64 = PtrVT == MVT::i64;
17122
17123
// Naked functions never have a frame pointer, and so we use r1. For all
17124
// other functions, this decision must be delayed until during PEI.
17125
unsigned FrameReg;
17126
if (MF.getFunction().hasFnAttribute(Attribute::Naked))
17127
FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
17128
else
17129
FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
17130
17131
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
17132
PtrVT);
17133
while (Depth--)
17134
FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
17135
FrameAddr, MachinePointerInfo());
17136
return FrameAddr;
17137
}
17138
17139
// FIXME? Maybe this could be a TableGen attribute on some registers and
17140
// this table could be generated automatically from RegInfo.
17141
Register PPCTargetLowering::getRegisterByName(const char* RegName, LLT VT,
17142
const MachineFunction &MF) const {
17143
bool isPPC64 = Subtarget.isPPC64();
17144
17145
bool is64Bit = isPPC64 && VT == LLT::scalar(64);
17146
if (!is64Bit && VT != LLT::scalar(32))
17147
report_fatal_error("Invalid register global variable type");
17148
17149
Register Reg = StringSwitch<Register>(RegName)
17150
.Case("r1", is64Bit ? PPC::X1 : PPC::R1)
17151
.Case("r2", isPPC64 ? Register() : PPC::R2)
17152
.Case("r13", (is64Bit ? PPC::X13 : PPC::R13))
17153
.Default(Register());
17154
17155
if (Reg)
17156
return Reg;
17157
report_fatal_error("Invalid register name global variable");
17158
}
17159
17160
bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
17161
// 32-bit SVR4 ABI access everything as got-indirect.
17162
if (Subtarget.is32BitELFABI())
17163
return true;
17164
17165
// AIX accesses everything indirectly through the TOC, which is similar to
17166
// the GOT.
17167
if (Subtarget.isAIXABI())
17168
return true;
17169
17170
CodeModel::Model CModel = getTargetMachine().getCodeModel();
17171
// If it is small or large code model, module locals are accessed
17172
// indirectly by loading their address from .toc/.got.
17173
if (CModel == CodeModel::Small || CModel == CodeModel::Large)
17174
return true;
17175
17176
// JumpTable and BlockAddress are accessed as got-indirect.
17177
if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA))
17178
return true;
17179
17180
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA))
17181
return Subtarget.isGVIndirectSymbol(G->getGlobal());
17182
17183
return false;
17184
}
17185
17186
bool
17187
PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
17188
// The PowerPC target isn't yet aware of offsets.
17189
return false;
17190
}
17191
17192
bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
17193
const CallInst &I,
17194
MachineFunction &MF,
17195
unsigned Intrinsic) const {
17196
switch (Intrinsic) {
17197
case Intrinsic::ppc_atomicrmw_xchg_i128:
17198
case Intrinsic::ppc_atomicrmw_add_i128:
17199
case Intrinsic::ppc_atomicrmw_sub_i128:
17200
case Intrinsic::ppc_atomicrmw_nand_i128:
17201
case Intrinsic::ppc_atomicrmw_and_i128:
17202
case Intrinsic::ppc_atomicrmw_or_i128:
17203
case Intrinsic::ppc_atomicrmw_xor_i128:
17204
case Intrinsic::ppc_cmpxchg_i128:
17205
Info.opc = ISD::INTRINSIC_W_CHAIN;
17206
Info.memVT = MVT::i128;
17207
Info.ptrVal = I.getArgOperand(0);
17208
Info.offset = 0;
17209
Info.align = Align(16);
17210
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
17211
MachineMemOperand::MOVolatile;
17212
return true;
17213
case Intrinsic::ppc_atomic_load_i128:
17214
Info.opc = ISD::INTRINSIC_W_CHAIN;
17215
Info.memVT = MVT::i128;
17216
Info.ptrVal = I.getArgOperand(0);
17217
Info.offset = 0;
17218
Info.align = Align(16);
17219
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
17220
return true;
17221
case Intrinsic::ppc_atomic_store_i128:
17222
Info.opc = ISD::INTRINSIC_VOID;
17223
Info.memVT = MVT::i128;
17224
Info.ptrVal = I.getArgOperand(2);
17225
Info.offset = 0;
17226
Info.align = Align(16);
17227
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
17228
return true;
17229
case Intrinsic::ppc_altivec_lvx:
17230
case Intrinsic::ppc_altivec_lvxl:
17231
case Intrinsic::ppc_altivec_lvebx:
17232
case Intrinsic::ppc_altivec_lvehx:
17233
case Intrinsic::ppc_altivec_lvewx:
17234
case Intrinsic::ppc_vsx_lxvd2x:
17235
case Intrinsic::ppc_vsx_lxvw4x:
17236
case Intrinsic::ppc_vsx_lxvd2x_be:
17237
case Intrinsic::ppc_vsx_lxvw4x_be:
17238
case Intrinsic::ppc_vsx_lxvl:
17239
case Intrinsic::ppc_vsx_lxvll: {
17240
EVT VT;
17241
switch (Intrinsic) {
17242
case Intrinsic::ppc_altivec_lvebx:
17243
VT = MVT::i8;
17244
break;
17245
case Intrinsic::ppc_altivec_lvehx:
17246
VT = MVT::i16;
17247
break;
17248
case Intrinsic::ppc_altivec_lvewx:
17249
VT = MVT::i32;
17250
break;
17251
case Intrinsic::ppc_vsx_lxvd2x:
17252
case Intrinsic::ppc_vsx_lxvd2x_be:
17253
VT = MVT::v2f64;
17254
break;
17255
default:
17256
VT = MVT::v4i32;
17257
break;
17258
}
17259
17260
Info.opc = ISD::INTRINSIC_W_CHAIN;
17261
Info.memVT = VT;
17262
Info.ptrVal = I.getArgOperand(0);
17263
Info.offset = -VT.getStoreSize()+1;
17264
Info.size = 2*VT.getStoreSize()-1;
17265
Info.align = Align(1);
17266
Info.flags = MachineMemOperand::MOLoad;
17267
return true;
17268
}
17269
case Intrinsic::ppc_altivec_stvx:
17270
case Intrinsic::ppc_altivec_stvxl:
17271
case Intrinsic::ppc_altivec_stvebx:
17272
case Intrinsic::ppc_altivec_stvehx:
17273
case Intrinsic::ppc_altivec_stvewx:
17274
case Intrinsic::ppc_vsx_stxvd2x:
17275
case Intrinsic::ppc_vsx_stxvw4x:
17276
case Intrinsic::ppc_vsx_stxvd2x_be:
17277
case Intrinsic::ppc_vsx_stxvw4x_be:
17278
case Intrinsic::ppc_vsx_stxvl:
17279
case Intrinsic::ppc_vsx_stxvll: {
17280
EVT VT;
17281
switch (Intrinsic) {
17282
case Intrinsic::ppc_altivec_stvebx:
17283
VT = MVT::i8;
17284
break;
17285
case Intrinsic::ppc_altivec_stvehx:
17286
VT = MVT::i16;
17287
break;
17288
case Intrinsic::ppc_altivec_stvewx:
17289
VT = MVT::i32;
17290
break;
17291
case Intrinsic::ppc_vsx_stxvd2x:
17292
case Intrinsic::ppc_vsx_stxvd2x_be:
17293
VT = MVT::v2f64;
17294
break;
17295
default:
17296
VT = MVT::v4i32;
17297
break;
17298
}
17299
17300
Info.opc = ISD::INTRINSIC_VOID;
17301
Info.memVT = VT;
17302
Info.ptrVal = I.getArgOperand(1);
17303
Info.offset = -VT.getStoreSize()+1;
17304
Info.size = 2*VT.getStoreSize()-1;
17305
Info.align = Align(1);
17306
Info.flags = MachineMemOperand::MOStore;
17307
return true;
17308
}
17309
case Intrinsic::ppc_stdcx:
17310
case Intrinsic::ppc_stwcx:
17311
case Intrinsic::ppc_sthcx:
17312
case Intrinsic::ppc_stbcx: {
17313
EVT VT;
17314
auto Alignment = Align(8);
17315
switch (Intrinsic) {
17316
case Intrinsic::ppc_stdcx:
17317
VT = MVT::i64;
17318
break;
17319
case Intrinsic::ppc_stwcx:
17320
VT = MVT::i32;
17321
Alignment = Align(4);
17322
break;
17323
case Intrinsic::ppc_sthcx:
17324
VT = MVT::i16;
17325
Alignment = Align(2);
17326
break;
17327
case Intrinsic::ppc_stbcx:
17328
VT = MVT::i8;
17329
Alignment = Align(1);
17330
break;
17331
}
17332
Info.opc = ISD::INTRINSIC_W_CHAIN;
17333
Info.memVT = VT;
17334
Info.ptrVal = I.getArgOperand(0);
17335
Info.offset = 0;
17336
Info.align = Alignment;
17337
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
17338
return true;
17339
}
17340
default:
17341
break;
17342
}
17343
17344
return false;
17345
}
17346
17347
/// It returns EVT::Other if the type should be determined using generic
17348
/// target-independent logic.
17349
EVT PPCTargetLowering::getOptimalMemOpType(
17350
const MemOp &Op, const AttributeList &FuncAttributes) const {
17351
if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None) {
17352
// We should use Altivec/VSX loads and stores when available. For unaligned
17353
// addresses, unaligned VSX loads are only fast starting with the P8.
17354
if (Subtarget.hasAltivec() && Op.size() >= 16) {
17355
if (Op.isMemset() && Subtarget.hasVSX()) {
17356
uint64_t TailSize = Op.size() % 16;
17357
// For memset lowering, EXTRACT_VECTOR_ELT tries to return constant
17358
// element if vector element type matches tail store. For tail size
17359
// 3/4, the tail store is i32, v4i32 cannot be used, need a legal one.
17360
if (TailSize > 2 && TailSize <= 4) {
17361
return MVT::v8i16;
17362
}
17363
return MVT::v4i32;
17364
}
17365
if (Op.isAligned(Align(16)) || Subtarget.hasP8Vector())
17366
return MVT::v4i32;
17367
}
17368
}
17369
17370
if (Subtarget.isPPC64()) {
17371
return MVT::i64;
17372
}
17373
17374
return MVT::i32;
17375
}
17376
17377
/// Returns true if it is beneficial to convert a load of a constant
17378
/// to just the constant itself.
17379
bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
17380
Type *Ty) const {
17381
assert(Ty->isIntegerTy());
17382
17383
unsigned BitSize = Ty->getPrimitiveSizeInBits();
17384
return !(BitSize == 0 || BitSize > 64);
17385
}
17386
17387
bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
17388
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17389
return false;
17390
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17391
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17392
return NumBits1 == 64 && NumBits2 == 32;
17393
}
17394
17395
bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
17396
if (!VT1.isInteger() || !VT2.isInteger())
17397
return false;
17398
unsigned NumBits1 = VT1.getSizeInBits();
17399
unsigned NumBits2 = VT2.getSizeInBits();
17400
return NumBits1 == 64 && NumBits2 == 32;
17401
}
17402
17403
bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
17404
// Generally speaking, zexts are not free, but they are free when they can be
17405
// folded with other operations.
17406
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
17407
EVT MemVT = LD->getMemoryVT();
17408
if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 ||
17409
(Subtarget.isPPC64() && MemVT == MVT::i32)) &&
17410
(LD->getExtensionType() == ISD::NON_EXTLOAD ||
17411
LD->getExtensionType() == ISD::ZEXTLOAD))
17412
return true;
17413
}
17414
17415
// FIXME: Add other cases...
17416
// - 32-bit shifts with a zext to i64
17417
// - zext after ctlz, bswap, etc.
17418
// - zext after and by a constant mask
17419
17420
return TargetLowering::isZExtFree(Val, VT2);
17421
}
17422
17423
bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
17424
assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
17425
"invalid fpext types");
17426
// Extending to float128 is not free.
17427
if (DestVT == MVT::f128)
17428
return false;
17429
return true;
17430
}
17431
17432
bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
17433
return isInt<16>(Imm) || isUInt<16>(Imm);
17434
}
17435
17436
bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
17437
return isInt<16>(Imm) || isUInt<16>(Imm);
17438
}
17439
17440
bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Align,
17441
MachineMemOperand::Flags,
17442
unsigned *Fast) const {
17443
if (DisablePPCUnaligned)
17444
return false;
17445
17446
// PowerPC supports unaligned memory access for simple non-vector types.
17447
// Although accessing unaligned addresses is not as efficient as accessing
17448
// aligned addresses, it is generally more efficient than manual expansion,
17449
// and generally only traps for software emulation when crossing page
17450
// boundaries.
17451
17452
if (!VT.isSimple())
17453
return false;
17454
17455
if (VT.isFloatingPoint() && !VT.isVector() &&
17456
!Subtarget.allowsUnalignedFPAccess())
17457
return false;
17458
17459
if (VT.getSimpleVT().isVector()) {
17460
if (Subtarget.hasVSX()) {
17461
if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
17462
VT != MVT::v4f32 && VT != MVT::v4i32)
17463
return false;
17464
} else {
17465
return false;
17466
}
17467
}
17468
17469
if (VT == MVT::ppcf128)
17470
return false;
17471
17472
if (Fast)
17473
*Fast = 1;
17474
17475
return true;
17476
}
17477
17478
bool PPCTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
17479
SDValue C) const {
17480
// Check integral scalar types.
17481
if (!VT.isScalarInteger())
17482
return false;
17483
if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
17484
if (!ConstNode->getAPIntValue().isSignedIntN(64))
17485
return false;
17486
// This transformation will generate >= 2 operations. But the following
17487
// cases will generate <= 2 instructions during ISEL. So exclude them.
17488
// 1. If the constant multiplier fits 16 bits, it can be handled by one
17489
// HW instruction, ie. MULLI
17490
// 2. If the multiplier after shifted fits 16 bits, an extra shift
17491
// instruction is needed than case 1, ie. MULLI and RLDICR
17492
int64_t Imm = ConstNode->getSExtValue();
17493
unsigned Shift = llvm::countr_zero<uint64_t>(Imm);
17494
Imm >>= Shift;
17495
if (isInt<16>(Imm))
17496
return false;
17497
uint64_t UImm = static_cast<uint64_t>(Imm);
17498
if (isPowerOf2_64(UImm + 1) || isPowerOf2_64(UImm - 1) ||
17499
isPowerOf2_64(1 - UImm) || isPowerOf2_64(-1 - UImm))
17500
return true;
17501
}
17502
return false;
17503
}
17504
17505
bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
17506
EVT VT) const {
17507
return isFMAFasterThanFMulAndFAdd(
17508
MF.getFunction(), VT.getTypeForEVT(MF.getFunction().getContext()));
17509
}
17510
17511
bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
17512
Type *Ty) const {
17513
if (Subtarget.hasSPE() || Subtarget.useSoftFloat())
17514
return false;
17515
switch (Ty->getScalarType()->getTypeID()) {
17516
case Type::FloatTyID:
17517
case Type::DoubleTyID:
17518
return true;
17519
case Type::FP128TyID:
17520
return Subtarget.hasP9Vector();
17521
default:
17522
return false;
17523
}
17524
}
17525
17526
// FIXME: add more patterns which are not profitable to hoist.
17527
bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
17528
if (!I->hasOneUse())
17529
return true;
17530
17531
Instruction *User = I->user_back();
17532
assert(User && "A single use instruction with no uses.");
17533
17534
switch (I->getOpcode()) {
17535
case Instruction::FMul: {
17536
// Don't break FMA, PowerPC prefers FMA.
17537
if (User->getOpcode() != Instruction::FSub &&
17538
User->getOpcode() != Instruction::FAdd)
17539
return true;
17540
17541
const TargetOptions &Options = getTargetMachine().Options;
17542
const Function *F = I->getFunction();
17543
const DataLayout &DL = F->getDataLayout();
17544
Type *Ty = User->getOperand(0)->getType();
17545
17546
return !(
17547
isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17548
isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
17549
(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
17550
}
17551
case Instruction::Load: {
17552
// Don't break "store (load float*)" pattern, this pattern will be combined
17553
// to "store (load int32)" in later InstCombine pass. See function
17554
// combineLoadToOperationType. On PowerPC, loading a float point takes more
17555
// cycles than loading a 32 bit integer.
17556
LoadInst *LI = cast<LoadInst>(I);
17557
// For the loads that combineLoadToOperationType does nothing, like
17558
// ordered load, it should be profitable to hoist them.
17559
// For swifterror load, it can only be used for pointer to pointer type, so
17560
// later type check should get rid of this case.
17561
if (!LI->isUnordered())
17562
return true;
17563
17564
if (User->getOpcode() != Instruction::Store)
17565
return true;
17566
17567
if (I->getType()->getTypeID() != Type::FloatTyID)
17568
return true;
17569
17570
return false;
17571
}
17572
default:
17573
return true;
17574
}
17575
return true;
17576
}
17577
17578
const MCPhysReg *
17579
PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
17580
// LR is a callee-save register, but we must treat it as clobbered by any call
17581
// site. Hence we include LR in the scratch registers, which are in turn added
17582
// as implicit-defs for stackmaps and patchpoints. The same reasoning applies
17583
// to CTR, which is used by any indirect call.
17584
static const MCPhysReg ScratchRegs[] = {
17585
PPC::X12, PPC::LR8, PPC::CTR8, 0
17586
};
17587
17588
return ScratchRegs;
17589
}
17590
17591
Register PPCTargetLowering::getExceptionPointerRegister(
17592
const Constant *PersonalityFn) const {
17593
return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
17594
}
17595
17596
Register PPCTargetLowering::getExceptionSelectorRegister(
17597
const Constant *PersonalityFn) const {
17598
return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
17599
}
17600
17601
bool
17602
PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
17603
EVT VT , unsigned DefinedValues) const {
17604
if (VT == MVT::v2i64)
17605
return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
17606
17607
if (Subtarget.hasVSX())
17608
return true;
17609
17610
return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
17611
}
17612
17613
Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
17614
if (DisableILPPref || Subtarget.enableMachineScheduler())
17615
return TargetLowering::getSchedulingPreference(N);
17616
17617
return Sched::ILP;
17618
}
17619
17620
// Create a fast isel object.
17621
FastISel *
17622
PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
17623
const TargetLibraryInfo *LibInfo) const {
17624
return PPC::createFastISel(FuncInfo, LibInfo);
17625
}
17626
17627
// 'Inverted' means the FMA opcode after negating one multiplicand.
17628
// For example, (fma -a b c) = (fnmsub a b c)
17629
static unsigned invertFMAOpcode(unsigned Opc) {
17630
switch (Opc) {
17631
default:
17632
llvm_unreachable("Invalid FMA opcode for PowerPC!");
17633
case ISD::FMA:
17634
return PPCISD::FNMSUB;
17635
case PPCISD::FNMSUB:
17636
return ISD::FMA;
17637
}
17638
}
17639
17640
SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
17641
bool LegalOps, bool OptForSize,
17642
NegatibleCost &Cost,
17643
unsigned Depth) const {
17644
if (Depth > SelectionDAG::MaxRecursionDepth)
17645
return SDValue();
17646
17647
unsigned Opc = Op.getOpcode();
17648
EVT VT = Op.getValueType();
17649
SDNodeFlags Flags = Op.getNode()->getFlags();
17650
17651
switch (Opc) {
17652
case PPCISD::FNMSUB:
17653
if (!Op.hasOneUse() || !isTypeLegal(VT))
17654
break;
17655
17656
const TargetOptions &Options = getTargetMachine().Options;
17657
SDValue N0 = Op.getOperand(0);
17658
SDValue N1 = Op.getOperand(1);
17659
SDValue N2 = Op.getOperand(2);
17660
SDLoc Loc(Op);
17661
17662
NegatibleCost N2Cost = NegatibleCost::Expensive;
17663
SDValue NegN2 =
17664
getNegatedExpression(N2, DAG, LegalOps, OptForSize, N2Cost, Depth + 1);
17665
17666
if (!NegN2)
17667
return SDValue();
17668
17669
// (fneg (fnmsub a b c)) => (fnmsub (fneg a) b (fneg c))
17670
// (fneg (fnmsub a b c)) => (fnmsub a (fneg b) (fneg c))
17671
// These transformations may change sign of zeroes. For example,
17672
// -(-ab-(-c))=-0 while -(-(ab-c))=+0 when a=b=c=1.
17673
if (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) {
17674
// Try and choose the cheaper one to negate.
17675
NegatibleCost N0Cost = NegatibleCost::Expensive;
17676
SDValue NegN0 = getNegatedExpression(N0, DAG, LegalOps, OptForSize,
17677
N0Cost, Depth + 1);
17678
17679
NegatibleCost N1Cost = NegatibleCost::Expensive;
17680
SDValue NegN1 = getNegatedExpression(N1, DAG, LegalOps, OptForSize,
17681
N1Cost, Depth + 1);
17682
17683
if (NegN0 && N0Cost <= N1Cost) {
17684
Cost = std::min(N0Cost, N2Cost);
17685
return DAG.getNode(Opc, Loc, VT, NegN0, N1, NegN2, Flags);
17686
} else if (NegN1) {
17687
Cost = std::min(N1Cost, N2Cost);
17688
return DAG.getNode(Opc, Loc, VT, N0, NegN1, NegN2, Flags);
17689
}
17690
}
17691
17692
// (fneg (fnmsub a b c)) => (fma a b (fneg c))
17693
if (isOperationLegal(ISD::FMA, VT)) {
17694
Cost = N2Cost;
17695
return DAG.getNode(ISD::FMA, Loc, VT, N0, N1, NegN2, Flags);
17696
}
17697
17698
break;
17699
}
17700
17701
return TargetLowering::getNegatedExpression(Op, DAG, LegalOps, OptForSize,
17702
Cost, Depth);
17703
}
17704
17705
// Override to enable LOAD_STACK_GUARD lowering on Linux.
17706
bool PPCTargetLowering::useLoadStackGuardNode() const {
17707
if (!Subtarget.isTargetLinux())
17708
return TargetLowering::useLoadStackGuardNode();
17709
return true;
17710
}
17711
17712
// Override to disable global variable loading on Linux and insert AIX canary
17713
// word declaration.
17714
void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
17715
if (Subtarget.isAIXABI()) {
17716
M.getOrInsertGlobal(AIXSSPCanaryWordName,
17717
PointerType::getUnqual(M.getContext()));
17718
return;
17719
}
17720
if (!Subtarget.isTargetLinux())
17721
return TargetLowering::insertSSPDeclarations(M);
17722
}
17723
17724
Value *PPCTargetLowering::getSDagStackGuard(const Module &M) const {
17725
if (Subtarget.isAIXABI())
17726
return M.getGlobalVariable(AIXSSPCanaryWordName);
17727
return TargetLowering::getSDagStackGuard(M);
17728
}
17729
17730
bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
17731
bool ForCodeSize) const {
17732
if (!VT.isSimple() || !Subtarget.hasVSX())
17733
return false;
17734
17735
switch(VT.getSimpleVT().SimpleTy) {
17736
default:
17737
// For FP types that are currently not supported by PPC backend, return
17738
// false. Examples: f16, f80.
17739
return false;
17740
case MVT::f32:
17741
case MVT::f64: {
17742
if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {
17743
// we can materialize all immediatess via XXSPLTI32DX and XXSPLTIDP.
17744
return true;
17745
}
17746
bool IsExact;
17747
APSInt IntResult(16, false);
17748
// The rounding mode doesn't really matter because we only care about floats
17749
// that can be converted to integers exactly.
17750
Imm.convertToInteger(IntResult, APFloat::rmTowardZero, &IsExact);
17751
// For exact values in the range [-16, 15] we can materialize the float.
17752
if (IsExact && IntResult <= 15 && IntResult >= -16)
17753
return true;
17754
return Imm.isZero();
17755
}
17756
case MVT::ppcf128:
17757
return Imm.isPosZero();
17758
}
17759
}
17760
17761
// For vector shift operation op, fold
17762
// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
17763
static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
17764
SelectionDAG &DAG) {
17765
SDValue N0 = N->getOperand(0);
17766
SDValue N1 = N->getOperand(1);
17767
EVT VT = N0.getValueType();
17768
unsigned OpSizeInBits = VT.getScalarSizeInBits();
17769
unsigned Opcode = N->getOpcode();
17770
unsigned TargetOpcode;
17771
17772
switch (Opcode) {
17773
default:
17774
llvm_unreachable("Unexpected shift operation");
17775
case ISD::SHL:
17776
TargetOpcode = PPCISD::SHL;
17777
break;
17778
case ISD::SRL:
17779
TargetOpcode = PPCISD::SRL;
17780
break;
17781
case ISD::SRA:
17782
TargetOpcode = PPCISD::SRA;
17783
break;
17784
}
17785
17786
if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&
17787
N1->getOpcode() == ISD::AND)
17788
if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))
17789
if (Mask->getZExtValue() == OpSizeInBits - 1)
17790
return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));
17791
17792
return SDValue();
17793
}
17794
17795
SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
17796
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
17797
return Value;
17798
17799
SDValue N0 = N->getOperand(0);
17800
ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
17801
if (!Subtarget.isISA3_0() || !Subtarget.isPPC64() ||
17802
N0.getOpcode() != ISD::SIGN_EXTEND ||
17803
N0.getOperand(0).getValueType() != MVT::i32 || CN1 == nullptr ||
17804
N->getValueType(0) != MVT::i64)
17805
return SDValue();
17806
17807
// We can't save an operation here if the value is already extended, and
17808
// the existing shift is easier to combine.
17809
SDValue ExtsSrc = N0.getOperand(0);
17810
if (ExtsSrc.getOpcode() == ISD::TRUNCATE &&
17811
ExtsSrc.getOperand(0).getOpcode() == ISD::AssertSext)
17812
return SDValue();
17813
17814
SDLoc DL(N0);
17815
SDValue ShiftBy = SDValue(CN1, 0);
17816
// We want the shift amount to be i32 on the extswli, but the shift could
17817
// have an i64.
17818
if (ShiftBy.getValueType() == MVT::i64)
17819
ShiftBy = DCI.DAG.getConstant(CN1->getZExtValue(), DL, MVT::i32);
17820
17821
return DCI.DAG.getNode(PPCISD::EXTSWSLI, DL, MVT::i64, N0->getOperand(0),
17822
ShiftBy);
17823
}
17824
17825
SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
17826
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
17827
return Value;
17828
17829
return SDValue();
17830
}
17831
17832
SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
17833
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
17834
return Value;
17835
17836
return SDValue();
17837
}
17838
17839
// Transform (add X, (zext(setne Z, C))) -> (addze X, (addic (addi Z, -C), -1))
17840
// Transform (add X, (zext(sete Z, C))) -> (addze X, (subfic (addi Z, -C), 0))
17841
// When C is zero, the equation (addi Z, -C) can be simplified to Z
17842
// Requirement: -C in [-32768, 32767], X and Z are MVT::i64 types
17843
static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG,
17844
const PPCSubtarget &Subtarget) {
17845
if (!Subtarget.isPPC64())
17846
return SDValue();
17847
17848
SDValue LHS = N->getOperand(0);
17849
SDValue RHS = N->getOperand(1);
17850
17851
auto isZextOfCompareWithConstant = [](SDValue Op) {
17852
if (Op.getOpcode() != ISD::ZERO_EXTEND || !Op.hasOneUse() ||
17853
Op.getValueType() != MVT::i64)
17854
return false;
17855
17856
SDValue Cmp = Op.getOperand(0);
17857
if (Cmp.getOpcode() != ISD::SETCC || !Cmp.hasOneUse() ||
17858
Cmp.getOperand(0).getValueType() != MVT::i64)
17859
return false;
17860
17861
if (auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1))) {
17862
int64_t NegConstant = 0 - Constant->getSExtValue();
17863
// Due to the limitations of the addi instruction,
17864
// -C is required to be [-32768, 32767].
17865
return isInt<16>(NegConstant);
17866
}
17867
17868
return false;
17869
};
17870
17871
bool LHSHasPattern = isZextOfCompareWithConstant(LHS);
17872
bool RHSHasPattern = isZextOfCompareWithConstant(RHS);
17873
17874
// If there is a pattern, canonicalize a zext operand to the RHS.
17875
if (LHSHasPattern && !RHSHasPattern)
17876
std::swap(LHS, RHS);
17877
else if (!LHSHasPattern && !RHSHasPattern)
17878
return SDValue();
17879
17880
SDLoc DL(N);
17881
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue);
17882
SDValue Cmp = RHS.getOperand(0);
17883
SDValue Z = Cmp.getOperand(0);
17884
auto *Constant = cast<ConstantSDNode>(Cmp.getOperand(1));
17885
int64_t NegConstant = 0 - Constant->getSExtValue();
17886
17887
switch(cast<CondCodeSDNode>(Cmp.getOperand(2))->get()) {
17888
default: break;
17889
case ISD::SETNE: {
17890
// when C == 0
17891
// --> addze X, (addic Z, -1).carry
17892
// /
17893
// add X, (zext(setne Z, C))--
17894
// \ when -32768 <= -C <= 32767 && C != 0
17895
// --> addze X, (addic (addi Z, -C), -1).carry
17896
SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,
17897
DAG.getConstant(NegConstant, DL, MVT::i64));
17898
SDValue AddOrZ = NegConstant != 0 ? Add : Z;
17899
SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue),
17900
AddOrZ, DAG.getConstant(-1ULL, DL, MVT::i64));
17901
return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),
17902
SDValue(Addc.getNode(), 1));
17903
}
17904
case ISD::SETEQ: {
17905
// when C == 0
17906
// --> addze X, (subfic Z, 0).carry
17907
// /
17908
// add X, (zext(sete Z, C))--
17909
// \ when -32768 <= -C <= 32767 && C != 0
17910
// --> addze X, (subfic (addi Z, -C), 0).carry
17911
SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,
17912
DAG.getConstant(NegConstant, DL, MVT::i64));
17913
SDValue AddOrZ = NegConstant != 0 ? Add : Z;
17914
SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue),
17915
DAG.getConstant(0, DL, MVT::i64), AddOrZ);
17916
return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),
17917
SDValue(Subc.getNode(), 1));
17918
}
17919
}
17920
17921
return SDValue();
17922
}
17923
17924
// Transform
17925
// (add C1, (MAT_PCREL_ADDR GlobalAddr+C2)) to
17926
// (MAT_PCREL_ADDR GlobalAddr+(C1+C2))
17927
// In this case both C1 and C2 must be known constants.
17928
// C1+C2 must fit into a 34 bit signed integer.
17929
static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,
17930
const PPCSubtarget &Subtarget) {
17931
if (!Subtarget.isUsingPCRelativeCalls())
17932
return SDValue();
17933
17934
// Check both Operand 0 and Operand 1 of the ADD node for the PCRel node.
17935
// If we find that node try to cast the Global Address and the Constant.
17936
SDValue LHS = N->getOperand(0);
17937
SDValue RHS = N->getOperand(1);
17938
17939
if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
17940
std::swap(LHS, RHS);
17941
17942
if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
17943
return SDValue();
17944
17945
// Operand zero of PPCISD::MAT_PCREL_ADDR is the GA node.
17946
GlobalAddressSDNode *GSDN = dyn_cast<GlobalAddressSDNode>(LHS.getOperand(0));
17947
ConstantSDNode* ConstNode = dyn_cast<ConstantSDNode>(RHS);
17948
17949
// Check that both casts succeeded.
17950
if (!GSDN || !ConstNode)
17951
return SDValue();
17952
17953
int64_t NewOffset = GSDN->getOffset() + ConstNode->getSExtValue();
17954
SDLoc DL(GSDN);
17955
17956
// The signed int offset needs to fit in 34 bits.
17957
if (!isInt<34>(NewOffset))
17958
return SDValue();
17959
17960
// The new global address is a copy of the old global address except
17961
// that it has the updated Offset.
17962
SDValue GA =
17963
DAG.getTargetGlobalAddress(GSDN->getGlobal(), DL, GSDN->getValueType(0),
17964
NewOffset, GSDN->getTargetFlags());
17965
SDValue MatPCRel =
17966
DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, GSDN->getValueType(0), GA);
17967
return MatPCRel;
17968
}
17969
17970
SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
17971
if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget))
17972
return Value;
17973
17974
if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget))
17975
return Value;
17976
17977
return SDValue();
17978
}
17979
17980
// Detect TRUNCATE operations on bitcasts of float128 values.
17981
// What we are looking for here is the situtation where we extract a subset
17982
// of bits from a 128 bit float.
17983
// This can be of two forms:
17984
// 1) BITCAST of f128 feeding TRUNCATE
17985
// 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE
17986
// The reason this is required is because we do not have a legal i128 type
17987
// and so we want to prevent having to store the f128 and then reload part
17988
// of it.
17989
SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
17990
DAGCombinerInfo &DCI) const {
17991
// If we are using CRBits then try that first.
17992
if (Subtarget.useCRBits()) {
17993
// Check if CRBits did anything and return that if it did.
17994
if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI))
17995
return CRTruncValue;
17996
}
17997
17998
SDLoc dl(N);
17999
SDValue Op0 = N->getOperand(0);
18000
18001
// Looking for a truncate of i128 to i64.
18002
if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64)
18003
return SDValue();
18004
18005
int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0;
18006
18007
// SRL feeding TRUNCATE.
18008
if (Op0.getOpcode() == ISD::SRL) {
18009
ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
18010
// The right shift has to be by 64 bits.
18011
if (!ConstNode || ConstNode->getZExtValue() != 64)
18012
return SDValue();
18013
18014
// Switch the element number to extract.
18015
EltToExtract = EltToExtract ? 0 : 1;
18016
// Update Op0 past the SRL.
18017
Op0 = Op0.getOperand(0);
18018
}
18019
18020
// BITCAST feeding a TRUNCATE possibly via SRL.
18021
if (Op0.getOpcode() == ISD::BITCAST &&
18022
Op0.getValueType() == MVT::i128 &&
18023
Op0.getOperand(0).getValueType() == MVT::f128) {
18024
SDValue Bitcast = DCI.DAG.getBitcast(MVT::v2i64, Op0.getOperand(0));
18025
return DCI.DAG.getNode(
18026
ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Bitcast,
18027
DCI.DAG.getTargetConstant(EltToExtract, dl, MVT::i32));
18028
}
18029
return SDValue();
18030
}
18031
18032
SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
18033
SelectionDAG &DAG = DCI.DAG;
18034
18035
ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N->getOperand(1));
18036
if (!ConstOpOrElement)
18037
return SDValue();
18038
18039
// An imul is usually smaller than the alternative sequence for legal type.
18040
if (DAG.getMachineFunction().getFunction().hasMinSize() &&
18041
isOperationLegal(ISD::MUL, N->getValueType(0)))
18042
return SDValue();
18043
18044
auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool {
18045
switch (this->Subtarget.getCPUDirective()) {
18046
default:
18047
// TODO: enhance the condition for subtarget before pwr8
18048
return false;
18049
case PPC::DIR_PWR8:
18050
// type mul add shl
18051
// scalar 4 1 1
18052
// vector 7 2 2
18053
return true;
18054
case PPC::DIR_PWR9:
18055
case PPC::DIR_PWR10:
18056
case PPC::DIR_PWR11:
18057
case PPC::DIR_PWR_FUTURE:
18058
// type mul add shl
18059
// scalar 5 2 2
18060
// vector 7 2 2
18061
18062
// The cycle RATIO of related operations are showed as a table above.
18063
// Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both
18064
// scalar and vector type. For 2 instrs patterns, add/sub + shl
18065
// are 4, it is always profitable; but for 3 instrs patterns
18066
// (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6.
18067
// So we should only do it for vector type.
18068
return IsAddOne && IsNeg ? VT.isVector() : true;
18069
}
18070
};
18071
18072
EVT VT = N->getValueType(0);
18073
SDLoc DL(N);
18074
18075
const APInt &MulAmt = ConstOpOrElement->getAPIntValue();
18076
bool IsNeg = MulAmt.isNegative();
18077
APInt MulAmtAbs = MulAmt.abs();
18078
18079
if ((MulAmtAbs - 1).isPowerOf2()) {
18080
// (mul x, 2^N + 1) => (add (shl x, N), x)
18081
// (mul x, -(2^N + 1)) => -(add (shl x, N), x)
18082
18083
if (!IsProfitable(IsNeg, true, VT))
18084
return SDValue();
18085
18086
SDValue Op0 = N->getOperand(0);
18087
SDValue Op1 =
18088
DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
18089
DAG.getConstant((MulAmtAbs - 1).logBase2(), DL, VT));
18090
SDValue Res = DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
18091
18092
if (!IsNeg)
18093
return Res;
18094
18095
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
18096
} else if ((MulAmtAbs + 1).isPowerOf2()) {
18097
// (mul x, 2^N - 1) => (sub (shl x, N), x)
18098
// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
18099
18100
if (!IsProfitable(IsNeg, false, VT))
18101
return SDValue();
18102
18103
SDValue Op0 = N->getOperand(0);
18104
SDValue Op1 =
18105
DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
18106
DAG.getConstant((MulAmtAbs + 1).logBase2(), DL, VT));
18107
18108
if (!IsNeg)
18109
return DAG.getNode(ISD::SUB, DL, VT, Op1, Op0);
18110
else
18111
return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1);
18112
18113
} else {
18114
return SDValue();
18115
}
18116
}
18117
18118
// Combine fma-like op (like fnmsub) with fnegs to appropriate op. Do this
18119
// in combiner since we need to check SD flags and other subtarget features.
18120
SDValue PPCTargetLowering::combineFMALike(SDNode *N,
18121
DAGCombinerInfo &DCI) const {
18122
SDValue N0 = N->getOperand(0);
18123
SDValue N1 = N->getOperand(1);
18124
SDValue N2 = N->getOperand(2);
18125
SDNodeFlags Flags = N->getFlags();
18126
EVT VT = N->getValueType(0);
18127
SelectionDAG &DAG = DCI.DAG;
18128
const TargetOptions &Options = getTargetMachine().Options;
18129
unsigned Opc = N->getOpcode();
18130
bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
18131
bool LegalOps = !DCI.isBeforeLegalizeOps();
18132
SDLoc Loc(N);
18133
18134
if (!isOperationLegal(ISD::FMA, VT))
18135
return SDValue();
18136
18137
// Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0
18138
// since (fnmsub a b c)=-0 while c-ab=+0.
18139
if (!Flags.hasNoSignedZeros() && !Options.NoSignedZerosFPMath)
18140
return SDValue();
18141
18142
// (fma (fneg a) b c) => (fnmsub a b c)
18143
// (fnmsub (fneg a) b c) => (fma a b c)
18144
if (SDValue NegN0 = getCheaperNegatedExpression(N0, DAG, LegalOps, CodeSize))
18145
return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, NegN0, N1, N2, Flags);
18146
18147
// (fma a (fneg b) c) => (fnmsub a b c)
18148
// (fnmsub a (fneg b) c) => (fma a b c)
18149
if (SDValue NegN1 = getCheaperNegatedExpression(N1, DAG, LegalOps, CodeSize))
18150
return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, N0, NegN1, N2, Flags);
18151
18152
return SDValue();
18153
}
18154
18155
bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
18156
// Only duplicate to increase tail-calls for the 64bit SysV ABIs.
18157
if (!Subtarget.is64BitELFABI())
18158
return false;
18159
18160
// If not a tail call then no need to proceed.
18161
if (!CI->isTailCall())
18162
return false;
18163
18164
// If sibling calls have been disabled and tail-calls aren't guaranteed
18165
// there is no reason to duplicate.
18166
auto &TM = getTargetMachine();
18167
if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
18168
return false;
18169
18170
// Can't tail call a function called indirectly, or if it has variadic args.
18171
const Function *Callee = CI->getCalledFunction();
18172
if (!Callee || Callee->isVarArg())
18173
return false;
18174
18175
// Make sure the callee and caller calling conventions are eligible for tco.
18176
const Function *Caller = CI->getParent()->getParent();
18177
if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(),
18178
CI->getCallingConv()))
18179
return false;
18180
18181
// If the function is local then we have a good chance at tail-calling it
18182
return getTargetMachine().shouldAssumeDSOLocal(Callee);
18183
}
18184
18185
bool PPCTargetLowering::
18186
isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
18187
const Value *Mask = AndI.getOperand(1);
18188
// If the mask is suitable for andi. or andis. we should sink the and.
18189
if (const ConstantInt *CI = dyn_cast<ConstantInt>(Mask)) {
18190
// Can't handle constants wider than 64-bits.
18191
if (CI->getBitWidth() > 64)
18192
return false;
18193
int64_t ConstVal = CI->getZExtValue();
18194
return isUInt<16>(ConstVal) ||
18195
(isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF));
18196
}
18197
18198
// For non-constant masks, we can always use the record-form and.
18199
return true;
18200
}
18201
18202
/// getAddrModeForFlags - Based on the set of address flags, select the most
18203
/// optimal instruction format to match by.
18204
PPC::AddrMode PPCTargetLowering::getAddrModeForFlags(unsigned Flags) const {
18205
// This is not a node we should be handling here.
18206
if (Flags == PPC::MOF_None)
18207
return PPC::AM_None;
18208
// Unaligned D-Forms are tried first, followed by the aligned D-Forms.
18209
for (auto FlagSet : AddrModesMap.at(PPC::AM_DForm))
18210
if ((Flags & FlagSet) == FlagSet)
18211
return PPC::AM_DForm;
18212
for (auto FlagSet : AddrModesMap.at(PPC::AM_DSForm))
18213
if ((Flags & FlagSet) == FlagSet)
18214
return PPC::AM_DSForm;
18215
for (auto FlagSet : AddrModesMap.at(PPC::AM_DQForm))
18216
if ((Flags & FlagSet) == FlagSet)
18217
return PPC::AM_DQForm;
18218
for (auto FlagSet : AddrModesMap.at(PPC::AM_PrefixDForm))
18219
if ((Flags & FlagSet) == FlagSet)
18220
return PPC::AM_PrefixDForm;
18221
// If no other forms are selected, return an X-Form as it is the most
18222
// general addressing mode.
18223
return PPC::AM_XForm;
18224
}
18225
18226
/// Set alignment flags based on whether or not the Frame Index is aligned.
18227
/// Utilized when computing flags for address computation when selecting
18228
/// load and store instructions.
18229
static void setAlignFlagsForFI(SDValue N, unsigned &FlagSet,
18230
SelectionDAG &DAG) {
18231
bool IsAdd = ((N.getOpcode() == ISD::ADD) || (N.getOpcode() == ISD::OR));
18232
FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(IsAdd ? N.getOperand(0) : N);
18233
if (!FI)
18234
return;
18235
const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
18236
unsigned FrameIndexAlign = MFI.getObjectAlign(FI->getIndex()).value();
18237
// If this is (add $FI, $S16Imm), the alignment flags are already set
18238
// based on the immediate. We just need to clear the alignment flags
18239
// if the FI alignment is weaker.
18240
if ((FrameIndexAlign % 4) != 0)
18241
FlagSet &= ~PPC::MOF_RPlusSImm16Mult4;
18242
if ((FrameIndexAlign % 16) != 0)
18243
FlagSet &= ~PPC::MOF_RPlusSImm16Mult16;
18244
// If the address is a plain FrameIndex, set alignment flags based on
18245
// FI alignment.
18246
if (!IsAdd) {
18247
if ((FrameIndexAlign % 4) == 0)
18248
FlagSet |= PPC::MOF_RPlusSImm16Mult4;
18249
if ((FrameIndexAlign % 16) == 0)
18250
FlagSet |= PPC::MOF_RPlusSImm16Mult16;
18251
}
18252
}
18253
18254
/// Given a node, compute flags that are used for address computation when
18255
/// selecting load and store instructions. The flags computed are stored in
18256
/// FlagSet. This function takes into account whether the node is a constant,
18257
/// an ADD, OR, or a constant, and computes the address flags accordingly.
18258
static void computeFlagsForAddressComputation(SDValue N, unsigned &FlagSet,
18259
SelectionDAG &DAG) {
18260
// Set the alignment flags for the node depending on if the node is
18261
// 4-byte or 16-byte aligned.
18262
auto SetAlignFlagsForImm = [&](uint64_t Imm) {
18263
if ((Imm & 0x3) == 0)
18264
FlagSet |= PPC::MOF_RPlusSImm16Mult4;
18265
if ((Imm & 0xf) == 0)
18266
FlagSet |= PPC::MOF_RPlusSImm16Mult16;
18267
};
18268
18269
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
18270
// All 32-bit constants can be computed as LIS + Disp.
18271
const APInt &ConstImm = CN->getAPIntValue();
18272
if (ConstImm.isSignedIntN(32)) { // Flag to handle 32-bit constants.
18273
FlagSet |= PPC::MOF_AddrIsSImm32;
18274
SetAlignFlagsForImm(ConstImm.getZExtValue());
18275
setAlignFlagsForFI(N, FlagSet, DAG);
18276
}
18277
if (ConstImm.isSignedIntN(34)) // Flag to handle 34-bit constants.
18278
FlagSet |= PPC::MOF_RPlusSImm34;
18279
else // Let constant materialization handle large constants.
18280
FlagSet |= PPC::MOF_NotAddNorCst;
18281
} else if (N.getOpcode() == ISD::ADD || provablyDisjointOr(DAG, N)) {
18282
// This address can be represented as an addition of:
18283
// - Register + Imm16 (possibly a multiple of 4/16)
18284
// - Register + Imm34
18285
// - Register + PPCISD::Lo
18286
// - Register + Register
18287
// In any case, we won't have to match this as Base + Zero.
18288
SDValue RHS = N.getOperand(1);
18289
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(RHS)) {
18290
const APInt &ConstImm = CN->getAPIntValue();
18291
if (ConstImm.isSignedIntN(16)) {
18292
FlagSet |= PPC::MOF_RPlusSImm16; // Signed 16-bit immediates.
18293
SetAlignFlagsForImm(ConstImm.getZExtValue());
18294
setAlignFlagsForFI(N, FlagSet, DAG);
18295
}
18296
if (ConstImm.isSignedIntN(34))
18297
FlagSet |= PPC::MOF_RPlusSImm34; // Signed 34-bit immediates.
18298
else
18299
FlagSet |= PPC::MOF_RPlusR; // Register.
18300
} else if (RHS.getOpcode() == PPCISD::Lo && !RHS.getConstantOperandVal(1))
18301
FlagSet |= PPC::MOF_RPlusLo; // PPCISD::Lo.
18302
else
18303
FlagSet |= PPC::MOF_RPlusR;
18304
} else { // The address computation is not a constant or an addition.
18305
setAlignFlagsForFI(N, FlagSet, DAG);
18306
FlagSet |= PPC::MOF_NotAddNorCst;
18307
}
18308
}
18309
18310
static bool isPCRelNode(SDValue N) {
18311
return (N.getOpcode() == PPCISD::MAT_PCREL_ADDR ||
18312
isValidPCRelNode<ConstantPoolSDNode>(N) ||
18313
isValidPCRelNode<GlobalAddressSDNode>(N) ||
18314
isValidPCRelNode<JumpTableSDNode>(N) ||
18315
isValidPCRelNode<BlockAddressSDNode>(N));
18316
}
18317
18318
/// computeMOFlags - Given a node N and it's Parent (a MemSDNode), compute
18319
/// the address flags of the load/store instruction that is to be matched.
18320
unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
18321
SelectionDAG &DAG) const {
18322
unsigned FlagSet = PPC::MOF_None;
18323
18324
// Compute subtarget flags.
18325
if (!Subtarget.hasP9Vector())
18326
FlagSet |= PPC::MOF_SubtargetBeforeP9;
18327
else
18328
FlagSet |= PPC::MOF_SubtargetP9;
18329
18330
if (Subtarget.hasPrefixInstrs())
18331
FlagSet |= PPC::MOF_SubtargetP10;
18332
18333
if (Subtarget.hasSPE())
18334
FlagSet |= PPC::MOF_SubtargetSPE;
18335
18336
// Check if we have a PCRel node and return early.
18337
if ((FlagSet & PPC::MOF_SubtargetP10) && isPCRelNode(N))
18338
return FlagSet;
18339
18340
// If the node is the paired load/store intrinsics, compute flags for
18341
// address computation and return early.
18342
unsigned ParentOp = Parent->getOpcode();
18343
if (Subtarget.isISA3_1() && ((ParentOp == ISD::INTRINSIC_W_CHAIN) ||
18344
(ParentOp == ISD::INTRINSIC_VOID))) {
18345
unsigned ID = Parent->getConstantOperandVal(1);
18346
if ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) {
18347
SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp)
18348
? Parent->getOperand(2)
18349
: Parent->getOperand(3);
18350
computeFlagsForAddressComputation(IntrinOp, FlagSet, DAG);
18351
FlagSet |= PPC::MOF_Vector;
18352
return FlagSet;
18353
}
18354
}
18355
18356
// Mark this as something we don't want to handle here if it is atomic
18357
// or pre-increment instruction.
18358
if (const LSBaseSDNode *LSB = dyn_cast<LSBaseSDNode>(Parent))
18359
if (LSB->isIndexed())
18360
return PPC::MOF_None;
18361
18362
// Compute in-memory type flags. This is based on if there are scalars,
18363
// floats or vectors.
18364
const MemSDNode *MN = dyn_cast<MemSDNode>(Parent);
18365
assert(MN && "Parent should be a MemSDNode!");
18366
EVT MemVT = MN->getMemoryVT();
18367
unsigned Size = MemVT.getSizeInBits();
18368
if (MemVT.isScalarInteger()) {
18369
assert(Size <= 128 &&
18370
"Not expecting scalar integers larger than 16 bytes!");
18371
if (Size < 32)
18372
FlagSet |= PPC::MOF_SubWordInt;
18373
else if (Size == 32)
18374
FlagSet |= PPC::MOF_WordInt;
18375
else
18376
FlagSet |= PPC::MOF_DoubleWordInt;
18377
} else if (MemVT.isVector() && !MemVT.isFloatingPoint()) { // Integer vectors.
18378
if (Size == 128)
18379
FlagSet |= PPC::MOF_Vector;
18380
else if (Size == 256) {
18381
assert(Subtarget.pairedVectorMemops() &&
18382
"256-bit vectors are only available when paired vector memops is "
18383
"enabled!");
18384
FlagSet |= PPC::MOF_Vector;
18385
} else
18386
llvm_unreachable("Not expecting illegal vectors!");
18387
} else { // Floating point type: can be scalar, f128 or vector types.
18388
if (Size == 32 || Size == 64)
18389
FlagSet |= PPC::MOF_ScalarFloat;
18390
else if (MemVT == MVT::f128 || MemVT.isVector())
18391
FlagSet |= PPC::MOF_Vector;
18392
else
18393
llvm_unreachable("Not expecting illegal scalar floats!");
18394
}
18395
18396
// Compute flags for address computation.
18397
computeFlagsForAddressComputation(N, FlagSet, DAG);
18398
18399
// Compute type extension flags.
18400
if (const LoadSDNode *LN = dyn_cast<LoadSDNode>(Parent)) {
18401
switch (LN->getExtensionType()) {
18402
case ISD::SEXTLOAD:
18403
FlagSet |= PPC::MOF_SExt;
18404
break;
18405
case ISD::EXTLOAD:
18406
case ISD::ZEXTLOAD:
18407
FlagSet |= PPC::MOF_ZExt;
18408
break;
18409
case ISD::NON_EXTLOAD:
18410
FlagSet |= PPC::MOF_NoExt;
18411
break;
18412
}
18413
} else
18414
FlagSet |= PPC::MOF_NoExt;
18415
18416
// For integers, no extension is the same as zero extension.
18417
// We set the extension mode to zero extension so we don't have
18418
// to add separate entries in AddrModesMap for loads and stores.
18419
if (MemVT.isScalarInteger() && (FlagSet & PPC::MOF_NoExt)) {
18420
FlagSet |= PPC::MOF_ZExt;
18421
FlagSet &= ~PPC::MOF_NoExt;
18422
}
18423
18424
// If we don't have prefixed instructions, 34-bit constants should be
18425
// treated as PPC::MOF_NotAddNorCst so they can match D-Forms.
18426
bool IsNonP1034BitConst =
18427
((PPC::MOF_RPlusSImm34 | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubtargetP10) &
18428
FlagSet) == PPC::MOF_RPlusSImm34;
18429
if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::OR &&
18430
IsNonP1034BitConst)
18431
FlagSet |= PPC::MOF_NotAddNorCst;
18432
18433
return FlagSet;
18434
}
18435
18436
/// SelectForceXFormMode - Given the specified address, force it to be
18437
/// represented as an indexed [r+r] operation (an XForm instruction).
18438
PPC::AddrMode PPCTargetLowering::SelectForceXFormMode(SDValue N, SDValue &Disp,
18439
SDValue &Base,
18440
SelectionDAG &DAG) const {
18441
18442
PPC::AddrMode Mode = PPC::AM_XForm;
18443
int16_t ForceXFormImm = 0;
18444
if (provablyDisjointOr(DAG, N) &&
18445
!isIntS16Immediate(N.getOperand(1), ForceXFormImm)) {
18446
Disp = N.getOperand(0);
18447
Base = N.getOperand(1);
18448
return Mode;
18449
}
18450
18451
// If the address is the result of an add, we will utilize the fact that the
18452
// address calculation includes an implicit add. However, we can reduce
18453
// register pressure if we do not materialize a constant just for use as the
18454
// index register. We only get rid of the add if it is not an add of a
18455
// value and a 16-bit signed constant and both have a single use.
18456
if (N.getOpcode() == ISD::ADD &&
18457
(!isIntS16Immediate(N.getOperand(1), ForceXFormImm) ||
18458
!N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) {
18459
Disp = N.getOperand(0);
18460
Base = N.getOperand(1);
18461
return Mode;
18462
}
18463
18464
// Otherwise, use R0 as the base register.
18465
Disp = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
18466
N.getValueType());
18467
Base = N;
18468
18469
return Mode;
18470
}
18471
18472
bool PPCTargetLowering::splitValueIntoRegisterParts(
18473
SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
18474
unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
18475
EVT ValVT = Val.getValueType();
18476
// If we are splitting a scalar integer into f64 parts (i.e. so they
18477
// can be placed into VFRC registers), we need to zero extend and
18478
// bitcast the values. This will ensure the value is placed into a
18479
// VSR using direct moves or stack operations as needed.
18480
if (PartVT == MVT::f64 &&
18481
(ValVT == MVT::i32 || ValVT == MVT::i16 || ValVT == MVT::i8)) {
18482
Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
18483
Val = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Val);
18484
Parts[0] = Val;
18485
return true;
18486
}
18487
return false;
18488
}
18489
18490
SDValue PPCTargetLowering::lowerToLibCall(const char *LibCallName, SDValue Op,
18491
SelectionDAG &DAG) const {
18492
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18493
TargetLowering::CallLoweringInfo CLI(DAG);
18494
EVT RetVT = Op.getValueType();
18495
Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
18496
SDValue Callee =
18497
DAG.getExternalSymbol(LibCallName, TLI.getPointerTy(DAG.getDataLayout()));
18498
bool SignExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, false);
18499
TargetLowering::ArgListTy Args;
18500
TargetLowering::ArgListEntry Entry;
18501
for (const SDValue &N : Op->op_values()) {
18502
EVT ArgVT = N.getValueType();
18503
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18504
Entry.Node = N;
18505
Entry.Ty = ArgTy;
18506
Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, SignExtend);
18507
Entry.IsZExt = !Entry.IsSExt;
18508
Args.push_back(Entry);
18509
}
18510
18511
SDValue InChain = DAG.getEntryNode();
18512
SDValue TCChain = InChain;
18513
const Function &F = DAG.getMachineFunction().getFunction();
18514
bool isTailCall =
18515
TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
18516
(RetTy == F.getReturnType() || F.getReturnType()->isVoidTy());
18517
if (isTailCall)
18518
InChain = TCChain;
18519
CLI.setDebugLoc(SDLoc(Op))
18520
.setChain(InChain)
18521
.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args))
18522
.setTailCall(isTailCall)
18523
.setSExtResult(SignExtend)
18524
.setZExtResult(!SignExtend)
18525
.setIsPostTypeLegalization(true);
18526
return TLI.LowerCallTo(CLI).first;
18527
}
18528
18529
SDValue PPCTargetLowering::lowerLibCallBasedOnType(
18530
const char *LibCallFloatName, const char *LibCallDoubleName, SDValue Op,
18531
SelectionDAG &DAG) const {
18532
if (Op.getValueType() == MVT::f32)
18533
return lowerToLibCall(LibCallFloatName, Op, DAG);
18534
18535
if (Op.getValueType() == MVT::f64)
18536
return lowerToLibCall(LibCallDoubleName, Op, DAG);
18537
18538
return SDValue();
18539
}
18540
18541
bool PPCTargetLowering::isLowringToMASSFiniteSafe(SDValue Op) const {
18542
SDNodeFlags Flags = Op.getNode()->getFlags();
18543
return isLowringToMASSSafe(Op) && Flags.hasNoSignedZeros() &&
18544
Flags.hasNoNaNs() && Flags.hasNoInfs();
18545
}
18546
18547
bool PPCTargetLowering::isLowringToMASSSafe(SDValue Op) const {
18548
return Op.getNode()->getFlags().hasApproximateFuncs();
18549
}
18550
18551
bool PPCTargetLowering::isScalarMASSConversionEnabled() const {
18552
return getTargetMachine().Options.PPCGenScalarMASSEntries;
18553
}
18554
18555
SDValue PPCTargetLowering::lowerLibCallBase(const char *LibCallDoubleName,
18556
const char *LibCallFloatName,
18557
const char *LibCallDoubleNameFinite,
18558
const char *LibCallFloatNameFinite,
18559
SDValue Op,
18560
SelectionDAG &DAG) const {
18561
if (!isScalarMASSConversionEnabled() || !isLowringToMASSSafe(Op))
18562
return SDValue();
18563
18564
if (!isLowringToMASSFiniteSafe(Op))
18565
return lowerLibCallBasedOnType(LibCallFloatName, LibCallDoubleName, Op,
18566
DAG);
18567
18568
return lowerLibCallBasedOnType(LibCallFloatNameFinite,
18569
LibCallDoubleNameFinite, Op, DAG);
18570
}
18571
18572
SDValue PPCTargetLowering::lowerPow(SDValue Op, SelectionDAG &DAG) const {
18573
return lowerLibCallBase("__xl_pow", "__xl_powf", "__xl_pow_finite",
18574
"__xl_powf_finite", Op, DAG);
18575
}
18576
18577
SDValue PPCTargetLowering::lowerSin(SDValue Op, SelectionDAG &DAG) const {
18578
return lowerLibCallBase("__xl_sin", "__xl_sinf", "__xl_sin_finite",
18579
"__xl_sinf_finite", Op, DAG);
18580
}
18581
18582
SDValue PPCTargetLowering::lowerCos(SDValue Op, SelectionDAG &DAG) const {
18583
return lowerLibCallBase("__xl_cos", "__xl_cosf", "__xl_cos_finite",
18584
"__xl_cosf_finite", Op, DAG);
18585
}
18586
18587
SDValue PPCTargetLowering::lowerLog(SDValue Op, SelectionDAG &DAG) const {
18588
return lowerLibCallBase("__xl_log", "__xl_logf", "__xl_log_finite",
18589
"__xl_logf_finite", Op, DAG);
18590
}
18591
18592
SDValue PPCTargetLowering::lowerLog10(SDValue Op, SelectionDAG &DAG) const {
18593
return lowerLibCallBase("__xl_log10", "__xl_log10f", "__xl_log10_finite",
18594
"__xl_log10f_finite", Op, DAG);
18595
}
18596
18597
SDValue PPCTargetLowering::lowerExp(SDValue Op, SelectionDAG &DAG) const {
18598
return lowerLibCallBase("__xl_exp", "__xl_expf", "__xl_exp_finite",
18599
"__xl_expf_finite", Op, DAG);
18600
}
18601
18602
// If we happen to match to an aligned D-Form, check if the Frame Index is
18603
// adequately aligned. If it is not, reset the mode to match to X-Form.
18604
static void setXFormForUnalignedFI(SDValue N, unsigned Flags,
18605
PPC::AddrMode &Mode) {
18606
if (!isa<FrameIndexSDNode>(N))
18607
return;
18608
if ((Mode == PPC::AM_DSForm && !(Flags & PPC::MOF_RPlusSImm16Mult4)) ||
18609
(Mode == PPC::AM_DQForm && !(Flags & PPC::MOF_RPlusSImm16Mult16)))
18610
Mode = PPC::AM_XForm;
18611
}
18612
18613
/// SelectOptimalAddrMode - Based on a node N and it's Parent (a MemSDNode),
18614
/// compute the address flags of the node, get the optimal address mode based
18615
/// on the flags, and set the Base and Disp based on the address mode.
18616
PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
18617
SDValue N, SDValue &Disp,
18618
SDValue &Base,
18619
SelectionDAG &DAG,
18620
MaybeAlign Align) const {
18621
SDLoc DL(Parent);
18622
18623
// Compute the address flags.
18624
unsigned Flags = computeMOFlags(Parent, N, DAG);
18625
18626
// Get the optimal address mode based on the Flags.
18627
PPC::AddrMode Mode = getAddrModeForFlags(Flags);
18628
18629
// If the address mode is DS-Form or DQ-Form, check if the FI is aligned.
18630
// Select an X-Form load if it is not.
18631
setXFormForUnalignedFI(N, Flags, Mode);
18632
18633
// Set the mode to PC-Relative addressing mode if we have a valid PC-Rel node.
18634
if ((Mode == PPC::AM_XForm) && isPCRelNode(N)) {
18635
assert(Subtarget.isUsingPCRelativeCalls() &&
18636
"Must be using PC-Relative calls when a valid PC-Relative node is "
18637
"present!");
18638
Mode = PPC::AM_PCRel;
18639
}
18640
18641
// Set Base and Disp accordingly depending on the address mode.
18642
switch (Mode) {
18643
case PPC::AM_DForm:
18644
case PPC::AM_DSForm:
18645
case PPC::AM_DQForm: {
18646
// This is a register plus a 16-bit immediate. The base will be the
18647
// register and the displacement will be the immediate unless it
18648
// isn't sufficiently aligned.
18649
if (Flags & PPC::MOF_RPlusSImm16) {
18650
SDValue Op0 = N.getOperand(0);
18651
SDValue Op1 = N.getOperand(1);
18652
int16_t Imm = Op1->getAsZExtVal();
18653
if (!Align || isAligned(*Align, Imm)) {
18654
Disp = DAG.getTargetConstant(Imm, DL, N.getValueType());
18655
Base = Op0;
18656
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op0)) {
18657
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
18658
fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
18659
}
18660
break;
18661
}
18662
}
18663
// This is a register plus the @lo relocation. The base is the register
18664
// and the displacement is the global address.
18665
else if (Flags & PPC::MOF_RPlusLo) {
18666
Disp = N.getOperand(1).getOperand(0); // The global address.
18667
assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
18668
Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
18669
Disp.getOpcode() == ISD::TargetConstantPool ||
18670
Disp.getOpcode() == ISD::TargetJumpTable);
18671
Base = N.getOperand(0);
18672
break;
18673
}
18674
// This is a constant address at most 32 bits. The base will be
18675
// zero or load-immediate-shifted and the displacement will be
18676
// the low 16 bits of the address.
18677
else if (Flags & PPC::MOF_AddrIsSImm32) {
18678
auto *CN = cast<ConstantSDNode>(N);
18679
EVT CNType = CN->getValueType(0);
18680
uint64_t CNImm = CN->getZExtValue();
18681
// If this address fits entirely in a 16-bit sext immediate field, codegen
18682
// this as "d, 0".
18683
int16_t Imm;
18684
if (isIntS16Immediate(CN, Imm) && (!Align || isAligned(*Align, Imm))) {
18685
Disp = DAG.getTargetConstant(Imm, DL, CNType);
18686
Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
18687
CNType);
18688
break;
18689
}
18690
// Handle 32-bit sext immediate with LIS + Addr mode.
18691
if ((CNType == MVT::i32 || isInt<32>(CNImm)) &&
18692
(!Align || isAligned(*Align, CNImm))) {
18693
int32_t Addr = (int32_t)CNImm;
18694
// Otherwise, break this down into LIS + Disp.
18695
Disp = DAG.getTargetConstant((int16_t)Addr, DL, MVT::i32);
18696
Base =
18697
DAG.getTargetConstant((Addr - (int16_t)Addr) >> 16, DL, MVT::i32);
18698
uint32_t LIS = CNType == MVT::i32 ? PPC::LIS : PPC::LIS8;
18699
Base = SDValue(DAG.getMachineNode(LIS, DL, CNType, Base), 0);
18700
break;
18701
}
18702
}
18703
// Otherwise, the PPC:MOF_NotAdd flag is set. Load/Store is Non-foldable.
18704
Disp = DAG.getTargetConstant(0, DL, getPointerTy(DAG.getDataLayout()));
18705
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
18706
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
18707
fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
18708
} else
18709
Base = N;
18710
break;
18711
}
18712
case PPC::AM_PrefixDForm: {
18713
int64_t Imm34 = 0;
18714
unsigned Opcode = N.getOpcode();
18715
if (((Opcode == ISD::ADD) || (Opcode == ISD::OR)) &&
18716
(isIntS34Immediate(N.getOperand(1), Imm34))) {
18717
// N is an Add/OR Node, and it's operand is a 34-bit signed immediate.
18718
Disp = DAG.getTargetConstant(Imm34, DL, N.getValueType());
18719
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
18720
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
18721
else
18722
Base = N.getOperand(0);
18723
} else if (isIntS34Immediate(N, Imm34)) {
18724
// The address is a 34-bit signed immediate.
18725
Disp = DAG.getTargetConstant(Imm34, DL, N.getValueType());
18726
Base = DAG.getRegister(PPC::ZERO8, N.getValueType());
18727
}
18728
break;
18729
}
18730
case PPC::AM_PCRel: {
18731
// When selecting PC-Relative instructions, "Base" is not utilized as
18732
// we select the address as [PC+imm].
18733
Disp = N;
18734
break;
18735
}
18736
case PPC::AM_None:
18737
break;
18738
default: { // By default, X-Form is always available to be selected.
18739
// When a frame index is not aligned, we also match by XForm.
18740
FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N);
18741
Base = FI ? N : N.getOperand(1);
18742
Disp = FI ? DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
18743
N.getValueType())
18744
: N.getOperand(0);
18745
break;
18746
}
18747
}
18748
return Mode;
18749
}
18750
18751
CCAssignFn *PPCTargetLowering::ccAssignFnForCall(CallingConv::ID CC,
18752
bool Return,
18753
bool IsVarArg) const {
18754
switch (CC) {
18755
case CallingConv::Cold:
18756
return (Return ? RetCC_PPC_Cold : CC_PPC64_ELF);
18757
default:
18758
return CC_PPC64_ELF;
18759
}
18760
}
18761
18762
bool PPCTargetLowering::shouldInlineQuadwordAtomics() const {
18763
return Subtarget.isPPC64() && Subtarget.hasQuadwordAtomics();
18764
}
18765
18766
TargetLowering::AtomicExpansionKind
18767
PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
18768
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
18769
if (shouldInlineQuadwordAtomics() && Size == 128)
18770
return AtomicExpansionKind::MaskedIntrinsic;
18771
18772
switch (AI->getOperation()) {
18773
case AtomicRMWInst::UIncWrap:
18774
case AtomicRMWInst::UDecWrap:
18775
return AtomicExpansionKind::CmpXChg;
18776
default:
18777
return TargetLowering::shouldExpandAtomicRMWInIR(AI);
18778
}
18779
18780
llvm_unreachable("unreachable atomicrmw operation");
18781
}
18782
18783
TargetLowering::AtomicExpansionKind
18784
PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
18785
unsigned Size = AI->getNewValOperand()->getType()->getPrimitiveSizeInBits();
18786
if (shouldInlineQuadwordAtomics() && Size == 128)
18787
return AtomicExpansionKind::MaskedIntrinsic;
18788
return TargetLowering::shouldExpandAtomicCmpXchgInIR(AI);
18789
}
18790
18791
static Intrinsic::ID
18792
getIntrinsicForAtomicRMWBinOp128(AtomicRMWInst::BinOp BinOp) {
18793
switch (BinOp) {
18794
default:
18795
llvm_unreachable("Unexpected AtomicRMW BinOp");
18796
case AtomicRMWInst::Xchg:
18797
return Intrinsic::ppc_atomicrmw_xchg_i128;
18798
case AtomicRMWInst::Add:
18799
return Intrinsic::ppc_atomicrmw_add_i128;
18800
case AtomicRMWInst::Sub:
18801
return Intrinsic::ppc_atomicrmw_sub_i128;
18802
case AtomicRMWInst::And:
18803
return Intrinsic::ppc_atomicrmw_and_i128;
18804
case AtomicRMWInst::Or:
18805
return Intrinsic::ppc_atomicrmw_or_i128;
18806
case AtomicRMWInst::Xor:
18807
return Intrinsic::ppc_atomicrmw_xor_i128;
18808
case AtomicRMWInst::Nand:
18809
return Intrinsic::ppc_atomicrmw_nand_i128;
18810
}
18811
}
18812
18813
Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
18814
IRBuilderBase &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
18815
Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {
18816
assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
18817
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
18818
Type *ValTy = Incr->getType();
18819
assert(ValTy->getPrimitiveSizeInBits() == 128);
18820
Function *RMW = Intrinsic::getDeclaration(
18821
M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation()));
18822
Type *Int64Ty = Type::getInt64Ty(M->getContext());
18823
Value *IncrLo = Builder.CreateTrunc(Incr, Int64Ty, "incr_lo");
18824
Value *IncrHi =
18825
Builder.CreateTrunc(Builder.CreateLShr(Incr, 64), Int64Ty, "incr_hi");
18826
Value *LoHi = Builder.CreateCall(RMW, {AlignedAddr, IncrLo, IncrHi});
18827
Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
18828
Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
18829
Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
18830
Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
18831
return Builder.CreateOr(
18832
Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
18833
}
18834
18835
Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
18836
IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
18837
Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
18838
assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
18839
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
18840
Type *ValTy = CmpVal->getType();
18841
assert(ValTy->getPrimitiveSizeInBits() == 128);
18842
Function *IntCmpXchg =
18843
Intrinsic::getDeclaration(M, Intrinsic::ppc_cmpxchg_i128);
18844
Type *Int64Ty = Type::getInt64Ty(M->getContext());
18845
Value *CmpLo = Builder.CreateTrunc(CmpVal, Int64Ty, "cmp_lo");
18846
Value *CmpHi =
18847
Builder.CreateTrunc(Builder.CreateLShr(CmpVal, 64), Int64Ty, "cmp_hi");
18848
Value *NewLo = Builder.CreateTrunc(NewVal, Int64Ty, "new_lo");
18849
Value *NewHi =
18850
Builder.CreateTrunc(Builder.CreateLShr(NewVal, 64), Int64Ty, "new_hi");
18851
emitLeadingFence(Builder, CI, Ord);
18852
Value *LoHi =
18853
Builder.CreateCall(IntCmpXchg, {AlignedAddr, CmpLo, CmpHi, NewLo, NewHi});
18854
emitTrailingFence(Builder, CI, Ord);
18855
Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
18856
Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
18857
Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
18858
Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
18859
return Builder.CreateOr(
18860
Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
18861
}
18862
18863