Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
35267 views
1
//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file defines a DAG pattern matching instruction selector for X86,
10
// converting from a legalized dag to a X86 dag.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "X86ISelDAGToDAG.h"
15
#include "X86.h"
16
#include "X86MachineFunctionInfo.h"
17
#include "X86RegisterInfo.h"
18
#include "X86Subtarget.h"
19
#include "X86TargetMachine.h"
20
#include "llvm/ADT/Statistic.h"
21
#include "llvm/CodeGen/MachineModuleInfo.h"
22
#include "llvm/CodeGen/SelectionDAGISel.h"
23
#include "llvm/Config/llvm-config.h"
24
#include "llvm/IR/ConstantRange.h"
25
#include "llvm/IR/Function.h"
26
#include "llvm/IR/Instructions.h"
27
#include "llvm/IR/Intrinsics.h"
28
#include "llvm/IR/IntrinsicsX86.h"
29
#include "llvm/IR/Module.h"
30
#include "llvm/IR/Type.h"
31
#include "llvm/Support/Debug.h"
32
#include "llvm/Support/ErrorHandling.h"
33
#include "llvm/Support/KnownBits.h"
34
#include "llvm/Support/MathExtras.h"
35
#include <cstdint>
36
37
using namespace llvm;
38
39
#define DEBUG_TYPE "x86-isel"
40
#define PASS_NAME "X86 DAG->DAG Instruction Selection"
41
42
STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
43
44
static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
45
cl::desc("Enable setting constant bits to reduce size of mask immediates"),
46
cl::Hidden);
47
48
static cl::opt<bool> EnablePromoteAnyextLoad(
49
"x86-promote-anyext-load", cl::init(true),
50
cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
51
52
extern cl::opt<bool> IndirectBranchTracking;
53
54
//===----------------------------------------------------------------------===//
55
// Pattern Matcher Implementation
56
//===----------------------------------------------------------------------===//
57
58
namespace {
59
/// This corresponds to X86AddressMode, but uses SDValue's instead of register
60
/// numbers for the leaves of the matched tree.
61
struct X86ISelAddressMode {
62
enum {
63
RegBase,
64
FrameIndexBase
65
} BaseType = RegBase;
66
67
// This is really a union, discriminated by BaseType!
68
SDValue Base_Reg;
69
int Base_FrameIndex = 0;
70
71
unsigned Scale = 1;
72
SDValue IndexReg;
73
int32_t Disp = 0;
74
SDValue Segment;
75
const GlobalValue *GV = nullptr;
76
const Constant *CP = nullptr;
77
const BlockAddress *BlockAddr = nullptr;
78
const char *ES = nullptr;
79
MCSymbol *MCSym = nullptr;
80
int JT = -1;
81
Align Alignment; // CP alignment.
82
unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
83
bool NegateIndex = false;
84
85
X86ISelAddressMode() = default;
86
87
bool hasSymbolicDisplacement() const {
88
return GV != nullptr || CP != nullptr || ES != nullptr ||
89
MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
90
}
91
92
bool hasBaseOrIndexReg() const {
93
return BaseType == FrameIndexBase ||
94
IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
95
}
96
97
/// Return true if this addressing mode is already RIP-relative.
98
bool isRIPRelative() const {
99
if (BaseType != RegBase) return false;
100
if (RegisterSDNode *RegNode =
101
dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
102
return RegNode->getReg() == X86::RIP;
103
return false;
104
}
105
106
void setBaseReg(SDValue Reg) {
107
BaseType = RegBase;
108
Base_Reg = Reg;
109
}
110
111
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
112
void dump(SelectionDAG *DAG = nullptr) {
113
dbgs() << "X86ISelAddressMode " << this << '\n';
114
dbgs() << "Base_Reg ";
115
if (Base_Reg.getNode())
116
Base_Reg.getNode()->dump(DAG);
117
else
118
dbgs() << "nul\n";
119
if (BaseType == FrameIndexBase)
120
dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
121
dbgs() << " Scale " << Scale << '\n'
122
<< "IndexReg ";
123
if (NegateIndex)
124
dbgs() << "negate ";
125
if (IndexReg.getNode())
126
IndexReg.getNode()->dump(DAG);
127
else
128
dbgs() << "nul\n";
129
dbgs() << " Disp " << Disp << '\n'
130
<< "GV ";
131
if (GV)
132
GV->dump();
133
else
134
dbgs() << "nul";
135
dbgs() << " CP ";
136
if (CP)
137
CP->dump();
138
else
139
dbgs() << "nul";
140
dbgs() << '\n'
141
<< "ES ";
142
if (ES)
143
dbgs() << ES;
144
else
145
dbgs() << "nul";
146
dbgs() << " MCSym ";
147
if (MCSym)
148
dbgs() << MCSym;
149
else
150
dbgs() << "nul";
151
dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
152
}
153
#endif
154
};
155
}
156
157
namespace {
158
//===--------------------------------------------------------------------===//
159
/// ISel - X86-specific code to select X86 machine instructions for
160
/// SelectionDAG operations.
161
///
162
class X86DAGToDAGISel final : public SelectionDAGISel {
163
/// Keep a pointer to the X86Subtarget around so that we can
164
/// make the right decision when generating code for different targets.
165
const X86Subtarget *Subtarget;
166
167
/// If true, selector should try to optimize for minimum code size.
168
bool OptForMinSize;
169
170
/// Disable direct TLS access through segment registers.
171
bool IndirectTlsSegRefs;
172
173
public:
174
X86DAGToDAGISel() = delete;
175
176
explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
177
: SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
178
OptForMinSize(false), IndirectTlsSegRefs(false) {}
179
180
bool runOnMachineFunction(MachineFunction &MF) override {
181
// Reset the subtarget each time through.
182
Subtarget = &MF.getSubtarget<X86Subtarget>();
183
IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
184
"indirect-tls-seg-refs");
185
186
// OptFor[Min]Size are used in pattern predicates that isel is matching.
187
OptForMinSize = MF.getFunction().hasMinSize();
188
assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
189
"OptForMinSize implies OptForSize");
190
return SelectionDAGISel::runOnMachineFunction(MF);
191
}
192
193
void emitFunctionEntryCode() override;
194
195
bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
196
197
void PreprocessISelDAG() override;
198
void PostprocessISelDAG() override;
199
200
// Include the pieces autogenerated from the target description.
201
#include "X86GenDAGISel.inc"
202
203
private:
204
void Select(SDNode *N) override;
205
206
bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
207
bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
208
bool AllowSegmentRegForX32 = false);
209
bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
210
bool matchAddress(SDValue N, X86ISelAddressMode &AM);
211
bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
212
bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
213
SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
214
unsigned Depth);
215
bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
216
unsigned Depth);
217
bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218
unsigned Depth);
219
bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
220
bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
221
SDValue &Scale, SDValue &Index, SDValue &Disp,
222
SDValue &Segment);
223
bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
224
SDValue ScaleOp, SDValue &Base, SDValue &Scale,
225
SDValue &Index, SDValue &Disp, SDValue &Segment);
226
bool selectMOV64Imm32(SDValue N, SDValue &Imm);
227
bool selectLEAAddr(SDValue N, SDValue &Base,
228
SDValue &Scale, SDValue &Index, SDValue &Disp,
229
SDValue &Segment);
230
bool selectLEA64_32Addr(SDValue N, SDValue &Base,
231
SDValue &Scale, SDValue &Index, SDValue &Disp,
232
SDValue &Segment);
233
bool selectTLSADDRAddr(SDValue N, SDValue &Base,
234
SDValue &Scale, SDValue &Index, SDValue &Disp,
235
SDValue &Segment);
236
bool selectRelocImm(SDValue N, SDValue &Op);
237
238
bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
239
SDValue &Base, SDValue &Scale,
240
SDValue &Index, SDValue &Disp,
241
SDValue &Segment);
242
243
// Convenience method where P is also root.
244
bool tryFoldLoad(SDNode *P, SDValue N,
245
SDValue &Base, SDValue &Scale,
246
SDValue &Index, SDValue &Disp,
247
SDValue &Segment) {
248
return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
249
}
250
251
bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
252
SDValue &Base, SDValue &Scale,
253
SDValue &Index, SDValue &Disp,
254
SDValue &Segment);
255
256
bool isProfitableToFormMaskedOp(SDNode *N) const;
257
258
/// Implement addressing mode selection for inline asm expressions.
259
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
260
InlineAsm::ConstraintCode ConstraintID,
261
std::vector<SDValue> &OutOps) override;
262
263
void emitSpecialCodeForMain();
264
265
inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
266
MVT VT, SDValue &Base, SDValue &Scale,
267
SDValue &Index, SDValue &Disp,
268
SDValue &Segment) {
269
if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
270
Base = CurDAG->getTargetFrameIndex(
271
AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
272
else if (AM.Base_Reg.getNode())
273
Base = AM.Base_Reg;
274
else
275
Base = CurDAG->getRegister(0, VT);
276
277
Scale = getI8Imm(AM.Scale, DL);
278
279
#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
280
// Negate the index if needed.
281
if (AM.NegateIndex) {
282
unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
283
: GET_ND_IF_ENABLED(X86::NEG32r);
284
SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
285
AM.IndexReg), 0);
286
AM.IndexReg = Neg;
287
}
288
289
if (AM.IndexReg.getNode())
290
Index = AM.IndexReg;
291
else
292
Index = CurDAG->getRegister(0, VT);
293
294
// These are 32-bit even in 64-bit mode since RIP-relative offset
295
// is 32-bit.
296
if (AM.GV)
297
Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
298
MVT::i32, AM.Disp,
299
AM.SymbolFlags);
300
else if (AM.CP)
301
Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
302
AM.Disp, AM.SymbolFlags);
303
else if (AM.ES) {
304
assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
305
Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
306
} else if (AM.MCSym) {
307
assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
308
assert(AM.SymbolFlags == 0 && "oo");
309
Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
310
} else if (AM.JT != -1) {
311
assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
312
Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
313
} else if (AM.BlockAddr)
314
Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
315
AM.SymbolFlags);
316
else
317
Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
318
319
if (AM.Segment.getNode())
320
Segment = AM.Segment;
321
else
322
Segment = CurDAG->getRegister(0, MVT::i16);
323
}
324
325
// Utility function to determine whether we should avoid selecting
326
// immediate forms of instructions for better code size or not.
327
// At a high level, we'd like to avoid such instructions when
328
// we have similar constants used within the same basic block
329
// that can be kept in a register.
330
//
331
bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
332
uint32_t UseCount = 0;
333
334
// Do not want to hoist if we're not optimizing for size.
335
// TODO: We'd like to remove this restriction.
336
// See the comment in X86InstrInfo.td for more info.
337
if (!CurDAG->shouldOptForSize())
338
return false;
339
340
// Walk all the users of the immediate.
341
for (const SDNode *User : N->uses()) {
342
if (UseCount >= 2)
343
break;
344
345
// This user is already selected. Count it as a legitimate use and
346
// move on.
347
if (User->isMachineOpcode()) {
348
UseCount++;
349
continue;
350
}
351
352
// We want to count stores of immediates as real uses.
353
if (User->getOpcode() == ISD::STORE &&
354
User->getOperand(1).getNode() == N) {
355
UseCount++;
356
continue;
357
}
358
359
// We don't currently match users that have > 2 operands (except
360
// for stores, which are handled above)
361
// Those instruction won't match in ISEL, for now, and would
362
// be counted incorrectly.
363
// This may change in the future as we add additional instruction
364
// types.
365
if (User->getNumOperands() != 2)
366
continue;
367
368
// If this is a sign-extended 8-bit integer immediate used in an ALU
369
// instruction, there is probably an opcode encoding to save space.
370
auto *C = dyn_cast<ConstantSDNode>(N);
371
if (C && isInt<8>(C->getSExtValue()))
372
continue;
373
374
// Immediates that are used for offsets as part of stack
375
// manipulation should be left alone. These are typically
376
// used to indicate SP offsets for argument passing and
377
// will get pulled into stores/pushes (implicitly).
378
if (User->getOpcode() == X86ISD::ADD ||
379
User->getOpcode() == ISD::ADD ||
380
User->getOpcode() == X86ISD::SUB ||
381
User->getOpcode() == ISD::SUB) {
382
383
// Find the other operand of the add/sub.
384
SDValue OtherOp = User->getOperand(0);
385
if (OtherOp.getNode() == N)
386
OtherOp = User->getOperand(1);
387
388
// Don't count if the other operand is SP.
389
RegisterSDNode *RegNode;
390
if (OtherOp->getOpcode() == ISD::CopyFromReg &&
391
(RegNode = dyn_cast_or_null<RegisterSDNode>(
392
OtherOp->getOperand(1).getNode())))
393
if ((RegNode->getReg() == X86::ESP) ||
394
(RegNode->getReg() == X86::RSP))
395
continue;
396
}
397
398
// ... otherwise, count this and move on.
399
UseCount++;
400
}
401
402
// If we have more than 1 use, then recommend for hoisting.
403
return (UseCount > 1);
404
}
405
406
/// Return a target constant with the specified value of type i8.
407
inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
408
return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
409
}
410
411
/// Return a target constant with the specified value, of type i32.
412
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
413
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
414
}
415
416
/// Return a target constant with the specified value, of type i64.
417
inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
418
return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
419
}
420
421
SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
422
const SDLoc &DL) {
423
assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
424
uint64_t Index = N->getConstantOperandVal(1);
425
MVT VecVT = N->getOperand(0).getSimpleValueType();
426
return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
427
}
428
429
SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
430
const SDLoc &DL) {
431
assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
432
uint64_t Index = N->getConstantOperandVal(2);
433
MVT VecVT = N->getSimpleValueType(0);
434
return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
435
}
436
437
SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
438
const SDLoc &DL) {
439
assert(VecWidth == 128 && "Unexpected vector width");
440
uint64_t Index = N->getConstantOperandVal(2);
441
MVT VecVT = N->getSimpleValueType(0);
442
uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
443
assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
444
// vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
445
// vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
446
return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
447
}
448
449
SDValue getSBBZero(SDNode *N) {
450
SDLoc dl(N);
451
MVT VT = N->getSimpleValueType(0);
452
453
// Create zero.
454
SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
455
SDValue Zero = SDValue(
456
CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
457
if (VT == MVT::i64) {
458
Zero = SDValue(
459
CurDAG->getMachineNode(
460
TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
461
CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
462
CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
463
0);
464
}
465
466
// Copy flags to the EFLAGS register and glue it to next node.
467
unsigned Opcode = N->getOpcode();
468
assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
469
"Unexpected opcode for SBB materialization");
470
unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
471
SDValue EFLAGS =
472
CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
473
N->getOperand(FlagOpIndex), SDValue());
474
475
// Create a 64-bit instruction if the result is 64-bits otherwise use the
476
// 32-bit version.
477
unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
478
MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
479
VTs = CurDAG->getVTList(SBBVT, MVT::i32);
480
return SDValue(
481
CurDAG->getMachineNode(Opc, dl, VTs,
482
{Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
483
0);
484
}
485
486
// Helper to detect unneeded and instructions on shift amounts. Called
487
// from PatFrags in tablegen.
488
bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
489
assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
490
const APInt &Val = N->getConstantOperandAPInt(1);
491
492
if (Val.countr_one() >= Width)
493
return true;
494
495
APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
496
return Mask.countr_one() >= Width;
497
}
498
499
/// Return an SDNode that returns the value of the global base register.
500
/// Output instructions required to initialize the global base register,
501
/// if necessary.
502
SDNode *getGlobalBaseReg();
503
504
/// Return a reference to the TargetMachine, casted to the target-specific
505
/// type.
506
const X86TargetMachine &getTargetMachine() const {
507
return static_cast<const X86TargetMachine &>(TM);
508
}
509
510
/// Return a reference to the TargetInstrInfo, casted to the target-specific
511
/// type.
512
const X86InstrInfo *getInstrInfo() const {
513
return Subtarget->getInstrInfo();
514
}
515
516
/// Return a condition code of the given SDNode
517
X86::CondCode getCondFromNode(SDNode *N) const;
518
519
/// Address-mode matching performs shift-of-and to and-of-shift
520
/// reassociation in order to expose more scaled addressing
521
/// opportunities.
522
bool ComplexPatternFuncMutatesDAG() const override {
523
return true;
524
}
525
526
bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
527
528
// Indicates we should prefer to use a non-temporal load for this load.
529
bool useNonTemporalLoad(LoadSDNode *N) const {
530
if (!N->isNonTemporal())
531
return false;
532
533
unsigned StoreSize = N->getMemoryVT().getStoreSize();
534
535
if (N->getAlign().value() < StoreSize)
536
return false;
537
538
switch (StoreSize) {
539
default: llvm_unreachable("Unsupported store size");
540
case 4:
541
case 8:
542
return false;
543
case 16:
544
return Subtarget->hasSSE41();
545
case 32:
546
return Subtarget->hasAVX2();
547
case 64:
548
return Subtarget->hasAVX512();
549
}
550
}
551
552
bool foldLoadStoreIntoMemOperand(SDNode *Node);
553
MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
554
bool matchBitExtract(SDNode *Node);
555
bool shrinkAndImmediate(SDNode *N);
556
bool isMaskZeroExtended(SDNode *N) const;
557
bool tryShiftAmountMod(SDNode *N);
558
bool tryShrinkShlLogicImm(SDNode *N);
559
bool tryVPTERNLOG(SDNode *N);
560
bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
561
SDNode *ParentC, SDValue A, SDValue B, SDValue C,
562
uint8_t Imm);
563
bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
564
bool tryMatchBitSelect(SDNode *N);
565
566
MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
567
const SDLoc &dl, MVT VT, SDNode *Node);
568
MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
569
const SDLoc &dl, MVT VT, SDNode *Node,
570
SDValue &InGlue);
571
572
bool tryOptimizeRem8Extend(SDNode *N);
573
574
bool onlyUsesZeroFlag(SDValue Flags) const;
575
bool hasNoSignFlagUses(SDValue Flags) const;
576
bool hasNoCarryFlagUses(SDValue Flags) const;
577
};
578
579
class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
580
public:
581
static char ID;
582
explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
583
CodeGenOptLevel OptLevel)
584
: SelectionDAGISelLegacy(
585
ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
586
};
587
}
588
589
char X86DAGToDAGISelLegacy::ID = 0;
590
591
INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
592
593
// Returns true if this masked compare can be implemented legally with this
594
// type.
595
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
596
unsigned Opcode = N->getOpcode();
597
if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
598
Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
599
Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
600
// We can get 256-bit 8 element types here without VLX being enabled. When
601
// this happens we will use 512-bit operations and the mask will not be
602
// zero extended.
603
EVT OpVT = N->getOperand(0).getValueType();
604
// The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
605
// second operand.
606
if (Opcode == X86ISD::STRICT_CMPM)
607
OpVT = N->getOperand(1).getValueType();
608
if (OpVT.is256BitVector() || OpVT.is128BitVector())
609
return Subtarget->hasVLX();
610
611
return true;
612
}
613
// Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
614
if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
615
Opcode == X86ISD::FSETCCM_SAE)
616
return true;
617
618
return false;
619
}
620
621
// Returns true if we can assume the writer of the mask has zero extended it
622
// for us.
623
bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
624
// If this is an AND, check if we have a compare on either side. As long as
625
// one side guarantees the mask is zero extended, the AND will preserve those
626
// zeros.
627
if (N->getOpcode() == ISD::AND)
628
return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
629
isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
630
631
return isLegalMaskCompare(N, Subtarget);
632
}
633
634
bool
635
X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
636
if (OptLevel == CodeGenOptLevel::None)
637
return false;
638
639
if (!N.hasOneUse())
640
return false;
641
642
if (N.getOpcode() != ISD::LOAD)
643
return true;
644
645
// Don't fold non-temporal loads if we have an instruction for them.
646
if (useNonTemporalLoad(cast<LoadSDNode>(N)))
647
return false;
648
649
// If N is a load, do additional profitability checks.
650
if (U == Root) {
651
switch (U->getOpcode()) {
652
default: break;
653
case X86ISD::ADD:
654
case X86ISD::ADC:
655
case X86ISD::SUB:
656
case X86ISD::SBB:
657
case X86ISD::AND:
658
case X86ISD::XOR:
659
case X86ISD::OR:
660
case ISD::ADD:
661
case ISD::UADDO_CARRY:
662
case ISD::AND:
663
case ISD::OR:
664
case ISD::XOR: {
665
SDValue Op1 = U->getOperand(1);
666
667
// If the other operand is a 8-bit immediate we should fold the immediate
668
// instead. This reduces code size.
669
// e.g.
670
// movl 4(%esp), %eax
671
// addl $4, %eax
672
// vs.
673
// movl $4, %eax
674
// addl 4(%esp), %eax
675
// The former is 2 bytes shorter. In case where the increment is 1, then
676
// the saving can be 4 bytes (by using incl %eax).
677
if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
678
if (Imm->getAPIntValue().isSignedIntN(8))
679
return false;
680
681
// If this is a 64-bit AND with an immediate that fits in 32-bits,
682
// prefer using the smaller and over folding the load. This is needed to
683
// make sure immediates created by shrinkAndImmediate are always folded.
684
// Ideally we would narrow the load during DAG combine and get the
685
// best of both worlds.
686
if (U->getOpcode() == ISD::AND &&
687
Imm->getAPIntValue().getBitWidth() == 64 &&
688
Imm->getAPIntValue().isIntN(32))
689
return false;
690
691
// If this really a zext_inreg that can be represented with a movzx
692
// instruction, prefer that.
693
// TODO: We could shrink the load and fold if it is non-volatile.
694
if (U->getOpcode() == ISD::AND &&
695
(Imm->getAPIntValue() == UINT8_MAX ||
696
Imm->getAPIntValue() == UINT16_MAX ||
697
Imm->getAPIntValue() == UINT32_MAX))
698
return false;
699
700
// ADD/SUB with can negate the immediate and use the opposite operation
701
// to fit 128 into a sign extended 8 bit immediate.
702
if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
703
(-Imm->getAPIntValue()).isSignedIntN(8))
704
return false;
705
706
if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
707
(-Imm->getAPIntValue()).isSignedIntN(8) &&
708
hasNoCarryFlagUses(SDValue(U, 1)))
709
return false;
710
}
711
712
// If the other operand is a TLS address, we should fold it instead.
713
// This produces
714
// movl %gs:0, %eax
715
// leal i@NTPOFF(%eax), %eax
716
// instead of
717
// movl $i@NTPOFF, %eax
718
// addl %gs:0, %eax
719
// if the block also has an access to a second TLS address this will save
720
// a load.
721
// FIXME: This is probably also true for non-TLS addresses.
722
if (Op1.getOpcode() == X86ISD::Wrapper) {
723
SDValue Val = Op1.getOperand(0);
724
if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
725
return false;
726
}
727
728
// Don't fold load if this matches the BTS/BTR/BTC patterns.
729
// BTS: (or X, (shl 1, n))
730
// BTR: (and X, (rotl -2, n))
731
// BTC: (xor X, (shl 1, n))
732
if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
733
if (U->getOperand(0).getOpcode() == ISD::SHL &&
734
isOneConstant(U->getOperand(0).getOperand(0)))
735
return false;
736
737
if (U->getOperand(1).getOpcode() == ISD::SHL &&
738
isOneConstant(U->getOperand(1).getOperand(0)))
739
return false;
740
}
741
if (U->getOpcode() == ISD::AND) {
742
SDValue U0 = U->getOperand(0);
743
SDValue U1 = U->getOperand(1);
744
if (U0.getOpcode() == ISD::ROTL) {
745
auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
746
if (C && C->getSExtValue() == -2)
747
return false;
748
}
749
750
if (U1.getOpcode() == ISD::ROTL) {
751
auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
752
if (C && C->getSExtValue() == -2)
753
return false;
754
}
755
}
756
757
break;
758
}
759
case ISD::SHL:
760
case ISD::SRA:
761
case ISD::SRL:
762
// Don't fold a load into a shift by immediate. The BMI2 instructions
763
// support folding a load, but not an immediate. The legacy instructions
764
// support folding an immediate, but can't fold a load. Folding an
765
// immediate is preferable to folding a load.
766
if (isa<ConstantSDNode>(U->getOperand(1)))
767
return false;
768
769
break;
770
}
771
}
772
773
// Prevent folding a load if this can implemented with an insert_subreg or
774
// a move that implicitly zeroes.
775
if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
776
isNullConstant(Root->getOperand(2)) &&
777
(Root->getOperand(0).isUndef() ||
778
ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
779
return false;
780
781
return true;
782
}
783
784
// Indicates it is profitable to form an AVX512 masked operation. Returning
785
// false will favor a masked register-register masked move or vblendm and the
786
// operation will be selected separately.
787
bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
788
assert(
789
(N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
790
"Unexpected opcode!");
791
792
// If the operation has additional users, the operation will be duplicated.
793
// Check the use count to prevent that.
794
// FIXME: Are there cheap opcodes we might want to duplicate?
795
return N->getOperand(1).hasOneUse();
796
}
797
798
/// Replace the original chain operand of the call with
799
/// load's chain operand and move load below the call's chain operand.
800
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
801
SDValue Call, SDValue OrigChain) {
802
SmallVector<SDValue, 8> Ops;
803
SDValue Chain = OrigChain.getOperand(0);
804
if (Chain.getNode() == Load.getNode())
805
Ops.push_back(Load.getOperand(0));
806
else {
807
assert(Chain.getOpcode() == ISD::TokenFactor &&
808
"Unexpected chain operand");
809
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
810
if (Chain.getOperand(i).getNode() == Load.getNode())
811
Ops.push_back(Load.getOperand(0));
812
else
813
Ops.push_back(Chain.getOperand(i));
814
SDValue NewChain =
815
CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
816
Ops.clear();
817
Ops.push_back(NewChain);
818
}
819
Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
820
CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
821
CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
822
Load.getOperand(1), Load.getOperand(2));
823
824
Ops.clear();
825
Ops.push_back(SDValue(Load.getNode(), 1));
826
Ops.append(Call->op_begin() + 1, Call->op_end());
827
CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
828
}
829
830
/// Return true if call address is a load and it can be
831
/// moved below CALLSEQ_START and the chains leading up to the call.
832
/// Return the CALLSEQ_START by reference as a second output.
833
/// In the case of a tail call, there isn't a callseq node between the call
834
/// chain and the load.
835
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
836
// The transformation is somewhat dangerous if the call's chain was glued to
837
// the call. After MoveBelowOrigChain the load is moved between the call and
838
// the chain, this can create a cycle if the load is not folded. So it is
839
// *really* important that we are sure the load will be folded.
840
if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
841
return false;
842
auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
843
if (!LD ||
844
!LD->isSimple() ||
845
LD->getAddressingMode() != ISD::UNINDEXED ||
846
LD->getExtensionType() != ISD::NON_EXTLOAD)
847
return false;
848
849
// Now let's find the callseq_start.
850
while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
851
if (!Chain.hasOneUse())
852
return false;
853
Chain = Chain.getOperand(0);
854
}
855
856
if (!Chain.getNumOperands())
857
return false;
858
// Since we are not checking for AA here, conservatively abort if the chain
859
// writes to memory. It's not safe to move the callee (a load) across a store.
860
if (isa<MemSDNode>(Chain.getNode()) &&
861
cast<MemSDNode>(Chain.getNode())->writeMem())
862
return false;
863
if (Chain.getOperand(0).getNode() == Callee.getNode())
864
return true;
865
if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
866
Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
867
Callee.getValue(1).hasOneUse())
868
return true;
869
return false;
870
}
871
872
static bool isEndbrImm64(uint64_t Imm) {
873
// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
874
// i.g: 0xF3660F1EFA, 0xF3670F1EFA
875
if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
876
return false;
877
878
uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
879
0x65, 0x66, 0x67, 0xf0, 0xf2};
880
int i = 24; // 24bit 0x0F1EFA has matched
881
while (i < 64) {
882
uint8_t Byte = (Imm >> i) & 0xFF;
883
if (Byte == 0xF3)
884
return true;
885
if (!llvm::is_contained(OptionalPrefixBytes, Byte))
886
return false;
887
i += 8;
888
}
889
890
return false;
891
}
892
893
static bool needBWI(MVT VT) {
894
return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
895
}
896
897
void X86DAGToDAGISel::PreprocessISelDAG() {
898
bool MadeChange = false;
899
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
900
E = CurDAG->allnodes_end(); I != E; ) {
901
SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
902
903
// This is for CET enhancement.
904
//
905
// ENDBR32 and ENDBR64 have specific opcodes:
906
// ENDBR32: F3 0F 1E FB
907
// ENDBR64: F3 0F 1E FA
908
// And we want that attackers won’t find unintended ENDBR32/64
909
// opcode matches in the binary
910
// Here’s an example:
911
// If the compiler had to generate asm for the following code:
912
// a = 0xF30F1EFA
913
// it could, for example, generate:
914
// mov 0xF30F1EFA, dword ptr[a]
915
// In such a case, the binary would include a gadget that starts
916
// with a fake ENDBR64 opcode. Therefore, we split such generation
917
// into multiple operations, let it not shows in the binary
918
if (N->getOpcode() == ISD::Constant) {
919
MVT VT = N->getSimpleValueType(0);
920
int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
921
int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
922
if (Imm == EndbrImm || isEndbrImm64(Imm)) {
923
// Check that the cf-protection-branch is enabled.
924
Metadata *CFProtectionBranch =
925
MF->getFunction().getParent()->getModuleFlag(
926
"cf-protection-branch");
927
if (CFProtectionBranch || IndirectBranchTracking) {
928
SDLoc dl(N);
929
SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
930
Complement = CurDAG->getNOT(dl, Complement, VT);
931
--I;
932
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
933
++I;
934
MadeChange = true;
935
continue;
936
}
937
}
938
}
939
940
// If this is a target specific AND node with no flag usages, turn it back
941
// into ISD::AND to enable test instruction matching.
942
if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
943
SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
944
N->getOperand(0), N->getOperand(1));
945
--I;
946
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
947
++I;
948
MadeChange = true;
949
continue;
950
}
951
952
// Convert vector increment or decrement to sub/add with an all-ones
953
// constant:
954
// add X, <1, 1...> --> sub X, <-1, -1...>
955
// sub X, <1, 1...> --> add X, <-1, -1...>
956
// The all-ones vector constant can be materialized using a pcmpeq
957
// instruction that is commonly recognized as an idiom (has no register
958
// dependency), so that's better/smaller than loading a splat 1 constant.
959
//
960
// But don't do this if it would inhibit a potentially profitable load
961
// folding opportunity for the other operand. That only occurs with the
962
// intersection of:
963
// (1) The other operand (op0) is load foldable.
964
// (2) The op is an add (otherwise, we are *creating* an add and can still
965
// load fold the other op).
966
// (3) The target has AVX (otherwise, we have a destructive add and can't
967
// load fold the other op without killing the constant op).
968
// (4) The constant 1 vector has multiple uses (so it is profitable to load
969
// into a register anyway).
970
auto mayPreventLoadFold = [&]() {
971
return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
972
N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
973
!N->getOperand(1).hasOneUse();
974
};
975
if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
976
N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
977
APInt SplatVal;
978
if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
979
SplatVal.isOne()) {
980
SDLoc DL(N);
981
982
MVT VT = N->getSimpleValueType(0);
983
unsigned NumElts = VT.getSizeInBits() / 32;
984
SDValue AllOnes =
985
CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
986
AllOnes = CurDAG->getBitcast(VT, AllOnes);
987
988
unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
989
SDValue Res =
990
CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
991
--I;
992
CurDAG->ReplaceAllUsesWith(N, Res.getNode());
993
++I;
994
MadeChange = true;
995
continue;
996
}
997
}
998
999
switch (N->getOpcode()) {
1000
case X86ISD::VBROADCAST: {
1001
MVT VT = N->getSimpleValueType(0);
1002
// Emulate v32i16/v64i8 broadcast without BWI.
1003
if (!Subtarget->hasBWI() && needBWI(VT)) {
1004
MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1005
SDLoc dl(N);
1006
SDValue NarrowBCast =
1007
CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1008
SDValue Res =
1009
CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1010
NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1011
unsigned Index = NarrowVT.getVectorMinNumElements();
1012
Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1013
CurDAG->getIntPtrConstant(Index, dl));
1014
1015
--I;
1016
CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1017
++I;
1018
MadeChange = true;
1019
continue;
1020
}
1021
1022
break;
1023
}
1024
case X86ISD::VBROADCAST_LOAD: {
1025
MVT VT = N->getSimpleValueType(0);
1026
// Emulate v32i16/v64i8 broadcast without BWI.
1027
if (!Subtarget->hasBWI() && needBWI(VT)) {
1028
MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1029
auto *MemNode = cast<MemSDNode>(N);
1030
SDLoc dl(N);
1031
SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1032
SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1033
SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1034
X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1035
MemNode->getMemOperand());
1036
SDValue Res =
1037
CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1038
NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1039
unsigned Index = NarrowVT.getVectorMinNumElements();
1040
Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1041
CurDAG->getIntPtrConstant(Index, dl));
1042
1043
--I;
1044
SDValue To[] = {Res, NarrowBCast.getValue(1)};
1045
CurDAG->ReplaceAllUsesWith(N, To);
1046
++I;
1047
MadeChange = true;
1048
continue;
1049
}
1050
1051
break;
1052
}
1053
case ISD::LOAD: {
1054
// If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1055
// load, then just extract the lower subvector and avoid the second load.
1056
auto *Ld = cast<LoadSDNode>(N);
1057
MVT VT = N->getSimpleValueType(0);
1058
if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1059
!(VT.is128BitVector() || VT.is256BitVector()))
1060
break;
1061
1062
MVT MaxVT = VT;
1063
SDNode *MaxLd = nullptr;
1064
SDValue Ptr = Ld->getBasePtr();
1065
SDValue Chain = Ld->getChain();
1066
for (SDNode *User : Ptr->uses()) {
1067
auto *UserLd = dyn_cast<LoadSDNode>(User);
1068
MVT UserVT = User->getSimpleValueType(0);
1069
if (User != N && UserLd && ISD::isNormalLoad(User) &&
1070
UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1071
!User->hasAnyUseOfValue(1) &&
1072
(UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1073
UserVT.getSizeInBits() > VT.getSizeInBits() &&
1074
(!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1075
MaxLd = User;
1076
MaxVT = UserVT;
1077
}
1078
}
1079
if (MaxLd) {
1080
SDLoc dl(N);
1081
unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1082
MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1083
SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1084
SDValue(MaxLd, 0),
1085
CurDAG->getIntPtrConstant(0, dl));
1086
SDValue Res = CurDAG->getBitcast(VT, Extract);
1087
1088
--I;
1089
SDValue To[] = {Res, SDValue(MaxLd, 1)};
1090
CurDAG->ReplaceAllUsesWith(N, To);
1091
++I;
1092
MadeChange = true;
1093
continue;
1094
}
1095
break;
1096
}
1097
case ISD::VSELECT: {
1098
// Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1099
EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1100
if (EleVT == MVT::i1)
1101
break;
1102
1103
assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1104
assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1105
"We can't replace VSELECT with BLENDV in vXi16!");
1106
SDValue R;
1107
if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1108
EleVT.getSizeInBits()) {
1109
R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1110
N->getOperand(0), N->getOperand(1), N->getOperand(2),
1111
CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1112
} else {
1113
R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1114
N->getOperand(0), N->getOperand(1),
1115
N->getOperand(2));
1116
}
1117
--I;
1118
CurDAG->ReplaceAllUsesWith(N, R.getNode());
1119
++I;
1120
MadeChange = true;
1121
continue;
1122
}
1123
case ISD::FP_ROUND:
1124
case ISD::STRICT_FP_ROUND:
1125
case ISD::FP_TO_SINT:
1126
case ISD::FP_TO_UINT:
1127
case ISD::STRICT_FP_TO_SINT:
1128
case ISD::STRICT_FP_TO_UINT: {
1129
// Replace vector fp_to_s/uint with their X86 specific equivalent so we
1130
// don't need 2 sets of patterns.
1131
if (!N->getSimpleValueType(0).isVector())
1132
break;
1133
1134
unsigned NewOpc;
1135
switch (N->getOpcode()) {
1136
default: llvm_unreachable("Unexpected opcode!");
1137
case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1138
case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1139
case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1140
case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1141
case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1142
case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1143
}
1144
SDValue Res;
1145
if (N->isStrictFPOpcode())
1146
Res =
1147
CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1148
{N->getOperand(0), N->getOperand(1)});
1149
else
1150
Res =
1151
CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1152
N->getOperand(0));
1153
--I;
1154
CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1155
++I;
1156
MadeChange = true;
1157
continue;
1158
}
1159
case ISD::SHL:
1160
case ISD::SRA:
1161
case ISD::SRL: {
1162
// Replace vector shifts with their X86 specific equivalent so we don't
1163
// need 2 sets of patterns.
1164
if (!N->getValueType(0).isVector())
1165
break;
1166
1167
unsigned NewOpc;
1168
switch (N->getOpcode()) {
1169
default: llvm_unreachable("Unexpected opcode!");
1170
case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1171
case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1172
case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1173
}
1174
SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1175
N->getOperand(0), N->getOperand(1));
1176
--I;
1177
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1178
++I;
1179
MadeChange = true;
1180
continue;
1181
}
1182
case ISD::ANY_EXTEND:
1183
case ISD::ANY_EXTEND_VECTOR_INREG: {
1184
// Replace vector any extend with the zero extend equivalents so we don't
1185
// need 2 sets of patterns. Ignore vXi1 extensions.
1186
if (!N->getValueType(0).isVector())
1187
break;
1188
1189
unsigned NewOpc;
1190
if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1191
assert(N->getOpcode() == ISD::ANY_EXTEND &&
1192
"Unexpected opcode for mask vector!");
1193
NewOpc = ISD::SIGN_EXTEND;
1194
} else {
1195
NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1196
? ISD::ZERO_EXTEND
1197
: ISD::ZERO_EXTEND_VECTOR_INREG;
1198
}
1199
1200
SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1201
N->getOperand(0));
1202
--I;
1203
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1204
++I;
1205
MadeChange = true;
1206
continue;
1207
}
1208
case ISD::FCEIL:
1209
case ISD::STRICT_FCEIL:
1210
case ISD::FFLOOR:
1211
case ISD::STRICT_FFLOOR:
1212
case ISD::FTRUNC:
1213
case ISD::STRICT_FTRUNC:
1214
case ISD::FROUNDEVEN:
1215
case ISD::STRICT_FROUNDEVEN:
1216
case ISD::FNEARBYINT:
1217
case ISD::STRICT_FNEARBYINT:
1218
case ISD::FRINT:
1219
case ISD::STRICT_FRINT: {
1220
// Replace fp rounding with their X86 specific equivalent so we don't
1221
// need 2 sets of patterns.
1222
unsigned Imm;
1223
switch (N->getOpcode()) {
1224
default: llvm_unreachable("Unexpected opcode!");
1225
case ISD::STRICT_FCEIL:
1226
case ISD::FCEIL: Imm = 0xA; break;
1227
case ISD::STRICT_FFLOOR:
1228
case ISD::FFLOOR: Imm = 0x9; break;
1229
case ISD::STRICT_FTRUNC:
1230
case ISD::FTRUNC: Imm = 0xB; break;
1231
case ISD::STRICT_FROUNDEVEN:
1232
case ISD::FROUNDEVEN: Imm = 0x8; break;
1233
case ISD::STRICT_FNEARBYINT:
1234
case ISD::FNEARBYINT: Imm = 0xC; break;
1235
case ISD::STRICT_FRINT:
1236
case ISD::FRINT: Imm = 0x4; break;
1237
}
1238
SDLoc dl(N);
1239
bool IsStrict = N->isStrictFPOpcode();
1240
SDValue Res;
1241
if (IsStrict)
1242
Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1243
{N->getValueType(0), MVT::Other},
1244
{N->getOperand(0), N->getOperand(1),
1245
CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1246
else
1247
Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1248
N->getOperand(0),
1249
CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1250
--I;
1251
CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1252
++I;
1253
MadeChange = true;
1254
continue;
1255
}
1256
case X86ISD::FANDN:
1257
case X86ISD::FAND:
1258
case X86ISD::FOR:
1259
case X86ISD::FXOR: {
1260
// Widen scalar fp logic ops to vector to reduce isel patterns.
1261
// FIXME: Can we do this during lowering/combine.
1262
MVT VT = N->getSimpleValueType(0);
1263
if (VT.isVector() || VT == MVT::f128)
1264
break;
1265
1266
MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1267
: VT == MVT::f32 ? MVT::v4f32
1268
: MVT::v8f16;
1269
1270
SDLoc dl(N);
1271
SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1272
N->getOperand(0));
1273
SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1274
N->getOperand(1));
1275
1276
SDValue Res;
1277
if (Subtarget->hasSSE2()) {
1278
EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1279
Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1280
Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1281
unsigned Opc;
1282
switch (N->getOpcode()) {
1283
default: llvm_unreachable("Unexpected opcode!");
1284
case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1285
case X86ISD::FAND: Opc = ISD::AND; break;
1286
case X86ISD::FOR: Opc = ISD::OR; break;
1287
case X86ISD::FXOR: Opc = ISD::XOR; break;
1288
}
1289
Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1290
Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1291
} else {
1292
Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1293
}
1294
Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1295
CurDAG->getIntPtrConstant(0, dl));
1296
--I;
1297
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1298
++I;
1299
MadeChange = true;
1300
continue;
1301
}
1302
}
1303
1304
if (OptLevel != CodeGenOptLevel::None &&
1305
// Only do this when the target can fold the load into the call or
1306
// jmp.
1307
!Subtarget->useIndirectThunkCalls() &&
1308
((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1309
(N->getOpcode() == X86ISD::TC_RETURN &&
1310
(Subtarget->is64Bit() ||
1311
!getTargetMachine().isPositionIndependent())))) {
1312
/// Also try moving call address load from outside callseq_start to just
1313
/// before the call to allow it to be folded.
1314
///
1315
/// [Load chain]
1316
/// ^
1317
/// |
1318
/// [Load]
1319
/// ^ ^
1320
/// | |
1321
/// / \--
1322
/// / |
1323
///[CALLSEQ_START] |
1324
/// ^ |
1325
/// | |
1326
/// [LOAD/C2Reg] |
1327
/// | |
1328
/// \ /
1329
/// \ /
1330
/// [CALL]
1331
bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1332
SDValue Chain = N->getOperand(0);
1333
SDValue Load = N->getOperand(1);
1334
if (!isCalleeLoad(Load, Chain, HasCallSeq))
1335
continue;
1336
moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1337
++NumLoadMoved;
1338
MadeChange = true;
1339
continue;
1340
}
1341
1342
// Lower fpround and fpextend nodes that target the FP stack to be store and
1343
// load to the stack. This is a gross hack. We would like to simply mark
1344
// these as being illegal, but when we do that, legalize produces these when
1345
// it expands calls, then expands these in the same legalize pass. We would
1346
// like dag combine to be able to hack on these between the call expansion
1347
// and the node legalization. As such this pass basically does "really
1348
// late" legalization of these inline with the X86 isel pass.
1349
// FIXME: This should only happen when not compiled with -O0.
1350
switch (N->getOpcode()) {
1351
default: continue;
1352
case ISD::FP_ROUND:
1353
case ISD::FP_EXTEND:
1354
{
1355
MVT SrcVT = N->getOperand(0).getSimpleValueType();
1356
MVT DstVT = N->getSimpleValueType(0);
1357
1358
// If any of the sources are vectors, no fp stack involved.
1359
if (SrcVT.isVector() || DstVT.isVector())
1360
continue;
1361
1362
// If the source and destination are SSE registers, then this is a legal
1363
// conversion that should not be lowered.
1364
const X86TargetLowering *X86Lowering =
1365
static_cast<const X86TargetLowering *>(TLI);
1366
bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1367
bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1368
if (SrcIsSSE && DstIsSSE)
1369
continue;
1370
1371
if (!SrcIsSSE && !DstIsSSE) {
1372
// If this is an FPStack extension, it is a noop.
1373
if (N->getOpcode() == ISD::FP_EXTEND)
1374
continue;
1375
// If this is a value-preserving FPStack truncation, it is a noop.
1376
if (N->getConstantOperandVal(1))
1377
continue;
1378
}
1379
1380
// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1381
// FPStack has extload and truncstore. SSE can fold direct loads into other
1382
// operations. Based on this, decide what we want to do.
1383
MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1384
SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1385
int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1386
MachinePointerInfo MPI =
1387
MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1388
SDLoc dl(N);
1389
1390
// FIXME: optimize the case where the src/dest is a load or store?
1391
1392
SDValue Store = CurDAG->getTruncStore(
1393
CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1394
SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1395
MemTmp, MPI, MemVT);
1396
1397
// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1398
// extload we created. This will cause general havok on the dag because
1399
// anything below the conversion could be folded into other existing nodes.
1400
// To avoid invalidating 'I', back it up to the convert node.
1401
--I;
1402
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1403
break;
1404
}
1405
1406
//The sequence of events for lowering STRICT_FP versions of these nodes requires
1407
//dealing with the chain differently, as there is already a preexisting chain.
1408
case ISD::STRICT_FP_ROUND:
1409
case ISD::STRICT_FP_EXTEND:
1410
{
1411
MVT SrcVT = N->getOperand(1).getSimpleValueType();
1412
MVT DstVT = N->getSimpleValueType(0);
1413
1414
// If any of the sources are vectors, no fp stack involved.
1415
if (SrcVT.isVector() || DstVT.isVector())
1416
continue;
1417
1418
// If the source and destination are SSE registers, then this is a legal
1419
// conversion that should not be lowered.
1420
const X86TargetLowering *X86Lowering =
1421
static_cast<const X86TargetLowering *>(TLI);
1422
bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1423
bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1424
if (SrcIsSSE && DstIsSSE)
1425
continue;
1426
1427
if (!SrcIsSSE && !DstIsSSE) {
1428
// If this is an FPStack extension, it is a noop.
1429
if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1430
continue;
1431
// If this is a value-preserving FPStack truncation, it is a noop.
1432
if (N->getConstantOperandVal(2))
1433
continue;
1434
}
1435
1436
// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1437
// FPStack has extload and truncstore. SSE can fold direct loads into other
1438
// operations. Based on this, decide what we want to do.
1439
MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1440
SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1441
int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1442
MachinePointerInfo MPI =
1443
MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1444
SDLoc dl(N);
1445
1446
// FIXME: optimize the case where the src/dest is a load or store?
1447
1448
//Since the operation is StrictFP, use the preexisting chain.
1449
SDValue Store, Result;
1450
if (!SrcIsSSE) {
1451
SDVTList VTs = CurDAG->getVTList(MVT::Other);
1452
SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1453
Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1454
MPI, /*Align*/ std::nullopt,
1455
MachineMemOperand::MOStore);
1456
if (N->getFlags().hasNoFPExcept()) {
1457
SDNodeFlags Flags = Store->getFlags();
1458
Flags.setNoFPExcept(true);
1459
Store->setFlags(Flags);
1460
}
1461
} else {
1462
assert(SrcVT == MemVT && "Unexpected VT!");
1463
Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1464
MPI);
1465
}
1466
1467
if (!DstIsSSE) {
1468
SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1469
SDValue Ops[] = {Store, MemTmp};
1470
Result = CurDAG->getMemIntrinsicNode(
1471
X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1472
/*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1473
if (N->getFlags().hasNoFPExcept()) {
1474
SDNodeFlags Flags = Result->getFlags();
1475
Flags.setNoFPExcept(true);
1476
Result->setFlags(Flags);
1477
}
1478
} else {
1479
assert(DstVT == MemVT && "Unexpected VT!");
1480
Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1481
}
1482
1483
// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1484
// extload we created. This will cause general havok on the dag because
1485
// anything below the conversion could be folded into other existing nodes.
1486
// To avoid invalidating 'I', back it up to the convert node.
1487
--I;
1488
CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1489
break;
1490
}
1491
}
1492
1493
1494
// Now that we did that, the node is dead. Increment the iterator to the
1495
// next node to process, then delete N.
1496
++I;
1497
MadeChange = true;
1498
}
1499
1500
// Remove any dead nodes that may have been left behind.
1501
if (MadeChange)
1502
CurDAG->RemoveDeadNodes();
1503
}
1504
1505
// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1506
bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1507
unsigned Opc = N->getMachineOpcode();
1508
if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1509
Opc != X86::MOVSX64rr8)
1510
return false;
1511
1512
SDValue N0 = N->getOperand(0);
1513
1514
// We need to be extracting the lower bit of an extend.
1515
if (!N0.isMachineOpcode() ||
1516
N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1517
N0.getConstantOperandVal(1) != X86::sub_8bit)
1518
return false;
1519
1520
// We're looking for either a movsx or movzx to match the original opcode.
1521
unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1522
: X86::MOVSX32rr8_NOREX;
1523
SDValue N00 = N0.getOperand(0);
1524
if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1525
return false;
1526
1527
if (Opc == X86::MOVSX64rr8) {
1528
// If we had a sign extend from 8 to 64 bits. We still need to go from 32
1529
// to 64.
1530
MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1531
MVT::i64, N00);
1532
ReplaceUses(N, Extend);
1533
} else {
1534
// Ok we can drop this extend and just use the original extend.
1535
ReplaceUses(N, N00.getNode());
1536
}
1537
1538
return true;
1539
}
1540
1541
void X86DAGToDAGISel::PostprocessISelDAG() {
1542
// Skip peepholes at -O0.
1543
if (TM.getOptLevel() == CodeGenOptLevel::None)
1544
return;
1545
1546
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1547
1548
bool MadeChange = false;
1549
while (Position != CurDAG->allnodes_begin()) {
1550
SDNode *N = &*--Position;
1551
// Skip dead nodes and any non-machine opcodes.
1552
if (N->use_empty() || !N->isMachineOpcode())
1553
continue;
1554
1555
if (tryOptimizeRem8Extend(N)) {
1556
MadeChange = true;
1557
continue;
1558
}
1559
1560
unsigned Opc = N->getMachineOpcode();
1561
switch (Opc) {
1562
default:
1563
continue;
1564
// ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1565
case X86::TEST8rr:
1566
case X86::TEST16rr:
1567
case X86::TEST32rr:
1568
case X86::TEST64rr:
1569
// ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1570
case X86::CTEST8rr:
1571
case X86::CTEST16rr:
1572
case X86::CTEST32rr:
1573
case X86::CTEST64rr: {
1574
auto &Op0 = N->getOperand(0);
1575
if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1576
!Op0.isMachineOpcode())
1577
continue;
1578
SDValue And = N->getOperand(0);
1579
#define CASE_ND(OP) \
1580
case X86::OP: \
1581
case X86::OP##_ND:
1582
switch (And.getMachineOpcode()) {
1583
default:
1584
continue;
1585
CASE_ND(AND8rr)
1586
CASE_ND(AND16rr)
1587
CASE_ND(AND32rr)
1588
CASE_ND(AND64rr) {
1589
if (And->hasAnyUseOfValue(1))
1590
continue;
1591
SmallVector<SDValue> Ops(N->op_values());
1592
Ops[0] = And.getOperand(0);
1593
Ops[1] = And.getOperand(1);
1594
MachineSDNode *Test =
1595
CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1596
ReplaceUses(N, Test);
1597
MadeChange = true;
1598
continue;
1599
}
1600
CASE_ND(AND8rm)
1601
CASE_ND(AND16rm)
1602
CASE_ND(AND32rm)
1603
CASE_ND(AND64rm) {
1604
if (And->hasAnyUseOfValue(1))
1605
continue;
1606
unsigned NewOpc;
1607
bool IsCTESTCC = X86::isCTESTCC(Opc);
1608
#define FROM_TO(A, B) \
1609
CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1610
break;
1611
switch (And.getMachineOpcode()) {
1612
FROM_TO(AND8rm, TEST8mr);
1613
FROM_TO(AND16rm, TEST16mr);
1614
FROM_TO(AND32rm, TEST32mr);
1615
FROM_TO(AND64rm, TEST64mr);
1616
}
1617
#undef FROM_TO
1618
#undef CASE_ND
1619
// Need to swap the memory and register operand.
1620
SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1621
And.getOperand(3), And.getOperand(4),
1622
And.getOperand(5), And.getOperand(0)};
1623
// CC, Cflags.
1624
if (IsCTESTCC) {
1625
Ops.push_back(N->getOperand(2));
1626
Ops.push_back(N->getOperand(3));
1627
}
1628
// Chain of memory load
1629
Ops.push_back(And.getOperand(6));
1630
// Glue
1631
if (IsCTESTCC)
1632
Ops.push_back(N->getOperand(4));
1633
1634
MachineSDNode *Test = CurDAG->getMachineNode(
1635
NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1636
CurDAG->setNodeMemRefs(
1637
Test, cast<MachineSDNode>(And.getNode())->memoperands());
1638
ReplaceUses(And.getValue(2), SDValue(Test, 1));
1639
ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1640
MadeChange = true;
1641
continue;
1642
}
1643
}
1644
}
1645
// Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1646
// used. We're doing this late so we can prefer to fold the AND into masked
1647
// comparisons. Doing that can be better for the live range of the mask
1648
// register.
1649
case X86::KORTESTBrr:
1650
case X86::KORTESTWrr:
1651
case X86::KORTESTDrr:
1652
case X86::KORTESTQrr: {
1653
SDValue Op0 = N->getOperand(0);
1654
if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1655
!Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1656
continue;
1657
#define CASE(A) \
1658
case X86::A: \
1659
break;
1660
switch (Op0.getMachineOpcode()) {
1661
default:
1662
continue;
1663
CASE(KANDBrr)
1664
CASE(KANDWrr)
1665
CASE(KANDDrr)
1666
CASE(KANDQrr)
1667
}
1668
unsigned NewOpc;
1669
#define FROM_TO(A, B) \
1670
case X86::A: \
1671
NewOpc = X86::B; \
1672
break;
1673
switch (Opc) {
1674
FROM_TO(KORTESTBrr, KTESTBrr)
1675
FROM_TO(KORTESTWrr, KTESTWrr)
1676
FROM_TO(KORTESTDrr, KTESTDrr)
1677
FROM_TO(KORTESTQrr, KTESTQrr)
1678
}
1679
// KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1680
// KAND instructions and KTEST use the same ISA feature.
1681
if (NewOpc == X86::KTESTWrr && !Subtarget->hasDQI())
1682
continue;
1683
#undef FROM_TO
1684
MachineSDNode *KTest = CurDAG->getMachineNode(
1685
NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1686
ReplaceUses(N, KTest);
1687
MadeChange = true;
1688
continue;
1689
}
1690
// Attempt to remove vectors moves that were inserted to zero upper bits.
1691
case TargetOpcode::SUBREG_TO_REG: {
1692
unsigned SubRegIdx = N->getConstantOperandVal(2);
1693
if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1694
continue;
1695
1696
SDValue Move = N->getOperand(1);
1697
if (!Move.isMachineOpcode())
1698
continue;
1699
1700
// Make sure its one of the move opcodes we recognize.
1701
switch (Move.getMachineOpcode()) {
1702
default:
1703
continue;
1704
CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1705
CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1706
CASE(VMOVDQArr) CASE(VMOVDQUrr)
1707
CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1708
CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1709
CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1710
CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1711
CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1712
CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1713
CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1714
CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1715
CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1716
CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1717
CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1718
}
1719
#undef CASE
1720
1721
SDValue In = Move.getOperand(0);
1722
if (!In.isMachineOpcode() ||
1723
In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1724
continue;
1725
1726
// Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1727
// the SHA instructions which use a legacy encoding.
1728
uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1729
if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1730
(TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1731
(TSFlags & X86II::EncodingMask) != X86II::XOP)
1732
continue;
1733
1734
// Producing instruction is another vector instruction. We can drop the
1735
// move.
1736
CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1737
MadeChange = true;
1738
}
1739
}
1740
}
1741
1742
if (MadeChange)
1743
CurDAG->RemoveDeadNodes();
1744
}
1745
1746
1747
/// Emit any code that needs to be executed only in the main function.
1748
void X86DAGToDAGISel::emitSpecialCodeForMain() {
1749
if (Subtarget->isTargetCygMing()) {
1750
TargetLowering::ArgListTy Args;
1751
auto &DL = CurDAG->getDataLayout();
1752
1753
TargetLowering::CallLoweringInfo CLI(*CurDAG);
1754
CLI.setChain(CurDAG->getRoot())
1755
.setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1756
CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1757
std::move(Args));
1758
const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1759
std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1760
CurDAG->setRoot(Result.second);
1761
}
1762
}
1763
1764
void X86DAGToDAGISel::emitFunctionEntryCode() {
1765
// If this is main, emit special code for main.
1766
const Function &F = MF->getFunction();
1767
if (F.hasExternalLinkage() && F.getName() == "main")
1768
emitSpecialCodeForMain();
1769
}
1770
1771
static bool isDispSafeForFrameIndex(int64_t Val) {
1772
// On 64-bit platforms, we can run into an issue where a frame index
1773
// includes a displacement that, when added to the explicit displacement,
1774
// will overflow the displacement field. Assuming that the frame index
1775
// displacement fits into a 31-bit integer (which is only slightly more
1776
// aggressive than the current fundamental assumption that it fits into
1777
// a 32-bit integer), a 31-bit disp should always be safe.
1778
return isInt<31>(Val);
1779
}
1780
1781
bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1782
X86ISelAddressMode &AM) {
1783
// We may have already matched a displacement and the caller just added the
1784
// symbolic displacement. So we still need to do the checks even if Offset
1785
// is zero.
1786
1787
int64_t Val = AM.Disp + Offset;
1788
1789
// Cannot combine ExternalSymbol displacements with integer offsets.
1790
if (Val != 0 && (AM.ES || AM.MCSym))
1791
return true;
1792
1793
CodeModel::Model M = TM.getCodeModel();
1794
if (Subtarget->is64Bit()) {
1795
if (Val != 0 &&
1796
!X86::isOffsetSuitableForCodeModel(Val, M,
1797
AM.hasSymbolicDisplacement()))
1798
return true;
1799
// In addition to the checks required for a register base, check that
1800
// we do not try to use an unsafe Disp with a frame index.
1801
if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1802
!isDispSafeForFrameIndex(Val))
1803
return true;
1804
// In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1805
// 64 bits. Instructions with 32-bit register addresses perform this zero
1806
// extension for us and we can safely ignore the high bits of Offset.
1807
// Instructions with only a 32-bit immediate address do not, though: they
1808
// sign extend instead. This means only address the low 2GB of address space
1809
// is directly addressable, we need indirect addressing for the high 2GB of
1810
// address space.
1811
// TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1812
// implicit zero extension of instructions would cover up any problem.
1813
// However, we have asserts elsewhere that get triggered if we do, so keep
1814
// the checks for now.
1815
// TODO: We would actually be able to accept these, as well as the same
1816
// addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1817
// to get an address size override to be emitted. However, this
1818
// pseudo-register is not part of any register class and therefore causes
1819
// MIR verification to fail.
1820
if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
1821
!AM.hasBaseOrIndexReg())
1822
return true;
1823
}
1824
AM.Disp = Val;
1825
return false;
1826
}
1827
1828
bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1829
bool AllowSegmentRegForX32) {
1830
SDValue Address = N->getOperand(1);
1831
1832
// load gs:0 -> GS segment register.
1833
// load fs:0 -> FS segment register.
1834
//
1835
// This optimization is generally valid because the GNU TLS model defines that
1836
// gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1837
// with 32-bit registers, as we get in ILP32 mode, those registers are first
1838
// zero-extended to 64 bits and then added it to the base address, which gives
1839
// unwanted results when the register holds a negative value.
1840
// For more information see http://people.redhat.com/drepper/tls.pdf
1841
if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1842
!IndirectTlsSegRefs &&
1843
(Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1844
Subtarget->isTargetFuchsia())) {
1845
if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1846
return true;
1847
switch (N->getPointerInfo().getAddrSpace()) {
1848
case X86AS::GS:
1849
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1850
return false;
1851
case X86AS::FS:
1852
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1853
return false;
1854
// Address space X86AS::SS is not handled here, because it is not used to
1855
// address TLS areas.
1856
}
1857
}
1858
1859
return true;
1860
}
1861
1862
/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1863
/// mode. These wrap things that will resolve down into a symbol reference.
1864
/// If no match is possible, this returns true, otherwise it returns false.
1865
bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1866
// If the addressing mode already has a symbol as the displacement, we can
1867
// never match another symbol.
1868
if (AM.hasSymbolicDisplacement())
1869
return true;
1870
1871
bool IsRIPRelTLS = false;
1872
bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1873
if (IsRIPRel) {
1874
SDValue Val = N.getOperand(0);
1875
if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1876
IsRIPRelTLS = true;
1877
}
1878
1879
// We can't use an addressing mode in the 64-bit large code model.
1880
// Global TLS addressing is an exception. In the medium code model,
1881
// we use can use a mode when RIP wrappers are present.
1882
// That signifies access to globals that are known to be "near",
1883
// such as the GOT itself.
1884
CodeModel::Model M = TM.getCodeModel();
1885
if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1886
return true;
1887
1888
// Base and index reg must be 0 in order to use %rip as base.
1889
if (IsRIPRel && AM.hasBaseOrIndexReg())
1890
return true;
1891
1892
// Make a local copy in case we can't do this fold.
1893
X86ISelAddressMode Backup = AM;
1894
1895
int64_t Offset = 0;
1896
SDValue N0 = N.getOperand(0);
1897
if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1898
AM.GV = G->getGlobal();
1899
AM.SymbolFlags = G->getTargetFlags();
1900
Offset = G->getOffset();
1901
} else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1902
AM.CP = CP->getConstVal();
1903
AM.Alignment = CP->getAlign();
1904
AM.SymbolFlags = CP->getTargetFlags();
1905
Offset = CP->getOffset();
1906
} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1907
AM.ES = S->getSymbol();
1908
AM.SymbolFlags = S->getTargetFlags();
1909
} else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1910
AM.MCSym = S->getMCSymbol();
1911
} else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1912
AM.JT = J->getIndex();
1913
AM.SymbolFlags = J->getTargetFlags();
1914
} else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1915
AM.BlockAddr = BA->getBlockAddress();
1916
AM.SymbolFlags = BA->getTargetFlags();
1917
Offset = BA->getOffset();
1918
} else
1919
llvm_unreachable("Unhandled symbol reference node.");
1920
1921
// Can't use an addressing mode with large globals.
1922
if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1923
TM.isLargeGlobalValue(AM.GV)) {
1924
AM = Backup;
1925
return true;
1926
}
1927
1928
if (foldOffsetIntoAddress(Offset, AM)) {
1929
AM = Backup;
1930
return true;
1931
}
1932
1933
if (IsRIPRel)
1934
AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1935
1936
// Commit the changes now that we know this fold is safe.
1937
return false;
1938
}
1939
1940
/// Add the specified node to the specified addressing mode, returning true if
1941
/// it cannot be done. This just pattern matches for the addressing mode.
1942
bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1943
if (matchAddressRecursively(N, AM, 0))
1944
return true;
1945
1946
// Post-processing: Make a second attempt to fold a load, if we now know
1947
// that there will not be any other register. This is only performed for
1948
// 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1949
// any foldable load the first time.
1950
if (Subtarget->isTarget64BitILP32() &&
1951
AM.BaseType == X86ISelAddressMode::RegBase &&
1952
AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1953
SDValue Save_Base_Reg = AM.Base_Reg;
1954
if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1955
AM.Base_Reg = SDValue();
1956
if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1957
AM.Base_Reg = Save_Base_Reg;
1958
}
1959
}
1960
1961
// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1962
// a smaller encoding and avoids a scaled-index.
1963
if (AM.Scale == 2 &&
1964
AM.BaseType == X86ISelAddressMode::RegBase &&
1965
AM.Base_Reg.getNode() == nullptr) {
1966
AM.Base_Reg = AM.IndexReg;
1967
AM.Scale = 1;
1968
}
1969
1970
// Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1971
// because it has a smaller encoding.
1972
if (TM.getCodeModel() != CodeModel::Large &&
1973
(!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
1974
AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
1975
AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
1976
AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
1977
AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1978
}
1979
1980
return false;
1981
}
1982
1983
bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1984
unsigned Depth) {
1985
// Add an artificial use to this node so that we can keep track of
1986
// it if it gets CSE'd with a different node.
1987
HandleSDNode Handle(N);
1988
1989
X86ISelAddressMode Backup = AM;
1990
if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1991
!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1992
return false;
1993
AM = Backup;
1994
1995
// Try again after commutating the operands.
1996
if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
1997
Depth + 1) &&
1998
!matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
1999
return false;
2000
AM = Backup;
2001
2002
// If we couldn't fold both operands into the address at the same time,
2003
// see if we can just put each operand into a register and fold at least
2004
// the add.
2005
if (AM.BaseType == X86ISelAddressMode::RegBase &&
2006
!AM.Base_Reg.getNode() &&
2007
!AM.IndexReg.getNode()) {
2008
N = Handle.getValue();
2009
AM.Base_Reg = N.getOperand(0);
2010
AM.IndexReg = N.getOperand(1);
2011
AM.Scale = 1;
2012
return false;
2013
}
2014
N = Handle.getValue();
2015
return true;
2016
}
2017
2018
// Insert a node into the DAG at least before the Pos node's position. This
2019
// will reposition the node as needed, and will assign it a node ID that is <=
2020
// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2021
// IDs! The selection DAG must no longer depend on their uniqueness when this
2022
// is used.
2023
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2024
if (N->getNodeId() == -1 ||
2025
(SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
2026
SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
2027
DAG.RepositionNode(Pos->getIterator(), N.getNode());
2028
// Mark Node as invalid for pruning as after this it may be a successor to a
2029
// selected node but otherwise be in the same position of Pos.
2030
// Conservatively mark it with the same -abs(Id) to assure node id
2031
// invariant is preserved.
2032
N->setNodeId(Pos->getNodeId());
2033
SelectionDAGISel::InvalidateNodeId(N.getNode());
2034
}
2035
}
2036
2037
// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2038
// safe. This allows us to convert the shift and and into an h-register
2039
// extract and a scaled index. Returns false if the simplification is
2040
// performed.
2041
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2042
uint64_t Mask,
2043
SDValue Shift, SDValue X,
2044
X86ISelAddressMode &AM) {
2045
if (Shift.getOpcode() != ISD::SRL ||
2046
!isa<ConstantSDNode>(Shift.getOperand(1)) ||
2047
!Shift.hasOneUse())
2048
return true;
2049
2050
int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2051
if (ScaleLog <= 0 || ScaleLog >= 4 ||
2052
Mask != (0xffu << ScaleLog))
2053
return true;
2054
2055
MVT XVT = X.getSimpleValueType();
2056
MVT VT = N.getSimpleValueType();
2057
SDLoc DL(N);
2058
SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2059
SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2060
SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2061
SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2062
SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2063
SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2064
SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2065
2066
// Insert the new nodes into the topological ordering. We must do this in
2067
// a valid topological ordering as nothing is going to go back and re-sort
2068
// these nodes. We continually insert before 'N' in sequence as this is
2069
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2070
// hierarchy left to express.
2071
insertDAGNode(DAG, N, Eight);
2072
insertDAGNode(DAG, N, NewMask);
2073
insertDAGNode(DAG, N, Srl);
2074
insertDAGNode(DAG, N, And);
2075
insertDAGNode(DAG, N, Ext);
2076
insertDAGNode(DAG, N, ShlCount);
2077
insertDAGNode(DAG, N, Shl);
2078
DAG.ReplaceAllUsesWith(N, Shl);
2079
DAG.RemoveDeadNode(N.getNode());
2080
AM.IndexReg = Ext;
2081
AM.Scale = (1 << ScaleLog);
2082
return false;
2083
}
2084
2085
// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2086
// allows us to fold the shift into this addressing mode. Returns false if the
2087
// transform succeeded.
2088
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2089
X86ISelAddressMode &AM) {
2090
SDValue Shift = N.getOperand(0);
2091
2092
// Use a signed mask so that shifting right will insert sign bits. These
2093
// bits will be removed when we shift the result left so it doesn't matter
2094
// what we use. This might allow a smaller immediate encoding.
2095
int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2096
2097
// If we have an any_extend feeding the AND, look through it to see if there
2098
// is a shift behind it. But only if the AND doesn't use the extended bits.
2099
// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2100
bool FoundAnyExtend = false;
2101
if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2102
Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2103
isUInt<32>(Mask)) {
2104
FoundAnyExtend = true;
2105
Shift = Shift.getOperand(0);
2106
}
2107
2108
if (Shift.getOpcode() != ISD::SHL ||
2109
!isa<ConstantSDNode>(Shift.getOperand(1)))
2110
return true;
2111
2112
SDValue X = Shift.getOperand(0);
2113
2114
// Not likely to be profitable if either the AND or SHIFT node has more
2115
// than one use (unless all uses are for address computation). Besides,
2116
// isel mechanism requires their node ids to be reused.
2117
if (!N.hasOneUse() || !Shift.hasOneUse())
2118
return true;
2119
2120
// Verify that the shift amount is something we can fold.
2121
unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2122
if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2123
return true;
2124
2125
MVT VT = N.getSimpleValueType();
2126
SDLoc DL(N);
2127
if (FoundAnyExtend) {
2128
SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2129
insertDAGNode(DAG, N, NewX);
2130
X = NewX;
2131
}
2132
2133
SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
2134
SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2135
SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2136
2137
// Insert the new nodes into the topological ordering. We must do this in
2138
// a valid topological ordering as nothing is going to go back and re-sort
2139
// these nodes. We continually insert before 'N' in sequence as this is
2140
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2141
// hierarchy left to express.
2142
insertDAGNode(DAG, N, NewMask);
2143
insertDAGNode(DAG, N, NewAnd);
2144
insertDAGNode(DAG, N, NewShift);
2145
DAG.ReplaceAllUsesWith(N, NewShift);
2146
DAG.RemoveDeadNode(N.getNode());
2147
2148
AM.Scale = 1 << ShiftAmt;
2149
AM.IndexReg = NewAnd;
2150
return false;
2151
}
2152
2153
// Implement some heroics to detect shifts of masked values where the mask can
2154
// be replaced by extending the shift and undoing that in the addressing mode
2155
// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2156
// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2157
// the addressing mode. This results in code such as:
2158
//
2159
// int f(short *y, int *lookup_table) {
2160
// ...
2161
// return *y + lookup_table[*y >> 11];
2162
// }
2163
//
2164
// Turning into:
2165
// movzwl (%rdi), %eax
2166
// movl %eax, %ecx
2167
// shrl $11, %ecx
2168
// addl (%rsi,%rcx,4), %eax
2169
//
2170
// Instead of:
2171
// movzwl (%rdi), %eax
2172
// movl %eax, %ecx
2173
// shrl $9, %ecx
2174
// andl $124, %rcx
2175
// addl (%rsi,%rcx), %eax
2176
//
2177
// Note that this function assumes the mask is provided as a mask *after* the
2178
// value is shifted. The input chain may or may not match that, but computing
2179
// such a mask is trivial.
2180
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2181
uint64_t Mask,
2182
SDValue Shift, SDValue X,
2183
X86ISelAddressMode &AM) {
2184
if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2185
!isa<ConstantSDNode>(Shift.getOperand(1)))
2186
return true;
2187
2188
// We need to ensure that mask is a continuous run of bits.
2189
unsigned MaskIdx, MaskLen;
2190
if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2191
return true;
2192
unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2193
2194
unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2195
2196
// The amount of shift we're trying to fit into the addressing mode is taken
2197
// from the shifted mask index (number of trailing zeros of the mask).
2198
unsigned AMShiftAmt = MaskIdx;
2199
2200
// There is nothing we can do here unless the mask is removing some bits.
2201
// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2202
if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2203
2204
// Scale the leading zero count down based on the actual size of the value.
2205
// Also scale it down based on the size of the shift.
2206
unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2207
if (MaskLZ < ScaleDown)
2208
return true;
2209
MaskLZ -= ScaleDown;
2210
2211
// The final check is to ensure that any masked out high bits of X are
2212
// already known to be zero. Otherwise, the mask has a semantic impact
2213
// other than masking out a couple of low bits. Unfortunately, because of
2214
// the mask, zero extensions will be removed from operands in some cases.
2215
// This code works extra hard to look through extensions because we can
2216
// replace them with zero extensions cheaply if necessary.
2217
bool ReplacingAnyExtend = false;
2218
if (X.getOpcode() == ISD::ANY_EXTEND) {
2219
unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2220
X.getOperand(0).getSimpleValueType().getSizeInBits();
2221
// Assume that we'll replace the any-extend with a zero-extend, and
2222
// narrow the search to the extended value.
2223
X = X.getOperand(0);
2224
MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2225
ReplacingAnyExtend = true;
2226
}
2227
APInt MaskedHighBits =
2228
APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2229
if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2230
return true;
2231
2232
// We've identified a pattern that can be transformed into a single shift
2233
// and an addressing mode. Make it so.
2234
MVT VT = N.getSimpleValueType();
2235
if (ReplacingAnyExtend) {
2236
assert(X.getValueType() != VT);
2237
// We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2238
SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2239
insertDAGNode(DAG, N, NewX);
2240
X = NewX;
2241
}
2242
2243
MVT XVT = X.getSimpleValueType();
2244
SDLoc DL(N);
2245
SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2246
SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2247
SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2248
SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2249
SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2250
2251
// Insert the new nodes into the topological ordering. We must do this in
2252
// a valid topological ordering as nothing is going to go back and re-sort
2253
// these nodes. We continually insert before 'N' in sequence as this is
2254
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2255
// hierarchy left to express.
2256
insertDAGNode(DAG, N, NewSRLAmt);
2257
insertDAGNode(DAG, N, NewSRL);
2258
insertDAGNode(DAG, N, NewExt);
2259
insertDAGNode(DAG, N, NewSHLAmt);
2260
insertDAGNode(DAG, N, NewSHL);
2261
DAG.ReplaceAllUsesWith(N, NewSHL);
2262
DAG.RemoveDeadNode(N.getNode());
2263
2264
AM.Scale = 1 << AMShiftAmt;
2265
AM.IndexReg = NewExt;
2266
return false;
2267
}
2268
2269
// Transform "(X >> SHIFT) & (MASK << C1)" to
2270
// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2271
// matched to a BEXTR later. Returns false if the simplification is performed.
2272
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2273
uint64_t Mask,
2274
SDValue Shift, SDValue X,
2275
X86ISelAddressMode &AM,
2276
const X86Subtarget &Subtarget) {
2277
if (Shift.getOpcode() != ISD::SRL ||
2278
!isa<ConstantSDNode>(Shift.getOperand(1)) ||
2279
!Shift.hasOneUse() || !N.hasOneUse())
2280
return true;
2281
2282
// Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2283
if (!Subtarget.hasTBM() &&
2284
!(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2285
return true;
2286
2287
// We need to ensure that mask is a continuous run of bits.
2288
unsigned MaskIdx, MaskLen;
2289
if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2290
return true;
2291
2292
unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2293
2294
// The amount of shift we're trying to fit into the addressing mode is taken
2295
// from the shifted mask index (number of trailing zeros of the mask).
2296
unsigned AMShiftAmt = MaskIdx;
2297
2298
// There is nothing we can do here unless the mask is removing some bits.
2299
// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2300
if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2301
2302
MVT XVT = X.getSimpleValueType();
2303
MVT VT = N.getSimpleValueType();
2304
SDLoc DL(N);
2305
SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2306
SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2307
SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2308
SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2309
SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2310
SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2311
SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2312
2313
// Insert the new nodes into the topological ordering. We must do this in
2314
// a valid topological ordering as nothing is going to go back and re-sort
2315
// these nodes. We continually insert before 'N' in sequence as this is
2316
// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2317
// hierarchy left to express.
2318
insertDAGNode(DAG, N, NewSRLAmt);
2319
insertDAGNode(DAG, N, NewSRL);
2320
insertDAGNode(DAG, N, NewMask);
2321
insertDAGNode(DAG, N, NewAnd);
2322
insertDAGNode(DAG, N, NewExt);
2323
insertDAGNode(DAG, N, NewSHLAmt);
2324
insertDAGNode(DAG, N, NewSHL);
2325
DAG.ReplaceAllUsesWith(N, NewSHL);
2326
DAG.RemoveDeadNode(N.getNode());
2327
2328
AM.Scale = 1 << AMShiftAmt;
2329
AM.IndexReg = NewExt;
2330
return false;
2331
}
2332
2333
// Attempt to peek further into a scaled index register, collecting additional
2334
// extensions / offsets / etc. Returns /p N if we can't peek any further.
2335
SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2336
X86ISelAddressMode &AM,
2337
unsigned Depth) {
2338
assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2339
assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2340
"Illegal index scale");
2341
2342
// Limit recursion.
2343
if (Depth >= SelectionDAG::MaxRecursionDepth)
2344
return N;
2345
2346
EVT VT = N.getValueType();
2347
unsigned Opc = N.getOpcode();
2348
2349
// index: add(x,c) -> index: x, disp + c
2350
if (CurDAG->isBaseWithConstantOffset(N)) {
2351
auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2352
uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2353
if (!foldOffsetIntoAddress(Offset, AM))
2354
return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2355
}
2356
2357
// index: add(x,x) -> index: x, scale * 2
2358
if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2359
if (AM.Scale <= 4) {
2360
AM.Scale *= 2;
2361
return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2362
}
2363
}
2364
2365
// index: shl(x,i) -> index: x, scale * (1 << i)
2366
if (Opc == X86ISD::VSHLI) {
2367
uint64_t ShiftAmt = N.getConstantOperandVal(1);
2368
uint64_t ScaleAmt = 1ULL << ShiftAmt;
2369
if ((AM.Scale * ScaleAmt) <= 8) {
2370
AM.Scale *= ScaleAmt;
2371
return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2372
}
2373
}
2374
2375
// index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2376
// TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2377
if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2378
SDValue Src = N.getOperand(0);
2379
if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2380
Src.hasOneUse()) {
2381
if (CurDAG->isBaseWithConstantOffset(Src)) {
2382
SDValue AddSrc = Src.getOperand(0);
2383
auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2384
uint64_t Offset = (uint64_t)AddVal->getSExtValue();
2385
if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2386
SDLoc DL(N);
2387
SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2388
SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2389
SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2390
insertDAGNode(*CurDAG, N, ExtSrc);
2391
insertDAGNode(*CurDAG, N, ExtVal);
2392
insertDAGNode(*CurDAG, N, ExtAdd);
2393
CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2394
CurDAG->RemoveDeadNode(N.getNode());
2395
return ExtSrc;
2396
}
2397
}
2398
}
2399
}
2400
2401
// index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2402
// index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2403
// TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2404
if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2405
SDValue Src = N.getOperand(0);
2406
unsigned SrcOpc = Src.getOpcode();
2407
if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2408
CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2409
Src.hasOneUse()) {
2410
if (CurDAG->isBaseWithConstantOffset(Src)) {
2411
SDValue AddSrc = Src.getOperand(0);
2412
uint64_t Offset = Src.getConstantOperandVal(1);
2413
if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2414
SDLoc DL(N);
2415
SDValue Res;
2416
// If we're also scaling, see if we can use that as well.
2417
if (AddSrc.getOpcode() == ISD::SHL &&
2418
isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2419
SDValue ShVal = AddSrc.getOperand(0);
2420
uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2421
APInt HiBits =
2422
APInt::getHighBitsSet(AddSrc.getScalarValueSizeInBits(), ShAmt);
2423
uint64_t ScaleAmt = 1ULL << ShAmt;
2424
if ((AM.Scale * ScaleAmt) <= 8 &&
2425
(AddSrc->getFlags().hasNoUnsignedWrap() ||
2426
CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2427
AM.Scale *= ScaleAmt;
2428
SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2429
SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2430
AddSrc.getOperand(1));
2431
insertDAGNode(*CurDAG, N, ExtShVal);
2432
insertDAGNode(*CurDAG, N, ExtShift);
2433
AddSrc = ExtShift;
2434
Res = ExtShVal;
2435
}
2436
}
2437
SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2438
SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2439
SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2440
insertDAGNode(*CurDAG, N, ExtSrc);
2441
insertDAGNode(*CurDAG, N, ExtVal);
2442
insertDAGNode(*CurDAG, N, ExtAdd);
2443
CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2444
CurDAG->RemoveDeadNode(N.getNode());
2445
return Res ? Res : ExtSrc;
2446
}
2447
}
2448
}
2449
}
2450
2451
// TODO: Handle extensions, shifted masks etc.
2452
return N;
2453
}
2454
2455
bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2456
unsigned Depth) {
2457
SDLoc dl(N);
2458
LLVM_DEBUG({
2459
dbgs() << "MatchAddress: ";
2460
AM.dump(CurDAG);
2461
});
2462
// Limit recursion.
2463
if (Depth >= SelectionDAG::MaxRecursionDepth)
2464
return matchAddressBase(N, AM);
2465
2466
// If this is already a %rip relative address, we can only merge immediates
2467
// into it. Instead of handling this in every case, we handle it here.
2468
// RIP relative addressing: %rip + 32-bit displacement!
2469
if (AM.isRIPRelative()) {
2470
// FIXME: JumpTable and ExternalSymbol address currently don't like
2471
// displacements. It isn't very important, but this should be fixed for
2472
// consistency.
2473
if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2474
return true;
2475
2476
if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2477
if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2478
return false;
2479
return true;
2480
}
2481
2482
switch (N.getOpcode()) {
2483
default: break;
2484
case ISD::LOCAL_RECOVER: {
2485
if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2486
if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2487
// Use the symbol and don't prefix it.
2488
AM.MCSym = ESNode->getMCSymbol();
2489
return false;
2490
}
2491
break;
2492
}
2493
case ISD::Constant: {
2494
uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2495
if (!foldOffsetIntoAddress(Val, AM))
2496
return false;
2497
break;
2498
}
2499
2500
case X86ISD::Wrapper:
2501
case X86ISD::WrapperRIP:
2502
if (!matchWrapper(N, AM))
2503
return false;
2504
break;
2505
2506
case ISD::LOAD:
2507
if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2508
return false;
2509
break;
2510
2511
case ISD::FrameIndex:
2512
if (AM.BaseType == X86ISelAddressMode::RegBase &&
2513
AM.Base_Reg.getNode() == nullptr &&
2514
(!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
2515
AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2516
AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2517
return false;
2518
}
2519
break;
2520
2521
case ISD::SHL:
2522
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2523
break;
2524
2525
if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2526
unsigned Val = CN->getZExtValue();
2527
// Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2528
// that the base operand remains free for further matching. If
2529
// the base doesn't end up getting used, a post-processing step
2530
// in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2531
if (Val == 1 || Val == 2 || Val == 3) {
2532
SDValue ShVal = N.getOperand(0);
2533
AM.Scale = 1 << Val;
2534
AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2535
return false;
2536
}
2537
}
2538
break;
2539
2540
case ISD::SRL: {
2541
// Scale must not be used already.
2542
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2543
2544
// We only handle up to 64-bit values here as those are what matter for
2545
// addressing mode optimizations.
2546
assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2547
"Unexpected value size!");
2548
2549
SDValue And = N.getOperand(0);
2550
if (And.getOpcode() != ISD::AND) break;
2551
SDValue X = And.getOperand(0);
2552
2553
// The mask used for the transform is expected to be post-shift, but we
2554
// found the shift first so just apply the shift to the mask before passing
2555
// it down.
2556
if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2557
!isa<ConstantSDNode>(And.getOperand(1)))
2558
break;
2559
uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2560
2561
// Try to fold the mask and shift into the scale, and return false if we
2562
// succeed.
2563
if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2564
return false;
2565
break;
2566
}
2567
2568
case ISD::SMUL_LOHI:
2569
case ISD::UMUL_LOHI:
2570
// A mul_lohi where we need the low part can be folded as a plain multiply.
2571
if (N.getResNo() != 0) break;
2572
[[fallthrough]];
2573
case ISD::MUL:
2574
case X86ISD::MUL_IMM:
2575
// X*[3,5,9] -> X+X*[2,4,8]
2576
if (AM.BaseType == X86ISelAddressMode::RegBase &&
2577
AM.Base_Reg.getNode() == nullptr &&
2578
AM.IndexReg.getNode() == nullptr) {
2579
if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2580
if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2581
CN->getZExtValue() == 9) {
2582
AM.Scale = unsigned(CN->getZExtValue())-1;
2583
2584
SDValue MulVal = N.getOperand(0);
2585
SDValue Reg;
2586
2587
// Okay, we know that we have a scale by now. However, if the scaled
2588
// value is an add of something and a constant, we can fold the
2589
// constant into the disp field here.
2590
if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2591
isa<ConstantSDNode>(MulVal.getOperand(1))) {
2592
Reg = MulVal.getOperand(0);
2593
auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2594
uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2595
if (foldOffsetIntoAddress(Disp, AM))
2596
Reg = N.getOperand(0);
2597
} else {
2598
Reg = N.getOperand(0);
2599
}
2600
2601
AM.IndexReg = AM.Base_Reg = Reg;
2602
return false;
2603
}
2604
}
2605
break;
2606
2607
case ISD::SUB: {
2608
// Given A-B, if A can be completely folded into the address and
2609
// the index field with the index field unused, use -B as the index.
2610
// This is a win if a has multiple parts that can be folded into
2611
// the address. Also, this saves a mov if the base register has
2612
// other uses, since it avoids a two-address sub instruction, however
2613
// it costs an additional mov if the index register has other uses.
2614
2615
// Add an artificial use to this node so that we can keep track of
2616
// it if it gets CSE'd with a different node.
2617
HandleSDNode Handle(N);
2618
2619
// Test if the LHS of the sub can be folded.
2620
X86ISelAddressMode Backup = AM;
2621
if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2622
N = Handle.getValue();
2623
AM = Backup;
2624
break;
2625
}
2626
N = Handle.getValue();
2627
// Test if the index field is free for use.
2628
if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2629
AM = Backup;
2630
break;
2631
}
2632
2633
int Cost = 0;
2634
SDValue RHS = N.getOperand(1);
2635
// If the RHS involves a register with multiple uses, this
2636
// transformation incurs an extra mov, due to the neg instruction
2637
// clobbering its operand.
2638
if (!RHS.getNode()->hasOneUse() ||
2639
RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2640
RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2641
RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2642
(RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2643
RHS.getOperand(0).getValueType() == MVT::i32))
2644
++Cost;
2645
// If the base is a register with multiple uses, this
2646
// transformation may save a mov.
2647
if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2648
!AM.Base_Reg.getNode()->hasOneUse()) ||
2649
AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2650
--Cost;
2651
// If the folded LHS was interesting, this transformation saves
2652
// address arithmetic.
2653
if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2654
((AM.Disp != 0) && (Backup.Disp == 0)) +
2655
(AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2656
--Cost;
2657
// If it doesn't look like it may be an overall win, don't do it.
2658
if (Cost >= 0) {
2659
AM = Backup;
2660
break;
2661
}
2662
2663
// Ok, the transformation is legal and appears profitable. Go for it.
2664
// Negation will be emitted later to avoid creating dangling nodes if this
2665
// was an unprofitable LEA.
2666
AM.IndexReg = RHS;
2667
AM.NegateIndex = true;
2668
AM.Scale = 1;
2669
return false;
2670
}
2671
2672
case ISD::OR:
2673
case ISD::XOR:
2674
// See if we can treat the OR/XOR node as an ADD node.
2675
if (!CurDAG->isADDLike(N))
2676
break;
2677
[[fallthrough]];
2678
case ISD::ADD:
2679
if (!matchAdd(N, AM, Depth))
2680
return false;
2681
break;
2682
2683
case ISD::AND: {
2684
// Perform some heroic transforms on an and of a constant-count shift
2685
// with a constant to enable use of the scaled offset field.
2686
2687
// Scale must not be used already.
2688
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2689
2690
// We only handle up to 64-bit values here as those are what matter for
2691
// addressing mode optimizations.
2692
assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2693
"Unexpected value size!");
2694
2695
if (!isa<ConstantSDNode>(N.getOperand(1)))
2696
break;
2697
2698
if (N.getOperand(0).getOpcode() == ISD::SRL) {
2699
SDValue Shift = N.getOperand(0);
2700
SDValue X = Shift.getOperand(0);
2701
2702
uint64_t Mask = N.getConstantOperandVal(1);
2703
2704
// Try to fold the mask and shift into an extract and scale.
2705
if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2706
return false;
2707
2708
// Try to fold the mask and shift directly into the scale.
2709
if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2710
return false;
2711
2712
// Try to fold the mask and shift into BEXTR and scale.
2713
if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2714
return false;
2715
}
2716
2717
// Try to swap the mask and shift to place shifts which can be done as
2718
// a scale on the outside of the mask.
2719
if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2720
return false;
2721
2722
break;
2723
}
2724
case ISD::ZERO_EXTEND: {
2725
// Try to widen a zexted shift left to the same size as its use, so we can
2726
// match the shift as a scale factor.
2727
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2728
break;
2729
2730
SDValue Src = N.getOperand(0);
2731
2732
// See if we can match a zext(addlike(x,c)).
2733
// TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2734
if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2735
if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2736
if (Index != N) {
2737
AM.IndexReg = Index;
2738
return false;
2739
}
2740
2741
// Peek through mask: zext(and(shl(x,c1),c2))
2742
APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2743
if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2744
if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2745
Mask = MaskC->getAPIntValue();
2746
Src = Src.getOperand(0);
2747
}
2748
2749
if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2750
// Give up if the shift is not a valid scale factor [1,2,3].
2751
SDValue ShlSrc = Src.getOperand(0);
2752
SDValue ShlAmt = Src.getOperand(1);
2753
auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2754
if (!ShAmtC)
2755
break;
2756
unsigned ShAmtV = ShAmtC->getZExtValue();
2757
if (ShAmtV > 3)
2758
break;
2759
2760
// The narrow shift must only shift out zero bits (it must be 'nuw').
2761
// That makes it safe to widen to the destination type.
2762
APInt HighZeros =
2763
APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2764
if (!Src->getFlags().hasNoUnsignedWrap() &&
2765
!CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2766
break;
2767
2768
// zext (shl nuw i8 %x, C1) to i32
2769
// --> shl (zext i8 %x to i32), (zext C1)
2770
// zext (and (shl nuw i8 %x, C1), C2) to i32
2771
// --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2772
MVT SrcVT = ShlSrc.getSimpleValueType();
2773
MVT VT = N.getSimpleValueType();
2774
SDLoc DL(N);
2775
2776
SDValue Res = ShlSrc;
2777
if (!Mask.isAllOnes()) {
2778
Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2779
insertDAGNode(*CurDAG, N, Res);
2780
Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2781
insertDAGNode(*CurDAG, N, Res);
2782
}
2783
SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2784
insertDAGNode(*CurDAG, N, Zext);
2785
SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2786
insertDAGNode(*CurDAG, N, NewShl);
2787
CurDAG->ReplaceAllUsesWith(N, NewShl);
2788
CurDAG->RemoveDeadNode(N.getNode());
2789
2790
// Convert the shift to scale factor.
2791
AM.Scale = 1 << ShAmtV;
2792
// If matchIndexRecursively is not called here,
2793
// Zext may be replaced by other nodes but later used to call a builder
2794
// method
2795
AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2796
return false;
2797
}
2798
2799
if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2800
// Try to fold the mask and shift into an extract and scale.
2801
if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2802
Src.getOperand(0), AM))
2803
return false;
2804
2805
// Try to fold the mask and shift directly into the scale.
2806
if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2807
Src.getOperand(0), AM))
2808
return false;
2809
2810
// Try to fold the mask and shift into BEXTR and scale.
2811
if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2812
Src.getOperand(0), AM, *Subtarget))
2813
return false;
2814
}
2815
2816
break;
2817
}
2818
}
2819
2820
return matchAddressBase(N, AM);
2821
}
2822
2823
/// Helper for MatchAddress. Add the specified node to the
2824
/// specified addressing mode without any further recursion.
2825
bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2826
// Is the base register already occupied?
2827
if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2828
// If so, check to see if the scale index register is set.
2829
if (!AM.IndexReg.getNode()) {
2830
AM.IndexReg = N;
2831
AM.Scale = 1;
2832
return false;
2833
}
2834
2835
// Otherwise, we cannot select it.
2836
return true;
2837
}
2838
2839
// Default, generate it as a register.
2840
AM.BaseType = X86ISelAddressMode::RegBase;
2841
AM.Base_Reg = N;
2842
return false;
2843
}
2844
2845
bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2846
X86ISelAddressMode &AM,
2847
unsigned Depth) {
2848
SDLoc dl(N);
2849
LLVM_DEBUG({
2850
dbgs() << "MatchVectorAddress: ";
2851
AM.dump(CurDAG);
2852
});
2853
// Limit recursion.
2854
if (Depth >= SelectionDAG::MaxRecursionDepth)
2855
return matchAddressBase(N, AM);
2856
2857
// TODO: Support other operations.
2858
switch (N.getOpcode()) {
2859
case ISD::Constant: {
2860
uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2861
if (!foldOffsetIntoAddress(Val, AM))
2862
return false;
2863
break;
2864
}
2865
case X86ISD::Wrapper:
2866
if (!matchWrapper(N, AM))
2867
return false;
2868
break;
2869
case ISD::ADD: {
2870
// Add an artificial use to this node so that we can keep track of
2871
// it if it gets CSE'd with a different node.
2872
HandleSDNode Handle(N);
2873
2874
X86ISelAddressMode Backup = AM;
2875
if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2876
!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2877
Depth + 1))
2878
return false;
2879
AM = Backup;
2880
2881
// Try again after commuting the operands.
2882
if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2883
Depth + 1) &&
2884
!matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2885
Depth + 1))
2886
return false;
2887
AM = Backup;
2888
2889
N = Handle.getValue();
2890
break;
2891
}
2892
}
2893
2894
return matchAddressBase(N, AM);
2895
}
2896
2897
/// Helper for selectVectorAddr. Handles things that can be folded into a
2898
/// gather/scatter address. The index register and scale should have already
2899
/// been handled.
2900
bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2901
return matchVectorAddressRecursively(N, AM, 0);
2902
}
2903
2904
bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2905
SDValue IndexOp, SDValue ScaleOp,
2906
SDValue &Base, SDValue &Scale,
2907
SDValue &Index, SDValue &Disp,
2908
SDValue &Segment) {
2909
X86ISelAddressMode AM;
2910
AM.Scale = ScaleOp->getAsZExtVal();
2911
2912
// Attempt to match index patterns, as long as we're not relying on implicit
2913
// sign-extension, which is performed BEFORE scale.
2914
if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2915
AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2916
else
2917
AM.IndexReg = IndexOp;
2918
2919
unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2920
if (AddrSpace == X86AS::GS)
2921
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2922
if (AddrSpace == X86AS::FS)
2923
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2924
if (AddrSpace == X86AS::SS)
2925
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2926
2927
SDLoc DL(BasePtr);
2928
MVT VT = BasePtr.getSimpleValueType();
2929
2930
// Try to match into the base and displacement fields.
2931
if (matchVectorAddress(BasePtr, AM))
2932
return false;
2933
2934
getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2935
return true;
2936
}
2937
2938
/// Returns true if it is able to pattern match an addressing mode.
2939
/// It returns the operands which make up the maximal addressing mode it can
2940
/// match by reference.
2941
///
2942
/// Parent is the parent node of the addr operand that is being matched. It
2943
/// is always a load, store, atomic node, or null. It is only null when
2944
/// checking memory operands for inline asm nodes.
2945
bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2946
SDValue &Scale, SDValue &Index,
2947
SDValue &Disp, SDValue &Segment) {
2948
X86ISelAddressMode AM;
2949
2950
if (Parent &&
2951
// This list of opcodes are all the nodes that have an "addr:$ptr" operand
2952
// that are not a MemSDNode, and thus don't have proper addrspace info.
2953
Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2954
Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2955
Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2956
Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2957
Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2958
Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2959
Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2960
unsigned AddrSpace =
2961
cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2962
if (AddrSpace == X86AS::GS)
2963
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2964
if (AddrSpace == X86AS::FS)
2965
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2966
if (AddrSpace == X86AS::SS)
2967
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2968
}
2969
2970
// Save the DL and VT before calling matchAddress, it can invalidate N.
2971
SDLoc DL(N);
2972
MVT VT = N.getSimpleValueType();
2973
2974
if (matchAddress(N, AM))
2975
return false;
2976
2977
getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2978
return true;
2979
}
2980
2981
bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2982
// Cannot use 32 bit constants to reference objects in kernel/large code
2983
// model.
2984
if (TM.getCodeModel() == CodeModel::Kernel ||
2985
TM.getCodeModel() == CodeModel::Large)
2986
return false;
2987
2988
// In static codegen with small code model, we can get the address of a label
2989
// into a register with 'movl'
2990
if (N->getOpcode() != X86ISD::Wrapper)
2991
return false;
2992
2993
N = N.getOperand(0);
2994
2995
// At least GNU as does not accept 'movl' for TPOFF relocations.
2996
// FIXME: We could use 'movl' when we know we are targeting MC.
2997
if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2998
return false;
2999
3000
Imm = N;
3001
// Small/medium code model can reference non-TargetGlobalAddress objects with
3002
// 32 bit constants.
3003
if (N->getOpcode() != ISD::TargetGlobalAddress) {
3004
return TM.getCodeModel() == CodeModel::Small ||
3005
TM.getCodeModel() == CodeModel::Medium;
3006
}
3007
3008
const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3009
if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3010
return CR->getUnsignedMax().ult(1ull << 32);
3011
3012
return !TM.isLargeGlobalValue(GV);
3013
}
3014
3015
bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
3016
SDValue &Scale, SDValue &Index,
3017
SDValue &Disp, SDValue &Segment) {
3018
// Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3019
SDLoc DL(N);
3020
3021
if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3022
return false;
3023
3024
auto *RN = dyn_cast<RegisterSDNode>(Base);
3025
if (RN && RN->getReg() == 0)
3026
Base = CurDAG->getRegister(0, MVT::i64);
3027
else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
3028
// Base could already be %rip, particularly in the x32 ABI.
3029
SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3030
MVT::i64), 0);
3031
Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3032
Base);
3033
}
3034
3035
RN = dyn_cast<RegisterSDNode>(Index);
3036
if (RN && RN->getReg() == 0)
3037
Index = CurDAG->getRegister(0, MVT::i64);
3038
else {
3039
assert(Index.getValueType() == MVT::i32 &&
3040
"Expect to be extending 32-bit registers for use in LEA");
3041
SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3042
MVT::i64), 0);
3043
Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3044
Index);
3045
}
3046
3047
return true;
3048
}
3049
3050
/// Calls SelectAddr and determines if the maximal addressing
3051
/// mode it matches can be cost effectively emitted as an LEA instruction.
3052
bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3053
SDValue &Base, SDValue &Scale,
3054
SDValue &Index, SDValue &Disp,
3055
SDValue &Segment) {
3056
X86ISelAddressMode AM;
3057
3058
// Save the DL and VT before calling matchAddress, it can invalidate N.
3059
SDLoc DL(N);
3060
MVT VT = N.getSimpleValueType();
3061
3062
// Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3063
// segments.
3064
SDValue Copy = AM.Segment;
3065
SDValue T = CurDAG->getRegister(0, MVT::i32);
3066
AM.Segment = T;
3067
if (matchAddress(N, AM))
3068
return false;
3069
assert (T == AM.Segment);
3070
AM.Segment = Copy;
3071
3072
unsigned Complexity = 0;
3073
if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3074
Complexity = 1;
3075
else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3076
Complexity = 4;
3077
3078
if (AM.IndexReg.getNode())
3079
Complexity++;
3080
3081
// Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3082
// a simple shift.
3083
if (AM.Scale > 1)
3084
Complexity++;
3085
3086
// FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3087
// to a LEA. This is determined with some experimentation but is by no means
3088
// optimal (especially for code size consideration). LEA is nice because of
3089
// its three-address nature. Tweak the cost function again when we can run
3090
// convertToThreeAddress() at register allocation time.
3091
if (AM.hasSymbolicDisplacement()) {
3092
// For X86-64, always use LEA to materialize RIP-relative addresses.
3093
if (Subtarget->is64Bit())
3094
Complexity = 4;
3095
else
3096
Complexity += 2;
3097
}
3098
3099
// Heuristic: try harder to form an LEA from ADD if the operands set flags.
3100
// Unlike ADD, LEA does not affect flags, so we will be less likely to require
3101
// duplicating flag-producing instructions later in the pipeline.
3102
if (N.getOpcode() == ISD::ADD) {
3103
auto isMathWithFlags = [](SDValue V) {
3104
switch (V.getOpcode()) {
3105
case X86ISD::ADD:
3106
case X86ISD::SUB:
3107
case X86ISD::ADC:
3108
case X86ISD::SBB:
3109
case X86ISD::SMUL:
3110
case X86ISD::UMUL:
3111
/* TODO: These opcodes can be added safely, but we may want to justify
3112
their inclusion for different reasons (better for reg-alloc).
3113
case X86ISD::OR:
3114
case X86ISD::XOR:
3115
case X86ISD::AND:
3116
*/
3117
// Value 1 is the flag output of the node - verify it's not dead.
3118
return !SDValue(V.getNode(), 1).use_empty();
3119
default:
3120
return false;
3121
}
3122
};
3123
// TODO: We might want to factor in whether there's a load folding
3124
// opportunity for the math op that disappears with LEA.
3125
if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3126
Complexity++;
3127
}
3128
3129
if (AM.Disp)
3130
Complexity++;
3131
3132
// If it isn't worth using an LEA, reject it.
3133
if (Complexity <= 2)
3134
return false;
3135
3136
getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3137
return true;
3138
}
3139
3140
/// This is only run on TargetGlobalTLSAddress nodes.
3141
bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3142
SDValue &Scale, SDValue &Index,
3143
SDValue &Disp, SDValue &Segment) {
3144
assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3145
N.getOpcode() == ISD::TargetExternalSymbol);
3146
3147
X86ISelAddressMode AM;
3148
if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3149
AM.GV = GA->getGlobal();
3150
AM.Disp += GA->getOffset();
3151
AM.SymbolFlags = GA->getTargetFlags();
3152
} else {
3153
auto *SA = cast<ExternalSymbolSDNode>(N);
3154
AM.ES = SA->getSymbol();
3155
AM.SymbolFlags = SA->getTargetFlags();
3156
}
3157
3158
if (Subtarget->is32Bit()) {
3159
AM.Scale = 1;
3160
AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3161
}
3162
3163
MVT VT = N.getSimpleValueType();
3164
getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3165
return true;
3166
}
3167
3168
bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3169
// Keep track of the original value type and whether this value was
3170
// truncated. If we see a truncation from pointer type to VT that truncates
3171
// bits that are known to be zero, we can use a narrow reference.
3172
EVT VT = N.getValueType();
3173
bool WasTruncated = false;
3174
if (N.getOpcode() == ISD::TRUNCATE) {
3175
WasTruncated = true;
3176
N = N.getOperand(0);
3177
}
3178
3179
if (N.getOpcode() != X86ISD::Wrapper)
3180
return false;
3181
3182
// We can only use non-GlobalValues as immediates if they were not truncated,
3183
// as we do not have any range information. If we have a GlobalValue and the
3184
// address was not truncated, we can select it as an operand directly.
3185
unsigned Opc = N.getOperand(0)->getOpcode();
3186
if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3187
Op = N.getOperand(0);
3188
// We can only select the operand directly if we didn't have to look past a
3189
// truncate.
3190
return !WasTruncated;
3191
}
3192
3193
// Check that the global's range fits into VT.
3194
auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3195
std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3196
if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3197
return false;
3198
3199
// Okay, we can use a narrow reference.
3200
Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3201
GA->getOffset(), GA->getTargetFlags());
3202
return true;
3203
}
3204
3205
bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3206
SDValue &Base, SDValue &Scale,
3207
SDValue &Index, SDValue &Disp,
3208
SDValue &Segment) {
3209
assert(Root && P && "Unknown root/parent nodes");
3210
if (!ISD::isNON_EXTLoad(N.getNode()) ||
3211
!IsProfitableToFold(N, P, Root) ||
3212
!IsLegalToFold(N, P, Root, OptLevel))
3213
return false;
3214
3215
return selectAddr(N.getNode(),
3216
N.getOperand(1), Base, Scale, Index, Disp, Segment);
3217
}
3218
3219
bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3220
SDValue &Base, SDValue &Scale,
3221
SDValue &Index, SDValue &Disp,
3222
SDValue &Segment) {
3223
assert(Root && P && "Unknown root/parent nodes");
3224
if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3225
!IsProfitableToFold(N, P, Root) ||
3226
!IsLegalToFold(N, P, Root, OptLevel))
3227
return false;
3228
3229
return selectAddr(N.getNode(),
3230
N.getOperand(1), Base, Scale, Index, Disp, Segment);
3231
}
3232
3233
/// Return an SDNode that returns the value of the global base register.
3234
/// Output instructions required to initialize the global base register,
3235
/// if necessary.
3236
SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3237
unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3238
auto &DL = MF->getDataLayout();
3239
return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3240
}
3241
3242
bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3243
if (N->getOpcode() == ISD::TRUNCATE)
3244
N = N->getOperand(0).getNode();
3245
if (N->getOpcode() != X86ISD::Wrapper)
3246
return false;
3247
3248
auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3249
if (!GA)
3250
return false;
3251
3252
auto *GV = GA->getGlobal();
3253
std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3254
if (CR)
3255
return CR->getSignedMin().sge(-1ull << Width) &&
3256
CR->getSignedMax().slt(1ull << Width);
3257
// In the kernel code model, globals are in the negative 2GB of the address
3258
// space, so globals can be a sign extended 32-bit immediate.
3259
// In other code models, small globals are in the low 2GB of the address
3260
// space, so sign extending them is equivalent to zero extending them.
3261
return Width == 32 && !TM.isLargeGlobalValue(GV);
3262
}
3263
3264
X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3265
assert(N->isMachineOpcode() && "Unexpected node");
3266
unsigned Opc = N->getMachineOpcode();
3267
const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3268
int CondNo = X86::getCondSrcNoFromDesc(MCID);
3269
if (CondNo < 0)
3270
return X86::COND_INVALID;
3271
3272
return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3273
}
3274
3275
/// Test whether the given X86ISD::CMP node has any users that use a flag
3276
/// other than ZF.
3277
bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3278
// Examine each user of the node.
3279
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3280
UI != UE; ++UI) {
3281
// Only check things that use the flags.
3282
if (UI.getUse().getResNo() != Flags.getResNo())
3283
continue;
3284
// Only examine CopyToReg uses that copy to EFLAGS.
3285
if (UI->getOpcode() != ISD::CopyToReg ||
3286
cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3287
return false;
3288
// Examine each user of the CopyToReg use.
3289
for (SDNode::use_iterator FlagUI = UI->use_begin(),
3290
FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3291
// Only examine the Flag result.
3292
if (FlagUI.getUse().getResNo() != 1) continue;
3293
// Anything unusual: assume conservatively.
3294
if (!FlagUI->isMachineOpcode()) return false;
3295
// Examine the condition code of the user.
3296
X86::CondCode CC = getCondFromNode(*FlagUI);
3297
3298
switch (CC) {
3299
// Comparisons which only use the zero flag.
3300
case X86::COND_E: case X86::COND_NE:
3301
continue;
3302
// Anything else: assume conservatively.
3303
default:
3304
return false;
3305
}
3306
}
3307
}
3308
return true;
3309
}
3310
3311
/// Test whether the given X86ISD::CMP node has any uses which require the SF
3312
/// flag to be accurate.
3313
bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3314
// Examine each user of the node.
3315
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3316
UI != UE; ++UI) {
3317
// Only check things that use the flags.
3318
if (UI.getUse().getResNo() != Flags.getResNo())
3319
continue;
3320
// Only examine CopyToReg uses that copy to EFLAGS.
3321
if (UI->getOpcode() != ISD::CopyToReg ||
3322
cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3323
return false;
3324
// Examine each user of the CopyToReg use.
3325
for (SDNode::use_iterator FlagUI = UI->use_begin(),
3326
FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3327
// Only examine the Flag result.
3328
if (FlagUI.getUse().getResNo() != 1) continue;
3329
// Anything unusual: assume conservatively.
3330
if (!FlagUI->isMachineOpcode()) return false;
3331
// Examine the condition code of the user.
3332
X86::CondCode CC = getCondFromNode(*FlagUI);
3333
3334
switch (CC) {
3335
// Comparisons which don't examine the SF flag.
3336
case X86::COND_A: case X86::COND_AE:
3337
case X86::COND_B: case X86::COND_BE:
3338
case X86::COND_E: case X86::COND_NE:
3339
case X86::COND_O: case X86::COND_NO:
3340
case X86::COND_P: case X86::COND_NP:
3341
continue;
3342
// Anything else: assume conservatively.
3343
default:
3344
return false;
3345
}
3346
}
3347
}
3348
return true;
3349
}
3350
3351
static bool mayUseCarryFlag(X86::CondCode CC) {
3352
switch (CC) {
3353
// Comparisons which don't examine the CF flag.
3354
case X86::COND_O: case X86::COND_NO:
3355
case X86::COND_E: case X86::COND_NE:
3356
case X86::COND_S: case X86::COND_NS:
3357
case X86::COND_P: case X86::COND_NP:
3358
case X86::COND_L: case X86::COND_GE:
3359
case X86::COND_G: case X86::COND_LE:
3360
return false;
3361
// Anything else: assume conservatively.
3362
default:
3363
return true;
3364
}
3365
}
3366
3367
/// Test whether the given node which sets flags has any uses which require the
3368
/// CF flag to be accurate.
3369
bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3370
// Examine each user of the node.
3371
for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3372
UI != UE; ++UI) {
3373
// Only check things that use the flags.
3374
if (UI.getUse().getResNo() != Flags.getResNo())
3375
continue;
3376
3377
unsigned UIOpc = UI->getOpcode();
3378
3379
if (UIOpc == ISD::CopyToReg) {
3380
// Only examine CopyToReg uses that copy to EFLAGS.
3381
if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3382
return false;
3383
// Examine each user of the CopyToReg use.
3384
for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
3385
FlagUI != FlagUE; ++FlagUI) {
3386
// Only examine the Flag result.
3387
if (FlagUI.getUse().getResNo() != 1)
3388
continue;
3389
// Anything unusual: assume conservatively.
3390
if (!FlagUI->isMachineOpcode())
3391
return false;
3392
// Examine the condition code of the user.
3393
X86::CondCode CC = getCondFromNode(*FlagUI);
3394
3395
if (mayUseCarryFlag(CC))
3396
return false;
3397
}
3398
3399
// This CopyToReg is ok. Move on to the next user.
3400
continue;
3401
}
3402
3403
// This might be an unselected node. So look for the pre-isel opcodes that
3404
// use flags.
3405
unsigned CCOpNo;
3406
switch (UIOpc) {
3407
default:
3408
// Something unusual. Be conservative.
3409
return false;
3410
case X86ISD::SETCC: CCOpNo = 0; break;
3411
case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3412
case X86ISD::CMOV: CCOpNo = 2; break;
3413
case X86ISD::BRCOND: CCOpNo = 2; break;
3414
}
3415
3416
X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
3417
if (mayUseCarryFlag(CC))
3418
return false;
3419
}
3420
return true;
3421
}
3422
3423
/// Check whether or not the chain ending in StoreNode is suitable for doing
3424
/// the {load; op; store} to modify transformation.
3425
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3426
SDValue StoredVal, SelectionDAG *CurDAG,
3427
unsigned LoadOpNo,
3428
LoadSDNode *&LoadNode,
3429
SDValue &InputChain) {
3430
// Is the stored value result 0 of the operation?
3431
if (StoredVal.getResNo() != 0) return false;
3432
3433
// Are there other uses of the operation other than the store?
3434
if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3435
3436
// Is the store non-extending and non-indexed?
3437
if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3438
return false;
3439
3440
SDValue Load = StoredVal->getOperand(LoadOpNo);
3441
// Is the stored value a non-extending and non-indexed load?
3442
if (!ISD::isNormalLoad(Load.getNode())) return false;
3443
3444
// Return LoadNode by reference.
3445
LoadNode = cast<LoadSDNode>(Load);
3446
3447
// Is store the only read of the loaded value?
3448
if (!Load.hasOneUse())
3449
return false;
3450
3451
// Is the address of the store the same as the load?
3452
if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3453
LoadNode->getOffset() != StoreNode->getOffset())
3454
return false;
3455
3456
bool FoundLoad = false;
3457
SmallVector<SDValue, 4> ChainOps;
3458
SmallVector<const SDNode *, 4> LoopWorklist;
3459
SmallPtrSet<const SDNode *, 16> Visited;
3460
const unsigned int Max = 1024;
3461
3462
// Visualization of Load-Op-Store fusion:
3463
// -------------------------
3464
// Legend:
3465
// *-lines = Chain operand dependencies.
3466
// |-lines = Normal operand dependencies.
3467
// Dependencies flow down and right. n-suffix references multiple nodes.
3468
//
3469
// C Xn C
3470
// * * *
3471
// * * *
3472
// Xn A-LD Yn TF Yn
3473
// * * \ | * |
3474
// * * \ | * |
3475
// * * \ | => A--LD_OP_ST
3476
// * * \| \
3477
// TF OP \
3478
// * | \ Zn
3479
// * | \
3480
// A-ST Zn
3481
//
3482
3483
// This merge induced dependences from: #1: Xn -> LD, OP, Zn
3484
// #2: Yn -> LD
3485
// #3: ST -> Zn
3486
3487
// Ensure the transform is safe by checking for the dual
3488
// dependencies to make sure we do not induce a loop.
3489
3490
// As LD is a predecessor to both OP and ST we can do this by checking:
3491
// a). if LD is a predecessor to a member of Xn or Yn.
3492
// b). if a Zn is a predecessor to ST.
3493
3494
// However, (b) can only occur through being a chain predecessor to
3495
// ST, which is the same as Zn being a member or predecessor of Xn,
3496
// which is a subset of LD being a predecessor of Xn. So it's
3497
// subsumed by check (a).
3498
3499
SDValue Chain = StoreNode->getChain();
3500
3501
// Gather X elements in ChainOps.
3502
if (Chain == Load.getValue(1)) {
3503
FoundLoad = true;
3504
ChainOps.push_back(Load.getOperand(0));
3505
} else if (Chain.getOpcode() == ISD::TokenFactor) {
3506
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3507
SDValue Op = Chain.getOperand(i);
3508
if (Op == Load.getValue(1)) {
3509
FoundLoad = true;
3510
// Drop Load, but keep its chain. No cycle check necessary.
3511
ChainOps.push_back(Load.getOperand(0));
3512
continue;
3513
}
3514
LoopWorklist.push_back(Op.getNode());
3515
ChainOps.push_back(Op);
3516
}
3517
}
3518
3519
if (!FoundLoad)
3520
return false;
3521
3522
// Worklist is currently Xn. Add Yn to worklist.
3523
for (SDValue Op : StoredVal->ops())
3524
if (Op.getNode() != LoadNode)
3525
LoopWorklist.push_back(Op.getNode());
3526
3527
// Check (a) if Load is a predecessor to Xn + Yn
3528
if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3529
true))
3530
return false;
3531
3532
InputChain =
3533
CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3534
return true;
3535
}
3536
3537
// Change a chain of {load; op; store} of the same value into a simple op
3538
// through memory of that value, if the uses of the modified value and its
3539
// address are suitable.
3540
//
3541
// The tablegen pattern memory operand pattern is currently not able to match
3542
// the case where the EFLAGS on the original operation are used.
3543
//
3544
// To move this to tablegen, we'll need to improve tablegen to allow flags to
3545
// be transferred from a node in the pattern to the result node, probably with
3546
// a new keyword. For example, we have this
3547
// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3548
// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3549
// (implicit EFLAGS)]>;
3550
// but maybe need something like this
3551
// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3552
// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3553
// (transferrable EFLAGS)]>;
3554
//
3555
// Until then, we manually fold these and instruction select the operation
3556
// here.
3557
bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3558
auto *StoreNode = cast<StoreSDNode>(Node);
3559
SDValue StoredVal = StoreNode->getOperand(1);
3560
unsigned Opc = StoredVal->getOpcode();
3561
3562
// Before we try to select anything, make sure this is memory operand size
3563
// and opcode we can handle. Note that this must match the code below that
3564
// actually lowers the opcodes.
3565
EVT MemVT = StoreNode->getMemoryVT();
3566
if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3567
MemVT != MVT::i8)
3568
return false;
3569
3570
bool IsCommutable = false;
3571
bool IsNegate = false;
3572
switch (Opc) {
3573
default:
3574
return false;
3575
case X86ISD::SUB:
3576
IsNegate = isNullConstant(StoredVal.getOperand(0));
3577
break;
3578
case X86ISD::SBB:
3579
break;
3580
case X86ISD::ADD:
3581
case X86ISD::ADC:
3582
case X86ISD::AND:
3583
case X86ISD::OR:
3584
case X86ISD::XOR:
3585
IsCommutable = true;
3586
break;
3587
}
3588
3589
unsigned LoadOpNo = IsNegate ? 1 : 0;
3590
LoadSDNode *LoadNode = nullptr;
3591
SDValue InputChain;
3592
if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3593
LoadNode, InputChain)) {
3594
if (!IsCommutable)
3595
return false;
3596
3597
// This operation is commutable, try the other operand.
3598
LoadOpNo = 1;
3599
if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3600
LoadNode, InputChain))
3601
return false;
3602
}
3603
3604
SDValue Base, Scale, Index, Disp, Segment;
3605
if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3606
Segment))
3607
return false;
3608
3609
auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3610
unsigned Opc8) {
3611
switch (MemVT.getSimpleVT().SimpleTy) {
3612
case MVT::i64:
3613
return Opc64;
3614
case MVT::i32:
3615
return Opc32;
3616
case MVT::i16:
3617
return Opc16;
3618
case MVT::i8:
3619
return Opc8;
3620
default:
3621
llvm_unreachable("Invalid size!");
3622
}
3623
};
3624
3625
MachineSDNode *Result;
3626
switch (Opc) {
3627
case X86ISD::SUB:
3628
// Handle negate.
3629
if (IsNegate) {
3630
unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3631
X86::NEG8m);
3632
const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3633
Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3634
MVT::Other, Ops);
3635
break;
3636
}
3637
[[fallthrough]];
3638
case X86ISD::ADD:
3639
// Try to match inc/dec.
3640
if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3641
bool IsOne = isOneConstant(StoredVal.getOperand(1));
3642
bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3643
// ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3644
if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3645
unsigned NewOpc =
3646
((Opc == X86ISD::ADD) == IsOne)
3647
? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3648
: SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3649
const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3650
Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3651
MVT::Other, Ops);
3652
break;
3653
}
3654
}
3655
[[fallthrough]];
3656
case X86ISD::ADC:
3657
case X86ISD::SBB:
3658
case X86ISD::AND:
3659
case X86ISD::OR:
3660
case X86ISD::XOR: {
3661
auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3662
switch (Opc) {
3663
case X86ISD::ADD:
3664
return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3665
X86::ADD8mr);
3666
case X86ISD::ADC:
3667
return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3668
X86::ADC8mr);
3669
case X86ISD::SUB:
3670
return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3671
X86::SUB8mr);
3672
case X86ISD::SBB:
3673
return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3674
X86::SBB8mr);
3675
case X86ISD::AND:
3676
return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3677
X86::AND8mr);
3678
case X86ISD::OR:
3679
return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3680
case X86ISD::XOR:
3681
return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3682
X86::XOR8mr);
3683
default:
3684
llvm_unreachable("Invalid opcode!");
3685
}
3686
};
3687
auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3688
switch (Opc) {
3689
case X86ISD::ADD:
3690
return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3691
X86::ADD8mi);
3692
case X86ISD::ADC:
3693
return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3694
X86::ADC8mi);
3695
case X86ISD::SUB:
3696
return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3697
X86::SUB8mi);
3698
case X86ISD::SBB:
3699
return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3700
X86::SBB8mi);
3701
case X86ISD::AND:
3702
return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3703
X86::AND8mi);
3704
case X86ISD::OR:
3705
return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3706
X86::OR8mi);
3707
case X86ISD::XOR:
3708
return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3709
X86::XOR8mi);
3710
default:
3711
llvm_unreachable("Invalid opcode!");
3712
}
3713
};
3714
3715
unsigned NewOpc = SelectRegOpcode(Opc);
3716
SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3717
3718
// See if the operand is a constant that we can fold into an immediate
3719
// operand.
3720
if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3721
int64_t OperandV = OperandC->getSExtValue();
3722
3723
// Check if we can shrink the operand enough to fit in an immediate (or
3724
// fit into a smaller immediate) by negating it and switching the
3725
// operation.
3726
if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3727
((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3728
(MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3729
isInt<32>(-OperandV))) &&
3730
hasNoCarryFlagUses(StoredVal.getValue(1))) {
3731
OperandV = -OperandV;
3732
Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3733
}
3734
3735
if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3736
Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3737
NewOpc = SelectImmOpcode(Opc);
3738
}
3739
}
3740
3741
if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3742
SDValue CopyTo =
3743
CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3744
StoredVal.getOperand(2), SDValue());
3745
3746
const SDValue Ops[] = {Base, Scale, Index, Disp,
3747
Segment, Operand, CopyTo, CopyTo.getValue(1)};
3748
Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3749
Ops);
3750
} else {
3751
const SDValue Ops[] = {Base, Scale, Index, Disp,
3752
Segment, Operand, InputChain};
3753
Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3754
Ops);
3755
}
3756
break;
3757
}
3758
default:
3759
llvm_unreachable("Invalid opcode!");
3760
}
3761
3762
MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3763
LoadNode->getMemOperand()};
3764
CurDAG->setNodeMemRefs(Result, MemOps);
3765
3766
// Update Load Chain uses as well.
3767
ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3768
ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3769
ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3770
CurDAG->RemoveDeadNode(Node);
3771
return true;
3772
}
3773
3774
// See if this is an X & Mask that we can match to BEXTR/BZHI.
3775
// Where Mask is one of the following patterns:
3776
// a) x & (1 << nbits) - 1
3777
// b) x & ~(-1 << nbits)
3778
// c) x & (-1 >> (32 - y))
3779
// d) x << (32 - y) >> (32 - y)
3780
// e) (1 << nbits) - 1
3781
bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3782
assert(
3783
(Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3784
Node->getOpcode() == ISD::SRL) &&
3785
"Should be either an and-mask, or right-shift after clearing high bits.");
3786
3787
// BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3788
if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3789
return false;
3790
3791
MVT NVT = Node->getSimpleValueType(0);
3792
3793
// Only supported for 32 and 64 bits.
3794
if (NVT != MVT::i32 && NVT != MVT::i64)
3795
return false;
3796
3797
SDValue NBits;
3798
bool NegateNBits;
3799
3800
// If we have BMI2's BZHI, we are ok with muti-use patterns.
3801
// Else, if we only have BMI1's BEXTR, we require one-use.
3802
const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3803
auto checkUses = [AllowExtraUsesByDefault](
3804
SDValue Op, unsigned NUses,
3805
std::optional<bool> AllowExtraUses) {
3806
return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3807
Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3808
};
3809
auto checkOneUse = [checkUses](SDValue Op,
3810
std::optional<bool> AllowExtraUses =
3811
std::nullopt) {
3812
return checkUses(Op, 1, AllowExtraUses);
3813
};
3814
auto checkTwoUse = [checkUses](SDValue Op,
3815
std::optional<bool> AllowExtraUses =
3816
std::nullopt) {
3817
return checkUses(Op, 2, AllowExtraUses);
3818
};
3819
3820
auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3821
if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3822
assert(V.getSimpleValueType() == MVT::i32 &&
3823
V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3824
"Expected i64 -> i32 truncation");
3825
V = V.getOperand(0);
3826
}
3827
return V;
3828
};
3829
3830
// a) x & ((1 << nbits) + (-1))
3831
auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3832
&NegateNBits](SDValue Mask) -> bool {
3833
// Match `add`. Must only have one use!
3834
if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3835
return false;
3836
// We should be adding all-ones constant (i.e. subtracting one.)
3837
if (!isAllOnesConstant(Mask->getOperand(1)))
3838
return false;
3839
// Match `1 << nbits`. Might be truncated. Must only have one use!
3840
SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3841
if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3842
return false;
3843
if (!isOneConstant(M0->getOperand(0)))
3844
return false;
3845
NBits = M0->getOperand(1);
3846
NegateNBits = false;
3847
return true;
3848
};
3849
3850
auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3851
V = peekThroughOneUseTruncation(V);
3852
return CurDAG->MaskedValueIsAllOnes(
3853
V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3854
NVT.getSizeInBits()));
3855
};
3856
3857
// b) x & ~(-1 << nbits)
3858
auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3859
&NBits, &NegateNBits](SDValue Mask) -> bool {
3860
// Match `~()`. Must only have one use!
3861
if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3862
return false;
3863
// The -1 only has to be all-ones for the final Node's NVT.
3864
if (!isAllOnes(Mask->getOperand(1)))
3865
return false;
3866
// Match `-1 << nbits`. Might be truncated. Must only have one use!
3867
SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3868
if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3869
return false;
3870
// The -1 only has to be all-ones for the final Node's NVT.
3871
if (!isAllOnes(M0->getOperand(0)))
3872
return false;
3873
NBits = M0->getOperand(1);
3874
NegateNBits = false;
3875
return true;
3876
};
3877
3878
// Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3879
// or leave the shift amount as-is, but then we'll have to negate it.
3880
auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3881
unsigned Bitwidth) {
3882
NBits = ShiftAmt;
3883
NegateNBits = true;
3884
// Skip over a truncate of the shift amount, if any.
3885
if (NBits.getOpcode() == ISD::TRUNCATE)
3886
NBits = NBits.getOperand(0);
3887
// Try to match the shift amount as (bitwidth - y). It should go away, too.
3888
// If it doesn't match, that's fine, we'll just negate it ourselves.
3889
if (NBits.getOpcode() != ISD::SUB)
3890
return;
3891
auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3892
if (!V0 || V0->getZExtValue() != Bitwidth)
3893
return;
3894
NBits = NBits.getOperand(1);
3895
NegateNBits = false;
3896
};
3897
3898
// c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3899
// or
3900
// c) x & (-1 >> (32 - y))
3901
auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3902
canonicalizeShiftAmt](SDValue Mask) -> bool {
3903
// The mask itself may be truncated.
3904
Mask = peekThroughOneUseTruncation(Mask);
3905
unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3906
// Match `l>>`. Must only have one use!
3907
if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3908
return false;
3909
// We should be shifting truly all-ones constant.
3910
if (!isAllOnesConstant(Mask.getOperand(0)))
3911
return false;
3912
SDValue M1 = Mask.getOperand(1);
3913
// The shift amount should not be used externally.
3914
if (!checkOneUse(M1))
3915
return false;
3916
canonicalizeShiftAmt(M1, Bitwidth);
3917
// Pattern c. is non-canonical, and is expanded into pattern d. iff there
3918
// is no extra use of the mask. Clearly, there was one since we are here.
3919
// But at the same time, if we need to negate the shift amount,
3920
// then we don't want the mask to stick around, else it's unprofitable.
3921
return !NegateNBits;
3922
};
3923
3924
SDValue X;
3925
3926
// d) x << z >> z but then we'll have to subtract z from bitwidth
3927
// or
3928
// d) x << (32 - y) >> (32 - y)
3929
auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3930
AllowExtraUsesByDefault, &NegateNBits,
3931
&X](SDNode *Node) -> bool {
3932
if (Node->getOpcode() != ISD::SRL)
3933
return false;
3934
SDValue N0 = Node->getOperand(0);
3935
if (N0->getOpcode() != ISD::SHL)
3936
return false;
3937
unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3938
SDValue N1 = Node->getOperand(1);
3939
SDValue N01 = N0->getOperand(1);
3940
// Both of the shifts must be by the exact same value.
3941
if (N1 != N01)
3942
return false;
3943
canonicalizeShiftAmt(N1, Bitwidth);
3944
// There should not be any external uses of the inner shift / shift amount.
3945
// Note that while we are generally okay with external uses given BMI2,
3946
// iff we need to negate the shift amount, we are not okay with extra uses.
3947
const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3948
if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3949
return false;
3950
X = N0->getOperand(0);
3951
return true;
3952
};
3953
3954
auto matchLowBitMask = [matchPatternA, matchPatternB,
3955
matchPatternC](SDValue Mask) -> bool {
3956
return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3957
};
3958
3959
if (Node->getOpcode() == ISD::AND) {
3960
X = Node->getOperand(0);
3961
SDValue Mask = Node->getOperand(1);
3962
3963
if (matchLowBitMask(Mask)) {
3964
// Great.
3965
} else {
3966
std::swap(X, Mask);
3967
if (!matchLowBitMask(Mask))
3968
return false;
3969
}
3970
} else if (matchLowBitMask(SDValue(Node, 0))) {
3971
X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
3972
} else if (!matchPatternD(Node))
3973
return false;
3974
3975
// If we need to negate the shift amount, require BMI2 BZHI support.
3976
// It's just too unprofitable for BMI1 BEXTR.
3977
if (NegateNBits && !Subtarget->hasBMI2())
3978
return false;
3979
3980
SDLoc DL(Node);
3981
3982
// Truncate the shift amount.
3983
NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3984
insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3985
3986
// Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3987
// All the other bits are undefined, we do not care about them.
3988
SDValue ImplDef = SDValue(
3989
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3990
insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3991
3992
SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3993
insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3994
NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
3995
MVT::i32, ImplDef, NBits, SRIdxVal),
3996
0);
3997
insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3998
3999
// We might have matched the amount of high bits to be cleared,
4000
// but we want the amount of low bits to be kept, so negate it then.
4001
if (NegateNBits) {
4002
SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4003
insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4004
4005
NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4006
insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4007
}
4008
4009
if (Subtarget->hasBMI2()) {
4010
// Great, just emit the BZHI..
4011
if (NVT != MVT::i32) {
4012
// But have to place the bit count into the wide-enough register first.
4013
NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4014
insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4015
}
4016
4017
SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4018
ReplaceNode(Node, Extract.getNode());
4019
SelectCode(Extract.getNode());
4020
return true;
4021
}
4022
4023
// Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4024
// *logically* shifted (potentially with one-use trunc inbetween),
4025
// and the truncation was the only use of the shift,
4026
// and if so look past one-use truncation.
4027
{
4028
SDValue RealX = peekThroughOneUseTruncation(X);
4029
// FIXME: only if the shift is one-use?
4030
if (RealX != X && RealX.getOpcode() == ISD::SRL)
4031
X = RealX;
4032
}
4033
4034
MVT XVT = X.getSimpleValueType();
4035
4036
// Else, emitting BEXTR requires one more step.
4037
// The 'control' of BEXTR has the pattern of:
4038
// [15...8 bit][ 7...0 bit] location
4039
// [ bit count][ shift] name
4040
// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4041
4042
// Shift NBits left by 8 bits, thus producing 'control'.
4043
// This makes the low 8 bits to be zero.
4044
SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4045
insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4046
SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4047
insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4048
4049
// If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4050
// FIXME: only if the shift is one-use?
4051
if (X.getOpcode() == ISD::SRL) {
4052
SDValue ShiftAmt = X.getOperand(1);
4053
X = X.getOperand(0);
4054
4055
assert(ShiftAmt.getValueType() == MVT::i8 &&
4056
"Expected shift amount to be i8");
4057
4058
// Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4059
// We could zext to i16 in some form, but we intentionally don't do that.
4060
SDValue OrigShiftAmt = ShiftAmt;
4061
ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4062
insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4063
4064
// And now 'or' these low 8 bits of shift amount into the 'control'.
4065
Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4066
insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4067
}
4068
4069
// But have to place the 'control' into the wide-enough register first.
4070
if (XVT != MVT::i32) {
4071
Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4072
insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4073
}
4074
4075
// And finally, form the BEXTR itself.
4076
SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4077
4078
// The 'X' was originally truncated. Do that now.
4079
if (XVT != NVT) {
4080
insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4081
Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4082
}
4083
4084
ReplaceNode(Node, Extract.getNode());
4085
SelectCode(Extract.getNode());
4086
4087
return true;
4088
}
4089
4090
// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4091
MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4092
MVT NVT = Node->getSimpleValueType(0);
4093
SDLoc dl(Node);
4094
4095
SDValue N0 = Node->getOperand(0);
4096
SDValue N1 = Node->getOperand(1);
4097
4098
// If we have TBM we can use an immediate for the control. If we have BMI
4099
// we should only do this if the BEXTR instruction is implemented well.
4100
// Otherwise moving the control into a register makes this more costly.
4101
// TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4102
// hoisting the move immediate would make it worthwhile with a less optimal
4103
// BEXTR?
4104
bool PreferBEXTR =
4105
Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4106
if (!PreferBEXTR && !Subtarget->hasBMI2())
4107
return nullptr;
4108
4109
// Must have a shift right.
4110
if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4111
return nullptr;
4112
4113
// Shift can't have additional users.
4114
if (!N0->hasOneUse())
4115
return nullptr;
4116
4117
// Only supported for 32 and 64 bits.
4118
if (NVT != MVT::i32 && NVT != MVT::i64)
4119
return nullptr;
4120
4121
// Shift amount and RHS of and must be constant.
4122
auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4123
auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4124
if (!MaskCst || !ShiftCst)
4125
return nullptr;
4126
4127
// And RHS must be a mask.
4128
uint64_t Mask = MaskCst->getZExtValue();
4129
if (!isMask_64(Mask))
4130
return nullptr;
4131
4132
uint64_t Shift = ShiftCst->getZExtValue();
4133
uint64_t MaskSize = llvm::popcount(Mask);
4134
4135
// Don't interfere with something that can be handled by extracting AH.
4136
// TODO: If we are able to fold a load, BEXTR might still be better than AH.
4137
if (Shift == 8 && MaskSize == 8)
4138
return nullptr;
4139
4140
// Make sure we are only using bits that were in the original value, not
4141
// shifted in.
4142
if (Shift + MaskSize > NVT.getSizeInBits())
4143
return nullptr;
4144
4145
// BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4146
// that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4147
// does not fit into 32 bits. Load folding is not a sufficient reason.
4148
if (!PreferBEXTR && MaskSize <= 32)
4149
return nullptr;
4150
4151
SDValue Control;
4152
unsigned ROpc, MOpc;
4153
4154
#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4155
if (!PreferBEXTR) {
4156
assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4157
// If we can't make use of BEXTR then we can't fuse shift+mask stages.
4158
// Let's perform the mask first, and apply shift later. Note that we need to
4159
// widen the mask to account for the fact that we'll apply shift afterwards!
4160
Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4161
ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4162
: GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4163
MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4164
: GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4165
unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4166
Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4167
} else {
4168
// The 'control' of BEXTR has the pattern of:
4169
// [15...8 bit][ 7...0 bit] location
4170
// [ bit count][ shift] name
4171
// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4172
Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4173
if (Subtarget->hasTBM()) {
4174
ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4175
MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4176
} else {
4177
assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4178
// BMI requires the immediate to placed in a register.
4179
ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4180
: GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4181
MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4182
: GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4183
unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4184
Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4185
}
4186
}
4187
4188
MachineSDNode *NewNode;
4189
SDValue Input = N0->getOperand(0);
4190
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4191
if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4192
SDValue Ops[] = {
4193
Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4194
SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4195
NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4196
// Update the chain.
4197
ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4198
// Record the mem-refs
4199
CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4200
} else {
4201
NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4202
}
4203
4204
if (!PreferBEXTR) {
4205
// We still need to apply the shift.
4206
SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4207
unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4208
: GET_ND_IF_ENABLED(X86::SHR32ri);
4209
NewNode =
4210
CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4211
}
4212
4213
return NewNode;
4214
}
4215
4216
// Emit a PCMISTR(I/M) instruction.
4217
MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4218
bool MayFoldLoad, const SDLoc &dl,
4219
MVT VT, SDNode *Node) {
4220
SDValue N0 = Node->getOperand(0);
4221
SDValue N1 = Node->getOperand(1);
4222
SDValue Imm = Node->getOperand(2);
4223
auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4224
Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4225
4226
// Try to fold a load. No need to check alignment.
4227
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4228
if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4229
SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4230
N1.getOperand(0) };
4231
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4232
MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4233
// Update the chain.
4234
ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4235
// Record the mem-refs
4236
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4237
return CNode;
4238
}
4239
4240
SDValue Ops[] = { N0, N1, Imm };
4241
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4242
MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4243
return CNode;
4244
}
4245
4246
// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4247
// to emit a second instruction after this one. This is needed since we have two
4248
// copyToReg nodes glued before this and we need to continue that glue through.
4249
MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4250
bool MayFoldLoad, const SDLoc &dl,
4251
MVT VT, SDNode *Node,
4252
SDValue &InGlue) {
4253
SDValue N0 = Node->getOperand(0);
4254
SDValue N2 = Node->getOperand(2);
4255
SDValue Imm = Node->getOperand(4);
4256
auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4257
Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4258
4259
// Try to fold a load. No need to check alignment.
4260
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4261
if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4262
SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4263
N2.getOperand(0), InGlue };
4264
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4265
MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4266
InGlue = SDValue(CNode, 3);
4267
// Update the chain.
4268
ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4269
// Record the mem-refs
4270
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4271
return CNode;
4272
}
4273
4274
SDValue Ops[] = { N0, N2, Imm, InGlue };
4275
SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4276
MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4277
InGlue = SDValue(CNode, 2);
4278
return CNode;
4279
}
4280
4281
bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4282
EVT VT = N->getValueType(0);
4283
4284
// Only handle scalar shifts.
4285
if (VT.isVector())
4286
return false;
4287
4288
// Narrower shifts only mask to 5 bits in hardware.
4289
unsigned Size = VT == MVT::i64 ? 64 : 32;
4290
4291
SDValue OrigShiftAmt = N->getOperand(1);
4292
SDValue ShiftAmt = OrigShiftAmt;
4293
SDLoc DL(N);
4294
4295
// Skip over a truncate of the shift amount.
4296
if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4297
ShiftAmt = ShiftAmt->getOperand(0);
4298
4299
// This function is called after X86DAGToDAGISel::matchBitExtract(),
4300
// so we are not afraid that we might mess up BZHI/BEXTR pattern.
4301
4302
SDValue NewShiftAmt;
4303
if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4304
ShiftAmt->getOpcode() == ISD::XOR) {
4305
SDValue Add0 = ShiftAmt->getOperand(0);
4306
SDValue Add1 = ShiftAmt->getOperand(1);
4307
auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4308
auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4309
// If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4310
// to avoid the ADD/SUB/XOR.
4311
if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4312
NewShiftAmt = Add0;
4313
4314
} else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4315
((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4316
(Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4317
// If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4318
// we can replace it with a NOT. In the XOR case it may save some code
4319
// size, in the SUB case it also may save a move.
4320
assert(Add0C == nullptr || Add1C == nullptr);
4321
4322
// We can only do N-X, not X-N
4323
if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4324
return false;
4325
4326
EVT OpVT = ShiftAmt.getValueType();
4327
4328
SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4329
NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4330
Add0C == nullptr ? Add0 : Add1, AllOnes);
4331
insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4332
insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4333
// If we are shifting by N-X where N == 0 mod Size, then just shift by
4334
// -X to generate a NEG instead of a SUB of a constant.
4335
} else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4336
Add0C->getZExtValue() != 0) {
4337
EVT SubVT = ShiftAmt.getValueType();
4338
SDValue X;
4339
if (Add0C->getZExtValue() % Size == 0)
4340
X = Add1;
4341
else if (ShiftAmt.hasOneUse() && Size == 64 &&
4342
Add0C->getZExtValue() % 32 == 0) {
4343
// We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4344
// This is mainly beneficial if we already compute (x+n*32).
4345
if (Add1.getOpcode() == ISD::TRUNCATE) {
4346
Add1 = Add1.getOperand(0);
4347
SubVT = Add1.getValueType();
4348
}
4349
if (Add0.getValueType() != SubVT) {
4350
Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4351
insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4352
}
4353
4354
X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4355
insertDAGNode(*CurDAG, OrigShiftAmt, X);
4356
} else
4357
return false;
4358
// Insert a negate op.
4359
// TODO: This isn't guaranteed to replace the sub if there is a logic cone
4360
// that uses it that's not a shift.
4361
SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4362
SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4363
NewShiftAmt = Neg;
4364
4365
// Insert these operands into a valid topological order so they can
4366
// get selected independently.
4367
insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4368
insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4369
} else
4370
return false;
4371
} else
4372
return false;
4373
4374
if (NewShiftAmt.getValueType() != MVT::i8) {
4375
// Need to truncate the shift amount.
4376
NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4377
// Add to a correct topological ordering.
4378
insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4379
}
4380
4381
// Insert a new mask to keep the shift amount legal. This should be removed
4382
// by isel patterns.
4383
NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4384
CurDAG->getConstant(Size - 1, DL, MVT::i8));
4385
// Place in a correct topological ordering.
4386
insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4387
4388
SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4389
NewShiftAmt);
4390
if (UpdatedNode != N) {
4391
// If we found an existing node, we should replace ourselves with that node
4392
// and wait for it to be selected after its other users.
4393
ReplaceNode(N, UpdatedNode);
4394
return true;
4395
}
4396
4397
// If the original shift amount is now dead, delete it so that we don't run
4398
// it through isel.
4399
if (OrigShiftAmt.getNode()->use_empty())
4400
CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4401
4402
// Now that we've optimized the shift amount, defer to normal isel to get
4403
// load folding and legacy vs BMI2 selection without repeating it here.
4404
SelectCode(N);
4405
return true;
4406
}
4407
4408
bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4409
MVT NVT = N->getSimpleValueType(0);
4410
unsigned Opcode = N->getOpcode();
4411
SDLoc dl(N);
4412
4413
// For operations of the form (x << C1) op C2, check if we can use a smaller
4414
// encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4415
SDValue Shift = N->getOperand(0);
4416
SDValue N1 = N->getOperand(1);
4417
4418
auto *Cst = dyn_cast<ConstantSDNode>(N1);
4419
if (!Cst)
4420
return false;
4421
4422
int64_t Val = Cst->getSExtValue();
4423
4424
// If we have an any_extend feeding the AND, look through it to see if there
4425
// is a shift behind it. But only if the AND doesn't use the extended bits.
4426
// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4427
bool FoundAnyExtend = false;
4428
if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4429
Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4430
isUInt<32>(Val)) {
4431
FoundAnyExtend = true;
4432
Shift = Shift.getOperand(0);
4433
}
4434
4435
if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4436
return false;
4437
4438
// i8 is unshrinkable, i16 should be promoted to i32.
4439
if (NVT != MVT::i32 && NVT != MVT::i64)
4440
return false;
4441
4442
auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4443
if (!ShlCst)
4444
return false;
4445
4446
uint64_t ShAmt = ShlCst->getZExtValue();
4447
4448
// Make sure that we don't change the operation by removing bits.
4449
// This only matters for OR and XOR, AND is unaffected.
4450
uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4451
if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4452
return false;
4453
4454
// Check the minimum bitwidth for the new constant.
4455
// TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4456
auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4457
if (Opcode == ISD::AND) {
4458
// AND32ri is the same as AND64ri32 with zext imm.
4459
// Try this before sign extended immediates below.
4460
ShiftedVal = (uint64_t)Val >> ShAmt;
4461
if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4462
return true;
4463
// Also swap order when the AND can become MOVZX.
4464
if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4465
return true;
4466
}
4467
ShiftedVal = Val >> ShAmt;
4468
if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4469
(!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4470
return true;
4471
if (Opcode != ISD::AND) {
4472
// MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4473
ShiftedVal = (uint64_t)Val >> ShAmt;
4474
if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4475
return true;
4476
}
4477
return false;
4478
};
4479
4480
int64_t ShiftedVal;
4481
if (!CanShrinkImmediate(ShiftedVal))
4482
return false;
4483
4484
// Ok, we can reorder to get a smaller immediate.
4485
4486
// But, its possible the original immediate allowed an AND to become MOVZX.
4487
// Doing this late due to avoid the MakedValueIsZero call as late as
4488
// possible.
4489
if (Opcode == ISD::AND) {
4490
// Find the smallest zext this could possibly be.
4491
unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4492
ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4493
4494
// Figure out which bits need to be zero to achieve that mask.
4495
APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4496
ZExtWidth);
4497
NeededMask &= ~Cst->getAPIntValue();
4498
4499
if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4500
return false;
4501
}
4502
4503
SDValue X = Shift.getOperand(0);
4504
if (FoundAnyExtend) {
4505
SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4506
insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4507
X = NewX;
4508
}
4509
4510
SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
4511
insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4512
SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4513
insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4514
SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4515
Shift.getOperand(1));
4516
ReplaceNode(N, NewSHL.getNode());
4517
SelectCode(NewSHL.getNode());
4518
return true;
4519
}
4520
4521
bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4522
SDNode *ParentB, SDNode *ParentC,
4523
SDValue A, SDValue B, SDValue C,
4524
uint8_t Imm) {
4525
assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4526
C.isOperandOf(ParentC) && "Incorrect parent node");
4527
4528
auto tryFoldLoadOrBCast =
4529
[this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4530
SDValue &Index, SDValue &Disp, SDValue &Segment) {
4531
if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4532
return true;
4533
4534
// Not a load, check for broadcast which may be behind a bitcast.
4535
if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4536
P = L.getNode();
4537
L = L.getOperand(0);
4538
}
4539
4540
if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4541
return false;
4542
4543
// Only 32 and 64 bit broadcasts are supported.
4544
auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4545
unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4546
if (Size != 32 && Size != 64)
4547
return false;
4548
4549
return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4550
};
4551
4552
bool FoldedLoad = false;
4553
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4554
if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4555
FoldedLoad = true;
4556
} else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4557
Tmp4)) {
4558
FoldedLoad = true;
4559
std::swap(A, C);
4560
// Swap bits 1/4 and 3/6.
4561
uint8_t OldImm = Imm;
4562
Imm = OldImm & 0xa5;
4563
if (OldImm & 0x02) Imm |= 0x10;
4564
if (OldImm & 0x10) Imm |= 0x02;
4565
if (OldImm & 0x08) Imm |= 0x40;
4566
if (OldImm & 0x40) Imm |= 0x08;
4567
} else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4568
Tmp4)) {
4569
FoldedLoad = true;
4570
std::swap(B, C);
4571
// Swap bits 1/2 and 5/6.
4572
uint8_t OldImm = Imm;
4573
Imm = OldImm & 0x99;
4574
if (OldImm & 0x02) Imm |= 0x04;
4575
if (OldImm & 0x04) Imm |= 0x02;
4576
if (OldImm & 0x20) Imm |= 0x40;
4577
if (OldImm & 0x40) Imm |= 0x20;
4578
}
4579
4580
SDLoc DL(Root);
4581
4582
SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4583
4584
MVT NVT = Root->getSimpleValueType(0);
4585
4586
MachineSDNode *MNode;
4587
if (FoldedLoad) {
4588
SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4589
4590
unsigned Opc;
4591
if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4592
auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4593
unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4594
assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4595
4596
bool UseD = EltSize == 32;
4597
if (NVT.is128BitVector())
4598
Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4599
else if (NVT.is256BitVector())
4600
Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4601
else if (NVT.is512BitVector())
4602
Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4603
else
4604
llvm_unreachable("Unexpected vector size!");
4605
} else {
4606
bool UseD = NVT.getVectorElementType() == MVT::i32;
4607
if (NVT.is128BitVector())
4608
Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4609
else if (NVT.is256BitVector())
4610
Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4611
else if (NVT.is512BitVector())
4612
Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4613
else
4614
llvm_unreachable("Unexpected vector size!");
4615
}
4616
4617
SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4618
MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4619
4620
// Update the chain.
4621
ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4622
// Record the mem-refs
4623
CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4624
} else {
4625
bool UseD = NVT.getVectorElementType() == MVT::i32;
4626
unsigned Opc;
4627
if (NVT.is128BitVector())
4628
Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4629
else if (NVT.is256BitVector())
4630
Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4631
else if (NVT.is512BitVector())
4632
Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4633
else
4634
llvm_unreachable("Unexpected vector size!");
4635
4636
MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4637
}
4638
4639
ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4640
CurDAG->RemoveDeadNode(Root);
4641
return true;
4642
}
4643
4644
// Try to match two logic ops to a VPTERNLOG.
4645
// FIXME: Handle more complex patterns that use an operand more than once?
4646
bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4647
MVT NVT = N->getSimpleValueType(0);
4648
4649
// Make sure we support VPTERNLOG.
4650
if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4651
NVT.getVectorElementType() == MVT::i1)
4652
return false;
4653
4654
// We need VLX for 128/256-bit.
4655
if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4656
return false;
4657
4658
SDValue N0 = N->getOperand(0);
4659
SDValue N1 = N->getOperand(1);
4660
4661
auto getFoldableLogicOp = [](SDValue Op) {
4662
// Peek through single use bitcast.
4663
if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4664
Op = Op.getOperand(0);
4665
4666
if (!Op.hasOneUse())
4667
return SDValue();
4668
4669
unsigned Opc = Op.getOpcode();
4670
if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4671
Opc == X86ISD::ANDNP)
4672
return Op;
4673
4674
return SDValue();
4675
};
4676
4677
SDValue A, FoldableOp;
4678
if ((FoldableOp = getFoldableLogicOp(N1))) {
4679
A = N0;
4680
} else if ((FoldableOp = getFoldableLogicOp(N0))) {
4681
A = N1;
4682
} else
4683
return false;
4684
4685
SDValue B = FoldableOp.getOperand(0);
4686
SDValue C = FoldableOp.getOperand(1);
4687
SDNode *ParentA = N;
4688
SDNode *ParentB = FoldableOp.getNode();
4689
SDNode *ParentC = FoldableOp.getNode();
4690
4691
// We can build the appropriate control immediate by performing the logic
4692
// operation we're matching using these constants for A, B, and C.
4693
uint8_t TernlogMagicA = 0xf0;
4694
uint8_t TernlogMagicB = 0xcc;
4695
uint8_t TernlogMagicC = 0xaa;
4696
4697
// Some of the inputs may be inverted, peek through them and invert the
4698
// magic values accordingly.
4699
// TODO: There may be a bitcast before the xor that we should peek through.
4700
auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4701
if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4702
ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4703
Magic = ~Magic;
4704
Parent = Op.getNode();
4705
Op = Op.getOperand(0);
4706
}
4707
};
4708
4709
PeekThroughNot(A, ParentA, TernlogMagicA);
4710
PeekThroughNot(B, ParentB, TernlogMagicB);
4711
PeekThroughNot(C, ParentC, TernlogMagicC);
4712
4713
uint8_t Imm;
4714
switch (FoldableOp.getOpcode()) {
4715
default: llvm_unreachable("Unexpected opcode!");
4716
case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4717
case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4718
case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4719
case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4720
}
4721
4722
switch (N->getOpcode()) {
4723
default: llvm_unreachable("Unexpected opcode!");
4724
case X86ISD::ANDNP:
4725
if (A == N0)
4726
Imm &= ~TernlogMagicA;
4727
else
4728
Imm = ~(Imm) & TernlogMagicA;
4729
break;
4730
case ISD::AND: Imm &= TernlogMagicA; break;
4731
case ISD::OR: Imm |= TernlogMagicA; break;
4732
case ISD::XOR: Imm ^= TernlogMagicA; break;
4733
}
4734
4735
return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4736
}
4737
4738
/// If the high bits of an 'and' operand are known zero, try setting the
4739
/// high bits of an 'and' constant operand to produce a smaller encoding by
4740
/// creating a small, sign-extended negative immediate rather than a large
4741
/// positive one. This reverses a transform in SimplifyDemandedBits that
4742
/// shrinks mask constants by clearing bits. There is also a possibility that
4743
/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4744
/// case, just replace the 'and'. Return 'true' if the node is replaced.
4745
bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4746
// i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4747
// have immediate operands.
4748
MVT VT = And->getSimpleValueType(0);
4749
if (VT != MVT::i32 && VT != MVT::i64)
4750
return false;
4751
4752
auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4753
if (!And1C)
4754
return false;
4755
4756
// Bail out if the mask constant is already negative. It's can't shrink more.
4757
// If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4758
// patterns to use a 32-bit and instead of a 64-bit and by relying on the
4759
// implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4760
// are negative too.
4761
APInt MaskVal = And1C->getAPIntValue();
4762
unsigned MaskLZ = MaskVal.countl_zero();
4763
if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4764
return false;
4765
4766
// Don't extend into the upper 32 bits of a 64 bit mask.
4767
if (VT == MVT::i64 && MaskLZ >= 32) {
4768
MaskLZ -= 32;
4769
MaskVal = MaskVal.trunc(32);
4770
}
4771
4772
SDValue And0 = And->getOperand(0);
4773
APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4774
APInt NegMaskVal = MaskVal | HighZeros;
4775
4776
// If a negative constant would not allow a smaller encoding, there's no need
4777
// to continue. Only change the constant when we know it's a win.
4778
unsigned MinWidth = NegMaskVal.getSignificantBits();
4779
if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4780
return false;
4781
4782
// Extend masks if we truncated above.
4783
if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4784
NegMaskVal = NegMaskVal.zext(64);
4785
HighZeros = HighZeros.zext(64);
4786
}
4787
4788
// The variable operand must be all zeros in the top bits to allow using the
4789
// new, negative constant as the mask.
4790
if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
4791
return false;
4792
4793
// Check if the mask is -1. In that case, this is an unnecessary instruction
4794
// that escaped earlier analysis.
4795
if (NegMaskVal.isAllOnes()) {
4796
ReplaceNode(And, And0.getNode());
4797
return true;
4798
}
4799
4800
// A negative mask allows a smaller encoding. Create a new 'and' node.
4801
SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4802
insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4803
SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4804
ReplaceNode(And, NewAnd.getNode());
4805
SelectCode(NewAnd.getNode());
4806
return true;
4807
}
4808
4809
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4810
bool FoldedBCast, bool Masked) {
4811
#define VPTESTM_CASE(VT, SUFFIX) \
4812
case MVT::VT: \
4813
if (Masked) \
4814
return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4815
return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4816
4817
4818
#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4819
default: llvm_unreachable("Unexpected VT!"); \
4820
VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4821
VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4822
VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4823
VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4824
VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4825
VPTESTM_CASE(v8i64, QZ##SUFFIX)
4826
4827
#define VPTESTM_FULL_CASES(SUFFIX) \
4828
VPTESTM_BROADCAST_CASES(SUFFIX) \
4829
VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4830
VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4831
VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4832
VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4833
VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4834
VPTESTM_CASE(v32i16, WZ##SUFFIX)
4835
4836
if (FoldedBCast) {
4837
switch (TestVT.SimpleTy) {
4838
VPTESTM_BROADCAST_CASES(rmb)
4839
}
4840
}
4841
4842
if (FoldedLoad) {
4843
switch (TestVT.SimpleTy) {
4844
VPTESTM_FULL_CASES(rm)
4845
}
4846
}
4847
4848
switch (TestVT.SimpleTy) {
4849
VPTESTM_FULL_CASES(rr)
4850
}
4851
4852
#undef VPTESTM_FULL_CASES
4853
#undef VPTESTM_BROADCAST_CASES
4854
#undef VPTESTM_CASE
4855
}
4856
4857
// Try to create VPTESTM instruction. If InMask is not null, it will be used
4858
// to form a masked operation.
4859
bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4860
SDValue InMask) {
4861
assert(Subtarget->hasAVX512() && "Expected AVX512!");
4862
assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4863
"Unexpected VT!");
4864
4865
// Look for equal and not equal compares.
4866
ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4867
if (CC != ISD::SETEQ && CC != ISD::SETNE)
4868
return false;
4869
4870
SDValue SetccOp0 = Setcc.getOperand(0);
4871
SDValue SetccOp1 = Setcc.getOperand(1);
4872
4873
// Canonicalize the all zero vector to the RHS.
4874
if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4875
std::swap(SetccOp0, SetccOp1);
4876
4877
// See if we're comparing against zero.
4878
if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4879
return false;
4880
4881
SDValue N0 = SetccOp0;
4882
4883
MVT CmpVT = N0.getSimpleValueType();
4884
MVT CmpSVT = CmpVT.getVectorElementType();
4885
4886
// Start with both operands the same. We'll try to refine this.
4887
SDValue Src0 = N0;
4888
SDValue Src1 = N0;
4889
4890
{
4891
// Look through single use bitcasts.
4892
SDValue N0Temp = N0;
4893
if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4894
N0Temp = N0.getOperand(0);
4895
4896
// Look for single use AND.
4897
if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4898
Src0 = N0Temp.getOperand(0);
4899
Src1 = N0Temp.getOperand(1);
4900
}
4901
}
4902
4903
// Without VLX we need to widen the operation.
4904
bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4905
4906
auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4907
SDValue &Base, SDValue &Scale, SDValue &Index,
4908
SDValue &Disp, SDValue &Segment) {
4909
// If we need to widen, we can't fold the load.
4910
if (!Widen)
4911
if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4912
return true;
4913
4914
// If we didn't fold a load, try to match broadcast. No widening limitation
4915
// for this. But only 32 and 64 bit types are supported.
4916
if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4917
return false;
4918
4919
// Look through single use bitcasts.
4920
if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4921
P = L.getNode();
4922
L = L.getOperand(0);
4923
}
4924
4925
if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4926
return false;
4927
4928
auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4929
if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4930
return false;
4931
4932
return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4933
};
4934
4935
// We can only fold loads if the sources are unique.
4936
bool CanFoldLoads = Src0 != Src1;
4937
4938
bool FoldedLoad = false;
4939
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4940
if (CanFoldLoads) {
4941
FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4942
Tmp3, Tmp4);
4943
if (!FoldedLoad) {
4944
// And is commutative.
4945
FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
4946
Tmp2, Tmp3, Tmp4);
4947
if (FoldedLoad)
4948
std::swap(Src0, Src1);
4949
}
4950
}
4951
4952
bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
4953
4954
bool IsMasked = InMask.getNode() != nullptr;
4955
4956
SDLoc dl(Root);
4957
4958
MVT ResVT = Setcc.getSimpleValueType();
4959
MVT MaskVT = ResVT;
4960
if (Widen) {
4961
// Widen the inputs using insert_subreg or copy_to_regclass.
4962
unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4963
unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4964
unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4965
CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4966
MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4967
SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4968
CmpVT), 0);
4969
Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4970
4971
if (!FoldedBCast)
4972
Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4973
4974
if (IsMasked) {
4975
// Widen the mask.
4976
unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
4977
SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4978
InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4979
dl, MaskVT, InMask, RC), 0);
4980
}
4981
}
4982
4983
bool IsTestN = CC == ISD::SETEQ;
4984
unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4985
IsMasked);
4986
4987
MachineSDNode *CNode;
4988
if (FoldedLoad) {
4989
SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4990
4991
if (IsMasked) {
4992
SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4993
Src1.getOperand(0) };
4994
CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4995
} else {
4996
SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4997
Src1.getOperand(0) };
4998
CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4999
}
5000
5001
// Update the chain.
5002
ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
5003
// Record the mem-refs
5004
CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
5005
} else {
5006
if (IsMasked)
5007
CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
5008
else
5009
CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
5010
}
5011
5012
// If we widened, we need to shrink the mask VT.
5013
if (Widen) {
5014
unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
5015
SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5016
CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5017
dl, ResVT, SDValue(CNode, 0), RC);
5018
}
5019
5020
ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
5021
CurDAG->RemoveDeadNode(Root);
5022
return true;
5023
}
5024
5025
// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5026
// into vpternlog.
5027
bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5028
assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5029
5030
MVT NVT = N->getSimpleValueType(0);
5031
5032
// Make sure we support VPTERNLOG.
5033
if (!NVT.isVector() || !Subtarget->hasAVX512())
5034
return false;
5035
5036
// We need VLX for 128/256-bit.
5037
if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5038
return false;
5039
5040
SDValue N0 = N->getOperand(0);
5041
SDValue N1 = N->getOperand(1);
5042
5043
// Canonicalize AND to LHS.
5044
if (N1.getOpcode() == ISD::AND)
5045
std::swap(N0, N1);
5046
5047
if (N0.getOpcode() != ISD::AND ||
5048
N1.getOpcode() != X86ISD::ANDNP ||
5049
!N0.hasOneUse() || !N1.hasOneUse())
5050
return false;
5051
5052
// ANDN is not commutable, use it to pick down A and C.
5053
SDValue A = N1.getOperand(0);
5054
SDValue C = N1.getOperand(1);
5055
5056
// AND is commutable, if one operand matches A, the other operand is B.
5057
// Otherwise this isn't a match.
5058
SDValue B;
5059
if (N0.getOperand(0) == A)
5060
B = N0.getOperand(1);
5061
else if (N0.getOperand(1) == A)
5062
B = N0.getOperand(0);
5063
else
5064
return false;
5065
5066
SDLoc dl(N);
5067
SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5068
SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5069
ReplaceNode(N, Ternlog.getNode());
5070
5071
return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5072
Ternlog.getNode(), A, B, C, 0xCA);
5073
}
5074
5075
void X86DAGToDAGISel::Select(SDNode *Node) {
5076
MVT NVT = Node->getSimpleValueType(0);
5077
unsigned Opcode = Node->getOpcode();
5078
SDLoc dl(Node);
5079
5080
if (Node->isMachineOpcode()) {
5081
LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5082
Node->setNodeId(-1);
5083
return; // Already selected.
5084
}
5085
5086
switch (Opcode) {
5087
default: break;
5088
case ISD::INTRINSIC_W_CHAIN: {
5089
unsigned IntNo = Node->getConstantOperandVal(1);
5090
switch (IntNo) {
5091
default: break;
5092
case Intrinsic::x86_encodekey128:
5093
case Intrinsic::x86_encodekey256: {
5094
if (!Subtarget->hasKL())
5095
break;
5096
5097
unsigned Opcode;
5098
switch (IntNo) {
5099
default: llvm_unreachable("Impossible intrinsic");
5100
case Intrinsic::x86_encodekey128:
5101
Opcode = X86::ENCODEKEY128;
5102
break;
5103
case Intrinsic::x86_encodekey256:
5104
Opcode = X86::ENCODEKEY256;
5105
break;
5106
}
5107
5108
SDValue Chain = Node->getOperand(0);
5109
Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5110
SDValue());
5111
if (Opcode == X86::ENCODEKEY256)
5112
Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5113
Chain.getValue(1));
5114
5115
MachineSDNode *Res = CurDAG->getMachineNode(
5116
Opcode, dl, Node->getVTList(),
5117
{Node->getOperand(2), Chain, Chain.getValue(1)});
5118
ReplaceNode(Node, Res);
5119
return;
5120
}
5121
case Intrinsic::x86_tileloadd64_internal:
5122
case Intrinsic::x86_tileloaddt164_internal: {
5123
if (!Subtarget->hasAMXTILE())
5124
break;
5125
auto *MFI =
5126
CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5127
MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5128
unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
5129
? X86::PTILELOADDV
5130
: X86::PTILELOADDT1V;
5131
// _tile_loadd_internal(row, col, buf, STRIDE)
5132
SDValue Base = Node->getOperand(4);
5133
SDValue Scale = getI8Imm(1, dl);
5134
SDValue Index = Node->getOperand(5);
5135
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5136
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5137
SDValue Chain = Node->getOperand(0);
5138
MachineSDNode *CNode;
5139
SDValue Ops[] = {Node->getOperand(2),
5140
Node->getOperand(3),
5141
Base,
5142
Scale,
5143
Index,
5144
Disp,
5145
Segment,
5146
Chain};
5147
CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5148
ReplaceNode(Node, CNode);
5149
return;
5150
}
5151
}
5152
break;
5153
}
5154
case ISD::INTRINSIC_VOID: {
5155
unsigned IntNo = Node->getConstantOperandVal(1);
5156
switch (IntNo) {
5157
default: break;
5158
case Intrinsic::x86_sse3_monitor:
5159
case Intrinsic::x86_monitorx:
5160
case Intrinsic::x86_clzero: {
5161
bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5162
5163
unsigned Opc = 0;
5164
switch (IntNo) {
5165
default: llvm_unreachable("Unexpected intrinsic!");
5166
case Intrinsic::x86_sse3_monitor:
5167
if (!Subtarget->hasSSE3())
5168
break;
5169
Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5170
break;
5171
case Intrinsic::x86_monitorx:
5172
if (!Subtarget->hasMWAITX())
5173
break;
5174
Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5175
break;
5176
case Intrinsic::x86_clzero:
5177
if (!Subtarget->hasCLZERO())
5178
break;
5179
Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5180
break;
5181
}
5182
5183
if (Opc) {
5184
unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5185
SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5186
Node->getOperand(2), SDValue());
5187
SDValue InGlue = Chain.getValue(1);
5188
5189
if (IntNo == Intrinsic::x86_sse3_monitor ||
5190
IntNo == Intrinsic::x86_monitorx) {
5191
// Copy the other two operands to ECX and EDX.
5192
Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5193
InGlue);
5194
InGlue = Chain.getValue(1);
5195
Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5196
InGlue);
5197
InGlue = Chain.getValue(1);
5198
}
5199
5200
MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5201
{ Chain, InGlue});
5202
ReplaceNode(Node, CNode);
5203
return;
5204
}
5205
5206
break;
5207
}
5208
case Intrinsic::x86_tilestored64_internal: {
5209
auto *MFI =
5210
CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5211
MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5212
unsigned Opc = X86::PTILESTOREDV;
5213
// _tile_stored_internal(row, col, buf, STRIDE, c)
5214
SDValue Base = Node->getOperand(4);
5215
SDValue Scale = getI8Imm(1, dl);
5216
SDValue Index = Node->getOperand(5);
5217
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5218
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5219
SDValue Chain = Node->getOperand(0);
5220
MachineSDNode *CNode;
5221
SDValue Ops[] = {Node->getOperand(2),
5222
Node->getOperand(3),
5223
Base,
5224
Scale,
5225
Index,
5226
Disp,
5227
Segment,
5228
Node->getOperand(6),
5229
Chain};
5230
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5231
ReplaceNode(Node, CNode);
5232
return;
5233
}
5234
case Intrinsic::x86_tileloadd64:
5235
case Intrinsic::x86_tileloaddt164:
5236
case Intrinsic::x86_tilestored64: {
5237
if (!Subtarget->hasAMXTILE())
5238
break;
5239
auto *MFI =
5240
CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5241
MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5242
unsigned Opc;
5243
switch (IntNo) {
5244
default: llvm_unreachable("Unexpected intrinsic!");
5245
case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5246
case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5247
case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5248
}
5249
// FIXME: Match displacement and scale.
5250
unsigned TIndex = Node->getConstantOperandVal(2);
5251
SDValue TReg = getI8Imm(TIndex, dl);
5252
SDValue Base = Node->getOperand(3);
5253
SDValue Scale = getI8Imm(1, dl);
5254
SDValue Index = Node->getOperand(4);
5255
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5256
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5257
SDValue Chain = Node->getOperand(0);
5258
MachineSDNode *CNode;
5259
if (Opc == X86::PTILESTORED) {
5260
SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5261
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5262
} else {
5263
SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5264
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5265
}
5266
ReplaceNode(Node, CNode);
5267
return;
5268
}
5269
}
5270
break;
5271
}
5272
case ISD::BRIND:
5273
case X86ISD::NT_BRIND: {
5274
if (Subtarget->isTargetNaCl())
5275
// NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5276
// leave the instruction alone.
5277
break;
5278
if (Subtarget->isTarget64BitILP32()) {
5279
// Converts a 32-bit register to a 64-bit, zero-extended version of
5280
// it. This is needed because x86-64 can do many things, but jmp %r32
5281
// ain't one of them.
5282
SDValue Target = Node->getOperand(1);
5283
assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5284
SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5285
SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5286
Node->getOperand(0), ZextTarget);
5287
ReplaceNode(Node, Brind.getNode());
5288
SelectCode(ZextTarget.getNode());
5289
SelectCode(Brind.getNode());
5290
return;
5291
}
5292
break;
5293
}
5294
case X86ISD::GlobalBaseReg:
5295
ReplaceNode(Node, getGlobalBaseReg());
5296
return;
5297
5298
case ISD::BITCAST:
5299
// Just drop all 128/256/512-bit bitcasts.
5300
if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5301
NVT == MVT::f128) {
5302
ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5303
CurDAG->RemoveDeadNode(Node);
5304
return;
5305
}
5306
break;
5307
5308
case ISD::SRL:
5309
if (matchBitExtract(Node))
5310
return;
5311
[[fallthrough]];
5312
case ISD::SRA:
5313
case ISD::SHL:
5314
if (tryShiftAmountMod(Node))
5315
return;
5316
break;
5317
5318
case X86ISD::VPTERNLOG: {
5319
uint8_t Imm = Node->getConstantOperandVal(3);
5320
if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5321
Node->getOperand(1), Node->getOperand(2), Imm))
5322
return;
5323
break;
5324
}
5325
5326
case X86ISD::ANDNP:
5327
if (tryVPTERNLOG(Node))
5328
return;
5329
break;
5330
5331
case ISD::AND:
5332
if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5333
// Try to form a masked VPTESTM. Operands can be in either order.
5334
SDValue N0 = Node->getOperand(0);
5335
SDValue N1 = Node->getOperand(1);
5336
if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5337
tryVPTESTM(Node, N0, N1))
5338
return;
5339
if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5340
tryVPTESTM(Node, N1, N0))
5341
return;
5342
}
5343
5344
if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5345
ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5346
CurDAG->RemoveDeadNode(Node);
5347
return;
5348
}
5349
if (matchBitExtract(Node))
5350
return;
5351
if (AndImmShrink && shrinkAndImmediate(Node))
5352
return;
5353
5354
[[fallthrough]];
5355
case ISD::OR:
5356
case ISD::XOR:
5357
if (tryShrinkShlLogicImm(Node))
5358
return;
5359
if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5360
return;
5361
if (tryVPTERNLOG(Node))
5362
return;
5363
5364
[[fallthrough]];
5365
case ISD::ADD:
5366
if (Opcode == ISD::ADD && matchBitExtract(Node))
5367
return;
5368
[[fallthrough]];
5369
case ISD::SUB: {
5370
// Try to avoid folding immediates with multiple uses for optsize.
5371
// This code tries to select to register form directly to avoid going
5372
// through the isel table which might fold the immediate. We can't change
5373
// the patterns on the add/sub/and/or/xor with immediate paterns in the
5374
// tablegen files to check immediate use count without making the patterns
5375
// unavailable to the fast-isel table.
5376
if (!CurDAG->shouldOptForSize())
5377
break;
5378
5379
// Only handle i8/i16/i32/i64.
5380
if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5381
break;
5382
5383
SDValue N0 = Node->getOperand(0);
5384
SDValue N1 = Node->getOperand(1);
5385
5386
auto *Cst = dyn_cast<ConstantSDNode>(N1);
5387
if (!Cst)
5388
break;
5389
5390
int64_t Val = Cst->getSExtValue();
5391
5392
// Make sure its an immediate that is considered foldable.
5393
// FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5394
if (!isInt<8>(Val) && !isInt<32>(Val))
5395
break;
5396
5397
// If this can match to INC/DEC, let it go.
5398
if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5399
break;
5400
5401
// Check if we should avoid folding this immediate.
5402
if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5403
break;
5404
5405
// We should not fold the immediate. So we need a register form instead.
5406
unsigned ROpc, MOpc;
5407
switch (NVT.SimpleTy) {
5408
default: llvm_unreachable("Unexpected VT!");
5409
case MVT::i8:
5410
switch (Opcode) {
5411
default: llvm_unreachable("Unexpected opcode!");
5412
case ISD::ADD:
5413
ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5414
MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5415
break;
5416
case ISD::SUB:
5417
ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5418
MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5419
break;
5420
case ISD::AND:
5421
ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5422
MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5423
break;
5424
case ISD::OR:
5425
ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5426
MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5427
break;
5428
case ISD::XOR:
5429
ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5430
MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5431
break;
5432
}
5433
break;
5434
case MVT::i16:
5435
switch (Opcode) {
5436
default: llvm_unreachable("Unexpected opcode!");
5437
case ISD::ADD:
5438
ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5439
MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5440
break;
5441
case ISD::SUB:
5442
ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5443
MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5444
break;
5445
case ISD::AND:
5446
ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5447
MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5448
break;
5449
case ISD::OR:
5450
ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5451
MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5452
break;
5453
case ISD::XOR:
5454
ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5455
MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5456
break;
5457
}
5458
break;
5459
case MVT::i32:
5460
switch (Opcode) {
5461
default: llvm_unreachable("Unexpected opcode!");
5462
case ISD::ADD:
5463
ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5464
MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5465
break;
5466
case ISD::SUB:
5467
ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5468
MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5469
break;
5470
case ISD::AND:
5471
ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5472
MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5473
break;
5474
case ISD::OR:
5475
ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5476
MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5477
break;
5478
case ISD::XOR:
5479
ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5480
MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5481
break;
5482
}
5483
break;
5484
case MVT::i64:
5485
switch (Opcode) {
5486
default: llvm_unreachable("Unexpected opcode!");
5487
case ISD::ADD:
5488
ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5489
MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5490
break;
5491
case ISD::SUB:
5492
ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5493
MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5494
break;
5495
case ISD::AND:
5496
ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5497
MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5498
break;
5499
case ISD::OR:
5500
ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5501
MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5502
break;
5503
case ISD::XOR:
5504
ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5505
MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5506
break;
5507
}
5508
break;
5509
}
5510
5511
// Ok this is a AND/OR/XOR/ADD/SUB with constant.
5512
5513
// If this is a not a subtract, we can still try to fold a load.
5514
if (Opcode != ISD::SUB) {
5515
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5516
if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5517
SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5518
SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5519
MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5520
// Update the chain.
5521
ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5522
// Record the mem-refs
5523
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5524
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5525
CurDAG->RemoveDeadNode(Node);
5526
return;
5527
}
5528
}
5529
5530
CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5531
return;
5532
}
5533
5534
case X86ISD::SMUL:
5535
// i16/i32/i64 are handled with isel patterns.
5536
if (NVT != MVT::i8)
5537
break;
5538
[[fallthrough]];
5539
case X86ISD::UMUL: {
5540
SDValue N0 = Node->getOperand(0);
5541
SDValue N1 = Node->getOperand(1);
5542
5543
unsigned LoReg, ROpc, MOpc;
5544
switch (NVT.SimpleTy) {
5545
default: llvm_unreachable("Unsupported VT!");
5546
case MVT::i8:
5547
LoReg = X86::AL;
5548
ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5549
MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5550
break;
5551
case MVT::i16:
5552
LoReg = X86::AX;
5553
ROpc = X86::MUL16r;
5554
MOpc = X86::MUL16m;
5555
break;
5556
case MVT::i32:
5557
LoReg = X86::EAX;
5558
ROpc = X86::MUL32r;
5559
MOpc = X86::MUL32m;
5560
break;
5561
case MVT::i64:
5562
LoReg = X86::RAX;
5563
ROpc = X86::MUL64r;
5564
MOpc = X86::MUL64m;
5565
break;
5566
}
5567
5568
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5569
bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5570
// Multiply is commutative.
5571
if (!FoldedLoad) {
5572
FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5573
if (FoldedLoad)
5574
std::swap(N0, N1);
5575
}
5576
5577
SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5578
N0, SDValue()).getValue(1);
5579
5580
MachineSDNode *CNode;
5581
if (FoldedLoad) {
5582
// i16/i32/i64 use an instruction that produces a low and high result even
5583
// though only the low result is used.
5584
SDVTList VTs;
5585
if (NVT == MVT::i8)
5586
VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5587
else
5588
VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5589
5590
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5591
InGlue };
5592
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5593
5594
// Update the chain.
5595
ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5596
// Record the mem-refs
5597
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5598
} else {
5599
// i16/i32/i64 use an instruction that produces a low and high result even
5600
// though only the low result is used.
5601
SDVTList VTs;
5602
if (NVT == MVT::i8)
5603
VTs = CurDAG->getVTList(NVT, MVT::i32);
5604
else
5605
VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5606
5607
CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5608
}
5609
5610
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5611
ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5612
CurDAG->RemoveDeadNode(Node);
5613
return;
5614
}
5615
5616
case ISD::SMUL_LOHI:
5617
case ISD::UMUL_LOHI: {
5618
SDValue N0 = Node->getOperand(0);
5619
SDValue N1 = Node->getOperand(1);
5620
5621
unsigned Opc, MOpc;
5622
unsigned LoReg, HiReg;
5623
bool IsSigned = Opcode == ISD::SMUL_LOHI;
5624
bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5625
bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5626
switch (NVT.SimpleTy) {
5627
default: llvm_unreachable("Unsupported VT!");
5628
case MVT::i32:
5629
Opc = UseMULXHi ? X86::MULX32Hrr
5630
: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5631
: IsSigned ? X86::IMUL32r
5632
: X86::MUL32r;
5633
MOpc = UseMULXHi ? X86::MULX32Hrm
5634
: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5635
: IsSigned ? X86::IMUL32m
5636
: X86::MUL32m;
5637
LoReg = UseMULX ? X86::EDX : X86::EAX;
5638
HiReg = X86::EDX;
5639
break;
5640
case MVT::i64:
5641
Opc = UseMULXHi ? X86::MULX64Hrr
5642
: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5643
: IsSigned ? X86::IMUL64r
5644
: X86::MUL64r;
5645
MOpc = UseMULXHi ? X86::MULX64Hrm
5646
: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5647
: IsSigned ? X86::IMUL64m
5648
: X86::MUL64m;
5649
LoReg = UseMULX ? X86::RDX : X86::RAX;
5650
HiReg = X86::RDX;
5651
break;
5652
}
5653
5654
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5655
bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5656
// Multiply is commutative.
5657
if (!foldedLoad) {
5658
foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5659
if (foldedLoad)
5660
std::swap(N0, N1);
5661
}
5662
5663
SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5664
N0, SDValue()).getValue(1);
5665
SDValue ResHi, ResLo;
5666
if (foldedLoad) {
5667
SDValue Chain;
5668
MachineSDNode *CNode = nullptr;
5669
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5670
InGlue };
5671
if (UseMULXHi) {
5672
SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5673
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5674
ResHi = SDValue(CNode, 0);
5675
Chain = SDValue(CNode, 1);
5676
} else if (UseMULX) {
5677
SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5678
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5679
ResHi = SDValue(CNode, 0);
5680
ResLo = SDValue(CNode, 1);
5681
Chain = SDValue(CNode, 2);
5682
} else {
5683
SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5684
CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5685
Chain = SDValue(CNode, 0);
5686
InGlue = SDValue(CNode, 1);
5687
}
5688
5689
// Update the chain.
5690
ReplaceUses(N1.getValue(1), Chain);
5691
// Record the mem-refs
5692
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5693
} else {
5694
SDValue Ops[] = { N1, InGlue };
5695
if (UseMULXHi) {
5696
SDVTList VTs = CurDAG->getVTList(NVT);
5697
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5698
ResHi = SDValue(CNode, 0);
5699
} else if (UseMULX) {
5700
SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5701
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5702
ResHi = SDValue(CNode, 0);
5703
ResLo = SDValue(CNode, 1);
5704
} else {
5705
SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5706
SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5707
InGlue = SDValue(CNode, 0);
5708
}
5709
}
5710
5711
// Copy the low half of the result, if it is needed.
5712
if (!SDValue(Node, 0).use_empty()) {
5713
if (!ResLo) {
5714
assert(LoReg && "Register for low half is not defined!");
5715
ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5716
NVT, InGlue);
5717
InGlue = ResLo.getValue(2);
5718
}
5719
ReplaceUses(SDValue(Node, 0), ResLo);
5720
LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5721
dbgs() << '\n');
5722
}
5723
// Copy the high half of the result, if it is needed.
5724
if (!SDValue(Node, 1).use_empty()) {
5725
if (!ResHi) {
5726
assert(HiReg && "Register for high half is not defined!");
5727
ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5728
NVT, InGlue);
5729
InGlue = ResHi.getValue(2);
5730
}
5731
ReplaceUses(SDValue(Node, 1), ResHi);
5732
LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5733
dbgs() << '\n');
5734
}
5735
5736
CurDAG->RemoveDeadNode(Node);
5737
return;
5738
}
5739
5740
case ISD::SDIVREM:
5741
case ISD::UDIVREM: {
5742
SDValue N0 = Node->getOperand(0);
5743
SDValue N1 = Node->getOperand(1);
5744
5745
unsigned ROpc, MOpc;
5746
bool isSigned = Opcode == ISD::SDIVREM;
5747
if (!isSigned) {
5748
switch (NVT.SimpleTy) {
5749
default: llvm_unreachable("Unsupported VT!");
5750
case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5751
case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5752
case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5753
case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5754
}
5755
} else {
5756
switch (NVT.SimpleTy) {
5757
default: llvm_unreachable("Unsupported VT!");
5758
case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5759
case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5760
case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5761
case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5762
}
5763
}
5764
5765
unsigned LoReg, HiReg, ClrReg;
5766
unsigned SExtOpcode;
5767
switch (NVT.SimpleTy) {
5768
default: llvm_unreachable("Unsupported VT!");
5769
case MVT::i8:
5770
LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5771
SExtOpcode = 0; // Not used.
5772
break;
5773
case MVT::i16:
5774
LoReg = X86::AX; HiReg = X86::DX;
5775
ClrReg = X86::DX;
5776
SExtOpcode = X86::CWD;
5777
break;
5778
case MVT::i32:
5779
LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5780
SExtOpcode = X86::CDQ;
5781
break;
5782
case MVT::i64:
5783
LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5784
SExtOpcode = X86::CQO;
5785
break;
5786
}
5787
5788
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5789
bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5790
bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5791
5792
SDValue InGlue;
5793
if (NVT == MVT::i8) {
5794
// Special case for div8, just use a move with zero extension to AX to
5795
// clear the upper 8 bits (AH).
5796
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5797
MachineSDNode *Move;
5798
if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5799
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5800
unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5801
: X86::MOVZX16rm8;
5802
Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5803
Chain = SDValue(Move, 1);
5804
ReplaceUses(N0.getValue(1), Chain);
5805
// Record the mem-refs
5806
CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5807
} else {
5808
unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5809
: X86::MOVZX16rr8;
5810
Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5811
Chain = CurDAG->getEntryNode();
5812
}
5813
Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5814
SDValue());
5815
InGlue = Chain.getValue(1);
5816
} else {
5817
InGlue =
5818
CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5819
LoReg, N0, SDValue()).getValue(1);
5820
if (isSigned && !signBitIsZero) {
5821
// Sign extend the low part into the high part.
5822
InGlue =
5823
SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5824
} else {
5825
// Zero out the high part, effectively zero extending the input.
5826
SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5827
SDValue ClrNode = SDValue(
5828
CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
5829
switch (NVT.SimpleTy) {
5830
case MVT::i16:
5831
ClrNode =
5832
SDValue(CurDAG->getMachineNode(
5833
TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5834
CurDAG->getTargetConstant(X86::sub_16bit, dl,
5835
MVT::i32)),
5836
0);
5837
break;
5838
case MVT::i32:
5839
break;
5840
case MVT::i64:
5841
ClrNode =
5842
SDValue(CurDAG->getMachineNode(
5843
TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5844
CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5845
CurDAG->getTargetConstant(X86::sub_32bit, dl,
5846
MVT::i32)),
5847
0);
5848
break;
5849
default:
5850
llvm_unreachable("Unexpected division source");
5851
}
5852
5853
InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5854
ClrNode, InGlue).getValue(1);
5855
}
5856
}
5857
5858
if (foldedLoad) {
5859
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5860
InGlue };
5861
MachineSDNode *CNode =
5862
CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5863
InGlue = SDValue(CNode, 1);
5864
// Update the chain.
5865
ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
5866
// Record the mem-refs
5867
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5868
} else {
5869
InGlue =
5870
SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
5871
}
5872
5873
// Prevent use of AH in a REX instruction by explicitly copying it to
5874
// an ABCD_L register.
5875
//
5876
// The current assumption of the register allocator is that isel
5877
// won't generate explicit references to the GR8_ABCD_H registers. If
5878
// the allocator and/or the backend get enhanced to be more robust in
5879
// that regard, this can be, and should be, removed.
5880
if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
5881
SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
5882
unsigned AHExtOpcode =
5883
isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
5884
5885
SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
5886
MVT::Glue, AHCopy, InGlue);
5887
SDValue Result(RNode, 0);
5888
InGlue = SDValue(RNode, 1);
5889
5890
Result =
5891
CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
5892
5893
ReplaceUses(SDValue(Node, 1), Result);
5894
LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5895
dbgs() << '\n');
5896
}
5897
// Copy the division (low) result, if it is needed.
5898
if (!SDValue(Node, 0).use_empty()) {
5899
SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5900
LoReg, NVT, InGlue);
5901
InGlue = Result.getValue(2);
5902
ReplaceUses(SDValue(Node, 0), Result);
5903
LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5904
dbgs() << '\n');
5905
}
5906
// Copy the remainder (high) result, if it is needed.
5907
if (!SDValue(Node, 1).use_empty()) {
5908
SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5909
HiReg, NVT, InGlue);
5910
InGlue = Result.getValue(2);
5911
ReplaceUses(SDValue(Node, 1), Result);
5912
LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5913
dbgs() << '\n');
5914
}
5915
CurDAG->RemoveDeadNode(Node);
5916
return;
5917
}
5918
5919
case X86ISD::FCMP:
5920
case X86ISD::STRICT_FCMP:
5921
case X86ISD::STRICT_FCMPS: {
5922
bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
5923
Node->getOpcode() == X86ISD::STRICT_FCMPS;
5924
SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
5925
SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
5926
5927
// Save the original VT of the compare.
5928
MVT CmpVT = N0.getSimpleValueType();
5929
5930
// Floating point needs special handling if we don't have FCOMI.
5931
if (Subtarget->canUseCMOV())
5932
break;
5933
5934
bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
5935
5936
unsigned Opc;
5937
switch (CmpVT.SimpleTy) {
5938
default: llvm_unreachable("Unexpected type!");
5939
case MVT::f32:
5940
Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
5941
break;
5942
case MVT::f64:
5943
Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
5944
break;
5945
case MVT::f80:
5946
Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
5947
break;
5948
}
5949
5950
SDValue Chain =
5951
IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
5952
SDValue Glue;
5953
if (IsStrictCmp) {
5954
SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5955
Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
5956
Glue = Chain.getValue(1);
5957
} else {
5958
Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
5959
}
5960
5961
// Move FPSW to AX.
5962
SDValue FNSTSW =
5963
SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
5964
5965
// Extract upper 8-bits of AX.
5966
SDValue Extract =
5967
CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
5968
5969
// Move AH into flags.
5970
// Some 64-bit targets lack SAHF support, but they do support FCOMI.
5971
assert(Subtarget->canUseLAHFSAHF() &&
5972
"Target doesn't support SAHF or FCOMI?");
5973
SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
5974
Chain = AH;
5975
SDValue SAHF = SDValue(
5976
CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
5977
5978
if (IsStrictCmp)
5979
ReplaceUses(SDValue(Node, 1), Chain);
5980
5981
ReplaceUses(SDValue(Node, 0), SAHF);
5982
CurDAG->RemoveDeadNode(Node);
5983
return;
5984
}
5985
5986
case X86ISD::CMP: {
5987
SDValue N0 = Node->getOperand(0);
5988
SDValue N1 = Node->getOperand(1);
5989
5990
// Optimizations for TEST compares.
5991
if (!isNullConstant(N1))
5992
break;
5993
5994
// Save the original VT of the compare.
5995
MVT CmpVT = N0.getSimpleValueType();
5996
5997
// If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
5998
// by a test instruction. The test should be removed later by
5999
// analyzeCompare if we are using only the zero flag.
6000
// TODO: Should we check the users and use the BEXTR flags directly?
6001
if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6002
if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
6003
unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6004
: X86::TEST32rr;
6005
SDValue BEXTR = SDValue(NewNode, 0);
6006
NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
6007
ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6008
CurDAG->RemoveDeadNode(Node);
6009
return;
6010
}
6011
}
6012
6013
// We can peek through truncates, but we need to be careful below.
6014
if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6015
N0 = N0.getOperand(0);
6016
6017
// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6018
// use a smaller encoding.
6019
// Look past the truncate if CMP is the only use of it.
6020
if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6021
N0.getValueType() != MVT::i8) {
6022
auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6023
if (!MaskC)
6024
break;
6025
6026
// We may have looked through a truncate so mask off any bits that
6027
// shouldn't be part of the compare.
6028
uint64_t Mask = MaskC->getZExtValue();
6029
Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
6030
6031
// Check if we can replace AND+IMM{32,64} with a shift. This is possible
6032
// for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6033
// zero flag.
6034
if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
6035
onlyUsesZeroFlag(SDValue(Node, 0))) {
6036
unsigned ShiftOpcode = ISD::DELETED_NODE;
6037
unsigned ShiftAmt;
6038
unsigned SubRegIdx;
6039
MVT SubRegVT;
6040
unsigned TestOpcode;
6041
unsigned LeadingZeros = llvm::countl_zero(Mask);
6042
unsigned TrailingZeros = llvm::countr_zero(Mask);
6043
6044
// With leading/trailing zeros, the transform is profitable if we can
6045
// eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6046
// incurring any extra register moves.
6047
bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6048
if (LeadingZeros == 0 && SavesBytes) {
6049
// If the mask covers the most significant bit, then we can replace
6050
// TEST+AND with a SHR and check eflags.
6051
// This emits a redundant TEST which is subsequently eliminated.
6052
ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6053
ShiftAmt = TrailingZeros;
6054
SubRegIdx = 0;
6055
TestOpcode = X86::TEST64rr;
6056
} else if (TrailingZeros == 0 && SavesBytes) {
6057
// If the mask covers the least significant bit, then we can replace
6058
// TEST+AND with a SHL and check eflags.
6059
// This emits a redundant TEST which is subsequently eliminated.
6060
ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6061
ShiftAmt = LeadingZeros;
6062
SubRegIdx = 0;
6063
TestOpcode = X86::TEST64rr;
6064
} else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6065
// If the shifted mask extends into the high half and is 8/16/32 bits
6066
// wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6067
unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6068
if (PopCount == 8) {
6069
ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6070
ShiftAmt = TrailingZeros;
6071
SubRegIdx = X86::sub_8bit;
6072
SubRegVT = MVT::i8;
6073
TestOpcode = X86::TEST8rr;
6074
} else if (PopCount == 16) {
6075
ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6076
ShiftAmt = TrailingZeros;
6077
SubRegIdx = X86::sub_16bit;
6078
SubRegVT = MVT::i16;
6079
TestOpcode = X86::TEST16rr;
6080
} else if (PopCount == 32) {
6081
ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6082
ShiftAmt = TrailingZeros;
6083
SubRegIdx = X86::sub_32bit;
6084
SubRegVT = MVT::i32;
6085
TestOpcode = X86::TEST32rr;
6086
}
6087
}
6088
if (ShiftOpcode != ISD::DELETED_NODE) {
6089
SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6090
SDValue Shift = SDValue(
6091
CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6092
N0.getOperand(0), ShiftC),
6093
0);
6094
if (SubRegIdx != 0) {
6095
Shift =
6096
CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6097
}
6098
MachineSDNode *Test =
6099
CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6100
ReplaceNode(Node, Test);
6101
return;
6102
}
6103
}
6104
6105
MVT VT;
6106
int SubRegOp;
6107
unsigned ROpc, MOpc;
6108
6109
// For each of these checks we need to be careful if the sign flag is
6110
// being used. It is only safe to use the sign flag in two conditions,
6111
// either the sign bit in the shrunken mask is zero or the final test
6112
// size is equal to the original compare size.
6113
6114
if (isUInt<8>(Mask) &&
6115
(!(Mask & 0x80) || CmpVT == MVT::i8 ||
6116
hasNoSignFlagUses(SDValue(Node, 0)))) {
6117
// For example, convert "testl %eax, $8" to "testb %al, $8"
6118
VT = MVT::i8;
6119
SubRegOp = X86::sub_8bit;
6120
ROpc = X86::TEST8ri;
6121
MOpc = X86::TEST8mi;
6122
} else if (OptForMinSize && isUInt<16>(Mask) &&
6123
(!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6124
hasNoSignFlagUses(SDValue(Node, 0)))) {
6125
// For example, "testl %eax, $32776" to "testw %ax, $32776".
6126
// NOTE: We only want to form TESTW instructions if optimizing for
6127
// min size. Otherwise we only save one byte and possibly get a length
6128
// changing prefix penalty in the decoders.
6129
VT = MVT::i16;
6130
SubRegOp = X86::sub_16bit;
6131
ROpc = X86::TEST16ri;
6132
MOpc = X86::TEST16mi;
6133
} else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6134
((!(Mask & 0x80000000) &&
6135
// Without minsize 16-bit Cmps can get here so we need to
6136
// be sure we calculate the correct sign flag if needed.
6137
(CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6138
CmpVT == MVT::i32 ||
6139
hasNoSignFlagUses(SDValue(Node, 0)))) {
6140
// For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6141
// NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6142
// Otherwize, we find ourselves in a position where we have to do
6143
// promotion. If previous passes did not promote the and, we assume
6144
// they had a good reason not to and do not promote here.
6145
VT = MVT::i32;
6146
SubRegOp = X86::sub_32bit;
6147
ROpc = X86::TEST32ri;
6148
MOpc = X86::TEST32mi;
6149
} else {
6150
// No eligible transformation was found.
6151
break;
6152
}
6153
6154
SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6155
SDValue Reg = N0.getOperand(0);
6156
6157
// Emit a testl or testw.
6158
MachineSDNode *NewNode;
6159
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6160
if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6161
if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6162
if (!LoadN->isSimple()) {
6163
unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6164
if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6165
(MOpc == X86::TEST16mi && NumVolBits != 16) ||
6166
(MOpc == X86::TEST32mi && NumVolBits != 32))
6167
break;
6168
}
6169
}
6170
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6171
Reg.getOperand(0) };
6172
NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6173
// Update the chain.
6174
ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6175
// Record the mem-refs
6176
CurDAG->setNodeMemRefs(NewNode,
6177
{cast<LoadSDNode>(Reg)->getMemOperand()});
6178
} else {
6179
// Extract the subregister if necessary.
6180
if (N0.getValueType() != VT)
6181
Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6182
6183
NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6184
}
6185
// Replace CMP with TEST.
6186
ReplaceNode(Node, NewNode);
6187
return;
6188
}
6189
break;
6190
}
6191
case X86ISD::PCMPISTR: {
6192
if (!Subtarget->hasSSE42())
6193
break;
6194
6195
bool NeedIndex = !SDValue(Node, 0).use_empty();
6196
bool NeedMask = !SDValue(Node, 1).use_empty();
6197
// We can't fold a load if we are going to make two instructions.
6198
bool MayFoldLoad = !NeedIndex || !NeedMask;
6199
6200
MachineSDNode *CNode;
6201
if (NeedMask) {
6202
unsigned ROpc =
6203
Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6204
unsigned MOpc =
6205
Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6206
CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6207
ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6208
}
6209
if (NeedIndex || !NeedMask) {
6210
unsigned ROpc =
6211
Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6212
unsigned MOpc =
6213
Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6214
CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6215
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6216
}
6217
6218
// Connect the flag usage to the last instruction created.
6219
ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6220
CurDAG->RemoveDeadNode(Node);
6221
return;
6222
}
6223
case X86ISD::PCMPESTR: {
6224
if (!Subtarget->hasSSE42())
6225
break;
6226
6227
// Copy the two implicit register inputs.
6228
SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6229
Node->getOperand(1),
6230
SDValue()).getValue(1);
6231
InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6232
Node->getOperand(3), InGlue).getValue(1);
6233
6234
bool NeedIndex = !SDValue(Node, 0).use_empty();
6235
bool NeedMask = !SDValue(Node, 1).use_empty();
6236
// We can't fold a load if we are going to make two instructions.
6237
bool MayFoldLoad = !NeedIndex || !NeedMask;
6238
6239
MachineSDNode *CNode;
6240
if (NeedMask) {
6241
unsigned ROpc =
6242
Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6243
unsigned MOpc =
6244
Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6245
CNode =
6246
emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6247
ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6248
}
6249
if (NeedIndex || !NeedMask) {
6250
unsigned ROpc =
6251
Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6252
unsigned MOpc =
6253
Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6254
CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6255
ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6256
}
6257
// Connect the flag usage to the last instruction created.
6258
ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6259
CurDAG->RemoveDeadNode(Node);
6260
return;
6261
}
6262
6263
case ISD::SETCC: {
6264
if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6265
return;
6266
6267
break;
6268
}
6269
6270
case ISD::STORE:
6271
if (foldLoadStoreIntoMemOperand(Node))
6272
return;
6273
break;
6274
6275
case X86ISD::SETCC_CARRY: {
6276
MVT VT = Node->getSimpleValueType(0);
6277
SDValue Result;
6278
if (Subtarget->hasSBBDepBreaking()) {
6279
// We have to do this manually because tblgen will put the eflags copy in
6280
// the wrong place if we use an extract_subreg in the pattern.
6281
// Copy flags to the EFLAGS register and glue it to next node.
6282
SDValue EFLAGS =
6283
CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6284
Node->getOperand(1), SDValue());
6285
6286
// Create a 64-bit instruction if the result is 64-bits otherwise use the
6287
// 32-bit version.
6288
unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6289
MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6290
Result = SDValue(
6291
CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6292
0);
6293
} else {
6294
// The target does not recognize sbb with the same reg operand as a
6295
// no-source idiom, so we explicitly zero the input values.
6296
Result = getSBBZero(Node);
6297
}
6298
6299
// For less than 32-bits we need to extract from the 32-bit node.
6300
if (VT == MVT::i8 || VT == MVT::i16) {
6301
int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6302
Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6303
}
6304
6305
ReplaceUses(SDValue(Node, 0), Result);
6306
CurDAG->RemoveDeadNode(Node);
6307
return;
6308
}
6309
case X86ISD::SBB: {
6310
if (isNullConstant(Node->getOperand(0)) &&
6311
isNullConstant(Node->getOperand(1))) {
6312
SDValue Result = getSBBZero(Node);
6313
6314
// Replace the flag use.
6315
ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6316
6317
// Replace the result use.
6318
if (!SDValue(Node, 0).use_empty()) {
6319
// For less than 32-bits we need to extract from the 32-bit node.
6320
MVT VT = Node->getSimpleValueType(0);
6321
if (VT == MVT::i8 || VT == MVT::i16) {
6322
int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6323
Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6324
}
6325
ReplaceUses(SDValue(Node, 0), Result);
6326
}
6327
6328
CurDAG->RemoveDeadNode(Node);
6329
return;
6330
}
6331
break;
6332
}
6333
case X86ISD::MGATHER: {
6334
auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6335
SDValue IndexOp = Mgt->getIndex();
6336
SDValue Mask = Mgt->getMask();
6337
MVT IndexVT = IndexOp.getSimpleValueType();
6338
MVT ValueVT = Node->getSimpleValueType(0);
6339
MVT MaskVT = Mask.getSimpleValueType();
6340
6341
// This is just to prevent crashes if the nodes are malformed somehow. We're
6342
// otherwise only doing loose type checking in here based on type what
6343
// a type constraint would say just like table based isel.
6344
if (!ValueVT.isVector() || !MaskVT.isVector())
6345
break;
6346
6347
unsigned NumElts = ValueVT.getVectorNumElements();
6348
MVT ValueSVT = ValueVT.getVectorElementType();
6349
6350
bool IsFP = ValueSVT.isFloatingPoint();
6351
unsigned EltSize = ValueSVT.getSizeInBits();
6352
6353
unsigned Opc = 0;
6354
bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6355
if (AVX512Gather) {
6356
if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6357
Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6358
else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6359
Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6360
else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6361
Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6362
else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6363
Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6364
else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6365
Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6366
else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6367
Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6368
else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6369
Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6370
else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6371
Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6372
else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6373
Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6374
else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6375
Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6376
else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6377
Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6378
else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6379
Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6380
} else {
6381
assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6382
"Unexpected mask VT!");
6383
if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6384
Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6385
else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6386
Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6387
else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6388
Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6389
else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6390
Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6391
else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6392
Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6393
else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6394
Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6395
else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6396
Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6397
else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6398
Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6399
}
6400
6401
if (!Opc)
6402
break;
6403
6404
SDValue Base, Scale, Index, Disp, Segment;
6405
if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6406
Base, Scale, Index, Disp, Segment))
6407
break;
6408
6409
SDValue PassThru = Mgt->getPassThru();
6410
SDValue Chain = Mgt->getChain();
6411
// Gather instructions have a mask output not in the ISD node.
6412
SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6413
6414
MachineSDNode *NewNode;
6415
if (AVX512Gather) {
6416
SDValue Ops[] = {PassThru, Mask, Base, Scale,
6417
Index, Disp, Segment, Chain};
6418
NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6419
} else {
6420
SDValue Ops[] = {PassThru, Base, Scale, Index,
6421
Disp, Segment, Mask, Chain};
6422
NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6423
}
6424
CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6425
ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6426
ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6427
CurDAG->RemoveDeadNode(Node);
6428
return;
6429
}
6430
case X86ISD::MSCATTER: {
6431
auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6432
SDValue Value = Sc->getValue();
6433
SDValue IndexOp = Sc->getIndex();
6434
MVT IndexVT = IndexOp.getSimpleValueType();
6435
MVT ValueVT = Value.getSimpleValueType();
6436
6437
// This is just to prevent crashes if the nodes are malformed somehow. We're
6438
// otherwise only doing loose type checking in here based on type what
6439
// a type constraint would say just like table based isel.
6440
if (!ValueVT.isVector())
6441
break;
6442
6443
unsigned NumElts = ValueVT.getVectorNumElements();
6444
MVT ValueSVT = ValueVT.getVectorElementType();
6445
6446
bool IsFP = ValueSVT.isFloatingPoint();
6447
unsigned EltSize = ValueSVT.getSizeInBits();
6448
6449
unsigned Opc;
6450
if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6451
Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6452
else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6453
Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6454
else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6455
Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6456
else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6457
Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6458
else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6459
Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6460
else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6461
Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6462
else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6463
Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6464
else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6465
Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6466
else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6467
Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6468
else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6469
Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6470
else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6471
Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6472
else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6473
Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6474
else
6475
break;
6476
6477
SDValue Base, Scale, Index, Disp, Segment;
6478
if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6479
Base, Scale, Index, Disp, Segment))
6480
break;
6481
6482
SDValue Mask = Sc->getMask();
6483
SDValue Chain = Sc->getChain();
6484
// Scatter instructions have a mask output not in the ISD node.
6485
SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6486
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6487
6488
MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6489
CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6490
ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6491
CurDAG->RemoveDeadNode(Node);
6492
return;
6493
}
6494
case ISD::PREALLOCATED_SETUP: {
6495
auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6496
auto CallId = MFI->getPreallocatedIdForCallSite(
6497
cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6498
SDValue Chain = Node->getOperand(0);
6499
SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6500
MachineSDNode *New = CurDAG->getMachineNode(
6501
TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6502
ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6503
CurDAG->RemoveDeadNode(Node);
6504
return;
6505
}
6506
case ISD::PREALLOCATED_ARG: {
6507
auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6508
auto CallId = MFI->getPreallocatedIdForCallSite(
6509
cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6510
SDValue Chain = Node->getOperand(0);
6511
SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6512
SDValue ArgIndex = Node->getOperand(2);
6513
SDValue Ops[3];
6514
Ops[0] = CallIdValue;
6515
Ops[1] = ArgIndex;
6516
Ops[2] = Chain;
6517
MachineSDNode *New = CurDAG->getMachineNode(
6518
TargetOpcode::PREALLOCATED_ARG, dl,
6519
CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6520
MVT::Other),
6521
Ops);
6522
ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6523
ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6524
CurDAG->RemoveDeadNode(Node);
6525
return;
6526
}
6527
case X86ISD::AESENCWIDE128KL:
6528
case X86ISD::AESDECWIDE128KL:
6529
case X86ISD::AESENCWIDE256KL:
6530
case X86ISD::AESDECWIDE256KL: {
6531
if (!Subtarget->hasWIDEKL())
6532
break;
6533
6534
unsigned Opcode;
6535
switch (Node->getOpcode()) {
6536
default:
6537
llvm_unreachable("Unexpected opcode!");
6538
case X86ISD::AESENCWIDE128KL:
6539
Opcode = X86::AESENCWIDE128KL;
6540
break;
6541
case X86ISD::AESDECWIDE128KL:
6542
Opcode = X86::AESDECWIDE128KL;
6543
break;
6544
case X86ISD::AESENCWIDE256KL:
6545
Opcode = X86::AESENCWIDE256KL;
6546
break;
6547
case X86ISD::AESDECWIDE256KL:
6548
Opcode = X86::AESDECWIDE256KL;
6549
break;
6550
}
6551
6552
SDValue Chain = Node->getOperand(0);
6553
SDValue Addr = Node->getOperand(1);
6554
6555
SDValue Base, Scale, Index, Disp, Segment;
6556
if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6557
break;
6558
6559
Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6560
SDValue());
6561
Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6562
Chain.getValue(1));
6563
Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6564
Chain.getValue(1));
6565
Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6566
Chain.getValue(1));
6567
Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6568
Chain.getValue(1));
6569
Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6570
Chain.getValue(1));
6571
Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6572
Chain.getValue(1));
6573
Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6574
Chain.getValue(1));
6575
6576
MachineSDNode *Res = CurDAG->getMachineNode(
6577
Opcode, dl, Node->getVTList(),
6578
{Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6579
CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6580
ReplaceNode(Node, Res);
6581
return;
6582
}
6583
}
6584
6585
SelectCode(Node);
6586
}
6587
6588
bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6589
const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6590
std::vector<SDValue> &OutOps) {
6591
SDValue Op0, Op1, Op2, Op3, Op4;
6592
switch (ConstraintID) {
6593
default:
6594
llvm_unreachable("Unexpected asm memory constraint");
6595
case InlineAsm::ConstraintCode::o: // offsetable ??
6596
case InlineAsm::ConstraintCode::v: // not offsetable ??
6597
case InlineAsm::ConstraintCode::m: // memory
6598
case InlineAsm::ConstraintCode::X:
6599
case InlineAsm::ConstraintCode::p: // address
6600
if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6601
return true;
6602
break;
6603
}
6604
6605
OutOps.push_back(Op0);
6606
OutOps.push_back(Op1);
6607
OutOps.push_back(Op2);
6608
OutOps.push_back(Op3);
6609
OutOps.push_back(Op4);
6610
return false;
6611
}
6612
6613
X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6614
: SelectionDAGISelPass(
6615
std::make_unique<X86DAGToDAGISel>(TM, TM.getOptLevel())) {}
6616
6617
/// This pass converts a legalized DAG into a X86-specific DAG,
6618
/// ready for instruction scheduling.
6619
FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6620
CodeGenOptLevel OptLevel) {
6621
return new X86DAGToDAGISelLegacy(TM, OptLevel);
6622
}
6623
6624