CoCalc -- X86ISelDAGToDAG.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
³⁵²⁶⁷ views
1
//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file defines a DAG pattern matching instruction selector for X86,
10
// converting from a legalized dag to a X86 dag.
11
//
12
//===----------------------------------------------------------------------===//
13

14
#include "X86ISelDAGToDAG.h"
15
#include "X86.h"
16
#include "X86MachineFunctionInfo.h"
17
#include "X86RegisterInfo.h"
18
#include "X86Subtarget.h"
19
#include "X86TargetMachine.h"
20
#include "llvm/ADT/Statistic.h"
21
#include "llvm/CodeGen/MachineModuleInfo.h"
22
#include "llvm/CodeGen/SelectionDAGISel.h"
23
#include "llvm/Config/llvm-config.h"
24
#include "llvm/IR/ConstantRange.h"
25
#include "llvm/IR/Function.h"
26
#include "llvm/IR/Instructions.h"
27
#include "llvm/IR/Intrinsics.h"
28
#include "llvm/IR/IntrinsicsX86.h"
29
#include "llvm/IR/Module.h"
30
#include "llvm/IR/Type.h"
31
#include "llvm/Support/Debug.h"
32
#include "llvm/Support/ErrorHandling.h"
33
#include "llvm/Support/KnownBits.h"
34
#include "llvm/Support/MathExtras.h"
35
#include <cstdint>
36

37
using namespace llvm;
38

39
#define DEBUG_TYPE "x86-isel"
40
#define PASS_NAME "X86 DAG->DAG Instruction Selection"
41

42
STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
43

44
static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
45
    cl::desc("Enable setting constant bits to reduce size of mask immediates"),
46
    cl::Hidden);
47

48
static cl::opt<bool> EnablePromoteAnyextLoad(
49
    "x86-promote-anyext-load", cl::init(true),
50
    cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
51

52
extern cl::opt<bool> IndirectBranchTracking;
53

54
//===----------------------------------------------------------------------===//
55
//                      Pattern Matcher Implementation
56
//===----------------------------------------------------------------------===//
57

58
namespace {
59
  /// This corresponds to X86AddressMode, but uses SDValue's instead of register
60
  /// numbers for the leaves of the matched tree.
61
  struct X86ISelAddressMode {
62
    enum {
63
      RegBase,
64
      FrameIndexBase
65
    } BaseType = RegBase;
66

67
    // This is really a union, discriminated by BaseType!
68
    SDValue Base_Reg;
69
    int Base_FrameIndex = 0;
70

71
    unsigned Scale = 1;
72
    SDValue IndexReg;
73
    int32_t Disp = 0;
74
    SDValue Segment;
75
    const GlobalValue *GV = nullptr;
76
    const Constant *CP = nullptr;
77
    const BlockAddress *BlockAddr = nullptr;
78
    const char *ES = nullptr;
79
    MCSymbol *MCSym = nullptr;
80
    int JT = -1;
81
    Align Alignment;            // CP alignment.
82
    unsigned char SymbolFlags = X86II::MO_NO_FLAG;  // X86II::MO_*
83
    bool NegateIndex = false;
84

85
    X86ISelAddressMode() = default;
86

87
    bool hasSymbolicDisplacement() const {
88
      return GV != nullptr || CP != nullptr || ES != nullptr ||
89
             MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
90
    }
91

92
    bool hasBaseOrIndexReg() const {
93
      return BaseType == FrameIndexBase ||
94
             IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
95
    }
96

97
    /// Return true if this addressing mode is already RIP-relative.
98
    bool isRIPRelative() const {
99
      if (BaseType != RegBase) return false;
100
      if (RegisterSDNode *RegNode =
101
            dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
102
        return RegNode->getReg() == X86::RIP;
103
      return false;
104
    }
105

106
    void setBaseReg(SDValue Reg) {
107
      BaseType = RegBase;
108
      Base_Reg = Reg;
109
    }
110

111
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
112
    void dump(SelectionDAG *DAG = nullptr) {
113
      dbgs() << "X86ISelAddressMode " << this << '\n';
114
      dbgs() << "Base_Reg ";
115
      if (Base_Reg.getNode())
116
        Base_Reg.getNode()->dump(DAG);
117
      else
118
        dbgs() << "nul\n";
119
      if (BaseType == FrameIndexBase)
120
        dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
121
      dbgs() << " Scale " << Scale << '\n'
122
             << "IndexReg ";
123
      if (NegateIndex)
124
        dbgs() << "negate ";
125
      if (IndexReg.getNode())
126
        IndexReg.getNode()->dump(DAG);
127
      else
128
        dbgs() << "nul\n";
129
      dbgs() << " Disp " << Disp << '\n'
130
             << "GV ";
131
      if (GV)
132
        GV->dump();
133
      else
134
        dbgs() << "nul";
135
      dbgs() << " CP ";
136
      if (CP)
137
        CP->dump();
138
      else
139
        dbgs() << "nul";
140
      dbgs() << '\n'
141
             << "ES ";
142
      if (ES)
143
        dbgs() << ES;
144
      else
145
        dbgs() << "nul";
146
      dbgs() << " MCSym ";
147
      if (MCSym)
148
        dbgs() << MCSym;
149
      else
150
        dbgs() << "nul";
151
      dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
152
    }
153
#endif
154
  };
155
}
156

157
namespace {
158
  //===--------------------------------------------------------------------===//
159
  /// ISel - X86-specific code to select X86 machine instructions for
160
  /// SelectionDAG operations.
161
  ///
162
  class X86DAGToDAGISel final : public SelectionDAGISel {
163
    /// Keep a pointer to the X86Subtarget around so that we can
164
    /// make the right decision when generating code for different targets.
165
    const X86Subtarget *Subtarget;
166

167
    /// If true, selector should try to optimize for minimum code size.
168
    bool OptForMinSize;
169

170
    /// Disable direct TLS access through segment registers.
171
    bool IndirectTlsSegRefs;
172

173
  public:
174
    X86DAGToDAGISel() = delete;
175

176
    explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
177
        : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
178
          OptForMinSize(false), IndirectTlsSegRefs(false) {}
179

180
    bool runOnMachineFunction(MachineFunction &MF) override {
181
      // Reset the subtarget each time through.
182
      Subtarget = &MF.getSubtarget<X86Subtarget>();
183
      IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
184
                             "indirect-tls-seg-refs");
185

186
      // OptFor[Min]Size are used in pattern predicates that isel is matching.
187
      OptForMinSize = MF.getFunction().hasMinSize();
188
      assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
189
             "OptForMinSize implies OptForSize");
190
      return SelectionDAGISel::runOnMachineFunction(MF);
191
    }
192

193
    void emitFunctionEntryCode() override;
194

195
    bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
196

197
    void PreprocessISelDAG() override;
198
    void PostprocessISelDAG() override;
199

200
// Include the pieces autogenerated from the target description.
201
#include "X86GenDAGISel.inc"
202

203
  private:
204
    void Select(SDNode *N) override;
205

206
    bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
207
    bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
208
                            bool AllowSegmentRegForX32 = false);
209
    bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
210
    bool matchAddress(SDValue N, X86ISelAddressMode &AM);
211
    bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
212
    bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
213
    SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
214
                                  unsigned Depth);
215
    bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
216
                                 unsigned Depth);
217
    bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218
                                       unsigned Depth);
219
    bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
220
    bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
221
                    SDValue &Scale, SDValue &Index, SDValue &Disp,
222
                    SDValue &Segment);
223
    bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
224
                          SDValue ScaleOp, SDValue &Base, SDValue &Scale,
225
                          SDValue &Index, SDValue &Disp, SDValue &Segment);
226
    bool selectMOV64Imm32(SDValue N, SDValue &Imm);
227
    bool selectLEAAddr(SDValue N, SDValue &Base,
228
                       SDValue &Scale, SDValue &Index, SDValue &Disp,
229
                       SDValue &Segment);
230
    bool selectLEA64_32Addr(SDValue N, SDValue &Base,
231
                            SDValue &Scale, SDValue &Index, SDValue &Disp,
232
                            SDValue &Segment);
233
    bool selectTLSADDRAddr(SDValue N, SDValue &Base,
234
                           SDValue &Scale, SDValue &Index, SDValue &Disp,
235
                           SDValue &Segment);
236
    bool selectRelocImm(SDValue N, SDValue &Op);
237

238
    bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
239
                     SDValue &Base, SDValue &Scale,
240
                     SDValue &Index, SDValue &Disp,
241
                     SDValue &Segment);
242

243
    // Convenience method where P is also root.
244
    bool tryFoldLoad(SDNode *P, SDValue N,
245
                     SDValue &Base, SDValue &Scale,
246
                     SDValue &Index, SDValue &Disp,
247
                     SDValue &Segment) {
248
      return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
249
    }
250

251
    bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
252
                          SDValue &Base, SDValue &Scale,
253
                          SDValue &Index, SDValue &Disp,
254
                          SDValue &Segment);
255

256
    bool isProfitableToFormMaskedOp(SDNode *N) const;
257

258
    /// Implement addressing mode selection for inline asm expressions.
259
    bool SelectInlineAsmMemoryOperand(const SDValue &Op,
260
                                      InlineAsm::ConstraintCode ConstraintID,
261
                                      std::vector<SDValue> &OutOps) override;
262

263
    void emitSpecialCodeForMain();
264

265
    inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
266
                                   MVT VT, SDValue &Base, SDValue &Scale,
267
                                   SDValue &Index, SDValue &Disp,
268
                                   SDValue &Segment) {
269
      if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
270
        Base = CurDAG->getTargetFrameIndex(
271
            AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
272
      else if (AM.Base_Reg.getNode())
273
        Base = AM.Base_Reg;
274
      else
275
        Base = CurDAG->getRegister(0, VT);
276

277
      Scale = getI8Imm(AM.Scale, DL);
278

279
#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
280
      // Negate the index if needed.
281
      if (AM.NegateIndex) {
282
        unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
283
                                         : GET_ND_IF_ENABLED(X86::NEG32r);
284
        SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
285
                                                     AM.IndexReg), 0);
286
        AM.IndexReg = Neg;
287
      }
288

289
      if (AM.IndexReg.getNode())
290
        Index = AM.IndexReg;
291
      else
292
        Index = CurDAG->getRegister(0, VT);
293

294
      // These are 32-bit even in 64-bit mode since RIP-relative offset
295
      // is 32-bit.
296
      if (AM.GV)
297
        Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
298
                                              MVT::i32, AM.Disp,
299
                                              AM.SymbolFlags);
300
      else if (AM.CP)
301
        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
302
                                             AM.Disp, AM.SymbolFlags);
303
      else if (AM.ES) {
304
        assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
305
        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
306
      } else if (AM.MCSym) {
307
        assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
308
        assert(AM.SymbolFlags == 0 && "oo");
309
        Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
310
      } else if (AM.JT != -1) {
311
        assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
312
        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
313
      } else if (AM.BlockAddr)
314
        Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
315
                                             AM.SymbolFlags);
316
      else
317
        Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
318

319
      if (AM.Segment.getNode())
320
        Segment = AM.Segment;
321
      else
322
        Segment = CurDAG->getRegister(0, MVT::i16);
323
    }
324

325
    // Utility function to determine whether we should avoid selecting
326
    // immediate forms of instructions for better code size or not.
327
    // At a high level, we'd like to avoid such instructions when
328
    // we have similar constants used within the same basic block
329
    // that can be kept in a register.
330
    //
331
    bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
332
      uint32_t UseCount = 0;
333

334
      // Do not want to hoist if we're not optimizing for size.
335
      // TODO: We'd like to remove this restriction.
336
      // See the comment in X86InstrInfo.td for more info.
337
      if (!CurDAG->shouldOptForSize())
338
        return false;
339

340
      // Walk all the users of the immediate.
341
      for (const SDNode *User : N->uses()) {
342
        if (UseCount >= 2)
343
          break;
344

345
        // This user is already selected. Count it as a legitimate use and
346
        // move on.
347
        if (User->isMachineOpcode()) {
348
          UseCount++;
349
          continue;
350
        }
351

352
        // We want to count stores of immediates as real uses.
353
        if (User->getOpcode() == ISD::STORE &&
354
            User->getOperand(1).getNode() == N) {
355
          UseCount++;
356
          continue;
357
        }
358

359
        // We don't currently match users that have > 2 operands (except
360
        // for stores, which are handled above)
361
        // Those instruction won't match in ISEL, for now, and would
362
        // be counted incorrectly.
363
        // This may change in the future as we add additional instruction
364
        // types.
365
        if (User->getNumOperands() != 2)
366
          continue;
367

368
        // If this is a sign-extended 8-bit integer immediate used in an ALU
369
        // instruction, there is probably an opcode encoding to save space.
370
        auto *C = dyn_cast<ConstantSDNode>(N);
371
        if (C && isInt<8>(C->getSExtValue()))
372
          continue;
373

374
        // Immediates that are used for offsets as part of stack
375
        // manipulation should be left alone. These are typically
376
        // used to indicate SP offsets for argument passing and
377
        // will get pulled into stores/pushes (implicitly).
378
        if (User->getOpcode() == X86ISD::ADD ||
379
            User->getOpcode() == ISD::ADD    ||
380
            User->getOpcode() == X86ISD::SUB ||
381
            User->getOpcode() == ISD::SUB) {
382

383
          // Find the other operand of the add/sub.
384
          SDValue OtherOp = User->getOperand(0);
385
          if (OtherOp.getNode() == N)
386
            OtherOp = User->getOperand(1);
387

388
          // Don't count if the other operand is SP.
389
          RegisterSDNode *RegNode;
390
          if (OtherOp->getOpcode() == ISD::CopyFromReg &&
391
              (RegNode = dyn_cast_or_null<RegisterSDNode>(
392
                 OtherOp->getOperand(1).getNode())))
393
            if ((RegNode->getReg() == X86::ESP) ||
394
                (RegNode->getReg() == X86::RSP))
395
              continue;
396
        }
397

398
        // ... otherwise, count this and move on.
399
        UseCount++;
400
      }
401

402
      // If we have more than 1 use, then recommend for hoisting.
403
      return (UseCount > 1);
404
    }
405

406
    /// Return a target constant with the specified value of type i8.
407
    inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
408
      return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
409
    }
410

411
    /// Return a target constant with the specified value, of type i32.
412
    inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
413
      return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
414
    }
415

416
    /// Return a target constant with the specified value, of type i64.
417
    inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
418
      return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
419
    }
420

421
    SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
422
                                        const SDLoc &DL) {
423
      assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
424
      uint64_t Index = N->getConstantOperandVal(1);
425
      MVT VecVT = N->getOperand(0).getSimpleValueType();
426
      return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
427
    }
428

429
    SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
430
                                      const SDLoc &DL) {
431
      assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
432
      uint64_t Index = N->getConstantOperandVal(2);
433
      MVT VecVT = N->getSimpleValueType(0);
434
      return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
435
    }
436

437
    SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
438
                                               const SDLoc &DL) {
439
      assert(VecWidth == 128 && "Unexpected vector width");
440
      uint64_t Index = N->getConstantOperandVal(2);
441
      MVT VecVT = N->getSimpleValueType(0);
442
      uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
443
      assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
444
      // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
445
      // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
446
      return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
447
    }
448

449
    SDValue getSBBZero(SDNode *N) {
450
      SDLoc dl(N);
451
      MVT VT = N->getSimpleValueType(0);
452

453
      // Create zero.
454
      SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
455
      SDValue Zero = SDValue(
456
          CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
457
      if (VT == MVT::i64) {
458
        Zero = SDValue(
459
            CurDAG->getMachineNode(
460
                TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
461
                CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
462
                CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
463
            0);
464
      }
465

466
      // Copy flags to the EFLAGS register and glue it to next node.
467
      unsigned Opcode = N->getOpcode();
468
      assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
469
             "Unexpected opcode for SBB materialization");
470
      unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
471
      SDValue EFLAGS =
472
          CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
473
                               N->getOperand(FlagOpIndex), SDValue());
474

475
      // Create a 64-bit instruction if the result is 64-bits otherwise use the
476
      // 32-bit version.
477
      unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
478
      MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
479
      VTs = CurDAG->getVTList(SBBVT, MVT::i32);
480
      return SDValue(
481
          CurDAG->getMachineNode(Opc, dl, VTs,
482
                                 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
483
          0);
484
    }
485

486
    // Helper to detect unneeded and instructions on shift amounts. Called
487
    // from PatFrags in tablegen.
488
    bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
489
      assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
490
      const APInt &Val = N->getConstantOperandAPInt(1);
491

492
      if (Val.countr_one() >= Width)
493
        return true;
494

495
      APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
496
      return Mask.countr_one() >= Width;
497
    }
498

499
    /// Return an SDNode that returns the value of the global base register.
500
    /// Output instructions required to initialize the global base register,
501
    /// if necessary.
502
    SDNode *getGlobalBaseReg();
503

504
    /// Return a reference to the TargetMachine, casted to the target-specific
505
    /// type.
506
    const X86TargetMachine &getTargetMachine() const {
507
      return static_cast<const X86TargetMachine &>(TM);
508
    }
509

510
    /// Return a reference to the TargetInstrInfo, casted to the target-specific
511
    /// type.
512
    const X86InstrInfo *getInstrInfo() const {
513
      return Subtarget->getInstrInfo();
514
    }
515

516
    /// Return a condition code of the given SDNode
517
    X86::CondCode getCondFromNode(SDNode *N) const;
518

519
    /// Address-mode matching performs shift-of-and to and-of-shift
520
    /// reassociation in order to expose more scaled addressing
521
    /// opportunities.
522
    bool ComplexPatternFuncMutatesDAG() const override {
523
      return true;
524
    }
525

526
    bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
527

528
    // Indicates we should prefer to use a non-temporal load for this load.
529
    bool useNonTemporalLoad(LoadSDNode *N) const {
530
      if (!N->isNonTemporal())
531
        return false;
532

533
      unsigned StoreSize = N->getMemoryVT().getStoreSize();
534

535
      if (N->getAlign().value() < StoreSize)
536
        return false;
537

538
      switch (StoreSize) {
539
      default: llvm_unreachable("Unsupported store size");
540
      case 4:
541
      case 8:
542
        return false;
543
      case 16:
544
        return Subtarget->hasSSE41();
545
      case 32:
546
        return Subtarget->hasAVX2();
547
      case 64:
548
        return Subtarget->hasAVX512();
549
      }
550
    }
551

552
    bool foldLoadStoreIntoMemOperand(SDNode *Node);
553
    MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
554
    bool matchBitExtract(SDNode *Node);
555
    bool shrinkAndImmediate(SDNode *N);
556
    bool isMaskZeroExtended(SDNode *N) const;
557
    bool tryShiftAmountMod(SDNode *N);
558
    bool tryShrinkShlLogicImm(SDNode *N);
559
    bool tryVPTERNLOG(SDNode *N);
560
    bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
561
                        SDNode *ParentC, SDValue A, SDValue B, SDValue C,
562
                        uint8_t Imm);
563
    bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
564
    bool tryMatchBitSelect(SDNode *N);
565

566
    MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
567
                                const SDLoc &dl, MVT VT, SDNode *Node);
568
    MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
569
                                const SDLoc &dl, MVT VT, SDNode *Node,
570
                                SDValue &InGlue);
571

572
    bool tryOptimizeRem8Extend(SDNode *N);
573

574
    bool onlyUsesZeroFlag(SDValue Flags) const;
575
    bool hasNoSignFlagUses(SDValue Flags) const;
576
    bool hasNoCarryFlagUses(SDValue Flags) const;
577
  };
578

579
  class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
580
  public:
581
    static char ID;
582
    explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
583
                                   CodeGenOptLevel OptLevel)
584
        : SelectionDAGISelLegacy(
585
              ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
586
  };
587
}
588

589
char X86DAGToDAGISelLegacy::ID = 0;
590

591
INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
592

593
// Returns true if this masked compare can be implemented legally with this
594
// type.
595
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
596
  unsigned Opcode = N->getOpcode();
597
  if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
598
      Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
599
      Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
600
    // We can get 256-bit 8 element types here without VLX being enabled. When
601
    // this happens we will use 512-bit operations and the mask will not be
602
    // zero extended.
603
    EVT OpVT = N->getOperand(0).getValueType();
604
    // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
605
    // second operand.
606
    if (Opcode == X86ISD::STRICT_CMPM)
607
      OpVT = N->getOperand(1).getValueType();
608
    if (OpVT.is256BitVector() || OpVT.is128BitVector())
609
      return Subtarget->hasVLX();
610

611
    return true;
612
  }
613
  // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
614
  if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
615
      Opcode == X86ISD::FSETCCM_SAE)
616
    return true;
617

618
  return false;
619
}
620

621
// Returns true if we can assume the writer of the mask has zero extended it
622
// for us.
623
bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
624
  // If this is an AND, check if we have a compare on either side. As long as
625
  // one side guarantees the mask is zero extended, the AND will preserve those
626
  // zeros.
627
  if (N->getOpcode() == ISD::AND)
628
    return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
629
           isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
630

631
  return isLegalMaskCompare(N, Subtarget);
632
}
633

634
bool
635
X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
636
  if (OptLevel == CodeGenOptLevel::None)
637
    return false;
638

639
  if (!N.hasOneUse())
640
    return false;
641

642
  if (N.getOpcode() != ISD::LOAD)
643
    return true;
644

645
  // Don't fold non-temporal loads if we have an instruction for them.
646
  if (useNonTemporalLoad(cast<LoadSDNode>(N)))
647
    return false;
648

649
  // If N is a load, do additional profitability checks.
650
  if (U == Root) {
651
    switch (U->getOpcode()) {
652
    default: break;
653
    case X86ISD::ADD:
654
    case X86ISD::ADC:
655
    case X86ISD::SUB:
656
    case X86ISD::SBB:
657
    case X86ISD::AND:
658
    case X86ISD::XOR:
659
    case X86ISD::OR:
660
    case ISD::ADD:
661
    case ISD::UADDO_CARRY:
662
    case ISD::AND:
663
    case ISD::OR:
664
    case ISD::XOR: {
665
      SDValue Op1 = U->getOperand(1);
666

667
      // If the other operand is a 8-bit immediate we should fold the immediate
668
      // instead. This reduces code size.
669
      // e.g.
670
      // movl 4(%esp), %eax
671
      // addl $4, %eax
672
      // vs.
673
      // movl $4, %eax
674
      // addl 4(%esp), %eax
675
      // The former is 2 bytes shorter. In case where the increment is 1, then
676
      // the saving can be 4 bytes (by using incl %eax).
677
      if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
678
        if (Imm->getAPIntValue().isSignedIntN(8))
679
          return false;
680

681
        // If this is a 64-bit AND with an immediate that fits in 32-bits,
682
        // prefer using the smaller and over folding the load. This is needed to
683
        // make sure immediates created by shrinkAndImmediate are always folded.
684
        // Ideally we would narrow the load during DAG combine and get the
685
        // best of both worlds.
686
        if (U->getOpcode() == ISD::AND &&
687
            Imm->getAPIntValue().getBitWidth() == 64 &&
688
            Imm->getAPIntValue().isIntN(32))
689
          return false;
690

691
        // If this really a zext_inreg that can be represented with a movzx
692
        // instruction, prefer that.
693
        // TODO: We could shrink the load and fold if it is non-volatile.
694
        if (U->getOpcode() == ISD::AND &&
695
            (Imm->getAPIntValue() == UINT8_MAX ||
696
             Imm->getAPIntValue() == UINT16_MAX ||
697
             Imm->getAPIntValue() == UINT32_MAX))
698
          return false;
699

700
        // ADD/SUB with can negate the immediate and use the opposite operation
701
        // to fit 128 into a sign extended 8 bit immediate.
702
        if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
703
            (-Imm->getAPIntValue()).isSignedIntN(8))
704
          return false;
705

706
        if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
707
            (-Imm->getAPIntValue()).isSignedIntN(8) &&
708
            hasNoCarryFlagUses(SDValue(U, 1)))
709
          return false;
710
      }
711

712
      // If the other operand is a TLS address, we should fold it instead.
713
      // This produces
714
      // movl    %gs:0, %eax
715
      // leal    i@NTPOFF(%eax), %eax
716
      // instead of
717
      // movl    $i@NTPOFF, %eax
718
      // addl    %gs:0, %eax
719
      // if the block also has an access to a second TLS address this will save
720
      // a load.
721
      // FIXME: This is probably also true for non-TLS addresses.
722
      if (Op1.getOpcode() == X86ISD::Wrapper) {
723
        SDValue Val = Op1.getOperand(0);
724
        if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
725
          return false;
726
      }
727

728
      // Don't fold load if this matches the BTS/BTR/BTC patterns.
729
      // BTS: (or X, (shl 1, n))
730
      // BTR: (and X, (rotl -2, n))
731
      // BTC: (xor X, (shl 1, n))
732
      if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
733
        if (U->getOperand(0).getOpcode() == ISD::SHL &&
734
            isOneConstant(U->getOperand(0).getOperand(0)))
735
          return false;
736

737
        if (U->getOperand(1).getOpcode() == ISD::SHL &&
738
            isOneConstant(U->getOperand(1).getOperand(0)))
739
          return false;
740
      }
741
      if (U->getOpcode() == ISD::AND) {
742
        SDValue U0 = U->getOperand(0);
743
        SDValue U1 = U->getOperand(1);
744
        if (U0.getOpcode() == ISD::ROTL) {
745
          auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
746
          if (C && C->getSExtValue() == -2)
747
            return false;
748
        }
749

750
        if (U1.getOpcode() == ISD::ROTL) {
751
          auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
752
          if (C && C->getSExtValue() == -2)
753
            return false;
754
        }
755
      }
756

757
      break;
758
    }
759
    case ISD::SHL:
760
    case ISD::SRA:
761
    case ISD::SRL:
762
      // Don't fold a load into a shift by immediate. The BMI2 instructions
763
      // support folding a load, but not an immediate. The legacy instructions
764
      // support folding an immediate, but can't fold a load. Folding an
765
      // immediate is preferable to folding a load.
766
      if (isa<ConstantSDNode>(U->getOperand(1)))
767
        return false;
768

769
      break;
770
    }
771
  }
772

773
  // Prevent folding a load if this can implemented with an insert_subreg or
774
  // a move that implicitly zeroes.
775
  if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
776
      isNullConstant(Root->getOperand(2)) &&
777
      (Root->getOperand(0).isUndef() ||
778
       ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
779
    return false;
780

781
  return true;
782
}
783

784
// Indicates it is profitable to form an AVX512 masked operation. Returning
785
// false will favor a masked register-register masked move or vblendm and the
786
// operation will be selected separately.
787
bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
788
  assert(
789
      (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
790
      "Unexpected opcode!");
791

792
  // If the operation has additional users, the operation will be duplicated.
793
  // Check the use count to prevent that.
794
  // FIXME: Are there cheap opcodes we might want to duplicate?
795
  return N->getOperand(1).hasOneUse();
796
}
797

798
/// Replace the original chain operand of the call with
799
/// load's chain operand and move load below the call's chain operand.
800
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
801
                               SDValue Call, SDValue OrigChain) {
802
  SmallVector<SDValue, 8> Ops;
803
  SDValue Chain = OrigChain.getOperand(0);
804
  if (Chain.getNode() == Load.getNode())
805
    Ops.push_back(Load.getOperand(0));
806
  else {
807
    assert(Chain.getOpcode() == ISD::TokenFactor &&
808
           "Unexpected chain operand");
809
    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
810
      if (Chain.getOperand(i).getNode() == Load.getNode())
811
        Ops.push_back(Load.getOperand(0));
812
      else
813
        Ops.push_back(Chain.getOperand(i));
814
    SDValue NewChain =
815
      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
816
    Ops.clear();
817
    Ops.push_back(NewChain);
818
  }
819
  Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
820
  CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
821
  CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
822
                             Load.getOperand(1), Load.getOperand(2));
823

824
  Ops.clear();
825
  Ops.push_back(SDValue(Load.getNode(), 1));
826
  Ops.append(Call->op_begin() + 1, Call->op_end());
827
  CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
828
}
829

830
/// Return true if call address is a load and it can be
831
/// moved below CALLSEQ_START and the chains leading up to the call.
832
/// Return the CALLSEQ_START by reference as a second output.
833
/// In the case of a tail call, there isn't a callseq node between the call
834
/// chain and the load.
835
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
836
  // The transformation is somewhat dangerous if the call's chain was glued to
837
  // the call. After MoveBelowOrigChain the load is moved between the call and
838
  // the chain, this can create a cycle if the load is not folded. So it is
839
  // *really* important that we are sure the load will be folded.
840
  if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
841
    return false;
842
  auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
843
  if (!LD ||
844
      !LD->isSimple() ||
845
      LD->getAddressingMode() != ISD::UNINDEXED ||
846
      LD->getExtensionType() != ISD::NON_EXTLOAD)
847
    return false;
848

849
  // Now let's find the callseq_start.
850
  while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
851
    if (!Chain.hasOneUse())
852
      return false;
853
    Chain = Chain.getOperand(0);
854
  }
855

856
  if (!Chain.getNumOperands())
857
    return false;
858
  // Since we are not checking for AA here, conservatively abort if the chain
859
  // writes to memory. It's not safe to move the callee (a load) across a store.
860
  if (isa<MemSDNode>(Chain.getNode()) &&
861
      cast<MemSDNode>(Chain.getNode())->writeMem())
862
    return false;
863
  if (Chain.getOperand(0).getNode() == Callee.getNode())
864
    return true;
865
  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
866
      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
867
      Callee.getValue(1).hasOneUse())
868
    return true;
869
  return false;
870
}
871

872
static bool isEndbrImm64(uint64_t Imm) {
873
// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
874
// i.g: 0xF3660F1EFA, 0xF3670F1EFA
875
  if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
876
    return false;
877

878
  uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
879
                                    0x65, 0x66, 0x67, 0xf0, 0xf2};
880
  int i = 24; // 24bit 0x0F1EFA has matched
881
  while (i < 64) {
882
    uint8_t Byte = (Imm >> i) & 0xFF;
883
    if (Byte == 0xF3)
884
      return true;
885
    if (!llvm::is_contained(OptionalPrefixBytes, Byte))
886
      return false;
887
    i += 8;
888
  }
889

890
  return false;
891
}
892

893
static bool needBWI(MVT VT) {
894
  return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
895
}
896

897
void X86DAGToDAGISel::PreprocessISelDAG() {
898
  bool MadeChange = false;
899
  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
900
       E = CurDAG->allnodes_end(); I != E; ) {
901
    SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
902

903
    // This is for CET enhancement.
904
    //
905
    // ENDBR32 and ENDBR64 have specific opcodes:
906
    // ENDBR32: F3 0F 1E FB
907
    // ENDBR64: F3 0F 1E FA
908
    // And we want that attackers won’t find unintended ENDBR32/64
909
    // opcode matches in the binary
910
    // Here’s an example:
911
    // If the compiler had to generate asm for the following code:
912
    // a = 0xF30F1EFA
913
    // it could, for example, generate:
914
    // mov 0xF30F1EFA, dword ptr[a]
915
    // In such a case, the binary would include a gadget that starts
916
    // with a fake ENDBR64 opcode. Therefore, we split such generation
917
    // into multiple operations, let it not shows in the binary
918
    if (N->getOpcode() == ISD::Constant) {
919
      MVT VT = N->getSimpleValueType(0);
920
      int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
921
      int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
922
      if (Imm == EndbrImm || isEndbrImm64(Imm)) {
923
        // Check that the cf-protection-branch is enabled.
924
        Metadata *CFProtectionBranch =
925
            MF->getFunction().getParent()->getModuleFlag(
926
                "cf-protection-branch");
927
        if (CFProtectionBranch || IndirectBranchTracking) {
928
          SDLoc dl(N);
929
          SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
930
          Complement = CurDAG->getNOT(dl, Complement, VT);
931
          --I;
932
          CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
933
          ++I;
934
          MadeChange = true;
935
          continue;
936
        }
937
      }
938
    }
939

940
    // If this is a target specific AND node with no flag usages, turn it back
941
    // into ISD::AND to enable test instruction matching.
942
    if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
943
      SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
944
                                    N->getOperand(0), N->getOperand(1));
945
      --I;
946
      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
947
      ++I;
948
      MadeChange = true;
949
      continue;
950
    }
951

952
    // Convert vector increment or decrement to sub/add with an all-ones
953
    // constant:
954
    // add X, <1, 1...> --> sub X, <-1, -1...>
955
    // sub X, <1, 1...> --> add X, <-1, -1...>
956
    // The all-ones vector constant can be materialized using a pcmpeq
957
    // instruction that is commonly recognized as an idiom (has no register
958
    // dependency), so that's better/smaller than loading a splat 1 constant.
959
    //
960
    // But don't do this if it would inhibit a potentially profitable load
961
    // folding opportunity for the other operand. That only occurs with the
962
    // intersection of:
963
    // (1) The other operand (op0) is load foldable.
964
    // (2) The op is an add (otherwise, we are *creating* an add and can still
965
    //     load fold the other op).
966
    // (3) The target has AVX (otherwise, we have a destructive add and can't
967
    //     load fold the other op without killing the constant op).
968
    // (4) The constant 1 vector has multiple uses (so it is profitable to load
969
    //     into a register anyway).
970
    auto mayPreventLoadFold = [&]() {
971
      return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
972
             N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
973
             !N->getOperand(1).hasOneUse();
974
    };
975
    if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
976
        N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
977
      APInt SplatVal;
978
      if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
979
          SplatVal.isOne()) {
980
        SDLoc DL(N);
981

982
        MVT VT = N->getSimpleValueType(0);
983
        unsigned NumElts = VT.getSizeInBits() / 32;
984
        SDValue AllOnes =
985
            CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
986
        AllOnes = CurDAG->getBitcast(VT, AllOnes);
987

988
        unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
989
        SDValue Res =
990
            CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
991
        --I;
992
        CurDAG->ReplaceAllUsesWith(N, Res.getNode());
993
        ++I;
994
        MadeChange = true;
995
        continue;
996
      }
997
    }
998

999
    switch (N->getOpcode()) {
1000
    case X86ISD::VBROADCAST: {
1001
      MVT VT = N->getSimpleValueType(0);
1002
      // Emulate v32i16/v64i8 broadcast without BWI.
1003
      if (!Subtarget->hasBWI() && needBWI(VT)) {
1004
        MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1005
        SDLoc dl(N);
1006
        SDValue NarrowBCast =
1007
            CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1008
        SDValue Res =
1009
            CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1010
                            NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1011
        unsigned Index = NarrowVT.getVectorMinNumElements();
1012
        Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1013
                              CurDAG->getIntPtrConstant(Index, dl));
1014

1015
        --I;
1016
        CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1017
        ++I;
1018
        MadeChange = true;
1019
        continue;
1020
      }
1021

1022
      break;
1023
    }
1024
    case X86ISD::VBROADCAST_LOAD: {
1025
      MVT VT = N->getSimpleValueType(0);
1026
      // Emulate v32i16/v64i8 broadcast without BWI.
1027
      if (!Subtarget->hasBWI() && needBWI(VT)) {
1028
        MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1029
        auto *MemNode = cast<MemSDNode>(N);
1030
        SDLoc dl(N);
1031
        SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1032
        SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1033
        SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1034
            X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1035
            MemNode->getMemOperand());
1036
        SDValue Res =
1037
            CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1038
                            NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1039
        unsigned Index = NarrowVT.getVectorMinNumElements();
1040
        Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1041
                              CurDAG->getIntPtrConstant(Index, dl));
1042

1043
        --I;
1044
        SDValue To[] = {Res, NarrowBCast.getValue(1)};
1045
        CurDAG->ReplaceAllUsesWith(N, To);
1046
        ++I;
1047
        MadeChange = true;
1048
        continue;
1049
      }
1050

1051
      break;
1052
    }
1053
    case ISD::LOAD: {
1054
      // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1055
      // load, then just extract the lower subvector and avoid the second load.
1056
      auto *Ld = cast<LoadSDNode>(N);
1057
      MVT VT = N->getSimpleValueType(0);
1058
      if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1059
          !(VT.is128BitVector() || VT.is256BitVector()))
1060
        break;
1061

1062
      MVT MaxVT = VT;
1063
      SDNode *MaxLd = nullptr;
1064
      SDValue Ptr = Ld->getBasePtr();
1065
      SDValue Chain = Ld->getChain();
1066
      for (SDNode *User : Ptr->uses()) {
1067
        auto *UserLd = dyn_cast<LoadSDNode>(User);
1068
        MVT UserVT = User->getSimpleValueType(0);
1069
        if (User != N && UserLd && ISD::isNormalLoad(User) &&
1070
            UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1071
            !User->hasAnyUseOfValue(1) &&
1072
            (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1073
            UserVT.getSizeInBits() > VT.getSizeInBits() &&
1074
            (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1075
          MaxLd = User;
1076
          MaxVT = UserVT;
1077
        }
1078
      }
1079
      if (MaxLd) {
1080
        SDLoc dl(N);
1081
        unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1082
        MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1083
        SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1084
                                          SDValue(MaxLd, 0),
1085
                                          CurDAG->getIntPtrConstant(0, dl));
1086
        SDValue Res = CurDAG->getBitcast(VT, Extract);
1087

1088
        --I;
1089
        SDValue To[] = {Res, SDValue(MaxLd, 1)};
1090
        CurDAG->ReplaceAllUsesWith(N, To);
1091
        ++I;
1092
        MadeChange = true;
1093
        continue;
1094
      }
1095
      break;
1096
    }
1097
    case ISD::VSELECT: {
1098
      // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1099
      EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1100
      if (EleVT == MVT::i1)
1101
        break;
1102

1103
      assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1104
      assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1105
             "We can't replace VSELECT with BLENDV in vXi16!");
1106
      SDValue R;
1107
      if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1108
                                     EleVT.getSizeInBits()) {
1109
        R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1110
                            N->getOperand(0), N->getOperand(1), N->getOperand(2),
1111
                            CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1112
      } else {
1113
        R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1114
                            N->getOperand(0), N->getOperand(1),
1115
                            N->getOperand(2));
1116
      }
1117
      --I;
1118
      CurDAG->ReplaceAllUsesWith(N, R.getNode());
1119
      ++I;
1120
      MadeChange = true;
1121
      continue;
1122
    }
1123
    case ISD::FP_ROUND:
1124
    case ISD::STRICT_FP_ROUND:
1125
    case ISD::FP_TO_SINT:
1126
    case ISD::FP_TO_UINT:
1127
    case ISD::STRICT_FP_TO_SINT:
1128
    case ISD::STRICT_FP_TO_UINT: {
1129
      // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1130
      // don't need 2 sets of patterns.
1131
      if (!N->getSimpleValueType(0).isVector())
1132
        break;
1133

1134
      unsigned NewOpc;
1135
      switch (N->getOpcode()) {
1136
      default: llvm_unreachable("Unexpected opcode!");
1137
      case ISD::FP_ROUND:          NewOpc = X86ISD::VFPROUND;        break;
1138
      case ISD::STRICT_FP_ROUND:   NewOpc = X86ISD::STRICT_VFPROUND; break;
1139
      case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1140
      case ISD::FP_TO_SINT:        NewOpc = X86ISD::CVTTP2SI;        break;
1141
      case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1142
      case ISD::FP_TO_UINT:        NewOpc = X86ISD::CVTTP2UI;        break;
1143
      }
1144
      SDValue Res;
1145
      if (N->isStrictFPOpcode())
1146
        Res =
1147
            CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1148
                            {N->getOperand(0), N->getOperand(1)});
1149
      else
1150
        Res =
1151
            CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1152
                            N->getOperand(0));
1153
      --I;
1154
      CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1155
      ++I;
1156
      MadeChange = true;
1157
      continue;
1158
    }
1159
    case ISD::SHL:
1160
    case ISD::SRA:
1161
    case ISD::SRL: {
1162
      // Replace vector shifts with their X86 specific equivalent so we don't
1163
      // need 2 sets of patterns.
1164
      if (!N->getValueType(0).isVector())
1165
        break;
1166

1167
      unsigned NewOpc;
1168
      switch (N->getOpcode()) {
1169
      default: llvm_unreachable("Unexpected opcode!");
1170
      case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1171
      case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1172
      case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1173
      }
1174
      SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1175
                                    N->getOperand(0), N->getOperand(1));
1176
      --I;
1177
      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1178
      ++I;
1179
      MadeChange = true;
1180
      continue;
1181
    }
1182
    case ISD::ANY_EXTEND:
1183
    case ISD::ANY_EXTEND_VECTOR_INREG: {
1184
      // Replace vector any extend with the zero extend equivalents so we don't
1185
      // need 2 sets of patterns. Ignore vXi1 extensions.
1186
      if (!N->getValueType(0).isVector())
1187
        break;
1188

1189
      unsigned NewOpc;
1190
      if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1191
        assert(N->getOpcode() == ISD::ANY_EXTEND &&
1192
               "Unexpected opcode for mask vector!");
1193
        NewOpc = ISD::SIGN_EXTEND;
1194
      } else {
1195
        NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1196
                              ? ISD::ZERO_EXTEND
1197
                              : ISD::ZERO_EXTEND_VECTOR_INREG;
1198
      }
1199

1200
      SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1201
                                    N->getOperand(0));
1202
      --I;
1203
      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1204
      ++I;
1205
      MadeChange = true;
1206
      continue;
1207
    }
1208
    case ISD::FCEIL:
1209
    case ISD::STRICT_FCEIL:
1210
    case ISD::FFLOOR:
1211
    case ISD::STRICT_FFLOOR:
1212
    case ISD::FTRUNC:
1213
    case ISD::STRICT_FTRUNC:
1214
    case ISD::FROUNDEVEN:
1215
    case ISD::STRICT_FROUNDEVEN:
1216
    case ISD::FNEARBYINT:
1217
    case ISD::STRICT_FNEARBYINT:
1218
    case ISD::FRINT:
1219
    case ISD::STRICT_FRINT: {
1220
      // Replace fp rounding with their X86 specific equivalent so we don't
1221
      // need 2 sets of patterns.
1222
      unsigned Imm;
1223
      switch (N->getOpcode()) {
1224
      default: llvm_unreachable("Unexpected opcode!");
1225
      case ISD::STRICT_FCEIL:
1226
      case ISD::FCEIL:      Imm = 0xA; break;
1227
      case ISD::STRICT_FFLOOR:
1228
      case ISD::FFLOOR:     Imm = 0x9; break;
1229
      case ISD::STRICT_FTRUNC:
1230
      case ISD::FTRUNC:     Imm = 0xB; break;
1231
      case ISD::STRICT_FROUNDEVEN:
1232
      case ISD::FROUNDEVEN: Imm = 0x8; break;
1233
      case ISD::STRICT_FNEARBYINT:
1234
      case ISD::FNEARBYINT: Imm = 0xC; break;
1235
      case ISD::STRICT_FRINT:
1236
      case ISD::FRINT:      Imm = 0x4; break;
1237
      }
1238
      SDLoc dl(N);
1239
      bool IsStrict = N->isStrictFPOpcode();
1240
      SDValue Res;
1241
      if (IsStrict)
1242
        Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1243
                              {N->getValueType(0), MVT::Other},
1244
                              {N->getOperand(0), N->getOperand(1),
1245
                               CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1246
      else
1247
        Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1248
                              N->getOperand(0),
1249
                              CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1250
      --I;
1251
      CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1252
      ++I;
1253
      MadeChange = true;
1254
      continue;
1255
    }
1256
    case X86ISD::FANDN:
1257
    case X86ISD::FAND:
1258
    case X86ISD::FOR:
1259
    case X86ISD::FXOR: {
1260
      // Widen scalar fp logic ops to vector to reduce isel patterns.
1261
      // FIXME: Can we do this during lowering/combine.
1262
      MVT VT = N->getSimpleValueType(0);
1263
      if (VT.isVector() || VT == MVT::f128)
1264
        break;
1265

1266
      MVT VecVT = VT == MVT::f64   ? MVT::v2f64
1267
                  : VT == MVT::f32 ? MVT::v4f32
1268
                                   : MVT::v8f16;
1269

1270
      SDLoc dl(N);
1271
      SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1272
                                    N->getOperand(0));
1273
      SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1274
                                    N->getOperand(1));
1275

1276
      SDValue Res;
1277
      if (Subtarget->hasSSE2()) {
1278
        EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1279
        Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1280
        Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1281
        unsigned Opc;
1282
        switch (N->getOpcode()) {
1283
        default: llvm_unreachable("Unexpected opcode!");
1284
        case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1285
        case X86ISD::FAND:  Opc = ISD::AND;      break;
1286
        case X86ISD::FOR:   Opc = ISD::OR;       break;
1287
        case X86ISD::FXOR:  Opc = ISD::XOR;      break;
1288
        }
1289
        Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1290
        Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1291
      } else {
1292
        Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1293
      }
1294
      Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1295
                            CurDAG->getIntPtrConstant(0, dl));
1296
      --I;
1297
      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1298
      ++I;
1299
      MadeChange = true;
1300
      continue;
1301
    }
1302
    }
1303

1304
    if (OptLevel != CodeGenOptLevel::None &&
1305
        // Only do this when the target can fold the load into the call or
1306
        // jmp.
1307
        !Subtarget->useIndirectThunkCalls() &&
1308
        ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1309
         (N->getOpcode() == X86ISD::TC_RETURN &&
1310
          (Subtarget->is64Bit() ||
1311
           !getTargetMachine().isPositionIndependent())))) {
1312
      /// Also try moving call address load from outside callseq_start to just
1313
      /// before the call to allow it to be folded.
1314
      ///
1315
      ///     [Load chain]
1316
      ///         ^
1317
      ///         |
1318
      ///       [Load]
1319
      ///       ^    ^
1320
      ///       |    |
1321
      ///      /      \--
1322
      ///     /          |
1323
      ///[CALLSEQ_START] |
1324
      ///     ^          |
1325
      ///     |          |
1326
      /// [LOAD/C2Reg]   |
1327
      ///     |          |
1328
      ///      \        /
1329
      ///       \      /
1330
      ///       [CALL]
1331
      bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1332
      SDValue Chain = N->getOperand(0);
1333
      SDValue Load  = N->getOperand(1);
1334
      if (!isCalleeLoad(Load, Chain, HasCallSeq))
1335
        continue;
1336
      moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1337
      ++NumLoadMoved;
1338
      MadeChange = true;
1339
      continue;
1340
    }
1341

1342
    // Lower fpround and fpextend nodes that target the FP stack to be store and
1343
    // load to the stack.  This is a gross hack.  We would like to simply mark
1344
    // these as being illegal, but when we do that, legalize produces these when
1345
    // it expands calls, then expands these in the same legalize pass.  We would
1346
    // like dag combine to be able to hack on these between the call expansion
1347
    // and the node legalization.  As such this pass basically does "really
1348
    // late" legalization of these inline with the X86 isel pass.
1349
    // FIXME: This should only happen when not compiled with -O0.
1350
    switch (N->getOpcode()) {
1351
    default: continue;
1352
    case ISD::FP_ROUND:
1353
    case ISD::FP_EXTEND:
1354
    {
1355
      MVT SrcVT = N->getOperand(0).getSimpleValueType();
1356
      MVT DstVT = N->getSimpleValueType(0);
1357

1358
      // If any of the sources are vectors, no fp stack involved.
1359
      if (SrcVT.isVector() || DstVT.isVector())
1360
        continue;
1361

1362
      // If the source and destination are SSE registers, then this is a legal
1363
      // conversion that should not be lowered.
1364
      const X86TargetLowering *X86Lowering =
1365
          static_cast<const X86TargetLowering *>(TLI);
1366
      bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1367
      bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1368
      if (SrcIsSSE && DstIsSSE)
1369
        continue;
1370

1371
      if (!SrcIsSSE && !DstIsSSE) {
1372
        // If this is an FPStack extension, it is a noop.
1373
        if (N->getOpcode() == ISD::FP_EXTEND)
1374
          continue;
1375
        // If this is a value-preserving FPStack truncation, it is a noop.
1376
        if (N->getConstantOperandVal(1))
1377
          continue;
1378
      }
1379

1380
      // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1381
      // FPStack has extload and truncstore.  SSE can fold direct loads into other
1382
      // operations.  Based on this, decide what we want to do.
1383
      MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1384
      SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1385
      int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1386
      MachinePointerInfo MPI =
1387
          MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1388
      SDLoc dl(N);
1389

1390
      // FIXME: optimize the case where the src/dest is a load or store?
1391

1392
      SDValue Store = CurDAG->getTruncStore(
1393
          CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1394
      SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1395
                                          MemTmp, MPI, MemVT);
1396

1397
      // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1398
      // extload we created.  This will cause general havok on the dag because
1399
      // anything below the conversion could be folded into other existing nodes.
1400
      // To avoid invalidating 'I', back it up to the convert node.
1401
      --I;
1402
      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1403
      break;
1404
    }
1405

1406
    //The sequence of events for lowering STRICT_FP versions of these nodes requires
1407
    //dealing with the chain differently, as there is already a preexisting chain.
1408
    case ISD::STRICT_FP_ROUND:
1409
    case ISD::STRICT_FP_EXTEND:
1410
    {
1411
      MVT SrcVT = N->getOperand(1).getSimpleValueType();
1412
      MVT DstVT = N->getSimpleValueType(0);
1413

1414
      // If any of the sources are vectors, no fp stack involved.
1415
      if (SrcVT.isVector() || DstVT.isVector())
1416
        continue;
1417

1418
      // If the source and destination are SSE registers, then this is a legal
1419
      // conversion that should not be lowered.
1420
      const X86TargetLowering *X86Lowering =
1421
          static_cast<const X86TargetLowering *>(TLI);
1422
      bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1423
      bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1424
      if (SrcIsSSE && DstIsSSE)
1425
        continue;
1426

1427
      if (!SrcIsSSE && !DstIsSSE) {
1428
        // If this is an FPStack extension, it is a noop.
1429
        if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1430
          continue;
1431
        // If this is a value-preserving FPStack truncation, it is a noop.
1432
        if (N->getConstantOperandVal(2))
1433
          continue;
1434
      }
1435

1436
      // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1437
      // FPStack has extload and truncstore.  SSE can fold direct loads into other
1438
      // operations.  Based on this, decide what we want to do.
1439
      MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1440
      SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1441
      int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1442
      MachinePointerInfo MPI =
1443
          MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1444
      SDLoc dl(N);
1445

1446
      // FIXME: optimize the case where the src/dest is a load or store?
1447

1448
      //Since the operation is StrictFP, use the preexisting chain.
1449
      SDValue Store, Result;
1450
      if (!SrcIsSSE) {
1451
        SDVTList VTs = CurDAG->getVTList(MVT::Other);
1452
        SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1453
        Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1454
                                            MPI, /*Align*/ std::nullopt,
1455
                                            MachineMemOperand::MOStore);
1456
        if (N->getFlags().hasNoFPExcept()) {
1457
          SDNodeFlags Flags = Store->getFlags();
1458
          Flags.setNoFPExcept(true);
1459
          Store->setFlags(Flags);
1460
        }
1461
      } else {
1462
        assert(SrcVT == MemVT && "Unexpected VT!");
1463
        Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1464
                                 MPI);
1465
      }
1466

1467
      if (!DstIsSSE) {
1468
        SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1469
        SDValue Ops[] = {Store, MemTmp};
1470
        Result = CurDAG->getMemIntrinsicNode(
1471
            X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1472
            /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1473
        if (N->getFlags().hasNoFPExcept()) {
1474
          SDNodeFlags Flags = Result->getFlags();
1475
          Flags.setNoFPExcept(true);
1476
          Result->setFlags(Flags);
1477
        }
1478
      } else {
1479
        assert(DstVT == MemVT && "Unexpected VT!");
1480
        Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1481
      }
1482

1483
      // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1484
      // extload we created.  This will cause general havok on the dag because
1485
      // anything below the conversion could be folded into other existing nodes.
1486
      // To avoid invalidating 'I', back it up to the convert node.
1487
      --I;
1488
      CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1489
      break;
1490
    }
1491
    }
1492

1493

1494
    // Now that we did that, the node is dead.  Increment the iterator to the
1495
    // next node to process, then delete N.
1496
    ++I;
1497
    MadeChange = true;
1498
  }
1499

1500
  // Remove any dead nodes that may have been left behind.
1501
  if (MadeChange)
1502
    CurDAG->RemoveDeadNodes();
1503
}
1504

1505
// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1506
bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1507
  unsigned Opc = N->getMachineOpcode();
1508
  if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1509
      Opc != X86::MOVSX64rr8)
1510
    return false;
1511

1512
  SDValue N0 = N->getOperand(0);
1513

1514
  // We need to be extracting the lower bit of an extend.
1515
  if (!N0.isMachineOpcode() ||
1516
      N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1517
      N0.getConstantOperandVal(1) != X86::sub_8bit)
1518
    return false;
1519

1520
  // We're looking for either a movsx or movzx to match the original opcode.
1521
  unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1522
                                                : X86::MOVSX32rr8_NOREX;
1523
  SDValue N00 = N0.getOperand(0);
1524
  if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1525
    return false;
1526

1527
  if (Opc == X86::MOVSX64rr8) {
1528
    // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1529
    // to 64.
1530
    MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1531
                                                   MVT::i64, N00);
1532
    ReplaceUses(N, Extend);
1533
  } else {
1534
    // Ok we can drop this extend and just use the original extend.
1535
    ReplaceUses(N, N00.getNode());
1536
  }
1537

1538
  return true;
1539
}
1540

1541
void X86DAGToDAGISel::PostprocessISelDAG() {
1542
  // Skip peepholes at -O0.
1543
  if (TM.getOptLevel() == CodeGenOptLevel::None)
1544
    return;
1545

1546
  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1547

1548
  bool MadeChange = false;
1549
  while (Position != CurDAG->allnodes_begin()) {
1550
    SDNode *N = &*--Position;
1551
    // Skip dead nodes and any non-machine opcodes.
1552
    if (N->use_empty() || !N->isMachineOpcode())
1553
      continue;
1554

1555
    if (tryOptimizeRem8Extend(N)) {
1556
      MadeChange = true;
1557
      continue;
1558
    }
1559

1560
    unsigned Opc = N->getMachineOpcode();
1561
    switch (Opc) {
1562
    default:
1563
      continue;
1564
    // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1565
    case X86::TEST8rr:
1566
    case X86::TEST16rr:
1567
    case X86::TEST32rr:
1568
    case X86::TEST64rr:
1569
    // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1570
    case X86::CTEST8rr:
1571
    case X86::CTEST16rr:
1572
    case X86::CTEST32rr:
1573
    case X86::CTEST64rr: {
1574
      auto &Op0 = N->getOperand(0);
1575
      if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1576
          !Op0.isMachineOpcode())
1577
        continue;
1578
      SDValue And = N->getOperand(0);
1579
#define CASE_ND(OP)                                                            \
1580
  case X86::OP:                                                                \
1581
  case X86::OP##_ND:
1582
      switch (And.getMachineOpcode()) {
1583
      default:
1584
        continue;
1585
        CASE_ND(AND8rr)
1586
        CASE_ND(AND16rr)
1587
        CASE_ND(AND32rr)
1588
        CASE_ND(AND64rr) {
1589
          if (And->hasAnyUseOfValue(1))
1590
            continue;
1591
          SmallVector<SDValue> Ops(N->op_values());
1592
          Ops[0] = And.getOperand(0);
1593
          Ops[1] = And.getOperand(1);
1594
          MachineSDNode *Test =
1595
              CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1596
          ReplaceUses(N, Test);
1597
          MadeChange = true;
1598
          continue;
1599
        }
1600
        CASE_ND(AND8rm)
1601
        CASE_ND(AND16rm)
1602
        CASE_ND(AND32rm)
1603
        CASE_ND(AND64rm) {
1604
          if (And->hasAnyUseOfValue(1))
1605
            continue;
1606
          unsigned NewOpc;
1607
          bool IsCTESTCC = X86::isCTESTCC(Opc);
1608
#define FROM_TO(A, B)                                                          \
1609
  CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B;                          \
1610
  break;
1611
          switch (And.getMachineOpcode()) {
1612
            FROM_TO(AND8rm, TEST8mr);
1613
            FROM_TO(AND16rm, TEST16mr);
1614
            FROM_TO(AND32rm, TEST32mr);
1615
            FROM_TO(AND64rm, TEST64mr);
1616
          }
1617
#undef FROM_TO
1618
#undef CASE_ND
1619
          // Need to swap the memory and register operand.
1620
          SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1621
                                      And.getOperand(3), And.getOperand(4),
1622
                                      And.getOperand(5), And.getOperand(0)};
1623
          // CC, Cflags.
1624
          if (IsCTESTCC) {
1625
            Ops.push_back(N->getOperand(2));
1626
            Ops.push_back(N->getOperand(3));
1627
          }
1628
          // Chain of memory load
1629
          Ops.push_back(And.getOperand(6));
1630
          // Glue
1631
          if (IsCTESTCC)
1632
            Ops.push_back(N->getOperand(4));
1633

1634
          MachineSDNode *Test = CurDAG->getMachineNode(
1635
              NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1636
          CurDAG->setNodeMemRefs(
1637
              Test, cast<MachineSDNode>(And.getNode())->memoperands());
1638
          ReplaceUses(And.getValue(2), SDValue(Test, 1));
1639
          ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1640
          MadeChange = true;
1641
          continue;
1642
        }
1643
      }
1644
    }
1645
    // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1646
    // used. We're doing this late so we can prefer to fold the AND into masked
1647
    // comparisons. Doing that can be better for the live range of the mask
1648
    // register.
1649
    case X86::KORTESTBrr:
1650
    case X86::KORTESTWrr:
1651
    case X86::KORTESTDrr:
1652
    case X86::KORTESTQrr: {
1653
      SDValue Op0 = N->getOperand(0);
1654
      if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1655
          !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1656
        continue;
1657
#define CASE(A)                                                                \
1658
  case X86::A:                                                                 \
1659
    break;
1660
      switch (Op0.getMachineOpcode()) {
1661
      default:
1662
        continue;
1663
        CASE(KANDBrr)
1664
        CASE(KANDWrr)
1665
        CASE(KANDDrr)
1666
        CASE(KANDQrr)
1667
      }
1668
      unsigned NewOpc;
1669
#define FROM_TO(A, B)                                                          \
1670
  case X86::A:                                                                 \
1671
    NewOpc = X86::B;                                                           \
1672
    break;
1673
      switch (Opc) {
1674
        FROM_TO(KORTESTBrr, KTESTBrr)
1675
        FROM_TO(KORTESTWrr, KTESTWrr)
1676
        FROM_TO(KORTESTDrr, KTESTDrr)
1677
        FROM_TO(KORTESTQrr, KTESTQrr)
1678
      }
1679
      // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1680
      // KAND instructions and KTEST use the same ISA feature.
1681
      if (NewOpc == X86::KTESTWrr && !Subtarget->hasDQI())
1682
        continue;
1683
#undef FROM_TO
1684
      MachineSDNode *KTest = CurDAG->getMachineNode(
1685
          NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1686
      ReplaceUses(N, KTest);
1687
      MadeChange = true;
1688
      continue;
1689
    }
1690
    // Attempt to remove vectors moves that were inserted to zero upper bits.
1691
    case TargetOpcode::SUBREG_TO_REG: {
1692
      unsigned SubRegIdx = N->getConstantOperandVal(2);
1693
      if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1694
        continue;
1695

1696
      SDValue Move = N->getOperand(1);
1697
      if (!Move.isMachineOpcode())
1698
        continue;
1699

1700
      // Make sure its one of the move opcodes we recognize.
1701
      switch (Move.getMachineOpcode()) {
1702
      default:
1703
        continue;
1704
        CASE(VMOVAPDrr)       CASE(VMOVUPDrr)
1705
        CASE(VMOVAPSrr)       CASE(VMOVUPSrr)
1706
        CASE(VMOVDQArr)       CASE(VMOVDQUrr)
1707
        CASE(VMOVAPDYrr)      CASE(VMOVUPDYrr)
1708
        CASE(VMOVAPSYrr)      CASE(VMOVUPSYrr)
1709
        CASE(VMOVDQAYrr)      CASE(VMOVDQUYrr)
1710
        CASE(VMOVAPDZ128rr)   CASE(VMOVUPDZ128rr)
1711
        CASE(VMOVAPSZ128rr)   CASE(VMOVUPSZ128rr)
1712
        CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1713
        CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1714
        CASE(VMOVAPDZ256rr)   CASE(VMOVUPDZ256rr)
1715
        CASE(VMOVAPSZ256rr)   CASE(VMOVUPSZ256rr)
1716
        CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1717
        CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1718
      }
1719
#undef CASE
1720

1721
    SDValue In = Move.getOperand(0);
1722
    if (!In.isMachineOpcode() ||
1723
        In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1724
      continue;
1725

1726
    // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1727
    // the SHA instructions which use a legacy encoding.
1728
    uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1729
    if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1730
        (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1731
        (TSFlags & X86II::EncodingMask) != X86II::XOP)
1732
      continue;
1733

1734
    // Producing instruction is another vector instruction. We can drop the
1735
    // move.
1736
    CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1737
    MadeChange = true;
1738
    }
1739
    }
1740
  }
1741

1742
  if (MadeChange)
1743
    CurDAG->RemoveDeadNodes();
1744
}
1745

1746

1747
/// Emit any code that needs to be executed only in the main function.
1748
void X86DAGToDAGISel::emitSpecialCodeForMain() {
1749
  if (Subtarget->isTargetCygMing()) {
1750
    TargetLowering::ArgListTy Args;
1751
    auto &DL = CurDAG->getDataLayout();
1752

1753
    TargetLowering::CallLoweringInfo CLI(*CurDAG);
1754
    CLI.setChain(CurDAG->getRoot())
1755
        .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1756
                   CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1757
                   std::move(Args));
1758
    const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1759
    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1760
    CurDAG->setRoot(Result.second);
1761
  }
1762
}
1763

1764
void X86DAGToDAGISel::emitFunctionEntryCode() {
1765
  // If this is main, emit special code for main.
1766
  const Function &F = MF->getFunction();
1767
  if (F.hasExternalLinkage() && F.getName() == "main")
1768
    emitSpecialCodeForMain();
1769
}
1770

1771
static bool isDispSafeForFrameIndex(int64_t Val) {
1772
  // On 64-bit platforms, we can run into an issue where a frame index
1773
  // includes a displacement that, when added to the explicit displacement,
1774
  // will overflow the displacement field. Assuming that the frame index
1775
  // displacement fits into a 31-bit integer  (which is only slightly more
1776
  // aggressive than the current fundamental assumption that it fits into
1777
  // a 32-bit integer), a 31-bit disp should always be safe.
1778
  return isInt<31>(Val);
1779
}
1780

1781
bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1782
                                            X86ISelAddressMode &AM) {
1783
  // We may have already matched a displacement and the caller just added the
1784
  // symbolic displacement. So we still need to do the checks even if Offset
1785
  // is zero.
1786

1787
  int64_t Val = AM.Disp + Offset;
1788

1789
  // Cannot combine ExternalSymbol displacements with integer offsets.
1790
  if (Val != 0 && (AM.ES || AM.MCSym))
1791
    return true;
1792

1793
  CodeModel::Model M = TM.getCodeModel();
1794
  if (Subtarget->is64Bit()) {
1795
    if (Val != 0 &&
1796
        !X86::isOffsetSuitableForCodeModel(Val, M,
1797
                                           AM.hasSymbolicDisplacement()))
1798
      return true;
1799
    // In addition to the checks required for a register base, check that
1800
    // we do not try to use an unsafe Disp with a frame index.
1801
    if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1802
        !isDispSafeForFrameIndex(Val))
1803
      return true;
1804
    // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1805
    // 64 bits. Instructions with 32-bit register addresses perform this zero
1806
    // extension for us and we can safely ignore the high bits of Offset.
1807
    // Instructions with only a 32-bit immediate address do not, though: they
1808
    // sign extend instead. This means only address the low 2GB of address space
1809
    // is directly addressable, we need indirect addressing for the high 2GB of
1810
    // address space.
1811
    // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1812
    // implicit zero extension of instructions would cover up any problem.
1813
    // However, we have asserts elsewhere that get triggered if we do, so keep
1814
    // the checks for now.
1815
    // TODO: We would actually be able to accept these, as well as the same
1816
    // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1817
    // to get an address size override to be emitted. However, this
1818
    // pseudo-register is not part of any register class and therefore causes
1819
    // MIR verification to fail.
1820
    if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
1821
        !AM.hasBaseOrIndexReg())
1822
      return true;
1823
  }
1824
  AM.Disp = Val;
1825
  return false;
1826
}
1827

1828
bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1829
                                         bool AllowSegmentRegForX32) {
1830
  SDValue Address = N->getOperand(1);
1831

1832
  // load gs:0 -> GS segment register.
1833
  // load fs:0 -> FS segment register.
1834
  //
1835
  // This optimization is generally valid because the GNU TLS model defines that
1836
  // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1837
  // with 32-bit registers, as we get in ILP32 mode, those registers are first
1838
  // zero-extended to 64 bits and then added it to the base address, which gives
1839
  // unwanted results when the register holds a negative value.
1840
  // For more information see http://people.redhat.com/drepper/tls.pdf
1841
  if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1842
      !IndirectTlsSegRefs &&
1843
      (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1844
       Subtarget->isTargetFuchsia())) {
1845
    if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1846
      return true;
1847
    switch (N->getPointerInfo().getAddrSpace()) {
1848
    case X86AS::GS:
1849
      AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1850
      return false;
1851
    case X86AS::FS:
1852
      AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1853
      return false;
1854
      // Address space X86AS::SS is not handled here, because it is not used to
1855
      // address TLS areas.
1856
    }
1857
  }
1858

1859
  return true;
1860
}
1861

1862
/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1863
/// mode. These wrap things that will resolve down into a symbol reference.
1864
/// If no match is possible, this returns true, otherwise it returns false.
1865
bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1866
  // If the addressing mode already has a symbol as the displacement, we can
1867
  // never match another symbol.
1868
  if (AM.hasSymbolicDisplacement())
1869
    return true;
1870

1871
  bool IsRIPRelTLS = false;
1872
  bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1873
  if (IsRIPRel) {
1874
    SDValue Val = N.getOperand(0);
1875
    if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1876
      IsRIPRelTLS = true;
1877
  }
1878

1879
  // We can't use an addressing mode in the 64-bit large code model.
1880
  // Global TLS addressing is an exception. In the medium code model,
1881
  // we use can use a mode when RIP wrappers are present.
1882
  // That signifies access to globals that are known to be "near",
1883
  // such as the GOT itself.
1884
  CodeModel::Model M = TM.getCodeModel();
1885
  if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1886
    return true;
1887

1888
  // Base and index reg must be 0 in order to use %rip as base.
1889
  if (IsRIPRel && AM.hasBaseOrIndexReg())
1890
    return true;
1891

1892
  // Make a local copy in case we can't do this fold.
1893
  X86ISelAddressMode Backup = AM;
1894

1895
  int64_t Offset = 0;
1896
  SDValue N0 = N.getOperand(0);
1897
  if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1898
    AM.GV = G->getGlobal();
1899
    AM.SymbolFlags = G->getTargetFlags();
1900
    Offset = G->getOffset();
1901
  } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1902
    AM.CP = CP->getConstVal();
1903
    AM.Alignment = CP->getAlign();
1904
    AM.SymbolFlags = CP->getTargetFlags();
1905
    Offset = CP->getOffset();
1906
  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1907
    AM.ES = S->getSymbol();
1908
    AM.SymbolFlags = S->getTargetFlags();
1909
  } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1910
    AM.MCSym = S->getMCSymbol();
1911
  } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1912
    AM.JT = J->getIndex();
1913
    AM.SymbolFlags = J->getTargetFlags();
1914
  } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1915
    AM.BlockAddr = BA->getBlockAddress();
1916
    AM.SymbolFlags = BA->getTargetFlags();
1917
    Offset = BA->getOffset();
1918
  } else
1919
    llvm_unreachable("Unhandled symbol reference node.");
1920

1921
  // Can't use an addressing mode with large globals.
1922
  if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1923
      TM.isLargeGlobalValue(AM.GV)) {
1924
    AM = Backup;
1925
    return true;
1926
  }
1927

1928
  if (foldOffsetIntoAddress(Offset, AM)) {
1929
    AM = Backup;
1930
    return true;
1931
  }
1932

1933
  if (IsRIPRel)
1934
    AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1935

1936
  // Commit the changes now that we know this fold is safe.
1937
  return false;
1938
}
1939

1940
/// Add the specified node to the specified addressing mode, returning true if
1941
/// it cannot be done. This just pattern matches for the addressing mode.
1942
bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1943
  if (matchAddressRecursively(N, AM, 0))
1944
    return true;
1945

1946
  // Post-processing: Make a second attempt to fold a load, if we now know
1947
  // that there will not be any other register. This is only performed for
1948
  // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1949
  // any foldable load the first time.
1950
  if (Subtarget->isTarget64BitILP32() &&
1951
      AM.BaseType == X86ISelAddressMode::RegBase &&
1952
      AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1953
    SDValue Save_Base_Reg = AM.Base_Reg;
1954
    if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1955
      AM.Base_Reg = SDValue();
1956
      if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1957
        AM.Base_Reg = Save_Base_Reg;
1958
    }
1959
  }
1960

1961
  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1962
  // a smaller encoding and avoids a scaled-index.
1963
  if (AM.Scale == 2 &&
1964
      AM.BaseType == X86ISelAddressMode::RegBase &&
1965
      AM.Base_Reg.getNode() == nullptr) {
1966
    AM.Base_Reg = AM.IndexReg;
1967
    AM.Scale = 1;
1968
  }
1969

1970
  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1971
  // because it has a smaller encoding.
1972
  if (TM.getCodeModel() != CodeModel::Large &&
1973
      (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
1974
      AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
1975
      AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
1976
      AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
1977
    AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1978
  }
1979

1980
  return false;
1981
}
1982

1983
bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1984
                               unsigned Depth) {
1985
  // Add an artificial use to this node so that we can keep track of
1986
  // it if it gets CSE'd with a different node.
1987
  HandleSDNode Handle(N);
1988

1989
  X86ISelAddressMode Backup = AM;
1990
  if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1991
      !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1992
    return false;
1993
  AM = Backup;
1994

1995
  // Try again after commutating the operands.
1996
  if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
1997
                               Depth + 1) &&
1998
      !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
1999
    return false;
2000
  AM = Backup;
2001

2002
  // If we couldn't fold both operands into the address at the same time,
2003
  // see if we can just put each operand into a register and fold at least
2004
  // the add.
2005
  if (AM.BaseType == X86ISelAddressMode::RegBase &&
2006
      !AM.Base_Reg.getNode() &&
2007
      !AM.IndexReg.getNode()) {
2008
    N = Handle.getValue();
2009
    AM.Base_Reg = N.getOperand(0);
2010
    AM.IndexReg = N.getOperand(1);
2011
    AM.Scale = 1;
2012
    return false;
2013
  }
2014
  N = Handle.getValue();
2015
  return true;
2016
}
2017

2018
// Insert a node into the DAG at least before the Pos node's position. This
2019
// will reposition the node as needed, and will assign it a node ID that is <=
2020
// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2021
// IDs! The selection DAG must no longer depend on their uniqueness when this
2022
// is used.
2023
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2024
  if (N->getNodeId() == -1 ||
2025
      (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
2026
       SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
2027
    DAG.RepositionNode(Pos->getIterator(), N.getNode());
2028
    // Mark Node as invalid for pruning as after this it may be a successor to a
2029
    // selected node but otherwise be in the same position of Pos.
2030
    // Conservatively mark it with the same -abs(Id) to assure node id
2031
    // invariant is preserved.
2032
    N->setNodeId(Pos->getNodeId());
2033
    SelectionDAGISel::InvalidateNodeId(N.getNode());
2034
  }
2035
}
2036

2037
// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2038
// safe. This allows us to convert the shift and and into an h-register
2039
// extract and a scaled index. Returns false if the simplification is
2040
// performed.
2041
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2042
                                      uint64_t Mask,
2043
                                      SDValue Shift, SDValue X,
2044
                                      X86ISelAddressMode &AM) {
2045
  if (Shift.getOpcode() != ISD::SRL ||
2046
      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2047
      !Shift.hasOneUse())
2048
    return true;
2049

2050
  int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2051
  if (ScaleLog <= 0 || ScaleLog >= 4 ||
2052
      Mask != (0xffu << ScaleLog))
2053
    return true;
2054

2055
  MVT XVT = X.getSimpleValueType();
2056
  MVT VT = N.getSimpleValueType();
2057
  SDLoc DL(N);
2058
  SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2059
  SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2060
  SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2061
  SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2062
  SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2063
  SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2064
  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2065

2066
  // Insert the new nodes into the topological ordering. We must do this in
2067
  // a valid topological ordering as nothing is going to go back and re-sort
2068
  // these nodes. We continually insert before 'N' in sequence as this is
2069
  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2070
  // hierarchy left to express.
2071
  insertDAGNode(DAG, N, Eight);
2072
  insertDAGNode(DAG, N, NewMask);
2073
  insertDAGNode(DAG, N, Srl);
2074
  insertDAGNode(DAG, N, And);
2075
  insertDAGNode(DAG, N, Ext);
2076
  insertDAGNode(DAG, N, ShlCount);
2077
  insertDAGNode(DAG, N, Shl);
2078
  DAG.ReplaceAllUsesWith(N, Shl);
2079
  DAG.RemoveDeadNode(N.getNode());
2080
  AM.IndexReg = Ext;
2081
  AM.Scale = (1 << ScaleLog);
2082
  return false;
2083
}
2084

2085
// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2086
// allows us to fold the shift into this addressing mode. Returns false if the
2087
// transform succeeded.
2088
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2089
                                        X86ISelAddressMode &AM) {
2090
  SDValue Shift = N.getOperand(0);
2091

2092
  // Use a signed mask so that shifting right will insert sign bits. These
2093
  // bits will be removed when we shift the result left so it doesn't matter
2094
  // what we use. This might allow a smaller immediate encoding.
2095
  int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2096

2097
  // If we have an any_extend feeding the AND, look through it to see if there
2098
  // is a shift behind it. But only if the AND doesn't use the extended bits.
2099
  // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2100
  bool FoundAnyExtend = false;
2101
  if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2102
      Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2103
      isUInt<32>(Mask)) {
2104
    FoundAnyExtend = true;
2105
    Shift = Shift.getOperand(0);
2106
  }
2107

2108
  if (Shift.getOpcode() != ISD::SHL ||
2109
      !isa<ConstantSDNode>(Shift.getOperand(1)))
2110
    return true;
2111

2112
  SDValue X = Shift.getOperand(0);
2113

2114
  // Not likely to be profitable if either the AND or SHIFT node has more
2115
  // than one use (unless all uses are for address computation). Besides,
2116
  // isel mechanism requires their node ids to be reused.
2117
  if (!N.hasOneUse() || !Shift.hasOneUse())
2118
    return true;
2119

2120
  // Verify that the shift amount is something we can fold.
2121
  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2122
  if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2123
    return true;
2124

2125
  MVT VT = N.getSimpleValueType();
2126
  SDLoc DL(N);
2127
  if (FoundAnyExtend) {
2128
    SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2129
    insertDAGNode(DAG, N, NewX);
2130
    X = NewX;
2131
  }
2132

2133
  SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
2134
  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2135
  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2136

2137
  // Insert the new nodes into the topological ordering. We must do this in
2138
  // a valid topological ordering as nothing is going to go back and re-sort
2139
  // these nodes. We continually insert before 'N' in sequence as this is
2140
  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2141
  // hierarchy left to express.
2142
  insertDAGNode(DAG, N, NewMask);
2143
  insertDAGNode(DAG, N, NewAnd);
2144
  insertDAGNode(DAG, N, NewShift);
2145
  DAG.ReplaceAllUsesWith(N, NewShift);
2146
  DAG.RemoveDeadNode(N.getNode());
2147

2148
  AM.Scale = 1 << ShiftAmt;
2149
  AM.IndexReg = NewAnd;
2150
  return false;
2151
}
2152

2153
// Implement some heroics to detect shifts of masked values where the mask can
2154
// be replaced by extending the shift and undoing that in the addressing mode
2155
// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2156
// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2157
// the addressing mode. This results in code such as:
2158
//
2159
//   int f(short *y, int *lookup_table) {
2160
//     ...
2161
//     return *y + lookup_table[*y >> 11];
2162
//   }
2163
//
2164
// Turning into:
2165
//   movzwl (%rdi), %eax
2166
//   movl %eax, %ecx
2167
//   shrl $11, %ecx
2168
//   addl (%rsi,%rcx,4), %eax
2169
//
2170
// Instead of:
2171
//   movzwl (%rdi), %eax
2172
//   movl %eax, %ecx
2173
//   shrl $9, %ecx
2174
//   andl $124, %rcx
2175
//   addl (%rsi,%rcx), %eax
2176
//
2177
// Note that this function assumes the mask is provided as a mask *after* the
2178
// value is shifted. The input chain may or may not match that, but computing
2179
// such a mask is trivial.
2180
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2181
                                    uint64_t Mask,
2182
                                    SDValue Shift, SDValue X,
2183
                                    X86ISelAddressMode &AM) {
2184
  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2185
      !isa<ConstantSDNode>(Shift.getOperand(1)))
2186
    return true;
2187

2188
  // We need to ensure that mask is a continuous run of bits.
2189
  unsigned MaskIdx, MaskLen;
2190
  if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2191
    return true;
2192
  unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2193

2194
  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2195

2196
  // The amount of shift we're trying to fit into the addressing mode is taken
2197
  // from the shifted mask index (number of trailing zeros of the mask).
2198
  unsigned AMShiftAmt = MaskIdx;
2199

2200
  // There is nothing we can do here unless the mask is removing some bits.
2201
  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2202
  if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2203

2204
  // Scale the leading zero count down based on the actual size of the value.
2205
  // Also scale it down based on the size of the shift.
2206
  unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2207
  if (MaskLZ < ScaleDown)
2208
    return true;
2209
  MaskLZ -= ScaleDown;
2210

2211
  // The final check is to ensure that any masked out high bits of X are
2212
  // already known to be zero. Otherwise, the mask has a semantic impact
2213
  // other than masking out a couple of low bits. Unfortunately, because of
2214
  // the mask, zero extensions will be removed from operands in some cases.
2215
  // This code works extra hard to look through extensions because we can
2216
  // replace them with zero extensions cheaply if necessary.
2217
  bool ReplacingAnyExtend = false;
2218
  if (X.getOpcode() == ISD::ANY_EXTEND) {
2219
    unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2220
                          X.getOperand(0).getSimpleValueType().getSizeInBits();
2221
    // Assume that we'll replace the any-extend with a zero-extend, and
2222
    // narrow the search to the extended value.
2223
    X = X.getOperand(0);
2224
    MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2225
    ReplacingAnyExtend = true;
2226
  }
2227
  APInt MaskedHighBits =
2228
    APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2229
  if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2230
    return true;
2231

2232
  // We've identified a pattern that can be transformed into a single shift
2233
  // and an addressing mode. Make it so.
2234
  MVT VT = N.getSimpleValueType();
2235
  if (ReplacingAnyExtend) {
2236
    assert(X.getValueType() != VT);
2237
    // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2238
    SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2239
    insertDAGNode(DAG, N, NewX);
2240
    X = NewX;
2241
  }
2242

2243
  MVT XVT = X.getSimpleValueType();
2244
  SDLoc DL(N);
2245
  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2246
  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2247
  SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2248
  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2249
  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2250

2251
  // Insert the new nodes into the topological ordering. We must do this in
2252
  // a valid topological ordering as nothing is going to go back and re-sort
2253
  // these nodes. We continually insert before 'N' in sequence as this is
2254
  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2255
  // hierarchy left to express.
2256
  insertDAGNode(DAG, N, NewSRLAmt);
2257
  insertDAGNode(DAG, N, NewSRL);
2258
  insertDAGNode(DAG, N, NewExt);
2259
  insertDAGNode(DAG, N, NewSHLAmt);
2260
  insertDAGNode(DAG, N, NewSHL);
2261
  DAG.ReplaceAllUsesWith(N, NewSHL);
2262
  DAG.RemoveDeadNode(N.getNode());
2263

2264
  AM.Scale = 1 << AMShiftAmt;
2265
  AM.IndexReg = NewExt;
2266
  return false;
2267
}
2268

2269
// Transform "(X >> SHIFT) & (MASK << C1)" to
2270
// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2271
// matched to a BEXTR later. Returns false if the simplification is performed.
2272
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2273
                                   uint64_t Mask,
2274
                                   SDValue Shift, SDValue X,
2275
                                   X86ISelAddressMode &AM,
2276
                                   const X86Subtarget &Subtarget) {
2277
  if (Shift.getOpcode() != ISD::SRL ||
2278
      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2279
      !Shift.hasOneUse() || !N.hasOneUse())
2280
    return true;
2281

2282
  // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2283
  if (!Subtarget.hasTBM() &&
2284
      !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2285
    return true;
2286

2287
  // We need to ensure that mask is a continuous run of bits.
2288
  unsigned MaskIdx, MaskLen;
2289
  if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2290
    return true;
2291

2292
  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2293

2294
  // The amount of shift we're trying to fit into the addressing mode is taken
2295
  // from the shifted mask index (number of trailing zeros of the mask).
2296
  unsigned AMShiftAmt = MaskIdx;
2297

2298
  // There is nothing we can do here unless the mask is removing some bits.
2299
  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2300
  if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2301

2302
  MVT XVT = X.getSimpleValueType();
2303
  MVT VT = N.getSimpleValueType();
2304
  SDLoc DL(N);
2305
  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2306
  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2307
  SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2308
  SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2309
  SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2310
  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2311
  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2312

2313
  // Insert the new nodes into the topological ordering. We must do this in
2314
  // a valid topological ordering as nothing is going to go back and re-sort
2315
  // these nodes. We continually insert before 'N' in sequence as this is
2316
  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2317
  // hierarchy left to express.
2318
  insertDAGNode(DAG, N, NewSRLAmt);
2319
  insertDAGNode(DAG, N, NewSRL);
2320
  insertDAGNode(DAG, N, NewMask);
2321
  insertDAGNode(DAG, N, NewAnd);
2322
  insertDAGNode(DAG, N, NewExt);
2323
  insertDAGNode(DAG, N, NewSHLAmt);
2324
  insertDAGNode(DAG, N, NewSHL);
2325
  DAG.ReplaceAllUsesWith(N, NewSHL);
2326
  DAG.RemoveDeadNode(N.getNode());
2327

2328
  AM.Scale = 1 << AMShiftAmt;
2329
  AM.IndexReg = NewExt;
2330
  return false;
2331
}
2332

2333
// Attempt to peek further into a scaled index register, collecting additional
2334
// extensions / offsets / etc. Returns /p N if we can't peek any further.
2335
SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2336
                                               X86ISelAddressMode &AM,
2337
                                               unsigned Depth) {
2338
  assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2339
  assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2340
         "Illegal index scale");
2341

2342
  // Limit recursion.
2343
  if (Depth >= SelectionDAG::MaxRecursionDepth)
2344
    return N;
2345

2346
  EVT VT = N.getValueType();
2347
  unsigned Opc = N.getOpcode();
2348

2349
  // index: add(x,c) -> index: x, disp + c
2350
  if (CurDAG->isBaseWithConstantOffset(N)) {
2351
    auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2352
    uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2353
    if (!foldOffsetIntoAddress(Offset, AM))
2354
      return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2355
  }
2356

2357
  // index: add(x,x) -> index: x, scale * 2
2358
  if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2359
    if (AM.Scale <= 4) {
2360
      AM.Scale *= 2;
2361
      return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2362
    }
2363
  }
2364

2365
  // index: shl(x,i) -> index: x, scale * (1 << i)
2366
  if (Opc == X86ISD::VSHLI) {
2367
    uint64_t ShiftAmt = N.getConstantOperandVal(1);
2368
    uint64_t ScaleAmt = 1ULL << ShiftAmt;
2369
    if ((AM.Scale * ScaleAmt) <= 8) {
2370
      AM.Scale *= ScaleAmt;
2371
      return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2372
    }
2373
  }
2374

2375
  // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2376
  // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2377
  if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2378
    SDValue Src = N.getOperand(0);
2379
    if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2380
        Src.hasOneUse()) {
2381
      if (CurDAG->isBaseWithConstantOffset(Src)) {
2382
        SDValue AddSrc = Src.getOperand(0);
2383
        auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2384
        uint64_t Offset = (uint64_t)AddVal->getSExtValue();
2385
        if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2386
          SDLoc DL(N);
2387
          SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2388
          SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2389
          SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2390
          insertDAGNode(*CurDAG, N, ExtSrc);
2391
          insertDAGNode(*CurDAG, N, ExtVal);
2392
          insertDAGNode(*CurDAG, N, ExtAdd);
2393
          CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2394
          CurDAG->RemoveDeadNode(N.getNode());
2395
          return ExtSrc;
2396
        }
2397
      }
2398
    }
2399
  }
2400

2401
  // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2402
  // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2403
  // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2404
  if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2405
    SDValue Src = N.getOperand(0);
2406
    unsigned SrcOpc = Src.getOpcode();
2407
    if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2408
         CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2409
        Src.hasOneUse()) {
2410
      if (CurDAG->isBaseWithConstantOffset(Src)) {
2411
        SDValue AddSrc = Src.getOperand(0);
2412
        uint64_t Offset = Src.getConstantOperandVal(1);
2413
        if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2414
          SDLoc DL(N);
2415
          SDValue Res;
2416
          // If we're also scaling, see if we can use that as well.
2417
          if (AddSrc.getOpcode() == ISD::SHL &&
2418
              isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2419
            SDValue ShVal = AddSrc.getOperand(0);
2420
            uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2421
            APInt HiBits =
2422
                APInt::getHighBitsSet(AddSrc.getScalarValueSizeInBits(), ShAmt);
2423
            uint64_t ScaleAmt = 1ULL << ShAmt;
2424
            if ((AM.Scale * ScaleAmt) <= 8 &&
2425
                (AddSrc->getFlags().hasNoUnsignedWrap() ||
2426
                 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2427
              AM.Scale *= ScaleAmt;
2428
              SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2429
              SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2430
                                                 AddSrc.getOperand(1));
2431
              insertDAGNode(*CurDAG, N, ExtShVal);
2432
              insertDAGNode(*CurDAG, N, ExtShift);
2433
              AddSrc = ExtShift;
2434
              Res = ExtShVal;
2435
            }
2436
          }
2437
          SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2438
          SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2439
          SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2440
          insertDAGNode(*CurDAG, N, ExtSrc);
2441
          insertDAGNode(*CurDAG, N, ExtVal);
2442
          insertDAGNode(*CurDAG, N, ExtAdd);
2443
          CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2444
          CurDAG->RemoveDeadNode(N.getNode());
2445
          return Res ? Res : ExtSrc;
2446
        }
2447
      }
2448
    }
2449
  }
2450

2451
  // TODO: Handle extensions, shifted masks etc.
2452
  return N;
2453
}
2454

2455
bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2456
                                              unsigned Depth) {
2457
  SDLoc dl(N);
2458
  LLVM_DEBUG({
2459
    dbgs() << "MatchAddress: ";
2460
    AM.dump(CurDAG);
2461
  });
2462
  // Limit recursion.
2463
  if (Depth >= SelectionDAG::MaxRecursionDepth)
2464
    return matchAddressBase(N, AM);
2465

2466
  // If this is already a %rip relative address, we can only merge immediates
2467
  // into it.  Instead of handling this in every case, we handle it here.
2468
  // RIP relative addressing: %rip + 32-bit displacement!
2469
  if (AM.isRIPRelative()) {
2470
    // FIXME: JumpTable and ExternalSymbol address currently don't like
2471
    // displacements.  It isn't very important, but this should be fixed for
2472
    // consistency.
2473
    if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2474
      return true;
2475

2476
    if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2477
      if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2478
        return false;
2479
    return true;
2480
  }
2481

2482
  switch (N.getOpcode()) {
2483
  default: break;
2484
  case ISD::LOCAL_RECOVER: {
2485
    if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2486
      if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2487
        // Use the symbol and don't prefix it.
2488
        AM.MCSym = ESNode->getMCSymbol();
2489
        return false;
2490
      }
2491
    break;
2492
  }
2493
  case ISD::Constant: {
2494
    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2495
    if (!foldOffsetIntoAddress(Val, AM))
2496
      return false;
2497
    break;
2498
  }
2499

2500
  case X86ISD::Wrapper:
2501
  case X86ISD::WrapperRIP:
2502
    if (!matchWrapper(N, AM))
2503
      return false;
2504
    break;
2505

2506
  case ISD::LOAD:
2507
    if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2508
      return false;
2509
    break;
2510

2511
  case ISD::FrameIndex:
2512
    if (AM.BaseType == X86ISelAddressMode::RegBase &&
2513
        AM.Base_Reg.getNode() == nullptr &&
2514
        (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
2515
      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2516
      AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2517
      return false;
2518
    }
2519
    break;
2520

2521
  case ISD::SHL:
2522
    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2523
      break;
2524

2525
    if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2526
      unsigned Val = CN->getZExtValue();
2527
      // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2528
      // that the base operand remains free for further matching. If
2529
      // the base doesn't end up getting used, a post-processing step
2530
      // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2531
      if (Val == 1 || Val == 2 || Val == 3) {
2532
        SDValue ShVal = N.getOperand(0);
2533
        AM.Scale = 1 << Val;
2534
        AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2535
        return false;
2536
      }
2537
    }
2538
    break;
2539

2540
  case ISD::SRL: {
2541
    // Scale must not be used already.
2542
    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2543

2544
    // We only handle up to 64-bit values here as those are what matter for
2545
    // addressing mode optimizations.
2546
    assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2547
           "Unexpected value size!");
2548

2549
    SDValue And = N.getOperand(0);
2550
    if (And.getOpcode() != ISD::AND) break;
2551
    SDValue X = And.getOperand(0);
2552

2553
    // The mask used for the transform is expected to be post-shift, but we
2554
    // found the shift first so just apply the shift to the mask before passing
2555
    // it down.
2556
    if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2557
        !isa<ConstantSDNode>(And.getOperand(1)))
2558
      break;
2559
    uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2560

2561
    // Try to fold the mask and shift into the scale, and return false if we
2562
    // succeed.
2563
    if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2564
      return false;
2565
    break;
2566
  }
2567

2568
  case ISD::SMUL_LOHI:
2569
  case ISD::UMUL_LOHI:
2570
    // A mul_lohi where we need the low part can be folded as a plain multiply.
2571
    if (N.getResNo() != 0) break;
2572
    [[fallthrough]];
2573
  case ISD::MUL:
2574
  case X86ISD::MUL_IMM:
2575
    // X*[3,5,9] -> X+X*[2,4,8]
2576
    if (AM.BaseType == X86ISelAddressMode::RegBase &&
2577
        AM.Base_Reg.getNode() == nullptr &&
2578
        AM.IndexReg.getNode() == nullptr) {
2579
      if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2580
        if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2581
            CN->getZExtValue() == 9) {
2582
          AM.Scale = unsigned(CN->getZExtValue())-1;
2583

2584
          SDValue MulVal = N.getOperand(0);
2585
          SDValue Reg;
2586

2587
          // Okay, we know that we have a scale by now.  However, if the scaled
2588
          // value is an add of something and a constant, we can fold the
2589
          // constant into the disp field here.
2590
          if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2591
              isa<ConstantSDNode>(MulVal.getOperand(1))) {
2592
            Reg = MulVal.getOperand(0);
2593
            auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2594
            uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2595
            if (foldOffsetIntoAddress(Disp, AM))
2596
              Reg = N.getOperand(0);
2597
          } else {
2598
            Reg = N.getOperand(0);
2599
          }
2600

2601
          AM.IndexReg = AM.Base_Reg = Reg;
2602
          return false;
2603
        }
2604
    }
2605
    break;
2606

2607
  case ISD::SUB: {
2608
    // Given A-B, if A can be completely folded into the address and
2609
    // the index field with the index field unused, use -B as the index.
2610
    // This is a win if a has multiple parts that can be folded into
2611
    // the address. Also, this saves a mov if the base register has
2612
    // other uses, since it avoids a two-address sub instruction, however
2613
    // it costs an additional mov if the index register has other uses.
2614

2615
    // Add an artificial use to this node so that we can keep track of
2616
    // it if it gets CSE'd with a different node.
2617
    HandleSDNode Handle(N);
2618

2619
    // Test if the LHS of the sub can be folded.
2620
    X86ISelAddressMode Backup = AM;
2621
    if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2622
      N = Handle.getValue();
2623
      AM = Backup;
2624
      break;
2625
    }
2626
    N = Handle.getValue();
2627
    // Test if the index field is free for use.
2628
    if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2629
      AM = Backup;
2630
      break;
2631
    }
2632

2633
    int Cost = 0;
2634
    SDValue RHS = N.getOperand(1);
2635
    // If the RHS involves a register with multiple uses, this
2636
    // transformation incurs an extra mov, due to the neg instruction
2637
    // clobbering its operand.
2638
    if (!RHS.getNode()->hasOneUse() ||
2639
        RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2640
        RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2641
        RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2642
        (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2643
         RHS.getOperand(0).getValueType() == MVT::i32))
2644
      ++Cost;
2645
    // If the base is a register with multiple uses, this
2646
    // transformation may save a mov.
2647
    if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2648
         !AM.Base_Reg.getNode()->hasOneUse()) ||
2649
        AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2650
      --Cost;
2651
    // If the folded LHS was interesting, this transformation saves
2652
    // address arithmetic.
2653
    if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2654
        ((AM.Disp != 0) && (Backup.Disp == 0)) +
2655
        (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2656
      --Cost;
2657
    // If it doesn't look like it may be an overall win, don't do it.
2658
    if (Cost >= 0) {
2659
      AM = Backup;
2660
      break;
2661
    }
2662

2663
    // Ok, the transformation is legal and appears profitable. Go for it.
2664
    // Negation will be emitted later to avoid creating dangling nodes if this
2665
    // was an unprofitable LEA.
2666
    AM.IndexReg = RHS;
2667
    AM.NegateIndex = true;
2668
    AM.Scale = 1;
2669
    return false;
2670
  }
2671

2672
  case ISD::OR:
2673
  case ISD::XOR:
2674
    // See if we can treat the OR/XOR node as an ADD node.
2675
    if (!CurDAG->isADDLike(N))
2676
      break;
2677
    [[fallthrough]];
2678
  case ISD::ADD:
2679
    if (!matchAdd(N, AM, Depth))
2680
      return false;
2681
    break;
2682

2683
  case ISD::AND: {
2684
    // Perform some heroic transforms on an and of a constant-count shift
2685
    // with a constant to enable use of the scaled offset field.
2686

2687
    // Scale must not be used already.
2688
    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2689

2690
    // We only handle up to 64-bit values here as those are what matter for
2691
    // addressing mode optimizations.
2692
    assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2693
           "Unexpected value size!");
2694

2695
    if (!isa<ConstantSDNode>(N.getOperand(1)))
2696
      break;
2697

2698
    if (N.getOperand(0).getOpcode() == ISD::SRL) {
2699
      SDValue Shift = N.getOperand(0);
2700
      SDValue X = Shift.getOperand(0);
2701

2702
      uint64_t Mask = N.getConstantOperandVal(1);
2703

2704
      // Try to fold the mask and shift into an extract and scale.
2705
      if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2706
        return false;
2707

2708
      // Try to fold the mask and shift directly into the scale.
2709
      if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2710
        return false;
2711

2712
      // Try to fold the mask and shift into BEXTR and scale.
2713
      if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2714
        return false;
2715
    }
2716

2717
    // Try to swap the mask and shift to place shifts which can be done as
2718
    // a scale on the outside of the mask.
2719
    if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2720
      return false;
2721

2722
    break;
2723
  }
2724
  case ISD::ZERO_EXTEND: {
2725
    // Try to widen a zexted shift left to the same size as its use, so we can
2726
    // match the shift as a scale factor.
2727
    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2728
      break;
2729

2730
    SDValue Src = N.getOperand(0);
2731

2732
    // See if we can match a zext(addlike(x,c)).
2733
    // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2734
    if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2735
      if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2736
        if (Index != N) {
2737
          AM.IndexReg = Index;
2738
          return false;
2739
        }
2740

2741
    // Peek through mask: zext(and(shl(x,c1),c2))
2742
    APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2743
    if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2744
      if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2745
        Mask = MaskC->getAPIntValue();
2746
        Src = Src.getOperand(0);
2747
      }
2748

2749
    if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2750
      // Give up if the shift is not a valid scale factor [1,2,3].
2751
      SDValue ShlSrc = Src.getOperand(0);
2752
      SDValue ShlAmt = Src.getOperand(1);
2753
      auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2754
      if (!ShAmtC)
2755
        break;
2756
      unsigned ShAmtV = ShAmtC->getZExtValue();
2757
      if (ShAmtV > 3)
2758
        break;
2759

2760
      // The narrow shift must only shift out zero bits (it must be 'nuw').
2761
      // That makes it safe to widen to the destination type.
2762
      APInt HighZeros =
2763
          APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2764
      if (!Src->getFlags().hasNoUnsignedWrap() &&
2765
          !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2766
        break;
2767

2768
      // zext (shl nuw i8 %x, C1) to i32
2769
      // --> shl (zext i8 %x to i32), (zext C1)
2770
      // zext (and (shl nuw i8 %x, C1), C2) to i32
2771
      // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2772
      MVT SrcVT = ShlSrc.getSimpleValueType();
2773
      MVT VT = N.getSimpleValueType();
2774
      SDLoc DL(N);
2775

2776
      SDValue Res = ShlSrc;
2777
      if (!Mask.isAllOnes()) {
2778
        Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2779
        insertDAGNode(*CurDAG, N, Res);
2780
        Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2781
        insertDAGNode(*CurDAG, N, Res);
2782
      }
2783
      SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2784
      insertDAGNode(*CurDAG, N, Zext);
2785
      SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2786
      insertDAGNode(*CurDAG, N, NewShl);
2787
      CurDAG->ReplaceAllUsesWith(N, NewShl);
2788
      CurDAG->RemoveDeadNode(N.getNode());
2789

2790
      // Convert the shift to scale factor.
2791
      AM.Scale = 1 << ShAmtV;
2792
      // If matchIndexRecursively is not called here,
2793
      // Zext may be replaced by other nodes but later used to call a builder
2794
      // method
2795
      AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2796
      return false;
2797
    }
2798

2799
    if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2800
      // Try to fold the mask and shift into an extract and scale.
2801
      if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2802
                                     Src.getOperand(0), AM))
2803
        return false;
2804

2805
      // Try to fold the mask and shift directly into the scale.
2806
      if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2807
                                   Src.getOperand(0), AM))
2808
        return false;
2809

2810
      // Try to fold the mask and shift into BEXTR and scale.
2811
      if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2812
                                  Src.getOperand(0), AM, *Subtarget))
2813
        return false;
2814
    }
2815

2816
    break;
2817
  }
2818
  }
2819

2820
  return matchAddressBase(N, AM);
2821
}
2822

2823
/// Helper for MatchAddress. Add the specified node to the
2824
/// specified addressing mode without any further recursion.
2825
bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2826
  // Is the base register already occupied?
2827
  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2828
    // If so, check to see if the scale index register is set.
2829
    if (!AM.IndexReg.getNode()) {
2830
      AM.IndexReg = N;
2831
      AM.Scale = 1;
2832
      return false;
2833
    }
2834

2835
    // Otherwise, we cannot select it.
2836
    return true;
2837
  }
2838

2839
  // Default, generate it as a register.
2840
  AM.BaseType = X86ISelAddressMode::RegBase;
2841
  AM.Base_Reg = N;
2842
  return false;
2843
}
2844

2845
bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2846
                                                    X86ISelAddressMode &AM,
2847
                                                    unsigned Depth) {
2848
  SDLoc dl(N);
2849
  LLVM_DEBUG({
2850
    dbgs() << "MatchVectorAddress: ";
2851
    AM.dump(CurDAG);
2852
  });
2853
  // Limit recursion.
2854
  if (Depth >= SelectionDAG::MaxRecursionDepth)
2855
    return matchAddressBase(N, AM);
2856

2857
  // TODO: Support other operations.
2858
  switch (N.getOpcode()) {
2859
  case ISD::Constant: {
2860
    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2861
    if (!foldOffsetIntoAddress(Val, AM))
2862
      return false;
2863
    break;
2864
  }
2865
  case X86ISD::Wrapper:
2866
    if (!matchWrapper(N, AM))
2867
      return false;
2868
    break;
2869
  case ISD::ADD: {
2870
    // Add an artificial use to this node so that we can keep track of
2871
    // it if it gets CSE'd with a different node.
2872
    HandleSDNode Handle(N);
2873

2874
    X86ISelAddressMode Backup = AM;
2875
    if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2876
        !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2877
                                       Depth + 1))
2878
      return false;
2879
    AM = Backup;
2880

2881
    // Try again after commuting the operands.
2882
    if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2883
                                       Depth + 1) &&
2884
        !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2885
                                       Depth + 1))
2886
      return false;
2887
    AM = Backup;
2888

2889
    N = Handle.getValue();
2890
    break;
2891
  }
2892
  }
2893

2894
  return matchAddressBase(N, AM);
2895
}
2896

2897
/// Helper for selectVectorAddr. Handles things that can be folded into a
2898
/// gather/scatter address. The index register and scale should have already
2899
/// been handled.
2900
bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2901
  return matchVectorAddressRecursively(N, AM, 0);
2902
}
2903

2904
bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2905
                                       SDValue IndexOp, SDValue ScaleOp,
2906
                                       SDValue &Base, SDValue &Scale,
2907
                                       SDValue &Index, SDValue &Disp,
2908
                                       SDValue &Segment) {
2909
  X86ISelAddressMode AM;
2910
  AM.Scale = ScaleOp->getAsZExtVal();
2911

2912
  // Attempt to match index patterns, as long as we're not relying on implicit
2913
  // sign-extension, which is performed BEFORE scale.
2914
  if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2915
    AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2916
  else
2917
    AM.IndexReg = IndexOp;
2918

2919
  unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2920
  if (AddrSpace == X86AS::GS)
2921
    AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2922
  if (AddrSpace == X86AS::FS)
2923
    AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2924
  if (AddrSpace == X86AS::SS)
2925
    AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2926

2927
  SDLoc DL(BasePtr);
2928
  MVT VT = BasePtr.getSimpleValueType();
2929

2930
  // Try to match into the base and displacement fields.
2931
  if (matchVectorAddress(BasePtr, AM))
2932
    return false;
2933

2934
  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2935
  return true;
2936
}
2937

2938
/// Returns true if it is able to pattern match an addressing mode.
2939
/// It returns the operands which make up the maximal addressing mode it can
2940
/// match by reference.
2941
///
2942
/// Parent is the parent node of the addr operand that is being matched.  It
2943
/// is always a load, store, atomic node, or null.  It is only null when
2944
/// checking memory operands for inline asm nodes.
2945
bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2946
                                 SDValue &Scale, SDValue &Index,
2947
                                 SDValue &Disp, SDValue &Segment) {
2948
  X86ISelAddressMode AM;
2949

2950
  if (Parent &&
2951
      // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2952
      // that are not a MemSDNode, and thus don't have proper addrspace info.
2953
      Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2954
      Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2955
      Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2956
      Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2957
      Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2958
      Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2959
      Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2960
    unsigned AddrSpace =
2961
      cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2962
    if (AddrSpace == X86AS::GS)
2963
      AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2964
    if (AddrSpace == X86AS::FS)
2965
      AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2966
    if (AddrSpace == X86AS::SS)
2967
      AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2968
  }
2969

2970
  // Save the DL and VT before calling matchAddress, it can invalidate N.
2971
  SDLoc DL(N);
2972
  MVT VT = N.getSimpleValueType();
2973

2974
  if (matchAddress(N, AM))
2975
    return false;
2976

2977
  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2978
  return true;
2979
}
2980

2981
bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2982
  // Cannot use 32 bit constants to reference objects in kernel/large code
2983
  // model.
2984
  if (TM.getCodeModel() == CodeModel::Kernel ||
2985
      TM.getCodeModel() == CodeModel::Large)
2986
    return false;
2987

2988
  // In static codegen with small code model, we can get the address of a label
2989
  // into a register with 'movl'
2990
  if (N->getOpcode() != X86ISD::Wrapper)
2991
    return false;
2992

2993
  N = N.getOperand(0);
2994

2995
  // At least GNU as does not accept 'movl' for TPOFF relocations.
2996
  // FIXME: We could use 'movl' when we know we are targeting MC.
2997
  if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2998
    return false;
2999

3000
  Imm = N;
3001
  // Small/medium code model can reference non-TargetGlobalAddress objects with
3002
  // 32 bit constants.
3003
  if (N->getOpcode() != ISD::TargetGlobalAddress) {
3004
    return TM.getCodeModel() == CodeModel::Small ||
3005
           TM.getCodeModel() == CodeModel::Medium;
3006
  }
3007

3008
  const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3009
  if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3010
    return CR->getUnsignedMax().ult(1ull << 32);
3011

3012
  return !TM.isLargeGlobalValue(GV);
3013
}
3014

3015
bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
3016
                                         SDValue &Scale, SDValue &Index,
3017
                                         SDValue &Disp, SDValue &Segment) {
3018
  // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3019
  SDLoc DL(N);
3020

3021
  if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3022
    return false;
3023

3024
  auto *RN = dyn_cast<RegisterSDNode>(Base);
3025
  if (RN && RN->getReg() == 0)
3026
    Base = CurDAG->getRegister(0, MVT::i64);
3027
  else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
3028
    // Base could already be %rip, particularly in the x32 ABI.
3029
    SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3030
                                                     MVT::i64), 0);
3031
    Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3032
                                         Base);
3033
  }
3034

3035
  RN = dyn_cast<RegisterSDNode>(Index);
3036
  if (RN && RN->getReg() == 0)
3037
    Index = CurDAG->getRegister(0, MVT::i64);
3038
  else {
3039
    assert(Index.getValueType() == MVT::i32 &&
3040
           "Expect to be extending 32-bit registers for use in LEA");
3041
    SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3042
                                                     MVT::i64), 0);
3043
    Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3044
                                          Index);
3045
  }
3046

3047
  return true;
3048
}
3049

3050
/// Calls SelectAddr and determines if the maximal addressing
3051
/// mode it matches can be cost effectively emitted as an LEA instruction.
3052
bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3053
                                    SDValue &Base, SDValue &Scale,
3054
                                    SDValue &Index, SDValue &Disp,
3055
                                    SDValue &Segment) {
3056
  X86ISelAddressMode AM;
3057

3058
  // Save the DL and VT before calling matchAddress, it can invalidate N.
3059
  SDLoc DL(N);
3060
  MVT VT = N.getSimpleValueType();
3061

3062
  // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3063
  // segments.
3064
  SDValue Copy = AM.Segment;
3065
  SDValue T = CurDAG->getRegister(0, MVT::i32);
3066
  AM.Segment = T;
3067
  if (matchAddress(N, AM))
3068
    return false;
3069
  assert (T == AM.Segment);
3070
  AM.Segment = Copy;
3071

3072
  unsigned Complexity = 0;
3073
  if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3074
    Complexity = 1;
3075
  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3076
    Complexity = 4;
3077

3078
  if (AM.IndexReg.getNode())
3079
    Complexity++;
3080

3081
  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3082
  // a simple shift.
3083
  if (AM.Scale > 1)
3084
    Complexity++;
3085

3086
  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3087
  // to a LEA. This is determined with some experimentation but is by no means
3088
  // optimal (especially for code size consideration). LEA is nice because of
3089
  // its three-address nature. Tweak the cost function again when we can run
3090
  // convertToThreeAddress() at register allocation time.
3091
  if (AM.hasSymbolicDisplacement()) {
3092
    // For X86-64, always use LEA to materialize RIP-relative addresses.
3093
    if (Subtarget->is64Bit())
3094
      Complexity = 4;
3095
    else
3096
      Complexity += 2;
3097
  }
3098

3099
  // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3100
  // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3101
  // duplicating flag-producing instructions later in the pipeline.
3102
  if (N.getOpcode() == ISD::ADD) {
3103
    auto isMathWithFlags = [](SDValue V) {
3104
      switch (V.getOpcode()) {
3105
      case X86ISD::ADD:
3106
      case X86ISD::SUB:
3107
      case X86ISD::ADC:
3108
      case X86ISD::SBB:
3109
      case X86ISD::SMUL:
3110
      case X86ISD::UMUL:
3111
      /* TODO: These opcodes can be added safely, but we may want to justify
3112
               their inclusion for different reasons (better for reg-alloc).
3113
      case X86ISD::OR:
3114
      case X86ISD::XOR:
3115
      case X86ISD::AND:
3116
      */
3117
        // Value 1 is the flag output of the node - verify it's not dead.
3118
        return !SDValue(V.getNode(), 1).use_empty();
3119
      default:
3120
        return false;
3121
      }
3122
    };
3123
    // TODO: We might want to factor in whether there's a load folding
3124
    // opportunity for the math op that disappears with LEA.
3125
    if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3126
      Complexity++;
3127
  }
3128

3129
  if (AM.Disp)
3130
    Complexity++;
3131

3132
  // If it isn't worth using an LEA, reject it.
3133
  if (Complexity <= 2)
3134
    return false;
3135

3136
  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3137
  return true;
3138
}
3139

3140
/// This is only run on TargetGlobalTLSAddress nodes.
3141
bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3142
                                        SDValue &Scale, SDValue &Index,
3143
                                        SDValue &Disp, SDValue &Segment) {
3144
  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3145
         N.getOpcode() == ISD::TargetExternalSymbol);
3146

3147
  X86ISelAddressMode AM;
3148
  if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3149
    AM.GV = GA->getGlobal();
3150
    AM.Disp += GA->getOffset();
3151
    AM.SymbolFlags = GA->getTargetFlags();
3152
  } else {
3153
    auto *SA = cast<ExternalSymbolSDNode>(N);
3154
    AM.ES = SA->getSymbol();
3155
    AM.SymbolFlags = SA->getTargetFlags();
3156
  }
3157

3158
  if (Subtarget->is32Bit()) {
3159
    AM.Scale = 1;
3160
    AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3161
  }
3162

3163
  MVT VT = N.getSimpleValueType();
3164
  getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3165
  return true;
3166
}
3167

3168
bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3169
  // Keep track of the original value type and whether this value was
3170
  // truncated. If we see a truncation from pointer type to VT that truncates
3171
  // bits that are known to be zero, we can use a narrow reference.
3172
  EVT VT = N.getValueType();
3173
  bool WasTruncated = false;
3174
  if (N.getOpcode() == ISD::TRUNCATE) {
3175
    WasTruncated = true;
3176
    N = N.getOperand(0);
3177
  }
3178

3179
  if (N.getOpcode() != X86ISD::Wrapper)
3180
    return false;
3181

3182
  // We can only use non-GlobalValues as immediates if they were not truncated,
3183
  // as we do not have any range information. If we have a GlobalValue and the
3184
  // address was not truncated, we can select it as an operand directly.
3185
  unsigned Opc = N.getOperand(0)->getOpcode();
3186
  if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3187
    Op = N.getOperand(0);
3188
    // We can only select the operand directly if we didn't have to look past a
3189
    // truncate.
3190
    return !WasTruncated;
3191
  }
3192

3193
  // Check that the global's range fits into VT.
3194
  auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3195
  std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3196
  if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3197
    return false;
3198

3199
  // Okay, we can use a narrow reference.
3200
  Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3201
                                      GA->getOffset(), GA->getTargetFlags());
3202
  return true;
3203
}
3204

3205
bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3206
                                  SDValue &Base, SDValue &Scale,
3207
                                  SDValue &Index, SDValue &Disp,
3208
                                  SDValue &Segment) {
3209
  assert(Root && P && "Unknown root/parent nodes");
3210
  if (!ISD::isNON_EXTLoad(N.getNode()) ||
3211
      !IsProfitableToFold(N, P, Root) ||
3212
      !IsLegalToFold(N, P, Root, OptLevel))
3213
    return false;
3214

3215
  return selectAddr(N.getNode(),
3216
                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
3217
}
3218

3219
bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3220
                                       SDValue &Base, SDValue &Scale,
3221
                                       SDValue &Index, SDValue &Disp,
3222
                                       SDValue &Segment) {
3223
  assert(Root && P && "Unknown root/parent nodes");
3224
  if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3225
      !IsProfitableToFold(N, P, Root) ||
3226
      !IsLegalToFold(N, P, Root, OptLevel))
3227
    return false;
3228

3229
  return selectAddr(N.getNode(),
3230
                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
3231
}
3232

3233
/// Return an SDNode that returns the value of the global base register.
3234
/// Output instructions required to initialize the global base register,
3235
/// if necessary.
3236
SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3237
  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3238
  auto &DL = MF->getDataLayout();
3239
  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3240
}
3241

3242
bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3243
  if (N->getOpcode() == ISD::TRUNCATE)
3244
    N = N->getOperand(0).getNode();
3245
  if (N->getOpcode() != X86ISD::Wrapper)
3246
    return false;
3247

3248
  auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3249
  if (!GA)
3250
    return false;
3251

3252
  auto *GV = GA->getGlobal();
3253
  std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3254
  if (CR)
3255
    return CR->getSignedMin().sge(-1ull << Width) &&
3256
           CR->getSignedMax().slt(1ull << Width);
3257
  // In the kernel code model, globals are in the negative 2GB of the address
3258
  // space, so globals can be a sign extended 32-bit immediate.
3259
  // In other code models, small globals are in the low 2GB of the address
3260
  // space, so sign extending them is equivalent to zero extending them.
3261
  return Width == 32 && !TM.isLargeGlobalValue(GV);
3262
}
3263

3264
X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3265
  assert(N->isMachineOpcode() && "Unexpected node");
3266
  unsigned Opc = N->getMachineOpcode();
3267
  const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3268
  int CondNo = X86::getCondSrcNoFromDesc(MCID);
3269
  if (CondNo < 0)
3270
    return X86::COND_INVALID;
3271

3272
  return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3273
}
3274

3275
/// Test whether the given X86ISD::CMP node has any users that use a flag
3276
/// other than ZF.
3277
bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3278
  // Examine each user of the node.
3279
  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3280
         UI != UE; ++UI) {
3281
    // Only check things that use the flags.
3282
    if (UI.getUse().getResNo() != Flags.getResNo())
3283
      continue;
3284
    // Only examine CopyToReg uses that copy to EFLAGS.
3285
    if (UI->getOpcode() != ISD::CopyToReg ||
3286
        cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3287
      return false;
3288
    // Examine each user of the CopyToReg use.
3289
    for (SDNode::use_iterator FlagUI = UI->use_begin(),
3290
           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3291
      // Only examine the Flag result.
3292
      if (FlagUI.getUse().getResNo() != 1) continue;
3293
      // Anything unusual: assume conservatively.
3294
      if (!FlagUI->isMachineOpcode()) return false;
3295
      // Examine the condition code of the user.
3296
      X86::CondCode CC = getCondFromNode(*FlagUI);
3297

3298
      switch (CC) {
3299
      // Comparisons which only use the zero flag.
3300
      case X86::COND_E: case X86::COND_NE:
3301
        continue;
3302
      // Anything else: assume conservatively.
3303
      default:
3304
        return false;
3305
      }
3306
    }
3307
  }
3308
  return true;
3309
}
3310

3311
/// Test whether the given X86ISD::CMP node has any uses which require the SF
3312
/// flag to be accurate.
3313
bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3314
  // Examine each user of the node.
3315
  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3316
         UI != UE; ++UI) {
3317
    // Only check things that use the flags.
3318
    if (UI.getUse().getResNo() != Flags.getResNo())
3319
      continue;
3320
    // Only examine CopyToReg uses that copy to EFLAGS.
3321
    if (UI->getOpcode() != ISD::CopyToReg ||
3322
        cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3323
      return false;
3324
    // Examine each user of the CopyToReg use.
3325
    for (SDNode::use_iterator FlagUI = UI->use_begin(),
3326
           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3327
      // Only examine the Flag result.
3328
      if (FlagUI.getUse().getResNo() != 1) continue;
3329
      // Anything unusual: assume conservatively.
3330
      if (!FlagUI->isMachineOpcode()) return false;
3331
      // Examine the condition code of the user.
3332
      X86::CondCode CC = getCondFromNode(*FlagUI);
3333

3334
      switch (CC) {
3335
      // Comparisons which don't examine the SF flag.
3336
      case X86::COND_A: case X86::COND_AE:
3337
      case X86::COND_B: case X86::COND_BE:
3338
      case X86::COND_E: case X86::COND_NE:
3339
      case X86::COND_O: case X86::COND_NO:
3340
      case X86::COND_P: case X86::COND_NP:
3341
        continue;
3342
      // Anything else: assume conservatively.
3343
      default:
3344
        return false;
3345
      }
3346
    }
3347
  }
3348
  return true;
3349
}
3350

3351
static bool mayUseCarryFlag(X86::CondCode CC) {
3352
  switch (CC) {
3353
  // Comparisons which don't examine the CF flag.
3354
  case X86::COND_O: case X86::COND_NO:
3355
  case X86::COND_E: case X86::COND_NE:
3356
  case X86::COND_S: case X86::COND_NS:
3357
  case X86::COND_P: case X86::COND_NP:
3358
  case X86::COND_L: case X86::COND_GE:
3359
  case X86::COND_G: case X86::COND_LE:
3360
    return false;
3361
  // Anything else: assume conservatively.
3362
  default:
3363
    return true;
3364
  }
3365
}
3366

3367
/// Test whether the given node which sets flags has any uses which require the
3368
/// CF flag to be accurate.
3369
 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3370
  // Examine each user of the node.
3371
  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3372
         UI != UE; ++UI) {
3373
    // Only check things that use the flags.
3374
    if (UI.getUse().getResNo() != Flags.getResNo())
3375
      continue;
3376

3377
    unsigned UIOpc = UI->getOpcode();
3378

3379
    if (UIOpc == ISD::CopyToReg) {
3380
      // Only examine CopyToReg uses that copy to EFLAGS.
3381
      if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3382
        return false;
3383
      // Examine each user of the CopyToReg use.
3384
      for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
3385
           FlagUI != FlagUE; ++FlagUI) {
3386
        // Only examine the Flag result.
3387
        if (FlagUI.getUse().getResNo() != 1)
3388
          continue;
3389
        // Anything unusual: assume conservatively.
3390
        if (!FlagUI->isMachineOpcode())
3391
          return false;
3392
        // Examine the condition code of the user.
3393
        X86::CondCode CC = getCondFromNode(*FlagUI);
3394

3395
        if (mayUseCarryFlag(CC))
3396
          return false;
3397
      }
3398

3399
      // This CopyToReg is ok. Move on to the next user.
3400
      continue;
3401
    }
3402

3403
    // This might be an unselected node. So look for the pre-isel opcodes that
3404
    // use flags.
3405
    unsigned CCOpNo;
3406
    switch (UIOpc) {
3407
    default:
3408
      // Something unusual. Be conservative.
3409
      return false;
3410
    case X86ISD::SETCC:       CCOpNo = 0; break;
3411
    case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3412
    case X86ISD::CMOV:        CCOpNo = 2; break;
3413
    case X86ISD::BRCOND:      CCOpNo = 2; break;
3414
    }
3415

3416
    X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
3417
    if (mayUseCarryFlag(CC))
3418
      return false;
3419
  }
3420
  return true;
3421
}
3422

3423
/// Check whether or not the chain ending in StoreNode is suitable for doing
3424
/// the {load; op; store} to modify transformation.
3425
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3426
                                        SDValue StoredVal, SelectionDAG *CurDAG,
3427
                                        unsigned LoadOpNo,
3428
                                        LoadSDNode *&LoadNode,
3429
                                        SDValue &InputChain) {
3430
  // Is the stored value result 0 of the operation?
3431
  if (StoredVal.getResNo() != 0) return false;
3432

3433
  // Are there other uses of the operation other than the store?
3434
  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3435

3436
  // Is the store non-extending and non-indexed?
3437
  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3438
    return false;
3439

3440
  SDValue Load = StoredVal->getOperand(LoadOpNo);
3441
  // Is the stored value a non-extending and non-indexed load?
3442
  if (!ISD::isNormalLoad(Load.getNode())) return false;
3443

3444
  // Return LoadNode by reference.
3445
  LoadNode = cast<LoadSDNode>(Load);
3446

3447
  // Is store the only read of the loaded value?
3448
  if (!Load.hasOneUse())
3449
    return false;
3450

3451
  // Is the address of the store the same as the load?
3452
  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3453
      LoadNode->getOffset() != StoreNode->getOffset())
3454
    return false;
3455

3456
  bool FoundLoad = false;
3457
  SmallVector<SDValue, 4> ChainOps;
3458
  SmallVector<const SDNode *, 4> LoopWorklist;
3459
  SmallPtrSet<const SDNode *, 16> Visited;
3460
  const unsigned int Max = 1024;
3461

3462
  //  Visualization of Load-Op-Store fusion:
3463
  // -------------------------
3464
  // Legend:
3465
  //    *-lines = Chain operand dependencies.
3466
  //    |-lines = Normal operand dependencies.
3467
  //    Dependencies flow down and right. n-suffix references multiple nodes.
3468
  //
3469
  //        C                        Xn  C
3470
  //        *                         *  *
3471
  //        *                          * *
3472
  //  Xn  A-LD    Yn                    TF         Yn
3473
  //   *    * \   |                       *        |
3474
  //    *   *  \  |                        *       |
3475
  //     *  *   \ |             =>       A--LD_OP_ST
3476
  //      * *    \|                                 \
3477
  //       TF    OP                                  \
3478
  //         *   | \                                  Zn
3479
  //          *  |  \
3480
  //         A-ST    Zn
3481
  //
3482

3483
  // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3484
  //                                      #2: Yn -> LD
3485
  //                                      #3: ST -> Zn
3486

3487
  // Ensure the transform is safe by checking for the dual
3488
  // dependencies to make sure we do not induce a loop.
3489

3490
  // As LD is a predecessor to both OP and ST we can do this by checking:
3491
  //  a). if LD is a predecessor to a member of Xn or Yn.
3492
  //  b). if a Zn is a predecessor to ST.
3493

3494
  // However, (b) can only occur through being a chain predecessor to
3495
  // ST, which is the same as Zn being a member or predecessor of Xn,
3496
  // which is a subset of LD being a predecessor of Xn. So it's
3497
  // subsumed by check (a).
3498

3499
  SDValue Chain = StoreNode->getChain();
3500

3501
  // Gather X elements in ChainOps.
3502
  if (Chain == Load.getValue(1)) {
3503
    FoundLoad = true;
3504
    ChainOps.push_back(Load.getOperand(0));
3505
  } else if (Chain.getOpcode() == ISD::TokenFactor) {
3506
    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3507
      SDValue Op = Chain.getOperand(i);
3508
      if (Op == Load.getValue(1)) {
3509
        FoundLoad = true;
3510
        // Drop Load, but keep its chain. No cycle check necessary.
3511
        ChainOps.push_back(Load.getOperand(0));
3512
        continue;
3513
      }
3514
      LoopWorklist.push_back(Op.getNode());
3515
      ChainOps.push_back(Op);
3516
    }
3517
  }
3518

3519
  if (!FoundLoad)
3520
    return false;
3521

3522
  // Worklist is currently Xn. Add Yn to worklist.
3523
  for (SDValue Op : StoredVal->ops())
3524
    if (Op.getNode() != LoadNode)
3525
      LoopWorklist.push_back(Op.getNode());
3526

3527
  // Check (a) if Load is a predecessor to Xn + Yn
3528
  if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3529
                                   true))
3530
    return false;
3531

3532
  InputChain =
3533
      CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3534
  return true;
3535
}
3536

3537
// Change a chain of {load; op; store} of the same value into a simple op
3538
// through memory of that value, if the uses of the modified value and its
3539
// address are suitable.
3540
//
3541
// The tablegen pattern memory operand pattern is currently not able to match
3542
// the case where the EFLAGS on the original operation are used.
3543
//
3544
// To move this to tablegen, we'll need to improve tablegen to allow flags to
3545
// be transferred from a node in the pattern to the result node, probably with
3546
// a new keyword. For example, we have this
3547
// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3548
//  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3549
//   (implicit EFLAGS)]>;
3550
// but maybe need something like this
3551
// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3552
//  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3553
//   (transferrable EFLAGS)]>;
3554
//
3555
// Until then, we manually fold these and instruction select the operation
3556
// here.
3557
bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3558
  auto *StoreNode = cast<StoreSDNode>(Node);
3559
  SDValue StoredVal = StoreNode->getOperand(1);
3560
  unsigned Opc = StoredVal->getOpcode();
3561

3562
  // Before we try to select anything, make sure this is memory operand size
3563
  // and opcode we can handle. Note that this must match the code below that
3564
  // actually lowers the opcodes.
3565
  EVT MemVT = StoreNode->getMemoryVT();
3566
  if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3567
      MemVT != MVT::i8)
3568
    return false;
3569

3570
  bool IsCommutable = false;
3571
  bool IsNegate = false;
3572
  switch (Opc) {
3573
  default:
3574
    return false;
3575
  case X86ISD::SUB:
3576
    IsNegate = isNullConstant(StoredVal.getOperand(0));
3577
    break;
3578
  case X86ISD::SBB:
3579
    break;
3580
  case X86ISD::ADD:
3581
  case X86ISD::ADC:
3582
  case X86ISD::AND:
3583
  case X86ISD::OR:
3584
  case X86ISD::XOR:
3585
    IsCommutable = true;
3586
    break;
3587
  }
3588

3589
  unsigned LoadOpNo = IsNegate ? 1 : 0;
3590
  LoadSDNode *LoadNode = nullptr;
3591
  SDValue InputChain;
3592
  if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3593
                                   LoadNode, InputChain)) {
3594
    if (!IsCommutable)
3595
      return false;
3596

3597
    // This operation is commutable, try the other operand.
3598
    LoadOpNo = 1;
3599
    if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3600
                                     LoadNode, InputChain))
3601
      return false;
3602
  }
3603

3604
  SDValue Base, Scale, Index, Disp, Segment;
3605
  if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3606
                  Segment))
3607
    return false;
3608

3609
  auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3610
                          unsigned Opc8) {
3611
    switch (MemVT.getSimpleVT().SimpleTy) {
3612
    case MVT::i64:
3613
      return Opc64;
3614
    case MVT::i32:
3615
      return Opc32;
3616
    case MVT::i16:
3617
      return Opc16;
3618
    case MVT::i8:
3619
      return Opc8;
3620
    default:
3621
      llvm_unreachable("Invalid size!");
3622
    }
3623
  };
3624

3625
  MachineSDNode *Result;
3626
  switch (Opc) {
3627
  case X86ISD::SUB:
3628
    // Handle negate.
3629
    if (IsNegate) {
3630
      unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3631
                                     X86::NEG8m);
3632
      const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3633
      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3634
                                      MVT::Other, Ops);
3635
      break;
3636
    }
3637
   [[fallthrough]];
3638
  case X86ISD::ADD:
3639
    // Try to match inc/dec.
3640
    if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3641
      bool IsOne = isOneConstant(StoredVal.getOperand(1));
3642
      bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3643
      // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3644
      if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3645
        unsigned NewOpc =
3646
          ((Opc == X86ISD::ADD) == IsOne)
3647
              ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3648
              : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3649
        const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3650
        Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3651
                                        MVT::Other, Ops);
3652
        break;
3653
      }
3654
    }
3655
    [[fallthrough]];
3656
  case X86ISD::ADC:
3657
  case X86ISD::SBB:
3658
  case X86ISD::AND:
3659
  case X86ISD::OR:
3660
  case X86ISD::XOR: {
3661
    auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3662
      switch (Opc) {
3663
      case X86ISD::ADD:
3664
        return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3665
                            X86::ADD8mr);
3666
      case X86ISD::ADC:
3667
        return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3668
                            X86::ADC8mr);
3669
      case X86ISD::SUB:
3670
        return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3671
                            X86::SUB8mr);
3672
      case X86ISD::SBB:
3673
        return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3674
                            X86::SBB8mr);
3675
      case X86ISD::AND:
3676
        return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3677
                            X86::AND8mr);
3678
      case X86ISD::OR:
3679
        return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3680
      case X86ISD::XOR:
3681
        return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3682
                            X86::XOR8mr);
3683
      default:
3684
        llvm_unreachable("Invalid opcode!");
3685
      }
3686
    };
3687
    auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3688
      switch (Opc) {
3689
      case X86ISD::ADD:
3690
        return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3691
                            X86::ADD8mi);
3692
      case X86ISD::ADC:
3693
        return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3694
                            X86::ADC8mi);
3695
      case X86ISD::SUB:
3696
        return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3697
                            X86::SUB8mi);
3698
      case X86ISD::SBB:
3699
        return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3700
                            X86::SBB8mi);
3701
      case X86ISD::AND:
3702
        return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3703
                            X86::AND8mi);
3704
      case X86ISD::OR:
3705
        return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3706
                            X86::OR8mi);
3707
      case X86ISD::XOR:
3708
        return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3709
                            X86::XOR8mi);
3710
      default:
3711
        llvm_unreachable("Invalid opcode!");
3712
      }
3713
    };
3714

3715
    unsigned NewOpc = SelectRegOpcode(Opc);
3716
    SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3717

3718
    // See if the operand is a constant that we can fold into an immediate
3719
    // operand.
3720
    if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3721
      int64_t OperandV = OperandC->getSExtValue();
3722

3723
      // Check if we can shrink the operand enough to fit in an immediate (or
3724
      // fit into a smaller immediate) by negating it and switching the
3725
      // operation.
3726
      if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3727
          ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3728
           (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3729
            isInt<32>(-OperandV))) &&
3730
          hasNoCarryFlagUses(StoredVal.getValue(1))) {
3731
        OperandV = -OperandV;
3732
        Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3733
      }
3734

3735
      if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3736
        Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3737
        NewOpc = SelectImmOpcode(Opc);
3738
      }
3739
    }
3740

3741
    if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3742
      SDValue CopyTo =
3743
          CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3744
                               StoredVal.getOperand(2), SDValue());
3745

3746
      const SDValue Ops[] = {Base,    Scale,   Index,  Disp,
3747
                             Segment, Operand, CopyTo, CopyTo.getValue(1)};
3748
      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3749
                                      Ops);
3750
    } else {
3751
      const SDValue Ops[] = {Base,    Scale,   Index,     Disp,
3752
                             Segment, Operand, InputChain};
3753
      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3754
                                      Ops);
3755
    }
3756
    break;
3757
  }
3758
  default:
3759
    llvm_unreachable("Invalid opcode!");
3760
  }
3761

3762
  MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3763
                                 LoadNode->getMemOperand()};
3764
  CurDAG->setNodeMemRefs(Result, MemOps);
3765

3766
  // Update Load Chain uses as well.
3767
  ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3768
  ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3769
  ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3770
  CurDAG->RemoveDeadNode(Node);
3771
  return true;
3772
}
3773

3774
// See if this is an  X & Mask  that we can match to BEXTR/BZHI.
3775
// Where Mask is one of the following patterns:
3776
//   a) x &  (1 << nbits) - 1
3777
//   b) x & ~(-1 << nbits)
3778
//   c) x &  (-1 >> (32 - y))
3779
//   d) x << (32 - y) >> (32 - y)
3780
//   e) (1 << nbits) - 1
3781
bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3782
  assert(
3783
      (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3784
       Node->getOpcode() == ISD::SRL) &&
3785
      "Should be either an and-mask, or right-shift after clearing high bits.");
3786

3787
  // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3788
  if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3789
    return false;
3790

3791
  MVT NVT = Node->getSimpleValueType(0);
3792

3793
  // Only supported for 32 and 64 bits.
3794
  if (NVT != MVT::i32 && NVT != MVT::i64)
3795
    return false;
3796

3797
  SDValue NBits;
3798
  bool NegateNBits;
3799

3800
  // If we have BMI2's BZHI, we are ok with muti-use patterns.
3801
  // Else, if we only have BMI1's BEXTR, we require one-use.
3802
  const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3803
  auto checkUses = [AllowExtraUsesByDefault](
3804
                       SDValue Op, unsigned NUses,
3805
                       std::optional<bool> AllowExtraUses) {
3806
    return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3807
           Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3808
  };
3809
  auto checkOneUse = [checkUses](SDValue Op,
3810
                                 std::optional<bool> AllowExtraUses =
3811
                                     std::nullopt) {
3812
    return checkUses(Op, 1, AllowExtraUses);
3813
  };
3814
  auto checkTwoUse = [checkUses](SDValue Op,
3815
                                 std::optional<bool> AllowExtraUses =
3816
                                     std::nullopt) {
3817
    return checkUses(Op, 2, AllowExtraUses);
3818
  };
3819

3820
  auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3821
    if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3822
      assert(V.getSimpleValueType() == MVT::i32 &&
3823
             V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3824
             "Expected i64 -> i32 truncation");
3825
      V = V.getOperand(0);
3826
    }
3827
    return V;
3828
  };
3829

3830
  // a) x & ((1 << nbits) + (-1))
3831
  auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3832
                        &NegateNBits](SDValue Mask) -> bool {
3833
    // Match `add`. Must only have one use!
3834
    if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3835
      return false;
3836
    // We should be adding all-ones constant (i.e. subtracting one.)
3837
    if (!isAllOnesConstant(Mask->getOperand(1)))
3838
      return false;
3839
    // Match `1 << nbits`. Might be truncated. Must only have one use!
3840
    SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3841
    if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3842
      return false;
3843
    if (!isOneConstant(M0->getOperand(0)))
3844
      return false;
3845
    NBits = M0->getOperand(1);
3846
    NegateNBits = false;
3847
    return true;
3848
  };
3849

3850
  auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3851
    V = peekThroughOneUseTruncation(V);
3852
    return CurDAG->MaskedValueIsAllOnes(
3853
        V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3854
                                NVT.getSizeInBits()));
3855
  };
3856

3857
  // b) x & ~(-1 << nbits)
3858
  auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3859
                        &NBits, &NegateNBits](SDValue Mask) -> bool {
3860
    // Match `~()`. Must only have one use!
3861
    if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3862
      return false;
3863
    // The -1 only has to be all-ones for the final Node's NVT.
3864
    if (!isAllOnes(Mask->getOperand(1)))
3865
      return false;
3866
    // Match `-1 << nbits`. Might be truncated. Must only have one use!
3867
    SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3868
    if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3869
      return false;
3870
    // The -1 only has to be all-ones for the final Node's NVT.
3871
    if (!isAllOnes(M0->getOperand(0)))
3872
      return false;
3873
    NBits = M0->getOperand(1);
3874
    NegateNBits = false;
3875
    return true;
3876
  };
3877

3878
  // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3879
  // or leave the shift amount as-is, but then we'll have to negate it.
3880
  auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3881
                                                     unsigned Bitwidth) {
3882
    NBits = ShiftAmt;
3883
    NegateNBits = true;
3884
    // Skip over a truncate of the shift amount, if any.
3885
    if (NBits.getOpcode() == ISD::TRUNCATE)
3886
      NBits = NBits.getOperand(0);
3887
    // Try to match the shift amount as (bitwidth - y). It should go away, too.
3888
    // If it doesn't match, that's fine, we'll just negate it ourselves.
3889
    if (NBits.getOpcode() != ISD::SUB)
3890
      return;
3891
    auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3892
    if (!V0 || V0->getZExtValue() != Bitwidth)
3893
      return;
3894
    NBits = NBits.getOperand(1);
3895
    NegateNBits = false;
3896
  };
3897

3898
  // c) x &  (-1 >> z)  but then we'll have to subtract z from bitwidth
3899
  //   or
3900
  // c) x &  (-1 >> (32 - y))
3901
  auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3902
                        canonicalizeShiftAmt](SDValue Mask) -> bool {
3903
    // The mask itself may be truncated.
3904
    Mask = peekThroughOneUseTruncation(Mask);
3905
    unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3906
    // Match `l>>`. Must only have one use!
3907
    if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3908
      return false;
3909
    // We should be shifting truly all-ones constant.
3910
    if (!isAllOnesConstant(Mask.getOperand(0)))
3911
      return false;
3912
    SDValue M1 = Mask.getOperand(1);
3913
    // The shift amount should not be used externally.
3914
    if (!checkOneUse(M1))
3915
      return false;
3916
    canonicalizeShiftAmt(M1, Bitwidth);
3917
    // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3918
    // is no extra use of the mask. Clearly, there was one since we are here.
3919
    // But at the same time, if we need to negate the shift amount,
3920
    // then we don't want the mask to stick around, else it's unprofitable.
3921
    return !NegateNBits;
3922
  };
3923

3924
  SDValue X;
3925

3926
  // d) x << z >> z  but then we'll have to subtract z from bitwidth
3927
  //   or
3928
  // d) x << (32 - y) >> (32 - y)
3929
  auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3930
                        AllowExtraUsesByDefault, &NegateNBits,
3931
                        &X](SDNode *Node) -> bool {
3932
    if (Node->getOpcode() != ISD::SRL)
3933
      return false;
3934
    SDValue N0 = Node->getOperand(0);
3935
    if (N0->getOpcode() != ISD::SHL)
3936
      return false;
3937
    unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3938
    SDValue N1 = Node->getOperand(1);
3939
    SDValue N01 = N0->getOperand(1);
3940
    // Both of the shifts must be by the exact same value.
3941
    if (N1 != N01)
3942
      return false;
3943
    canonicalizeShiftAmt(N1, Bitwidth);
3944
    // There should not be any external uses of the inner shift / shift amount.
3945
    // Note that while we are generally okay with external uses given BMI2,
3946
    // iff we need to negate the shift amount, we are not okay with extra uses.
3947
    const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3948
    if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3949
      return false;
3950
    X = N0->getOperand(0);
3951
    return true;
3952
  };
3953

3954
  auto matchLowBitMask = [matchPatternA, matchPatternB,
3955
                          matchPatternC](SDValue Mask) -> bool {
3956
    return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3957
  };
3958

3959
  if (Node->getOpcode() == ISD::AND) {
3960
    X = Node->getOperand(0);
3961
    SDValue Mask = Node->getOperand(1);
3962

3963
    if (matchLowBitMask(Mask)) {
3964
      // Great.
3965
    } else {
3966
      std::swap(X, Mask);
3967
      if (!matchLowBitMask(Mask))
3968
        return false;
3969
    }
3970
  } else if (matchLowBitMask(SDValue(Node, 0))) {
3971
    X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
3972
  } else if (!matchPatternD(Node))
3973
    return false;
3974

3975
  // If we need to negate the shift amount, require BMI2 BZHI support.
3976
  // It's just too unprofitable for BMI1 BEXTR.
3977
  if (NegateNBits && !Subtarget->hasBMI2())
3978
    return false;
3979

3980
  SDLoc DL(Node);
3981

3982
  // Truncate the shift amount.
3983
  NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3984
  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3985

3986
  // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3987
  // All the other bits are undefined, we do not care about them.
3988
  SDValue ImplDef = SDValue(
3989
      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3990
  insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3991

3992
  SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3993
  insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3994
  NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
3995
                                         MVT::i32, ImplDef, NBits, SRIdxVal),
3996
                  0);
3997
  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3998

3999
  // We might have matched the amount of high bits to be cleared,
4000
  // but we want the amount of low bits to be kept, so negate it then.
4001
  if (NegateNBits) {
4002
    SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4003
    insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4004

4005
    NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4006
    insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4007
  }
4008

4009
  if (Subtarget->hasBMI2()) {
4010
    // Great, just emit the BZHI..
4011
    if (NVT != MVT::i32) {
4012
      // But have to place the bit count into the wide-enough register first.
4013
      NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4014
      insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4015
    }
4016

4017
    SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4018
    ReplaceNode(Node, Extract.getNode());
4019
    SelectCode(Extract.getNode());
4020
    return true;
4021
  }
4022

4023
  // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4024
  // *logically* shifted (potentially with one-use trunc inbetween),
4025
  // and the truncation was the only use of the shift,
4026
  // and if so look past one-use truncation.
4027
  {
4028
    SDValue RealX = peekThroughOneUseTruncation(X);
4029
    // FIXME: only if the shift is one-use?
4030
    if (RealX != X && RealX.getOpcode() == ISD::SRL)
4031
      X = RealX;
4032
  }
4033

4034
  MVT XVT = X.getSimpleValueType();
4035

4036
  // Else, emitting BEXTR requires one more step.
4037
  // The 'control' of BEXTR has the pattern of:
4038
  // [15...8 bit][ 7...0 bit] location
4039
  // [ bit count][     shift] name
4040
  // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
4041

4042
  // Shift NBits left by 8 bits, thus producing 'control'.
4043
  // This makes the low 8 bits to be zero.
4044
  SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4045
  insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4046
  SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4047
  insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4048

4049
  // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4050
  // FIXME: only if the shift is one-use?
4051
  if (X.getOpcode() == ISD::SRL) {
4052
    SDValue ShiftAmt = X.getOperand(1);
4053
    X = X.getOperand(0);
4054

4055
    assert(ShiftAmt.getValueType() == MVT::i8 &&
4056
           "Expected shift amount to be i8");
4057

4058
    // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4059
    // We could zext to i16 in some form, but we intentionally don't do that.
4060
    SDValue OrigShiftAmt = ShiftAmt;
4061
    ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4062
    insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4063

4064
    // And now 'or' these low 8 bits of shift amount into the 'control'.
4065
    Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4066
    insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4067
  }
4068

4069
  // But have to place the 'control' into the wide-enough register first.
4070
  if (XVT != MVT::i32) {
4071
    Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4072
    insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4073
  }
4074

4075
  // And finally, form the BEXTR itself.
4076
  SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4077

4078
  // The 'X' was originally truncated. Do that now.
4079
  if (XVT != NVT) {
4080
    insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4081
    Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4082
  }
4083

4084
  ReplaceNode(Node, Extract.getNode());
4085
  SelectCode(Extract.getNode());
4086

4087
  return true;
4088
}
4089

4090
// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4091
MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4092
  MVT NVT = Node->getSimpleValueType(0);
4093
  SDLoc dl(Node);
4094

4095
  SDValue N0 = Node->getOperand(0);
4096
  SDValue N1 = Node->getOperand(1);
4097

4098
  // If we have TBM we can use an immediate for the control. If we have BMI
4099
  // we should only do this if the BEXTR instruction is implemented well.
4100
  // Otherwise moving the control into a register makes this more costly.
4101
  // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4102
  // hoisting the move immediate would make it worthwhile with a less optimal
4103
  // BEXTR?
4104
  bool PreferBEXTR =
4105
      Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4106
  if (!PreferBEXTR && !Subtarget->hasBMI2())
4107
    return nullptr;
4108

4109
  // Must have a shift right.
4110
  if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4111
    return nullptr;
4112

4113
  // Shift can't have additional users.
4114
  if (!N0->hasOneUse())
4115
    return nullptr;
4116

4117
  // Only supported for 32 and 64 bits.
4118
  if (NVT != MVT::i32 && NVT != MVT::i64)
4119
    return nullptr;
4120

4121
  // Shift amount and RHS of and must be constant.
4122
  auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4123
  auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4124
  if (!MaskCst || !ShiftCst)
4125
    return nullptr;
4126

4127
  // And RHS must be a mask.
4128
  uint64_t Mask = MaskCst->getZExtValue();
4129
  if (!isMask_64(Mask))
4130
    return nullptr;
4131

4132
  uint64_t Shift = ShiftCst->getZExtValue();
4133
  uint64_t MaskSize = llvm::popcount(Mask);
4134

4135
  // Don't interfere with something that can be handled by extracting AH.
4136
  // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4137
  if (Shift == 8 && MaskSize == 8)
4138
    return nullptr;
4139

4140
  // Make sure we are only using bits that were in the original value, not
4141
  // shifted in.
4142
  if (Shift + MaskSize > NVT.getSizeInBits())
4143
    return nullptr;
4144

4145
  // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4146
  // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4147
  // does not fit into 32 bits. Load folding is not a sufficient reason.
4148
  if (!PreferBEXTR && MaskSize <= 32)
4149
    return nullptr;
4150

4151
  SDValue Control;
4152
  unsigned ROpc, MOpc;
4153

4154
#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4155
  if (!PreferBEXTR) {
4156
    assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4157
    // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4158
    // Let's perform the mask first, and apply shift later. Note that we need to
4159
    // widen the mask to account for the fact that we'll apply shift afterwards!
4160
    Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4161
    ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4162
                           : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4163
    MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4164
                           : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4165
    unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4166
    Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4167
  } else {
4168
    // The 'control' of BEXTR has the pattern of:
4169
    // [15...8 bit][ 7...0 bit] location
4170
    // [ bit count][     shift] name
4171
    // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
4172
    Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4173
    if (Subtarget->hasTBM()) {
4174
      ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4175
      MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4176
    } else {
4177
      assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4178
      // BMI requires the immediate to placed in a register.
4179
      ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4180
                             : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4181
      MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4182
                             : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4183
      unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4184
      Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4185
    }
4186
  }
4187

4188
  MachineSDNode *NewNode;
4189
  SDValue Input = N0->getOperand(0);
4190
  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4191
  if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4192
    SDValue Ops[] = {
4193
        Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4194
    SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4195
    NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4196
    // Update the chain.
4197
    ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4198
    // Record the mem-refs
4199
    CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4200
  } else {
4201
    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4202
  }
4203

4204
  if (!PreferBEXTR) {
4205
    // We still need to apply the shift.
4206
    SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4207
    unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4208
                                      : GET_ND_IF_ENABLED(X86::SHR32ri);
4209
    NewNode =
4210
        CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4211
  }
4212

4213
  return NewNode;
4214
}
4215

4216
// Emit a PCMISTR(I/M) instruction.
4217
MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4218
                                             bool MayFoldLoad, const SDLoc &dl,
4219
                                             MVT VT, SDNode *Node) {
4220
  SDValue N0 = Node->getOperand(0);
4221
  SDValue N1 = Node->getOperand(1);
4222
  SDValue Imm = Node->getOperand(2);
4223
  auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4224
  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4225

4226
  // Try to fold a load. No need to check alignment.
4227
  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4228
  if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4229
    SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4230
                      N1.getOperand(0) };
4231
    SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4232
    MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4233
    // Update the chain.
4234
    ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4235
    // Record the mem-refs
4236
    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4237
    return CNode;
4238
  }
4239

4240
  SDValue Ops[] = { N0, N1, Imm };
4241
  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4242
  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4243
  return CNode;
4244
}
4245

4246
// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4247
// to emit a second instruction after this one. This is needed since we have two
4248
// copyToReg nodes glued before this and we need to continue that glue through.
4249
MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4250
                                             bool MayFoldLoad, const SDLoc &dl,
4251
                                             MVT VT, SDNode *Node,
4252
                                             SDValue &InGlue) {
4253
  SDValue N0 = Node->getOperand(0);
4254
  SDValue N2 = Node->getOperand(2);
4255
  SDValue Imm = Node->getOperand(4);
4256
  auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4257
  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4258

4259
  // Try to fold a load. No need to check alignment.
4260
  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4261
  if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4262
    SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4263
                      N2.getOperand(0), InGlue };
4264
    SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4265
    MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4266
    InGlue = SDValue(CNode, 3);
4267
    // Update the chain.
4268
    ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4269
    // Record the mem-refs
4270
    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4271
    return CNode;
4272
  }
4273

4274
  SDValue Ops[] = { N0, N2, Imm, InGlue };
4275
  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4276
  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4277
  InGlue = SDValue(CNode, 2);
4278
  return CNode;
4279
}
4280

4281
bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4282
  EVT VT = N->getValueType(0);
4283

4284
  // Only handle scalar shifts.
4285
  if (VT.isVector())
4286
    return false;
4287

4288
  // Narrower shifts only mask to 5 bits in hardware.
4289
  unsigned Size = VT == MVT::i64 ? 64 : 32;
4290

4291
  SDValue OrigShiftAmt = N->getOperand(1);
4292
  SDValue ShiftAmt = OrigShiftAmt;
4293
  SDLoc DL(N);
4294

4295
  // Skip over a truncate of the shift amount.
4296
  if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4297
    ShiftAmt = ShiftAmt->getOperand(0);
4298

4299
  // This function is called after X86DAGToDAGISel::matchBitExtract(),
4300
  // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4301

4302
  SDValue NewShiftAmt;
4303
  if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4304
      ShiftAmt->getOpcode() == ISD::XOR) {
4305
    SDValue Add0 = ShiftAmt->getOperand(0);
4306
    SDValue Add1 = ShiftAmt->getOperand(1);
4307
    auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4308
    auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4309
    // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4310
    // to avoid the ADD/SUB/XOR.
4311
    if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4312
      NewShiftAmt = Add0;
4313

4314
    } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4315
               ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4316
                (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4317
      // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4318
      // we can replace it with a NOT. In the XOR case it may save some code
4319
      // size, in the SUB case it also may save a move.
4320
      assert(Add0C == nullptr || Add1C == nullptr);
4321

4322
      // We can only do N-X, not X-N
4323
      if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4324
        return false;
4325

4326
      EVT OpVT = ShiftAmt.getValueType();
4327

4328
      SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4329
      NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4330
                                    Add0C == nullptr ? Add0 : Add1, AllOnes);
4331
      insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4332
      insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4333
      // If we are shifting by N-X where N == 0 mod Size, then just shift by
4334
      // -X to generate a NEG instead of a SUB of a constant.
4335
    } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4336
               Add0C->getZExtValue() != 0) {
4337
      EVT SubVT = ShiftAmt.getValueType();
4338
      SDValue X;
4339
      if (Add0C->getZExtValue() % Size == 0)
4340
        X = Add1;
4341
      else if (ShiftAmt.hasOneUse() && Size == 64 &&
4342
               Add0C->getZExtValue() % 32 == 0) {
4343
        // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4344
        // This is mainly beneficial if we already compute (x+n*32).
4345
        if (Add1.getOpcode() == ISD::TRUNCATE) {
4346
          Add1 = Add1.getOperand(0);
4347
          SubVT = Add1.getValueType();
4348
        }
4349
        if (Add0.getValueType() != SubVT) {
4350
          Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4351
          insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4352
        }
4353

4354
        X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4355
        insertDAGNode(*CurDAG, OrigShiftAmt, X);
4356
      } else
4357
        return false;
4358
      // Insert a negate op.
4359
      // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4360
      // that uses it that's not a shift.
4361
      SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4362
      SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4363
      NewShiftAmt = Neg;
4364

4365
      // Insert these operands into a valid topological order so they can
4366
      // get selected independently.
4367
      insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4368
      insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4369
    } else
4370
      return false;
4371
  } else
4372
    return false;
4373

4374
  if (NewShiftAmt.getValueType() != MVT::i8) {
4375
    // Need to truncate the shift amount.
4376
    NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4377
    // Add to a correct topological ordering.
4378
    insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4379
  }
4380

4381
  // Insert a new mask to keep the shift amount legal. This should be removed
4382
  // by isel patterns.
4383
  NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4384
                                CurDAG->getConstant(Size - 1, DL, MVT::i8));
4385
  // Place in a correct topological ordering.
4386
  insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4387

4388
  SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4389
                                                   NewShiftAmt);
4390
  if (UpdatedNode != N) {
4391
    // If we found an existing node, we should replace ourselves with that node
4392
    // and wait for it to be selected after its other users.
4393
    ReplaceNode(N, UpdatedNode);
4394
    return true;
4395
  }
4396

4397
  // If the original shift amount is now dead, delete it so that we don't run
4398
  // it through isel.
4399
  if (OrigShiftAmt.getNode()->use_empty())
4400
    CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4401

4402
  // Now that we've optimized the shift amount, defer to normal isel to get
4403
  // load folding and legacy vs BMI2 selection without repeating it here.
4404
  SelectCode(N);
4405
  return true;
4406
}
4407

4408
bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4409
  MVT NVT = N->getSimpleValueType(0);
4410
  unsigned Opcode = N->getOpcode();
4411
  SDLoc dl(N);
4412

4413
  // For operations of the form (x << C1) op C2, check if we can use a smaller
4414
  // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4415
  SDValue Shift = N->getOperand(0);
4416
  SDValue N1 = N->getOperand(1);
4417

4418
  auto *Cst = dyn_cast<ConstantSDNode>(N1);
4419
  if (!Cst)
4420
    return false;
4421

4422
  int64_t Val = Cst->getSExtValue();
4423

4424
  // If we have an any_extend feeding the AND, look through it to see if there
4425
  // is a shift behind it. But only if the AND doesn't use the extended bits.
4426
  // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4427
  bool FoundAnyExtend = false;
4428
  if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4429
      Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4430
      isUInt<32>(Val)) {
4431
    FoundAnyExtend = true;
4432
    Shift = Shift.getOperand(0);
4433
  }
4434

4435
  if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4436
    return false;
4437

4438
  // i8 is unshrinkable, i16 should be promoted to i32.
4439
  if (NVT != MVT::i32 && NVT != MVT::i64)
4440
    return false;
4441

4442
  auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4443
  if (!ShlCst)
4444
    return false;
4445

4446
  uint64_t ShAmt = ShlCst->getZExtValue();
4447

4448
  // Make sure that we don't change the operation by removing bits.
4449
  // This only matters for OR and XOR, AND is unaffected.
4450
  uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4451
  if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4452
    return false;
4453

4454
  // Check the minimum bitwidth for the new constant.
4455
  // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4456
  auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4457
    if (Opcode == ISD::AND) {
4458
      // AND32ri is the same as AND64ri32 with zext imm.
4459
      // Try this before sign extended immediates below.
4460
      ShiftedVal = (uint64_t)Val >> ShAmt;
4461
      if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4462
        return true;
4463
      // Also swap order when the AND can become MOVZX.
4464
      if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4465
        return true;
4466
    }
4467
    ShiftedVal = Val >> ShAmt;
4468
    if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4469
        (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4470
      return true;
4471
    if (Opcode != ISD::AND) {
4472
      // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4473
      ShiftedVal = (uint64_t)Val >> ShAmt;
4474
      if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4475
        return true;
4476
    }
4477
    return false;
4478
  };
4479

4480
  int64_t ShiftedVal;
4481
  if (!CanShrinkImmediate(ShiftedVal))
4482
    return false;
4483

4484
  // Ok, we can reorder to get a smaller immediate.
4485

4486
  // But, its possible the original immediate allowed an AND to become MOVZX.
4487
  // Doing this late due to avoid the MakedValueIsZero call as late as
4488
  // possible.
4489
  if (Opcode == ISD::AND) {
4490
    // Find the smallest zext this could possibly be.
4491
    unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4492
    ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4493

4494
    // Figure out which bits need to be zero to achieve that mask.
4495
    APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4496
                                            ZExtWidth);
4497
    NeededMask &= ~Cst->getAPIntValue();
4498

4499
    if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4500
      return false;
4501
  }
4502

4503
  SDValue X = Shift.getOperand(0);
4504
  if (FoundAnyExtend) {
4505
    SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4506
    insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4507
    X = NewX;
4508
  }
4509

4510
  SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
4511
  insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4512
  SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4513
  insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4514
  SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4515
                                   Shift.getOperand(1));
4516
  ReplaceNode(N, NewSHL.getNode());
4517
  SelectCode(NewSHL.getNode());
4518
  return true;
4519
}
4520

4521
bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4522
                                     SDNode *ParentB, SDNode *ParentC,
4523
                                     SDValue A, SDValue B, SDValue C,
4524
                                     uint8_t Imm) {
4525
  assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4526
         C.isOperandOf(ParentC) && "Incorrect parent node");
4527

4528
  auto tryFoldLoadOrBCast =
4529
      [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4530
             SDValue &Index, SDValue &Disp, SDValue &Segment) {
4531
        if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4532
          return true;
4533

4534
        // Not a load, check for broadcast which may be behind a bitcast.
4535
        if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4536
          P = L.getNode();
4537
          L = L.getOperand(0);
4538
        }
4539

4540
        if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4541
          return false;
4542

4543
        // Only 32 and 64 bit broadcasts are supported.
4544
        auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4545
        unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4546
        if (Size != 32 && Size != 64)
4547
          return false;
4548

4549
        return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4550
      };
4551

4552
  bool FoldedLoad = false;
4553
  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4554
  if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4555
    FoldedLoad = true;
4556
  } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4557
                                Tmp4)) {
4558
    FoldedLoad = true;
4559
    std::swap(A, C);
4560
    // Swap bits 1/4 and 3/6.
4561
    uint8_t OldImm = Imm;
4562
    Imm = OldImm & 0xa5;
4563
    if (OldImm & 0x02) Imm |= 0x10;
4564
    if (OldImm & 0x10) Imm |= 0x02;
4565
    if (OldImm & 0x08) Imm |= 0x40;
4566
    if (OldImm & 0x40) Imm |= 0x08;
4567
  } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4568
                                Tmp4)) {
4569
    FoldedLoad = true;
4570
    std::swap(B, C);
4571
    // Swap bits 1/2 and 5/6.
4572
    uint8_t OldImm = Imm;
4573
    Imm = OldImm & 0x99;
4574
    if (OldImm & 0x02) Imm |= 0x04;
4575
    if (OldImm & 0x04) Imm |= 0x02;
4576
    if (OldImm & 0x20) Imm |= 0x40;
4577
    if (OldImm & 0x40) Imm |= 0x20;
4578
  }
4579

4580
  SDLoc DL(Root);
4581

4582
  SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4583

4584
  MVT NVT = Root->getSimpleValueType(0);
4585

4586
  MachineSDNode *MNode;
4587
  if (FoldedLoad) {
4588
    SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4589

4590
    unsigned Opc;
4591
    if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4592
      auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4593
      unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4594
      assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4595

4596
      bool UseD = EltSize == 32;
4597
      if (NVT.is128BitVector())
4598
        Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4599
      else if (NVT.is256BitVector())
4600
        Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4601
      else if (NVT.is512BitVector())
4602
        Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4603
      else
4604
        llvm_unreachable("Unexpected vector size!");
4605
    } else {
4606
      bool UseD = NVT.getVectorElementType() == MVT::i32;
4607
      if (NVT.is128BitVector())
4608
        Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4609
      else if (NVT.is256BitVector())
4610
        Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4611
      else if (NVT.is512BitVector())
4612
        Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4613
      else
4614
        llvm_unreachable("Unexpected vector size!");
4615
    }
4616

4617
    SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4618
    MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4619

4620
    // Update the chain.
4621
    ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4622
    // Record the mem-refs
4623
    CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4624
  } else {
4625
    bool UseD = NVT.getVectorElementType() == MVT::i32;
4626
    unsigned Opc;
4627
    if (NVT.is128BitVector())
4628
      Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4629
    else if (NVT.is256BitVector())
4630
      Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4631
    else if (NVT.is512BitVector())
4632
      Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4633
    else
4634
      llvm_unreachable("Unexpected vector size!");
4635

4636
    MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4637
  }
4638

4639
  ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4640
  CurDAG->RemoveDeadNode(Root);
4641
  return true;
4642
}
4643

4644
// Try to match two logic ops to a VPTERNLOG.
4645
// FIXME: Handle more complex patterns that use an operand more than once?
4646
bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4647
  MVT NVT = N->getSimpleValueType(0);
4648

4649
  // Make sure we support VPTERNLOG.
4650
  if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4651
      NVT.getVectorElementType() == MVT::i1)
4652
    return false;
4653

4654
  // We need VLX for 128/256-bit.
4655
  if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4656
    return false;
4657

4658
  SDValue N0 = N->getOperand(0);
4659
  SDValue N1 = N->getOperand(1);
4660

4661
  auto getFoldableLogicOp = [](SDValue Op) {
4662
    // Peek through single use bitcast.
4663
    if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4664
      Op = Op.getOperand(0);
4665

4666
    if (!Op.hasOneUse())
4667
      return SDValue();
4668

4669
    unsigned Opc = Op.getOpcode();
4670
    if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4671
        Opc == X86ISD::ANDNP)
4672
      return Op;
4673

4674
    return SDValue();
4675
  };
4676

4677
  SDValue A, FoldableOp;
4678
  if ((FoldableOp = getFoldableLogicOp(N1))) {
4679
    A = N0;
4680
  } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4681
    A = N1;
4682
  } else
4683
    return false;
4684

4685
  SDValue B = FoldableOp.getOperand(0);
4686
  SDValue C = FoldableOp.getOperand(1);
4687
  SDNode *ParentA = N;
4688
  SDNode *ParentB = FoldableOp.getNode();
4689
  SDNode *ParentC = FoldableOp.getNode();
4690

4691
  // We can build the appropriate control immediate by performing the logic
4692
  // operation we're matching using these constants for A, B, and C.
4693
  uint8_t TernlogMagicA = 0xf0;
4694
  uint8_t TernlogMagicB = 0xcc;
4695
  uint8_t TernlogMagicC = 0xaa;
4696

4697
  // Some of the inputs may be inverted, peek through them and invert the
4698
  // magic values accordingly.
4699
  // TODO: There may be a bitcast before the xor that we should peek through.
4700
  auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4701
    if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4702
        ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4703
      Magic = ~Magic;
4704
      Parent = Op.getNode();
4705
      Op = Op.getOperand(0);
4706
    }
4707
  };
4708

4709
  PeekThroughNot(A, ParentA, TernlogMagicA);
4710
  PeekThroughNot(B, ParentB, TernlogMagicB);
4711
  PeekThroughNot(C, ParentC, TernlogMagicC);
4712

4713
  uint8_t Imm;
4714
  switch (FoldableOp.getOpcode()) {
4715
  default: llvm_unreachable("Unexpected opcode!");
4716
  case ISD::AND:      Imm = TernlogMagicB & TernlogMagicC; break;
4717
  case ISD::OR:       Imm = TernlogMagicB | TernlogMagicC; break;
4718
  case ISD::XOR:      Imm = TernlogMagicB ^ TernlogMagicC; break;
4719
  case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4720
  }
4721

4722
  switch (N->getOpcode()) {
4723
  default: llvm_unreachable("Unexpected opcode!");
4724
  case X86ISD::ANDNP:
4725
    if (A == N0)
4726
      Imm &= ~TernlogMagicA;
4727
    else
4728
      Imm = ~(Imm) & TernlogMagicA;
4729
    break;
4730
  case ISD::AND: Imm &= TernlogMagicA; break;
4731
  case ISD::OR:  Imm |= TernlogMagicA; break;
4732
  case ISD::XOR: Imm ^= TernlogMagicA; break;
4733
  }
4734

4735
  return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4736
}
4737

4738
/// If the high bits of an 'and' operand are known zero, try setting the
4739
/// high bits of an 'and' constant operand to produce a smaller encoding by
4740
/// creating a small, sign-extended negative immediate rather than a large
4741
/// positive one. This reverses a transform in SimplifyDemandedBits that
4742
/// shrinks mask constants by clearing bits. There is also a possibility that
4743
/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4744
/// case, just replace the 'and'. Return 'true' if the node is replaced.
4745
bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4746
  // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4747
  // have immediate operands.
4748
  MVT VT = And->getSimpleValueType(0);
4749
  if (VT != MVT::i32 && VT != MVT::i64)
4750
    return false;
4751

4752
  auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4753
  if (!And1C)
4754
    return false;
4755

4756
  // Bail out if the mask constant is already negative. It's can't shrink more.
4757
  // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4758
  // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4759
  // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4760
  // are negative too.
4761
  APInt MaskVal = And1C->getAPIntValue();
4762
  unsigned MaskLZ = MaskVal.countl_zero();
4763
  if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4764
    return false;
4765

4766
  // Don't extend into the upper 32 bits of a 64 bit mask.
4767
  if (VT == MVT::i64 && MaskLZ >= 32) {
4768
    MaskLZ -= 32;
4769
    MaskVal = MaskVal.trunc(32);
4770
  }
4771

4772
  SDValue And0 = And->getOperand(0);
4773
  APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4774
  APInt NegMaskVal = MaskVal | HighZeros;
4775

4776
  // If a negative constant would not allow a smaller encoding, there's no need
4777
  // to continue. Only change the constant when we know it's a win.
4778
  unsigned MinWidth = NegMaskVal.getSignificantBits();
4779
  if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4780
    return false;
4781

4782
  // Extend masks if we truncated above.
4783
  if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4784
    NegMaskVal = NegMaskVal.zext(64);
4785
    HighZeros = HighZeros.zext(64);
4786
  }
4787

4788
  // The variable operand must be all zeros in the top bits to allow using the
4789
  // new, negative constant as the mask.
4790
  if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
4791
    return false;
4792

4793
  // Check if the mask is -1. In that case, this is an unnecessary instruction
4794
  // that escaped earlier analysis.
4795
  if (NegMaskVal.isAllOnes()) {
4796
    ReplaceNode(And, And0.getNode());
4797
    return true;
4798
  }
4799

4800
  // A negative mask allows a smaller encoding. Create a new 'and' node.
4801
  SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4802
  insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4803
  SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4804
  ReplaceNode(And, NewAnd.getNode());
4805
  SelectCode(NewAnd.getNode());
4806
  return true;
4807
}
4808

4809
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4810
                              bool FoldedBCast, bool Masked) {
4811
#define VPTESTM_CASE(VT, SUFFIX) \
4812
case MVT::VT: \
4813
  if (Masked) \
4814
    return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4815
  return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4816

4817

4818
#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4819
default: llvm_unreachable("Unexpected VT!"); \
4820
VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4821
VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4822
VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4823
VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4824
VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4825
VPTESTM_CASE(v8i64, QZ##SUFFIX)
4826

4827
#define VPTESTM_FULL_CASES(SUFFIX) \
4828
VPTESTM_BROADCAST_CASES(SUFFIX) \
4829
VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4830
VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4831
VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4832
VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4833
VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4834
VPTESTM_CASE(v32i16, WZ##SUFFIX)
4835

4836
  if (FoldedBCast) {
4837
    switch (TestVT.SimpleTy) {
4838
    VPTESTM_BROADCAST_CASES(rmb)
4839
    }
4840
  }
4841

4842
  if (FoldedLoad) {
4843
    switch (TestVT.SimpleTy) {
4844
    VPTESTM_FULL_CASES(rm)
4845
    }
4846
  }
4847

4848
  switch (TestVT.SimpleTy) {
4849
  VPTESTM_FULL_CASES(rr)
4850
  }
4851

4852
#undef VPTESTM_FULL_CASES
4853
#undef VPTESTM_BROADCAST_CASES
4854
#undef VPTESTM_CASE
4855
}
4856

4857
// Try to create VPTESTM instruction. If InMask is not null, it will be used
4858
// to form a masked operation.
4859
bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4860
                                 SDValue InMask) {
4861
  assert(Subtarget->hasAVX512() && "Expected AVX512!");
4862
  assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4863
         "Unexpected VT!");
4864

4865
  // Look for equal and not equal compares.
4866
  ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4867
  if (CC != ISD::SETEQ && CC != ISD::SETNE)
4868
    return false;
4869

4870
  SDValue SetccOp0 = Setcc.getOperand(0);
4871
  SDValue SetccOp1 = Setcc.getOperand(1);
4872

4873
  // Canonicalize the all zero vector to the RHS.
4874
  if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4875
    std::swap(SetccOp0, SetccOp1);
4876

4877
  // See if we're comparing against zero.
4878
  if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4879
    return false;
4880

4881
  SDValue N0 = SetccOp0;
4882

4883
  MVT CmpVT = N0.getSimpleValueType();
4884
  MVT CmpSVT = CmpVT.getVectorElementType();
4885

4886
  // Start with both operands the same. We'll try to refine this.
4887
  SDValue Src0 = N0;
4888
  SDValue Src1 = N0;
4889

4890
  {
4891
    // Look through single use bitcasts.
4892
    SDValue N0Temp = N0;
4893
    if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4894
      N0Temp = N0.getOperand(0);
4895

4896
     // Look for single use AND.
4897
    if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4898
      Src0 = N0Temp.getOperand(0);
4899
      Src1 = N0Temp.getOperand(1);
4900
    }
4901
  }
4902

4903
  // Without VLX we need to widen the operation.
4904
  bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4905

4906
  auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4907
                                SDValue &Base, SDValue &Scale, SDValue &Index,
4908
                                SDValue &Disp, SDValue &Segment) {
4909
    // If we need to widen, we can't fold the load.
4910
    if (!Widen)
4911
      if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4912
        return true;
4913

4914
    // If we didn't fold a load, try to match broadcast. No widening limitation
4915
    // for this. But only 32 and 64 bit types are supported.
4916
    if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4917
      return false;
4918

4919
    // Look through single use bitcasts.
4920
    if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4921
      P = L.getNode();
4922
      L = L.getOperand(0);
4923
    }
4924

4925
    if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4926
      return false;
4927

4928
    auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4929
    if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4930
      return false;
4931

4932
    return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4933
  };
4934

4935
  // We can only fold loads if the sources are unique.
4936
  bool CanFoldLoads = Src0 != Src1;
4937

4938
  bool FoldedLoad = false;
4939
  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4940
  if (CanFoldLoads) {
4941
    FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4942
                                    Tmp3, Tmp4);
4943
    if (!FoldedLoad) {
4944
      // And is commutative.
4945
      FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
4946
                                      Tmp2, Tmp3, Tmp4);
4947
      if (FoldedLoad)
4948
        std::swap(Src0, Src1);
4949
    }
4950
  }
4951

4952
  bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
4953

4954
  bool IsMasked = InMask.getNode() != nullptr;
4955

4956
  SDLoc dl(Root);
4957

4958
  MVT ResVT = Setcc.getSimpleValueType();
4959
  MVT MaskVT = ResVT;
4960
  if (Widen) {
4961
    // Widen the inputs using insert_subreg or copy_to_regclass.
4962
    unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4963
    unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4964
    unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4965
    CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4966
    MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4967
    SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4968
                                                     CmpVT), 0);
4969
    Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4970

4971
    if (!FoldedBCast)
4972
      Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4973

4974
    if (IsMasked) {
4975
      // Widen the mask.
4976
      unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
4977
      SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4978
      InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4979
                                              dl, MaskVT, InMask, RC), 0);
4980
    }
4981
  }
4982

4983
  bool IsTestN = CC == ISD::SETEQ;
4984
  unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4985
                               IsMasked);
4986

4987
  MachineSDNode *CNode;
4988
  if (FoldedLoad) {
4989
    SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4990

4991
    if (IsMasked) {
4992
      SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4993
                        Src1.getOperand(0) };
4994
      CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4995
    } else {
4996
      SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4997
                        Src1.getOperand(0) };
4998
      CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4999
    }
5000

5001
    // Update the chain.
5002
    ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
5003
    // Record the mem-refs
5004
    CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
5005
  } else {
5006
    if (IsMasked)
5007
      CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
5008
    else
5009
      CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
5010
  }
5011

5012
  // If we widened, we need to shrink the mask VT.
5013
  if (Widen) {
5014
    unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
5015
    SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5016
    CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5017
                                   dl, ResVT, SDValue(CNode, 0), RC);
5018
  }
5019

5020
  ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
5021
  CurDAG->RemoveDeadNode(Root);
5022
  return true;
5023
}
5024

5025
// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5026
// into vpternlog.
5027
bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5028
  assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5029

5030
  MVT NVT = N->getSimpleValueType(0);
5031

5032
  // Make sure we support VPTERNLOG.
5033
  if (!NVT.isVector() || !Subtarget->hasAVX512())
5034
    return false;
5035

5036
  // We need VLX for 128/256-bit.
5037
  if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5038
    return false;
5039

5040
  SDValue N0 = N->getOperand(0);
5041
  SDValue N1 = N->getOperand(1);
5042

5043
  // Canonicalize AND to LHS.
5044
  if (N1.getOpcode() == ISD::AND)
5045
    std::swap(N0, N1);
5046

5047
  if (N0.getOpcode() != ISD::AND ||
5048
      N1.getOpcode() != X86ISD::ANDNP ||
5049
      !N0.hasOneUse() || !N1.hasOneUse())
5050
    return false;
5051

5052
  // ANDN is not commutable, use it to pick down A and C.
5053
  SDValue A = N1.getOperand(0);
5054
  SDValue C = N1.getOperand(1);
5055

5056
  // AND is commutable, if one operand matches A, the other operand is B.
5057
  // Otherwise this isn't a match.
5058
  SDValue B;
5059
  if (N0.getOperand(0) == A)
5060
    B = N0.getOperand(1);
5061
  else if (N0.getOperand(1) == A)
5062
    B = N0.getOperand(0);
5063
  else
5064
    return false;
5065

5066
  SDLoc dl(N);
5067
  SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5068
  SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5069
  ReplaceNode(N, Ternlog.getNode());
5070

5071
  return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5072
                        Ternlog.getNode(), A, B, C, 0xCA);
5073
}
5074

5075
void X86DAGToDAGISel::Select(SDNode *Node) {
5076
  MVT NVT = Node->getSimpleValueType(0);
5077
  unsigned Opcode = Node->getOpcode();
5078
  SDLoc dl(Node);
5079

5080
  if (Node->isMachineOpcode()) {
5081
    LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5082
    Node->setNodeId(-1);
5083
    return;   // Already selected.
5084
  }
5085

5086
  switch (Opcode) {
5087
  default: break;
5088
  case ISD::INTRINSIC_W_CHAIN: {
5089
    unsigned IntNo = Node->getConstantOperandVal(1);
5090
    switch (IntNo) {
5091
    default: break;
5092
    case Intrinsic::x86_encodekey128:
5093
    case Intrinsic::x86_encodekey256: {
5094
      if (!Subtarget->hasKL())
5095
        break;
5096

5097
      unsigned Opcode;
5098
      switch (IntNo) {
5099
      default: llvm_unreachable("Impossible intrinsic");
5100
      case Intrinsic::x86_encodekey128:
5101
        Opcode = X86::ENCODEKEY128;
5102
        break;
5103
      case Intrinsic::x86_encodekey256:
5104
        Opcode = X86::ENCODEKEY256;
5105
        break;
5106
      }
5107

5108
      SDValue Chain = Node->getOperand(0);
5109
      Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5110
                                   SDValue());
5111
      if (Opcode == X86::ENCODEKEY256)
5112
        Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5113
                                     Chain.getValue(1));
5114

5115
      MachineSDNode *Res = CurDAG->getMachineNode(
5116
          Opcode, dl, Node->getVTList(),
5117
          {Node->getOperand(2), Chain, Chain.getValue(1)});
5118
      ReplaceNode(Node, Res);
5119
      return;
5120
    }
5121
    case Intrinsic::x86_tileloadd64_internal:
5122
    case Intrinsic::x86_tileloaddt164_internal: {
5123
      if (!Subtarget->hasAMXTILE())
5124
        break;
5125
      auto *MFI =
5126
          CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5127
      MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5128
      unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
5129
                         ? X86::PTILELOADDV
5130
                         : X86::PTILELOADDT1V;
5131
      // _tile_loadd_internal(row, col, buf, STRIDE)
5132
      SDValue Base = Node->getOperand(4);
5133
      SDValue Scale = getI8Imm(1, dl);
5134
      SDValue Index = Node->getOperand(5);
5135
      SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5136
      SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5137
      SDValue Chain = Node->getOperand(0);
5138
      MachineSDNode *CNode;
5139
      SDValue Ops[] = {Node->getOperand(2),
5140
                       Node->getOperand(3),
5141
                       Base,
5142
                       Scale,
5143
                       Index,
5144
                       Disp,
5145
                       Segment,
5146
                       Chain};
5147
      CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5148
      ReplaceNode(Node, CNode);
5149
      return;
5150
    }
5151
    }
5152
    break;
5153
  }
5154
  case ISD::INTRINSIC_VOID: {
5155
    unsigned IntNo = Node->getConstantOperandVal(1);
5156
    switch (IntNo) {
5157
    default: break;
5158
    case Intrinsic::x86_sse3_monitor:
5159
    case Intrinsic::x86_monitorx:
5160
    case Intrinsic::x86_clzero: {
5161
      bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5162

5163
      unsigned Opc = 0;
5164
      switch (IntNo) {
5165
      default: llvm_unreachable("Unexpected intrinsic!");
5166
      case Intrinsic::x86_sse3_monitor:
5167
        if (!Subtarget->hasSSE3())
5168
          break;
5169
        Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5170
        break;
5171
      case Intrinsic::x86_monitorx:
5172
        if (!Subtarget->hasMWAITX())
5173
          break;
5174
        Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5175
        break;
5176
      case Intrinsic::x86_clzero:
5177
        if (!Subtarget->hasCLZERO())
5178
          break;
5179
        Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5180
        break;
5181
      }
5182

5183
      if (Opc) {
5184
        unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5185
        SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5186
                                             Node->getOperand(2), SDValue());
5187
        SDValue InGlue = Chain.getValue(1);
5188

5189
        if (IntNo == Intrinsic::x86_sse3_monitor ||
5190
            IntNo == Intrinsic::x86_monitorx) {
5191
          // Copy the other two operands to ECX and EDX.
5192
          Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5193
                                       InGlue);
5194
          InGlue = Chain.getValue(1);
5195
          Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5196
                                       InGlue);
5197
          InGlue = Chain.getValue(1);
5198
        }
5199

5200
        MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5201
                                                      { Chain, InGlue});
5202
        ReplaceNode(Node, CNode);
5203
        return;
5204
      }
5205

5206
      break;
5207
    }
5208
    case Intrinsic::x86_tilestored64_internal: {
5209
      auto *MFI =
5210
          CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5211
      MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5212
      unsigned Opc = X86::PTILESTOREDV;
5213
      // _tile_stored_internal(row, col, buf, STRIDE, c)
5214
      SDValue Base = Node->getOperand(4);
5215
      SDValue Scale = getI8Imm(1, dl);
5216
      SDValue Index = Node->getOperand(5);
5217
      SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5218
      SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5219
      SDValue Chain = Node->getOperand(0);
5220
      MachineSDNode *CNode;
5221
      SDValue Ops[] = {Node->getOperand(2),
5222
                       Node->getOperand(3),
5223
                       Base,
5224
                       Scale,
5225
                       Index,
5226
                       Disp,
5227
                       Segment,
5228
                       Node->getOperand(6),
5229
                       Chain};
5230
      CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5231
      ReplaceNode(Node, CNode);
5232
      return;
5233
    }
5234
    case Intrinsic::x86_tileloadd64:
5235
    case Intrinsic::x86_tileloaddt164:
5236
    case Intrinsic::x86_tilestored64: {
5237
      if (!Subtarget->hasAMXTILE())
5238
        break;
5239
      auto *MFI =
5240
          CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5241
      MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5242
      unsigned Opc;
5243
      switch (IntNo) {
5244
      default: llvm_unreachable("Unexpected intrinsic!");
5245
      case Intrinsic::x86_tileloadd64:   Opc = X86::PTILELOADD; break;
5246
      case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5247
      case Intrinsic::x86_tilestored64:  Opc = X86::PTILESTORED; break;
5248
      }
5249
      // FIXME: Match displacement and scale.
5250
      unsigned TIndex = Node->getConstantOperandVal(2);
5251
      SDValue TReg = getI8Imm(TIndex, dl);
5252
      SDValue Base = Node->getOperand(3);
5253
      SDValue Scale = getI8Imm(1, dl);
5254
      SDValue Index = Node->getOperand(4);
5255
      SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5256
      SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5257
      SDValue Chain = Node->getOperand(0);
5258
      MachineSDNode *CNode;
5259
      if (Opc == X86::PTILESTORED) {
5260
        SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5261
        CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5262
      } else {
5263
        SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5264
        CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5265
      }
5266
      ReplaceNode(Node, CNode);
5267
      return;
5268
    }
5269
    }
5270
    break;
5271
  }
5272
  case ISD::BRIND:
5273
  case X86ISD::NT_BRIND: {
5274
    if (Subtarget->isTargetNaCl())
5275
      // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5276
      // leave the instruction alone.
5277
      break;
5278
    if (Subtarget->isTarget64BitILP32()) {
5279
      // Converts a 32-bit register to a 64-bit, zero-extended version of
5280
      // it. This is needed because x86-64 can do many things, but jmp %r32
5281
      // ain't one of them.
5282
      SDValue Target = Node->getOperand(1);
5283
      assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5284
      SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5285
      SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5286
                                      Node->getOperand(0), ZextTarget);
5287
      ReplaceNode(Node, Brind.getNode());
5288
      SelectCode(ZextTarget.getNode());
5289
      SelectCode(Brind.getNode());
5290
      return;
5291
    }
5292
    break;
5293
  }
5294
  case X86ISD::GlobalBaseReg:
5295
    ReplaceNode(Node, getGlobalBaseReg());
5296
    return;
5297

5298
  case ISD::BITCAST:
5299
    // Just drop all 128/256/512-bit bitcasts.
5300
    if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5301
        NVT == MVT::f128) {
5302
      ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5303
      CurDAG->RemoveDeadNode(Node);
5304
      return;
5305
    }
5306
    break;
5307

5308
  case ISD::SRL:
5309
    if (matchBitExtract(Node))
5310
      return;
5311
    [[fallthrough]];
5312
  case ISD::SRA:
5313
  case ISD::SHL:
5314
    if (tryShiftAmountMod(Node))
5315
      return;
5316
    break;
5317

5318
  case X86ISD::VPTERNLOG: {
5319
    uint8_t Imm = Node->getConstantOperandVal(3);
5320
    if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5321
                       Node->getOperand(1), Node->getOperand(2), Imm))
5322
      return;
5323
    break;
5324
  }
5325

5326
  case X86ISD::ANDNP:
5327
    if (tryVPTERNLOG(Node))
5328
      return;
5329
    break;
5330

5331
  case ISD::AND:
5332
    if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5333
      // Try to form a masked VPTESTM. Operands can be in either order.
5334
      SDValue N0 = Node->getOperand(0);
5335
      SDValue N1 = Node->getOperand(1);
5336
      if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5337
          tryVPTESTM(Node, N0, N1))
5338
        return;
5339
      if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5340
          tryVPTESTM(Node, N1, N0))
5341
        return;
5342
    }
5343

5344
    if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5345
      ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5346
      CurDAG->RemoveDeadNode(Node);
5347
      return;
5348
    }
5349
    if (matchBitExtract(Node))
5350
      return;
5351
    if (AndImmShrink && shrinkAndImmediate(Node))
5352
      return;
5353

5354
    [[fallthrough]];
5355
  case ISD::OR:
5356
  case ISD::XOR:
5357
    if (tryShrinkShlLogicImm(Node))
5358
      return;
5359
    if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5360
      return;
5361
    if (tryVPTERNLOG(Node))
5362
      return;
5363

5364
    [[fallthrough]];
5365
  case ISD::ADD:
5366
    if (Opcode == ISD::ADD && matchBitExtract(Node))
5367
      return;
5368
    [[fallthrough]];
5369
  case ISD::SUB: {
5370
    // Try to avoid folding immediates with multiple uses for optsize.
5371
    // This code tries to select to register form directly to avoid going
5372
    // through the isel table which might fold the immediate. We can't change
5373
    // the patterns on the add/sub/and/or/xor with immediate paterns in the
5374
    // tablegen files to check immediate use count without making the patterns
5375
    // unavailable to the fast-isel table.
5376
    if (!CurDAG->shouldOptForSize())
5377
      break;
5378

5379
    // Only handle i8/i16/i32/i64.
5380
    if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5381
      break;
5382

5383
    SDValue N0 = Node->getOperand(0);
5384
    SDValue N1 = Node->getOperand(1);
5385

5386
    auto *Cst = dyn_cast<ConstantSDNode>(N1);
5387
    if (!Cst)
5388
      break;
5389

5390
    int64_t Val = Cst->getSExtValue();
5391

5392
    // Make sure its an immediate that is considered foldable.
5393
    // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5394
    if (!isInt<8>(Val) && !isInt<32>(Val))
5395
      break;
5396

5397
    // If this can match to INC/DEC, let it go.
5398
    if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5399
      break;
5400

5401
    // Check if we should avoid folding this immediate.
5402
    if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5403
      break;
5404

5405
    // We should not fold the immediate. So we need a register form instead.
5406
    unsigned ROpc, MOpc;
5407
    switch (NVT.SimpleTy) {
5408
    default: llvm_unreachable("Unexpected VT!");
5409
    case MVT::i8:
5410
      switch (Opcode) {
5411
      default: llvm_unreachable("Unexpected opcode!");
5412
      case ISD::ADD:
5413
        ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5414
        MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5415
        break;
5416
      case ISD::SUB:
5417
        ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5418
        MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5419
        break;
5420
      case ISD::AND:
5421
        ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5422
        MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5423
        break;
5424
      case ISD::OR:
5425
        ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5426
        MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5427
        break;
5428
      case ISD::XOR:
5429
        ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5430
        MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5431
        break;
5432
      }
5433
      break;
5434
    case MVT::i16:
5435
      switch (Opcode) {
5436
      default: llvm_unreachable("Unexpected opcode!");
5437
      case ISD::ADD:
5438
        ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5439
        MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5440
        break;
5441
      case ISD::SUB:
5442
        ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5443
        MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5444
        break;
5445
      case ISD::AND:
5446
        ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5447
        MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5448
        break;
5449
      case ISD::OR:
5450
        ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5451
        MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5452
        break;
5453
      case ISD::XOR:
5454
        ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5455
        MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5456
        break;
5457
      }
5458
      break;
5459
    case MVT::i32:
5460
      switch (Opcode) {
5461
      default: llvm_unreachable("Unexpected opcode!");
5462
      case ISD::ADD:
5463
        ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5464
        MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5465
        break;
5466
      case ISD::SUB:
5467
        ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5468
        MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5469
        break;
5470
      case ISD::AND:
5471
        ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5472
        MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5473
        break;
5474
      case ISD::OR:
5475
        ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5476
        MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5477
        break;
5478
      case ISD::XOR:
5479
        ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5480
        MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5481
        break;
5482
      }
5483
      break;
5484
    case MVT::i64:
5485
      switch (Opcode) {
5486
      default: llvm_unreachable("Unexpected opcode!");
5487
      case ISD::ADD:
5488
        ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5489
        MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5490
        break;
5491
      case ISD::SUB:
5492
        ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5493
        MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5494
        break;
5495
      case ISD::AND:
5496
        ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5497
        MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5498
        break;
5499
      case ISD::OR:
5500
        ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5501
        MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5502
        break;
5503
      case ISD::XOR:
5504
        ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5505
        MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5506
        break;
5507
      }
5508
      break;
5509
    }
5510

5511
    // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5512

5513
    // If this is a not a subtract, we can still try to fold a load.
5514
    if (Opcode != ISD::SUB) {
5515
      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5516
      if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5517
        SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5518
        SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5519
        MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5520
        // Update the chain.
5521
        ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5522
        // Record the mem-refs
5523
        CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5524
        ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5525
        CurDAG->RemoveDeadNode(Node);
5526
        return;
5527
      }
5528
    }
5529

5530
    CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5531
    return;
5532
  }
5533

5534
  case X86ISD::SMUL:
5535
    // i16/i32/i64 are handled with isel patterns.
5536
    if (NVT != MVT::i8)
5537
      break;
5538
    [[fallthrough]];
5539
  case X86ISD::UMUL: {
5540
    SDValue N0 = Node->getOperand(0);
5541
    SDValue N1 = Node->getOperand(1);
5542

5543
    unsigned LoReg, ROpc, MOpc;
5544
    switch (NVT.SimpleTy) {
5545
    default: llvm_unreachable("Unsupported VT!");
5546
    case MVT::i8:
5547
      LoReg = X86::AL;
5548
      ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5549
      MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5550
      break;
5551
    case MVT::i16:
5552
      LoReg = X86::AX;
5553
      ROpc = X86::MUL16r;
5554
      MOpc = X86::MUL16m;
5555
      break;
5556
    case MVT::i32:
5557
      LoReg = X86::EAX;
5558
      ROpc = X86::MUL32r;
5559
      MOpc = X86::MUL32m;
5560
      break;
5561
    case MVT::i64:
5562
      LoReg = X86::RAX;
5563
      ROpc = X86::MUL64r;
5564
      MOpc = X86::MUL64m;
5565
      break;
5566
    }
5567

5568
    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5569
    bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5570
    // Multiply is commutative.
5571
    if (!FoldedLoad) {
5572
      FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5573
      if (FoldedLoad)
5574
        std::swap(N0, N1);
5575
    }
5576

5577
    SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5578
                                          N0, SDValue()).getValue(1);
5579

5580
    MachineSDNode *CNode;
5581
    if (FoldedLoad) {
5582
      // i16/i32/i64 use an instruction that produces a low and high result even
5583
      // though only the low result is used.
5584
      SDVTList VTs;
5585
      if (NVT == MVT::i8)
5586
        VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5587
      else
5588
        VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5589

5590
      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5591
                        InGlue };
5592
      CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5593

5594
      // Update the chain.
5595
      ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5596
      // Record the mem-refs
5597
      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5598
    } else {
5599
      // i16/i32/i64 use an instruction that produces a low and high result even
5600
      // though only the low result is used.
5601
      SDVTList VTs;
5602
      if (NVT == MVT::i8)
5603
        VTs = CurDAG->getVTList(NVT, MVT::i32);
5604
      else
5605
        VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5606

5607
      CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5608
    }
5609

5610
    ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5611
    ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5612
    CurDAG->RemoveDeadNode(Node);
5613
    return;
5614
  }
5615

5616
  case ISD::SMUL_LOHI:
5617
  case ISD::UMUL_LOHI: {
5618
    SDValue N0 = Node->getOperand(0);
5619
    SDValue N1 = Node->getOperand(1);
5620

5621
    unsigned Opc, MOpc;
5622
    unsigned LoReg, HiReg;
5623
    bool IsSigned = Opcode == ISD::SMUL_LOHI;
5624
    bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5625
    bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5626
    switch (NVT.SimpleTy) {
5627
    default: llvm_unreachable("Unsupported VT!");
5628
    case MVT::i32:
5629
      Opc = UseMULXHi  ? X86::MULX32Hrr
5630
            : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5631
            : IsSigned ? X86::IMUL32r
5632
                       : X86::MUL32r;
5633
      MOpc = UseMULXHi  ? X86::MULX32Hrm
5634
             : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5635
             : IsSigned ? X86::IMUL32m
5636
                        : X86::MUL32m;
5637
      LoReg = UseMULX ? X86::EDX : X86::EAX;
5638
      HiReg = X86::EDX;
5639
      break;
5640
    case MVT::i64:
5641
      Opc = UseMULXHi  ? X86::MULX64Hrr
5642
            : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5643
            : IsSigned ? X86::IMUL64r
5644
                       : X86::MUL64r;
5645
      MOpc = UseMULXHi  ? X86::MULX64Hrm
5646
             : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5647
             : IsSigned ? X86::IMUL64m
5648
                        : X86::MUL64m;
5649
      LoReg = UseMULX ? X86::RDX : X86::RAX;
5650
      HiReg = X86::RDX;
5651
      break;
5652
    }
5653

5654
    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5655
    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5656
    // Multiply is commutative.
5657
    if (!foldedLoad) {
5658
      foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5659
      if (foldedLoad)
5660
        std::swap(N0, N1);
5661
    }
5662

5663
    SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5664
                                          N0, SDValue()).getValue(1);
5665
    SDValue ResHi, ResLo;
5666
    if (foldedLoad) {
5667
      SDValue Chain;
5668
      MachineSDNode *CNode = nullptr;
5669
      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5670
                        InGlue };
5671
      if (UseMULXHi) {
5672
        SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5673
        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5674
        ResHi = SDValue(CNode, 0);
5675
        Chain = SDValue(CNode, 1);
5676
      } else if (UseMULX) {
5677
        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5678
        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5679
        ResHi = SDValue(CNode, 0);
5680
        ResLo = SDValue(CNode, 1);
5681
        Chain = SDValue(CNode, 2);
5682
      } else {
5683
        SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5684
        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5685
        Chain = SDValue(CNode, 0);
5686
        InGlue = SDValue(CNode, 1);
5687
      }
5688

5689
      // Update the chain.
5690
      ReplaceUses(N1.getValue(1), Chain);
5691
      // Record the mem-refs
5692
      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5693
    } else {
5694
      SDValue Ops[] = { N1, InGlue };
5695
      if (UseMULXHi) {
5696
        SDVTList VTs = CurDAG->getVTList(NVT);
5697
        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5698
        ResHi = SDValue(CNode, 0);
5699
      } else if (UseMULX) {
5700
        SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5701
        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5702
        ResHi = SDValue(CNode, 0);
5703
        ResLo = SDValue(CNode, 1);
5704
      } else {
5705
        SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5706
        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5707
        InGlue = SDValue(CNode, 0);
5708
      }
5709
    }
5710

5711
    // Copy the low half of the result, if it is needed.
5712
    if (!SDValue(Node, 0).use_empty()) {
5713
      if (!ResLo) {
5714
        assert(LoReg && "Register for low half is not defined!");
5715
        ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5716
                                       NVT, InGlue);
5717
        InGlue = ResLo.getValue(2);
5718
      }
5719
      ReplaceUses(SDValue(Node, 0), ResLo);
5720
      LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5721
                 dbgs() << '\n');
5722
    }
5723
    // Copy the high half of the result, if it is needed.
5724
    if (!SDValue(Node, 1).use_empty()) {
5725
      if (!ResHi) {
5726
        assert(HiReg && "Register for high half is not defined!");
5727
        ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5728
                                       NVT, InGlue);
5729
        InGlue = ResHi.getValue(2);
5730
      }
5731
      ReplaceUses(SDValue(Node, 1), ResHi);
5732
      LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5733
                 dbgs() << '\n');
5734
    }
5735

5736
    CurDAG->RemoveDeadNode(Node);
5737
    return;
5738
  }
5739

5740
  case ISD::SDIVREM:
5741
  case ISD::UDIVREM: {
5742
    SDValue N0 = Node->getOperand(0);
5743
    SDValue N1 = Node->getOperand(1);
5744

5745
    unsigned ROpc, MOpc;
5746
    bool isSigned = Opcode == ISD::SDIVREM;
5747
    if (!isSigned) {
5748
      switch (NVT.SimpleTy) {
5749
      default: llvm_unreachable("Unsupported VT!");
5750
      case MVT::i8:  ROpc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
5751
      case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5752
      case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5753
      case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5754
      }
5755
    } else {
5756
      switch (NVT.SimpleTy) {
5757
      default: llvm_unreachable("Unsupported VT!");
5758
      case MVT::i8:  ROpc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
5759
      case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5760
      case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5761
      case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5762
      }
5763
    }
5764

5765
    unsigned LoReg, HiReg, ClrReg;
5766
    unsigned SExtOpcode;
5767
    switch (NVT.SimpleTy) {
5768
    default: llvm_unreachable("Unsupported VT!");
5769
    case MVT::i8:
5770
      LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
5771
      SExtOpcode = 0; // Not used.
5772
      break;
5773
    case MVT::i16:
5774
      LoReg = X86::AX;  HiReg = X86::DX;
5775
      ClrReg = X86::DX;
5776
      SExtOpcode = X86::CWD;
5777
      break;
5778
    case MVT::i32:
5779
      LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5780
      SExtOpcode = X86::CDQ;
5781
      break;
5782
    case MVT::i64:
5783
      LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5784
      SExtOpcode = X86::CQO;
5785
      break;
5786
    }
5787

5788
    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5789
    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5790
    bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5791

5792
    SDValue InGlue;
5793
    if (NVT == MVT::i8) {
5794
      // Special case for div8, just use a move with zero extension to AX to
5795
      // clear the upper 8 bits (AH).
5796
      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5797
      MachineSDNode *Move;
5798
      if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5799
        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5800
        unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5801
                                                    : X86::MOVZX16rm8;
5802
        Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5803
        Chain = SDValue(Move, 1);
5804
        ReplaceUses(N0.getValue(1), Chain);
5805
        // Record the mem-refs
5806
        CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5807
      } else {
5808
        unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5809
                                                    : X86::MOVZX16rr8;
5810
        Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5811
        Chain = CurDAG->getEntryNode();
5812
      }
5813
      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5814
                                    SDValue());
5815
      InGlue = Chain.getValue(1);
5816
    } else {
5817
      InGlue =
5818
        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5819
                             LoReg, N0, SDValue()).getValue(1);
5820
      if (isSigned && !signBitIsZero) {
5821
        // Sign extend the low part into the high part.
5822
        InGlue =
5823
          SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5824
      } else {
5825
        // Zero out the high part, effectively zero extending the input.
5826
        SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5827
        SDValue ClrNode = SDValue(
5828
            CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
5829
        switch (NVT.SimpleTy) {
5830
        case MVT::i16:
5831
          ClrNode =
5832
              SDValue(CurDAG->getMachineNode(
5833
                          TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5834
                          CurDAG->getTargetConstant(X86::sub_16bit, dl,
5835
                                                    MVT::i32)),
5836
                      0);
5837
          break;
5838
        case MVT::i32:
5839
          break;
5840
        case MVT::i64:
5841
          ClrNode =
5842
              SDValue(CurDAG->getMachineNode(
5843
                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5844
                          CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5845
                          CurDAG->getTargetConstant(X86::sub_32bit, dl,
5846
                                                    MVT::i32)),
5847
                      0);
5848
          break;
5849
        default:
5850
          llvm_unreachable("Unexpected division source");
5851
        }
5852

5853
        InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5854
                                      ClrNode, InGlue).getValue(1);
5855
      }
5856
    }
5857

5858
    if (foldedLoad) {
5859
      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5860
                        InGlue };
5861
      MachineSDNode *CNode =
5862
        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5863
      InGlue = SDValue(CNode, 1);
5864
      // Update the chain.
5865
      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
5866
      // Record the mem-refs
5867
      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5868
    } else {
5869
      InGlue =
5870
        SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
5871
    }
5872

5873
    // Prevent use of AH in a REX instruction by explicitly copying it to
5874
    // an ABCD_L register.
5875
    //
5876
    // The current assumption of the register allocator is that isel
5877
    // won't generate explicit references to the GR8_ABCD_H registers. If
5878
    // the allocator and/or the backend get enhanced to be more robust in
5879
    // that regard, this can be, and should be, removed.
5880
    if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
5881
      SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
5882
      unsigned AHExtOpcode =
5883
          isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
5884

5885
      SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
5886
                                             MVT::Glue, AHCopy, InGlue);
5887
      SDValue Result(RNode, 0);
5888
      InGlue = SDValue(RNode, 1);
5889

5890
      Result =
5891
          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
5892

5893
      ReplaceUses(SDValue(Node, 1), Result);
5894
      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5895
                 dbgs() << '\n');
5896
    }
5897
    // Copy the division (low) result, if it is needed.
5898
    if (!SDValue(Node, 0).use_empty()) {
5899
      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5900
                                                LoReg, NVT, InGlue);
5901
      InGlue = Result.getValue(2);
5902
      ReplaceUses(SDValue(Node, 0), Result);
5903
      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5904
                 dbgs() << '\n');
5905
    }
5906
    // Copy the remainder (high) result, if it is needed.
5907
    if (!SDValue(Node, 1).use_empty()) {
5908
      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5909
                                              HiReg, NVT, InGlue);
5910
      InGlue = Result.getValue(2);
5911
      ReplaceUses(SDValue(Node, 1), Result);
5912
      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5913
                 dbgs() << '\n');
5914
    }
5915
    CurDAG->RemoveDeadNode(Node);
5916
    return;
5917
  }
5918

5919
  case X86ISD::FCMP:
5920
  case X86ISD::STRICT_FCMP:
5921
  case X86ISD::STRICT_FCMPS: {
5922
    bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
5923
                       Node->getOpcode() == X86ISD::STRICT_FCMPS;
5924
    SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
5925
    SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
5926

5927
    // Save the original VT of the compare.
5928
    MVT CmpVT = N0.getSimpleValueType();
5929

5930
    // Floating point needs special handling if we don't have FCOMI.
5931
    if (Subtarget->canUseCMOV())
5932
      break;
5933

5934
    bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
5935

5936
    unsigned Opc;
5937
    switch (CmpVT.SimpleTy) {
5938
    default: llvm_unreachable("Unexpected type!");
5939
    case MVT::f32:
5940
      Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
5941
      break;
5942
    case MVT::f64:
5943
      Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
5944
      break;
5945
    case MVT::f80:
5946
      Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
5947
      break;
5948
    }
5949

5950
    SDValue Chain =
5951
        IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
5952
    SDValue Glue;
5953
    if (IsStrictCmp) {
5954
      SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5955
      Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
5956
      Glue = Chain.getValue(1);
5957
    } else {
5958
      Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
5959
    }
5960

5961
    // Move FPSW to AX.
5962
    SDValue FNSTSW =
5963
        SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
5964

5965
    // Extract upper 8-bits of AX.
5966
    SDValue Extract =
5967
        CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
5968

5969
    // Move AH into flags.
5970
    // Some 64-bit targets lack SAHF support, but they do support FCOMI.
5971
    assert(Subtarget->canUseLAHFSAHF() &&
5972
           "Target doesn't support SAHF or FCOMI?");
5973
    SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
5974
    Chain = AH;
5975
    SDValue SAHF = SDValue(
5976
        CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
5977

5978
    if (IsStrictCmp)
5979
      ReplaceUses(SDValue(Node, 1), Chain);
5980

5981
    ReplaceUses(SDValue(Node, 0), SAHF);
5982
    CurDAG->RemoveDeadNode(Node);
5983
    return;
5984
  }
5985

5986
  case X86ISD::CMP: {
5987
    SDValue N0 = Node->getOperand(0);
5988
    SDValue N1 = Node->getOperand(1);
5989

5990
    // Optimizations for TEST compares.
5991
    if (!isNullConstant(N1))
5992
      break;
5993

5994
    // Save the original VT of the compare.
5995
    MVT CmpVT = N0.getSimpleValueType();
5996

5997
    // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
5998
    // by a test instruction. The test should be removed later by
5999
    // analyzeCompare if we are using only the zero flag.
6000
    // TODO: Should we check the users and use the BEXTR flags directly?
6001
    if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6002
      if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
6003
        unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6004
                                             : X86::TEST32rr;
6005
        SDValue BEXTR = SDValue(NewNode, 0);
6006
        NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
6007
        ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6008
        CurDAG->RemoveDeadNode(Node);
6009
        return;
6010
      }
6011
    }
6012

6013
    // We can peek through truncates, but we need to be careful below.
6014
    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6015
      N0 = N0.getOperand(0);
6016

6017
    // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6018
    // use a smaller encoding.
6019
    // Look past the truncate if CMP is the only use of it.
6020
    if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6021
        N0.getValueType() != MVT::i8) {
6022
      auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6023
      if (!MaskC)
6024
        break;
6025

6026
      // We may have looked through a truncate so mask off any bits that
6027
      // shouldn't be part of the compare.
6028
      uint64_t Mask = MaskC->getZExtValue();
6029
      Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
6030

6031
      // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6032
      // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6033
      // zero flag.
6034
      if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
6035
          onlyUsesZeroFlag(SDValue(Node, 0))) {
6036
        unsigned ShiftOpcode = ISD::DELETED_NODE;
6037
        unsigned ShiftAmt;
6038
        unsigned SubRegIdx;
6039
        MVT SubRegVT;
6040
        unsigned TestOpcode;
6041
        unsigned LeadingZeros = llvm::countl_zero(Mask);
6042
        unsigned TrailingZeros = llvm::countr_zero(Mask);
6043

6044
        // With leading/trailing zeros, the transform is profitable if we can
6045
        // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6046
        // incurring any extra register moves.
6047
        bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6048
        if (LeadingZeros == 0 && SavesBytes) {
6049
          // If the mask covers the most significant bit, then we can replace
6050
          // TEST+AND with a SHR and check eflags.
6051
          // This emits a redundant TEST which is subsequently eliminated.
6052
          ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6053
          ShiftAmt = TrailingZeros;
6054
          SubRegIdx = 0;
6055
          TestOpcode = X86::TEST64rr;
6056
        } else if (TrailingZeros == 0 && SavesBytes) {
6057
          // If the mask covers the least significant bit, then we can replace
6058
          // TEST+AND with a SHL and check eflags.
6059
          // This emits a redundant TEST which is subsequently eliminated.
6060
          ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6061
          ShiftAmt = LeadingZeros;
6062
          SubRegIdx = 0;
6063
          TestOpcode = X86::TEST64rr;
6064
        } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6065
          // If the shifted mask extends into the high half and is 8/16/32 bits
6066
          // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6067
          unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6068
          if (PopCount == 8) {
6069
            ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6070
            ShiftAmt = TrailingZeros;
6071
            SubRegIdx = X86::sub_8bit;
6072
            SubRegVT = MVT::i8;
6073
            TestOpcode = X86::TEST8rr;
6074
          } else if (PopCount == 16) {
6075
            ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6076
            ShiftAmt = TrailingZeros;
6077
            SubRegIdx = X86::sub_16bit;
6078
            SubRegVT = MVT::i16;
6079
            TestOpcode = X86::TEST16rr;
6080
          } else if (PopCount == 32) {
6081
            ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6082
            ShiftAmt = TrailingZeros;
6083
            SubRegIdx = X86::sub_32bit;
6084
            SubRegVT = MVT::i32;
6085
            TestOpcode = X86::TEST32rr;
6086
          }
6087
        }
6088
        if (ShiftOpcode != ISD::DELETED_NODE) {
6089
          SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6090
          SDValue Shift = SDValue(
6091
              CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6092
                                     N0.getOperand(0), ShiftC),
6093
              0);
6094
          if (SubRegIdx != 0) {
6095
            Shift =
6096
                CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6097
          }
6098
          MachineSDNode *Test =
6099
              CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6100
          ReplaceNode(Node, Test);
6101
          return;
6102
        }
6103
      }
6104

6105
      MVT VT;
6106
      int SubRegOp;
6107
      unsigned ROpc, MOpc;
6108

6109
      // For each of these checks we need to be careful if the sign flag is
6110
      // being used. It is only safe to use the sign flag in two conditions,
6111
      // either the sign bit in the shrunken mask is zero or the final test
6112
      // size is equal to the original compare size.
6113

6114
      if (isUInt<8>(Mask) &&
6115
          (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6116
           hasNoSignFlagUses(SDValue(Node, 0)))) {
6117
        // For example, convert "testl %eax, $8" to "testb %al, $8"
6118
        VT = MVT::i8;
6119
        SubRegOp = X86::sub_8bit;
6120
        ROpc = X86::TEST8ri;
6121
        MOpc = X86::TEST8mi;
6122
      } else if (OptForMinSize && isUInt<16>(Mask) &&
6123
                 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6124
                  hasNoSignFlagUses(SDValue(Node, 0)))) {
6125
        // For example, "testl %eax, $32776" to "testw %ax, $32776".
6126
        // NOTE: We only want to form TESTW instructions if optimizing for
6127
        // min size. Otherwise we only save one byte and possibly get a length
6128
        // changing prefix penalty in the decoders.
6129
        VT = MVT::i16;
6130
        SubRegOp = X86::sub_16bit;
6131
        ROpc = X86::TEST16ri;
6132
        MOpc = X86::TEST16mi;
6133
      } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6134
                 ((!(Mask & 0x80000000) &&
6135
                   // Without minsize 16-bit Cmps can get here so we need to
6136
                   // be sure we calculate the correct sign flag if needed.
6137
                   (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6138
                  CmpVT == MVT::i32 ||
6139
                  hasNoSignFlagUses(SDValue(Node, 0)))) {
6140
        // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6141
        // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6142
        // Otherwize, we find ourselves in a position where we have to do
6143
        // promotion. If previous passes did not promote the and, we assume
6144
        // they had a good reason not to and do not promote here.
6145
        VT = MVT::i32;
6146
        SubRegOp = X86::sub_32bit;
6147
        ROpc = X86::TEST32ri;
6148
        MOpc = X86::TEST32mi;
6149
      } else {
6150
        // No eligible transformation was found.
6151
        break;
6152
      }
6153

6154
      SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6155
      SDValue Reg = N0.getOperand(0);
6156

6157
      // Emit a testl or testw.
6158
      MachineSDNode *NewNode;
6159
      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6160
      if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6161
        if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6162
          if (!LoadN->isSimple()) {
6163
            unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6164
            if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6165
                (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6166
                (MOpc == X86::TEST32mi && NumVolBits != 32))
6167
              break;
6168
          }
6169
        }
6170
        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6171
                          Reg.getOperand(0) };
6172
        NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6173
        // Update the chain.
6174
        ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6175
        // Record the mem-refs
6176
        CurDAG->setNodeMemRefs(NewNode,
6177
                               {cast<LoadSDNode>(Reg)->getMemOperand()});
6178
      } else {
6179
        // Extract the subregister if necessary.
6180
        if (N0.getValueType() != VT)
6181
          Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6182

6183
        NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6184
      }
6185
      // Replace CMP with TEST.
6186
      ReplaceNode(Node, NewNode);
6187
      return;
6188
    }
6189
    break;
6190
  }
6191
  case X86ISD::PCMPISTR: {
6192
    if (!Subtarget->hasSSE42())
6193
      break;
6194

6195
    bool NeedIndex = !SDValue(Node, 0).use_empty();
6196
    bool NeedMask = !SDValue(Node, 1).use_empty();
6197
    // We can't fold a load if we are going to make two instructions.
6198
    bool MayFoldLoad = !NeedIndex || !NeedMask;
6199

6200
    MachineSDNode *CNode;
6201
    if (NeedMask) {
6202
      unsigned ROpc =
6203
          Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6204
      unsigned MOpc =
6205
          Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6206
      CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6207
      ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6208
    }
6209
    if (NeedIndex || !NeedMask) {
6210
      unsigned ROpc =
6211
          Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6212
      unsigned MOpc =
6213
          Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6214
      CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6215
      ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6216
    }
6217

6218
    // Connect the flag usage to the last instruction created.
6219
    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6220
    CurDAG->RemoveDeadNode(Node);
6221
    return;
6222
  }
6223
  case X86ISD::PCMPESTR: {
6224
    if (!Subtarget->hasSSE42())
6225
      break;
6226

6227
    // Copy the two implicit register inputs.
6228
    SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6229
                                          Node->getOperand(1),
6230
                                          SDValue()).getValue(1);
6231
    InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6232
                                  Node->getOperand(3), InGlue).getValue(1);
6233

6234
    bool NeedIndex = !SDValue(Node, 0).use_empty();
6235
    bool NeedMask = !SDValue(Node, 1).use_empty();
6236
    // We can't fold a load if we are going to make two instructions.
6237
    bool MayFoldLoad = !NeedIndex || !NeedMask;
6238

6239
    MachineSDNode *CNode;
6240
    if (NeedMask) {
6241
      unsigned ROpc =
6242
          Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6243
      unsigned MOpc =
6244
          Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6245
      CNode =
6246
          emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6247
      ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6248
    }
6249
    if (NeedIndex || !NeedMask) {
6250
      unsigned ROpc =
6251
          Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6252
      unsigned MOpc =
6253
          Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6254
      CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6255
      ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6256
    }
6257
    // Connect the flag usage to the last instruction created.
6258
    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6259
    CurDAG->RemoveDeadNode(Node);
6260
    return;
6261
  }
6262

6263
  case ISD::SETCC: {
6264
    if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6265
      return;
6266

6267
    break;
6268
  }
6269

6270
  case ISD::STORE:
6271
    if (foldLoadStoreIntoMemOperand(Node))
6272
      return;
6273
    break;
6274

6275
  case X86ISD::SETCC_CARRY: {
6276
    MVT VT = Node->getSimpleValueType(0);
6277
    SDValue Result;
6278
    if (Subtarget->hasSBBDepBreaking()) {
6279
      // We have to do this manually because tblgen will put the eflags copy in
6280
      // the wrong place if we use an extract_subreg in the pattern.
6281
      // Copy flags to the EFLAGS register and glue it to next node.
6282
      SDValue EFLAGS =
6283
          CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6284
                               Node->getOperand(1), SDValue());
6285

6286
      // Create a 64-bit instruction if the result is 64-bits otherwise use the
6287
      // 32-bit version.
6288
      unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6289
      MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6290
      Result = SDValue(
6291
          CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6292
          0);
6293
    } else {
6294
      // The target does not recognize sbb with the same reg operand as a
6295
      // no-source idiom, so we explicitly zero the input values.
6296
      Result = getSBBZero(Node);
6297
    }
6298

6299
    // For less than 32-bits we need to extract from the 32-bit node.
6300
    if (VT == MVT::i8 || VT == MVT::i16) {
6301
      int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6302
      Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6303
    }
6304

6305
    ReplaceUses(SDValue(Node, 0), Result);
6306
    CurDAG->RemoveDeadNode(Node);
6307
    return;
6308
  }
6309
  case X86ISD::SBB: {
6310
    if (isNullConstant(Node->getOperand(0)) &&
6311
        isNullConstant(Node->getOperand(1))) {
6312
      SDValue Result = getSBBZero(Node);
6313

6314
      // Replace the flag use.
6315
      ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6316

6317
      // Replace the result use.
6318
      if (!SDValue(Node, 0).use_empty()) {
6319
        // For less than 32-bits we need to extract from the 32-bit node.
6320
        MVT VT = Node->getSimpleValueType(0);
6321
        if (VT == MVT::i8 || VT == MVT::i16) {
6322
          int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6323
          Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6324
        }
6325
        ReplaceUses(SDValue(Node, 0), Result);
6326
      }
6327

6328
      CurDAG->RemoveDeadNode(Node);
6329
      return;
6330
    }
6331
    break;
6332
  }
6333
  case X86ISD::MGATHER: {
6334
    auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6335
    SDValue IndexOp = Mgt->getIndex();
6336
    SDValue Mask = Mgt->getMask();
6337
    MVT IndexVT = IndexOp.getSimpleValueType();
6338
    MVT ValueVT = Node->getSimpleValueType(0);
6339
    MVT MaskVT = Mask.getSimpleValueType();
6340

6341
    // This is just to prevent crashes if the nodes are malformed somehow. We're
6342
    // otherwise only doing loose type checking in here based on type what
6343
    // a type constraint would say just like table based isel.
6344
    if (!ValueVT.isVector() || !MaskVT.isVector())
6345
      break;
6346

6347
    unsigned NumElts = ValueVT.getVectorNumElements();
6348
    MVT ValueSVT = ValueVT.getVectorElementType();
6349

6350
    bool IsFP = ValueSVT.isFloatingPoint();
6351
    unsigned EltSize = ValueSVT.getSizeInBits();
6352

6353
    unsigned Opc = 0;
6354
    bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6355
    if (AVX512Gather) {
6356
      if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6357
        Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6358
      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6359
        Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6360
      else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6361
        Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6362
      else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6363
        Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6364
      else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6365
        Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6366
      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6367
        Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6368
      else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6369
        Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6370
      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6371
        Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6372
      else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6373
        Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6374
      else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6375
        Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6376
      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6377
        Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6378
      else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6379
        Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6380
    } else {
6381
      assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6382
             "Unexpected mask VT!");
6383
      if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6384
        Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6385
      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6386
        Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6387
      else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6388
        Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6389
      else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6390
        Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6391
      else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6392
        Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6393
      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6394
        Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6395
      else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6396
        Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6397
      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6398
        Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6399
    }
6400

6401
    if (!Opc)
6402
      break;
6403

6404
    SDValue Base, Scale, Index, Disp, Segment;
6405
    if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6406
                          Base, Scale, Index, Disp, Segment))
6407
      break;
6408

6409
    SDValue PassThru = Mgt->getPassThru();
6410
    SDValue Chain = Mgt->getChain();
6411
    // Gather instructions have a mask output not in the ISD node.
6412
    SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6413

6414
    MachineSDNode *NewNode;
6415
    if (AVX512Gather) {
6416
      SDValue Ops[] = {PassThru, Mask, Base,    Scale,
6417
                       Index,    Disp, Segment, Chain};
6418
      NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6419
    } else {
6420
      SDValue Ops[] = {PassThru, Base,    Scale, Index,
6421
                       Disp,     Segment, Mask,  Chain};
6422
      NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6423
    }
6424
    CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6425
    ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6426
    ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6427
    CurDAG->RemoveDeadNode(Node);
6428
    return;
6429
  }
6430
  case X86ISD::MSCATTER: {
6431
    auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6432
    SDValue Value = Sc->getValue();
6433
    SDValue IndexOp = Sc->getIndex();
6434
    MVT IndexVT = IndexOp.getSimpleValueType();
6435
    MVT ValueVT = Value.getSimpleValueType();
6436

6437
    // This is just to prevent crashes if the nodes are malformed somehow. We're
6438
    // otherwise only doing loose type checking in here based on type what
6439
    // a type constraint would say just like table based isel.
6440
    if (!ValueVT.isVector())
6441
      break;
6442

6443
    unsigned NumElts = ValueVT.getVectorNumElements();
6444
    MVT ValueSVT = ValueVT.getVectorElementType();
6445

6446
    bool IsFP = ValueSVT.isFloatingPoint();
6447
    unsigned EltSize = ValueSVT.getSizeInBits();
6448

6449
    unsigned Opc;
6450
    if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6451
      Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6452
    else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6453
      Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6454
    else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6455
      Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6456
    else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6457
      Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6458
    else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6459
      Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6460
    else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6461
      Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6462
    else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6463
      Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6464
    else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6465
      Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6466
    else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6467
      Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6468
    else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6469
      Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6470
    else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6471
      Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6472
    else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6473
      Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6474
    else
6475
      break;
6476

6477
    SDValue Base, Scale, Index, Disp, Segment;
6478
    if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6479
                          Base, Scale, Index, Disp, Segment))
6480
      break;
6481

6482
    SDValue Mask = Sc->getMask();
6483
    SDValue Chain = Sc->getChain();
6484
    // Scatter instructions have a mask output not in the ISD node.
6485
    SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6486
    SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6487

6488
    MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6489
    CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6490
    ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6491
    CurDAG->RemoveDeadNode(Node);
6492
    return;
6493
  }
6494
  case ISD::PREALLOCATED_SETUP: {
6495
    auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6496
    auto CallId = MFI->getPreallocatedIdForCallSite(
6497
        cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6498
    SDValue Chain = Node->getOperand(0);
6499
    SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6500
    MachineSDNode *New = CurDAG->getMachineNode(
6501
        TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6502
    ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6503
    CurDAG->RemoveDeadNode(Node);
6504
    return;
6505
  }
6506
  case ISD::PREALLOCATED_ARG: {
6507
    auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6508
    auto CallId = MFI->getPreallocatedIdForCallSite(
6509
        cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6510
    SDValue Chain = Node->getOperand(0);
6511
    SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6512
    SDValue ArgIndex = Node->getOperand(2);
6513
    SDValue Ops[3];
6514
    Ops[0] = CallIdValue;
6515
    Ops[1] = ArgIndex;
6516
    Ops[2] = Chain;
6517
    MachineSDNode *New = CurDAG->getMachineNode(
6518
        TargetOpcode::PREALLOCATED_ARG, dl,
6519
        CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6520
                          MVT::Other),
6521
        Ops);
6522
    ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6523
    ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6524
    CurDAG->RemoveDeadNode(Node);
6525
    return;
6526
  }
6527
  case X86ISD::AESENCWIDE128KL:
6528
  case X86ISD::AESDECWIDE128KL:
6529
  case X86ISD::AESENCWIDE256KL:
6530
  case X86ISD::AESDECWIDE256KL: {
6531
    if (!Subtarget->hasWIDEKL())
6532
      break;
6533

6534
    unsigned Opcode;
6535
    switch (Node->getOpcode()) {
6536
    default:
6537
      llvm_unreachable("Unexpected opcode!");
6538
    case X86ISD::AESENCWIDE128KL:
6539
      Opcode = X86::AESENCWIDE128KL;
6540
      break;
6541
    case X86ISD::AESDECWIDE128KL:
6542
      Opcode = X86::AESDECWIDE128KL;
6543
      break;
6544
    case X86ISD::AESENCWIDE256KL:
6545
      Opcode = X86::AESENCWIDE256KL;
6546
      break;
6547
    case X86ISD::AESDECWIDE256KL:
6548
      Opcode = X86::AESDECWIDE256KL;
6549
      break;
6550
    }
6551

6552
    SDValue Chain = Node->getOperand(0);
6553
    SDValue Addr = Node->getOperand(1);
6554

6555
    SDValue Base, Scale, Index, Disp, Segment;
6556
    if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6557
      break;
6558

6559
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6560
                                 SDValue());
6561
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6562
                                 Chain.getValue(1));
6563
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6564
                                 Chain.getValue(1));
6565
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6566
                                 Chain.getValue(1));
6567
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6568
                                 Chain.getValue(1));
6569
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6570
                                 Chain.getValue(1));
6571
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6572
                                 Chain.getValue(1));
6573
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6574
                                 Chain.getValue(1));
6575

6576
    MachineSDNode *Res = CurDAG->getMachineNode(
6577
        Opcode, dl, Node->getVTList(),
6578
        {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6579
    CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6580
    ReplaceNode(Node, Res);
6581
    return;
6582
  }
6583
  }
6584

6585
  SelectCode(Node);
6586
}
6587

6588
bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6589
    const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6590
    std::vector<SDValue> &OutOps) {
6591
  SDValue Op0, Op1, Op2, Op3, Op4;
6592
  switch (ConstraintID) {
6593
  default:
6594
    llvm_unreachable("Unexpected asm memory constraint");
6595
  case InlineAsm::ConstraintCode::o: // offsetable        ??
6596
  case InlineAsm::ConstraintCode::v: // not offsetable    ??
6597
  case InlineAsm::ConstraintCode::m: // memory
6598
  case InlineAsm::ConstraintCode::X:
6599
  case InlineAsm::ConstraintCode::p: // address
6600
    if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6601
      return true;
6602
    break;
6603
  }
6604

6605
  OutOps.push_back(Op0);
6606
  OutOps.push_back(Op1);
6607
  OutOps.push_back(Op2);
6608
  OutOps.push_back(Op3);
6609
  OutOps.push_back(Op4);
6610
  return false;
6611
}
6612

6613
X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6614
    : SelectionDAGISelPass(
6615
          std::make_unique<X86DAGToDAGISel>(TM, TM.getOptLevel())) {}
6616

6617
/// This pass converts a legalized DAG into a X86-specific DAG,
6618
/// ready for instruction scheduling.
6619
FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6620
                                     CodeGenOptLevel OptLevel) {
6621
  return new X86DAGToDAGISelLegacy(TM, OptLevel);
6622
}
6623

6624
Product

Resources

Company