CoCalc -- X86InstCombineIntrinsic.cpp

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
³⁵²⁶⁶ views
1
//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
/// \file
9
/// This file implements a TargetTransformInfo analysis pass specific to the
10
/// X86 target machine. It uses the target's detailed information to provide
11
/// more precise answers to certain TTI queries, while letting the target
12
/// independent and default TTI implementations handle the rest.
13
///
14
//===----------------------------------------------------------------------===//
15

16
#include "X86TargetTransformInfo.h"
17
#include "llvm/IR/IntrinsicInst.h"
18
#include "llvm/IR/IntrinsicsX86.h"
19
#include "llvm/Support/KnownBits.h"
20
#include "llvm/Transforms/InstCombine/InstCombiner.h"
21
#include <optional>
22

23
using namespace llvm;
24
using namespace llvm::PatternMatch;
25

26
#define DEBUG_TYPE "x86tti"
27

28
/// Return a constant boolean vector that has true elements in all positions
29
/// where the input constant data vector has an element with the sign bit set.
30
static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) {
31
  VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
32
  V = ConstantExpr::getBitCast(V, IntTy);
33
  V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT,
34
                                      Constant::getNullValue(IntTy), V, DL);
35
  assert(V && "Vector must be foldable");
36
  return V;
37
}
38

39
/// Convert the x86 XMM integer vector mask to a vector of bools based on
40
/// each element's most significant bit (the sign bit).
41
static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
42
  // Fold Constant Mask.
43
  if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
44
    return getNegativeIsTrueBoolVec(ConstantMask, DL);
45

46
  // Mask was extended from a boolean vector.
47
  Value *ExtMask;
48
  if (match(Mask, m_SExt(m_Value(ExtMask))) &&
49
      ExtMask->getType()->isIntOrIntVectorTy(1))
50
    return ExtMask;
51

52
  return nullptr;
53
}
54

55
// TODO: If the x86 backend knew how to convert a bool vector mask back to an
56
// XMM register mask efficiently, we could transform all x86 masked intrinsics
57
// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
58
static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
59
  Value *Ptr = II.getOperand(0);
60
  Value *Mask = II.getOperand(1);
61
  Constant *ZeroVec = Constant::getNullValue(II.getType());
62

63
  // Zero Mask - masked load instruction creates a zero vector.
64
  if (isa<ConstantAggregateZero>(Mask))
65
    return IC.replaceInstUsesWith(II, ZeroVec);
66

67
  // The mask is constant or extended from a bool vector. Convert this x86
68
  // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
69
  if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
70
    // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
71
    // the LLVM intrinsic definition for the pointer argument.
72
    unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
73
    PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
74
    Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
75

76
    // The pass-through vector for an x86 masked load is a zero vector.
77
    CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
78
        II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
79
    return IC.replaceInstUsesWith(II, NewMaskedLoad);
80
  }
81

82
  return nullptr;
83
}
84

85
// TODO: If the x86 backend knew how to convert a bool vector mask back to an
86
// XMM register mask efficiently, we could transform all x86 masked intrinsics
87
// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
88
static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
89
  Value *Ptr = II.getOperand(0);
90
  Value *Mask = II.getOperand(1);
91
  Value *Vec = II.getOperand(2);
92

93
  // Zero Mask - this masked store instruction does nothing.
94
  if (isa<ConstantAggregateZero>(Mask)) {
95
    IC.eraseInstFromFunction(II);
96
    return true;
97
  }
98

99
  // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
100
  // anything else at this level.
101
  if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
102
    return false;
103

104
  // The mask is constant or extended from a bool vector. Convert this x86
105
  // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
106
  if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
107
    unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
108
    PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
109
    Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
110

111
    IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
112

113
    // 'Replace uses' doesn't work for stores. Erase the original masked store.
114
    IC.eraseInstFromFunction(II);
115
    return true;
116
  }
117

118
  return false;
119
}
120

121
static Value *simplifyX86immShift(const IntrinsicInst &II,
122
                                  InstCombiner::BuilderTy &Builder) {
123
  bool LogicalShift = false;
124
  bool ShiftLeft = false;
125
  bool IsImm = false;
126

127
  switch (II.getIntrinsicID()) {
128
  default:
129
    llvm_unreachable("Unexpected intrinsic!");
130
  case Intrinsic::x86_sse2_psrai_d:
131
  case Intrinsic::x86_sse2_psrai_w:
132
  case Intrinsic::x86_avx2_psrai_d:
133
  case Intrinsic::x86_avx2_psrai_w:
134
  case Intrinsic::x86_avx512_psrai_q_128:
135
  case Intrinsic::x86_avx512_psrai_q_256:
136
  case Intrinsic::x86_avx512_psrai_d_512:
137
  case Intrinsic::x86_avx512_psrai_q_512:
138
  case Intrinsic::x86_avx512_psrai_w_512:
139
    IsImm = true;
140
    [[fallthrough]];
141
  case Intrinsic::x86_sse2_psra_d:
142
  case Intrinsic::x86_sse2_psra_w:
143
  case Intrinsic::x86_avx2_psra_d:
144
  case Intrinsic::x86_avx2_psra_w:
145
  case Intrinsic::x86_avx512_psra_q_128:
146
  case Intrinsic::x86_avx512_psra_q_256:
147
  case Intrinsic::x86_avx512_psra_d_512:
148
  case Intrinsic::x86_avx512_psra_q_512:
149
  case Intrinsic::x86_avx512_psra_w_512:
150
    LogicalShift = false;
151
    ShiftLeft = false;
152
    break;
153
  case Intrinsic::x86_sse2_psrli_d:
154
  case Intrinsic::x86_sse2_psrli_q:
155
  case Intrinsic::x86_sse2_psrli_w:
156
  case Intrinsic::x86_avx2_psrli_d:
157
  case Intrinsic::x86_avx2_psrli_q:
158
  case Intrinsic::x86_avx2_psrli_w:
159
  case Intrinsic::x86_avx512_psrli_d_512:
160
  case Intrinsic::x86_avx512_psrli_q_512:
161
  case Intrinsic::x86_avx512_psrli_w_512:
162
    IsImm = true;
163
    [[fallthrough]];
164
  case Intrinsic::x86_sse2_psrl_d:
165
  case Intrinsic::x86_sse2_psrl_q:
166
  case Intrinsic::x86_sse2_psrl_w:
167
  case Intrinsic::x86_avx2_psrl_d:
168
  case Intrinsic::x86_avx2_psrl_q:
169
  case Intrinsic::x86_avx2_psrl_w:
170
  case Intrinsic::x86_avx512_psrl_d_512:
171
  case Intrinsic::x86_avx512_psrl_q_512:
172
  case Intrinsic::x86_avx512_psrl_w_512:
173
    LogicalShift = true;
174
    ShiftLeft = false;
175
    break;
176
  case Intrinsic::x86_sse2_pslli_d:
177
  case Intrinsic::x86_sse2_pslli_q:
178
  case Intrinsic::x86_sse2_pslli_w:
179
  case Intrinsic::x86_avx2_pslli_d:
180
  case Intrinsic::x86_avx2_pslli_q:
181
  case Intrinsic::x86_avx2_pslli_w:
182
  case Intrinsic::x86_avx512_pslli_d_512:
183
  case Intrinsic::x86_avx512_pslli_q_512:
184
  case Intrinsic::x86_avx512_pslli_w_512:
185
    IsImm = true;
186
    [[fallthrough]];
187
  case Intrinsic::x86_sse2_psll_d:
188
  case Intrinsic::x86_sse2_psll_q:
189
  case Intrinsic::x86_sse2_psll_w:
190
  case Intrinsic::x86_avx2_psll_d:
191
  case Intrinsic::x86_avx2_psll_q:
192
  case Intrinsic::x86_avx2_psll_w:
193
  case Intrinsic::x86_avx512_psll_d_512:
194
  case Intrinsic::x86_avx512_psll_q_512:
195
  case Intrinsic::x86_avx512_psll_w_512:
196
    LogicalShift = true;
197
    ShiftLeft = true;
198
    break;
199
  }
200
  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
201

202
  Value *Vec = II.getArgOperand(0);
203
  Value *Amt = II.getArgOperand(1);
204
  auto *VT = cast<FixedVectorType>(Vec->getType());
205
  Type *SVT = VT->getElementType();
206
  Type *AmtVT = Amt->getType();
207
  unsigned VWidth = VT->getNumElements();
208
  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
209

210
  // If the shift amount is guaranteed to be in-range we can replace it with a
211
  // generic shift. If its guaranteed to be out of range, logical shifts combine
212
  // to zero and arithmetic shifts are clamped to (BitWidth - 1).
213
  if (IsImm) {
214
    assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
215
    KnownBits KnownAmtBits =
216
        llvm::computeKnownBits(Amt, II.getDataLayout());
217
    if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
218
      Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
219
      Amt = Builder.CreateVectorSplat(VWidth, Amt);
220
      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
221
                                        : Builder.CreateLShr(Vec, Amt))
222
                           : Builder.CreateAShr(Vec, Amt));
223
    }
224
    if (KnownAmtBits.getMinValue().uge(BitWidth)) {
225
      if (LogicalShift)
226
        return ConstantAggregateZero::get(VT);
227
      Amt = ConstantInt::get(SVT, BitWidth - 1);
228
      return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
229
    }
230
  } else {
231
    // Ensure the first element has an in-range value and the rest of the
232
    // elements in the bottom 64 bits are zero.
233
    assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
234
           cast<VectorType>(AmtVT)->getElementType() == SVT &&
235
           "Unexpected shift-by-scalar type");
236
    unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
237
    APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
238
    APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
239
    KnownBits KnownLowerBits = llvm::computeKnownBits(
240
        Amt, DemandedLower, II.getDataLayout());
241
    KnownBits KnownUpperBits = llvm::computeKnownBits(
242
        Amt, DemandedUpper, II.getDataLayout());
243
    if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
244
        (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
245
      SmallVector<int, 16> ZeroSplat(VWidth, 0);
246
      Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
247
      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
248
                                        : Builder.CreateLShr(Vec, Amt))
249
                           : Builder.CreateAShr(Vec, Amt));
250
    }
251
  }
252

253
  // Simplify if count is constant vector.
254
  auto *CDV = dyn_cast<ConstantDataVector>(Amt);
255
  if (!CDV)
256
    return nullptr;
257

258
  // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
259
  // operand to compute the shift amount.
260
  assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
261
         cast<VectorType>(AmtVT)->getElementType() == SVT &&
262
         "Unexpected shift-by-scalar type");
263

264
  // Concatenate the sub-elements to create the 64-bit value.
265
  APInt Count(64, 0);
266
  for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
267
    unsigned SubEltIdx = (NumSubElts - 1) - i;
268
    auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
269
    Count <<= BitWidth;
270
    Count |= SubElt->getValue().zextOrTrunc(64);
271
  }
272

273
  // If shift-by-zero then just return the original value.
274
  if (Count.isZero())
275
    return Vec;
276

277
  // Handle cases when Shift >= BitWidth.
278
  if (Count.uge(BitWidth)) {
279
    // If LogicalShift - just return zero.
280
    if (LogicalShift)
281
      return ConstantAggregateZero::get(VT);
282

283
    // If ArithmeticShift - clamp Shift to (BitWidth - 1).
284
    Count = APInt(64, BitWidth - 1);
285
  }
286

287
  // Get a constant vector of the same type as the first operand.
288
  auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
289
  auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
290

291
  if (ShiftLeft)
292
    return Builder.CreateShl(Vec, ShiftVec);
293

294
  if (LogicalShift)
295
    return Builder.CreateLShr(Vec, ShiftVec);
296

297
  return Builder.CreateAShr(Vec, ShiftVec);
298
}
299

300
// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
301
// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
302
// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
303
static Value *simplifyX86varShift(const IntrinsicInst &II,
304
                                  InstCombiner::BuilderTy &Builder) {
305
  bool LogicalShift = false;
306
  bool ShiftLeft = false;
307

308
  switch (II.getIntrinsicID()) {
309
  default:
310
    llvm_unreachable("Unexpected intrinsic!");
311
  case Intrinsic::x86_avx2_psrav_d:
312
  case Intrinsic::x86_avx2_psrav_d_256:
313
  case Intrinsic::x86_avx512_psrav_q_128:
314
  case Intrinsic::x86_avx512_psrav_q_256:
315
  case Intrinsic::x86_avx512_psrav_d_512:
316
  case Intrinsic::x86_avx512_psrav_q_512:
317
  case Intrinsic::x86_avx512_psrav_w_128:
318
  case Intrinsic::x86_avx512_psrav_w_256:
319
  case Intrinsic::x86_avx512_psrav_w_512:
320
    LogicalShift = false;
321
    ShiftLeft = false;
322
    break;
323
  case Intrinsic::x86_avx2_psrlv_d:
324
  case Intrinsic::x86_avx2_psrlv_d_256:
325
  case Intrinsic::x86_avx2_psrlv_q:
326
  case Intrinsic::x86_avx2_psrlv_q_256:
327
  case Intrinsic::x86_avx512_psrlv_d_512:
328
  case Intrinsic::x86_avx512_psrlv_q_512:
329
  case Intrinsic::x86_avx512_psrlv_w_128:
330
  case Intrinsic::x86_avx512_psrlv_w_256:
331
  case Intrinsic::x86_avx512_psrlv_w_512:
332
    LogicalShift = true;
333
    ShiftLeft = false;
334
    break;
335
  case Intrinsic::x86_avx2_psllv_d:
336
  case Intrinsic::x86_avx2_psllv_d_256:
337
  case Intrinsic::x86_avx2_psllv_q:
338
  case Intrinsic::x86_avx2_psllv_q_256:
339
  case Intrinsic::x86_avx512_psllv_d_512:
340
  case Intrinsic::x86_avx512_psllv_q_512:
341
  case Intrinsic::x86_avx512_psllv_w_128:
342
  case Intrinsic::x86_avx512_psllv_w_256:
343
  case Intrinsic::x86_avx512_psllv_w_512:
344
    LogicalShift = true;
345
    ShiftLeft = true;
346
    break;
347
  }
348
  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
349

350
  Value *Vec = II.getArgOperand(0);
351
  Value *Amt = II.getArgOperand(1);
352
  auto *VT = cast<FixedVectorType>(II.getType());
353
  Type *SVT = VT->getElementType();
354
  int NumElts = VT->getNumElements();
355
  int BitWidth = SVT->getIntegerBitWidth();
356

357
  // If the shift amount is guaranteed to be in-range we can replace it with a
358
  // generic shift.
359
  KnownBits KnownAmt =
360
      llvm::computeKnownBits(Amt, II.getDataLayout());
361
  if (KnownAmt.getMaxValue().ult(BitWidth)) {
362
    return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
363
                                      : Builder.CreateLShr(Vec, Amt))
364
                         : Builder.CreateAShr(Vec, Amt));
365
  }
366

367
  // Simplify if all shift amounts are constant/undef.
368
  auto *CShift = dyn_cast<Constant>(Amt);
369
  if (!CShift)
370
    return nullptr;
371

372
  // Collect each element's shift amount.
373
  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
374
  bool AnyOutOfRange = false;
375
  SmallVector<int, 8> ShiftAmts;
376
  for (int I = 0; I < NumElts; ++I) {
377
    auto *CElt = CShift->getAggregateElement(I);
378
    if (isa_and_nonnull<UndefValue>(CElt)) {
379
      ShiftAmts.push_back(-1);
380
      continue;
381
    }
382

383
    auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
384
    if (!COp)
385
      return nullptr;
386

387
    // Handle out of range shifts.
388
    // If LogicalShift - set to BitWidth (special case).
389
    // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
390
    APInt ShiftVal = COp->getValue();
391
    if (ShiftVal.uge(BitWidth)) {
392
      AnyOutOfRange = LogicalShift;
393
      ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
394
      continue;
395
    }
396

397
    ShiftAmts.push_back((int)ShiftVal.getZExtValue());
398
  }
399

400
  // If all elements out of range or UNDEF, return vector of zeros/undefs.
401
  // ArithmeticShift should only hit this if they are all UNDEF.
402
  auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
403
  if (llvm::all_of(ShiftAmts, OutOfRange)) {
404
    SmallVector<Constant *, 8> ConstantVec;
405
    for (int Idx : ShiftAmts) {
406
      if (Idx < 0) {
407
        ConstantVec.push_back(UndefValue::get(SVT));
408
      } else {
409
        assert(LogicalShift && "Logical shift expected");
410
        ConstantVec.push_back(ConstantInt::getNullValue(SVT));
411
      }
412
    }
413
    return ConstantVector::get(ConstantVec);
414
  }
415

416
  // We can't handle only some out of range values with generic logical shifts.
417
  if (AnyOutOfRange)
418
    return nullptr;
419

420
  // Build the shift amount constant vector.
421
  SmallVector<Constant *, 8> ShiftVecAmts;
422
  for (int Idx : ShiftAmts) {
423
    if (Idx < 0)
424
      ShiftVecAmts.push_back(UndefValue::get(SVT));
425
    else
426
      ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
427
  }
428
  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
429

430
  if (ShiftLeft)
431
    return Builder.CreateShl(Vec, ShiftVec);
432

433
  if (LogicalShift)
434
    return Builder.CreateLShr(Vec, ShiftVec);
435

436
  return Builder.CreateAShr(Vec, ShiftVec);
437
}
438

439
static Value *simplifyX86pack(IntrinsicInst &II,
440
                              InstCombiner::BuilderTy &Builder, bool IsSigned) {
441
  Value *Arg0 = II.getArgOperand(0);
442
  Value *Arg1 = II.getArgOperand(1);
443
  Type *ResTy = II.getType();
444

445
  // Fast all undef handling.
446
  if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
447
    return UndefValue::get(ResTy);
448

449
  auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
450
  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
451
  unsigned NumSrcElts = ArgTy->getNumElements();
452
  assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
453
         "Unexpected packing types");
454

455
  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
456
  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
457
  unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
458
  assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
459
         "Unexpected packing types");
460

461
  // Constant folding.
462
  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
463
    return nullptr;
464

465
  // Clamp Values - signed/unsigned both use signed clamp values, but they
466
  // differ on the min/max values.
467
  APInt MinValue, MaxValue;
468
  if (IsSigned) {
469
    // PACKSS: Truncate signed value with signed saturation.
470
    // Source values less than dst minint are saturated to minint.
471
    // Source values greater than dst maxint are saturated to maxint.
472
    MinValue =
473
        APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
474
    MaxValue =
475
        APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
476
  } else {
477
    // PACKUS: Truncate signed value with unsigned saturation.
478
    // Source values less than zero are saturated to zero.
479
    // Source values greater than dst maxuint are saturated to maxuint.
480
    MinValue = APInt::getZero(SrcScalarSizeInBits);
481
    MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
482
  }
483

484
  auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
485
  auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
486
  Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
487
  Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
488
  Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
489
  Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
490

491
  // Shuffle clamped args together at the lane level.
492
  SmallVector<int, 32> PackMask;
493
  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
494
    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
495
      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
496
    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
497
      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
498
  }
499
  auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
500

501
  // Truncate to dst size.
502
  return Builder.CreateTrunc(Shuffle, ResTy);
503
}
504

505
static Value *simplifyX86pmulh(IntrinsicInst &II,
506
                               InstCombiner::BuilderTy &Builder, bool IsSigned,
507
                               bool IsRounding) {
508
  Value *Arg0 = II.getArgOperand(0);
509
  Value *Arg1 = II.getArgOperand(1);
510
  auto *ResTy = cast<FixedVectorType>(II.getType());
511
  auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
512
  assert(ArgTy == ResTy && ResTy->getScalarSizeInBits() == 16 &&
513
         "Unexpected PMULH types");
514
  assert((!IsRounding || IsSigned) && "PMULHRS instruction must be signed");
515

516
  // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
517
  if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
518
    return ConstantAggregateZero::get(ResTy);
519

520
  // Multiply by zero.
521
  if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
522
    return ConstantAggregateZero::get(ResTy);
523

524
  // Multiply by one.
525
  if (!IsRounding) {
526
    if (match(Arg0, m_One()))
527
      return IsSigned ? Builder.CreateAShr(Arg1, 15)
528
                      : ConstantAggregateZero::get(ResTy);
529
    if (match(Arg1, m_One()))
530
      return IsSigned ? Builder.CreateAShr(Arg0, 15)
531
                      : ConstantAggregateZero::get(ResTy);
532
  }
533

534
  // Constant folding.
535
  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
536
    return nullptr;
537

538
  // Extend to twice the width and multiply.
539
  auto Cast =
540
      IsSigned ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
541
  auto *ExtTy = FixedVectorType::getExtendedElementVectorType(ArgTy);
542
  Value *LHS = Builder.CreateCast(Cast, Arg0, ExtTy);
543
  Value *RHS = Builder.CreateCast(Cast, Arg1, ExtTy);
544
  Value *Mul = Builder.CreateMul(LHS, RHS);
545

546
  if (IsRounding) {
547
    // PMULHRSW: truncate to vXi18 of the most significant bits, add one and
548
    // extract bits[16:1].
549
    auto *RndEltTy = IntegerType::get(ExtTy->getContext(), 18);
550
    auto *RndTy = FixedVectorType::get(RndEltTy, ExtTy);
551
    Mul = Builder.CreateLShr(Mul, 14);
552
    Mul = Builder.CreateTrunc(Mul, RndTy);
553
    Mul = Builder.CreateAdd(Mul, ConstantInt::get(RndTy, 1));
554
    Mul = Builder.CreateLShr(Mul, 1);
555
  } else {
556
    // PMULH/PMULHU: extract the vXi16 most significant bits.
557
    Mul = Builder.CreateLShr(Mul, 16);
558
  }
559

560
  return Builder.CreateTrunc(Mul, ResTy);
561
}
562

563
static Value *simplifyX86pmadd(IntrinsicInst &II,
564
                               InstCombiner::BuilderTy &Builder,
565
                               bool IsPMADDWD) {
566
  Value *Arg0 = II.getArgOperand(0);
567
  Value *Arg1 = II.getArgOperand(1);
568
  auto *ResTy = cast<FixedVectorType>(II.getType());
569
  [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
570

571
  unsigned NumDstElts = ResTy->getNumElements();
572
  assert(ArgTy->getNumElements() == (2 * NumDstElts) &&
573
         ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) &&
574
         "Unexpected PMADD types");
575

576
  // Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
577
  if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
578
    return ConstantAggregateZero::get(ResTy);
579

580
  // Multiply by zero.
581
  if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
582
    return ConstantAggregateZero::get(ResTy);
583

584
  // Constant folding.
585
  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
586
    return nullptr;
587

588
  // Split Lo/Hi elements pairs, extend and add together.
589
  // PMADDWD(X,Y) =
590
  // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1])))
591
  // PMADDUBSW(X,Y) =
592
  // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1])))
593
  SmallVector<int> LoMask, HiMask;
594
  for (unsigned I = 0; I != NumDstElts; ++I) {
595
    LoMask.push_back(2 * I + 0);
596
    HiMask.push_back(2 * I + 1);
597
  }
598

599
  auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask);
600
  auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask);
601
  auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask);
602
  auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask);
603

604
  auto LHSCast =
605
      IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
606
  LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy);
607
  LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy);
608
  RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy);
609
  RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy);
610
  Value *Lo = Builder.CreateMul(LHSLo, RHSLo);
611
  Value *Hi = Builder.CreateMul(LHSHi, RHSHi);
612
  return IsPMADDWD
613
             ? Builder.CreateAdd(Lo, Hi)
614
             : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi});
615
}
616

617
static Value *simplifyX86movmsk(const IntrinsicInst &II,
618
                                InstCombiner::BuilderTy &Builder) {
619
  Value *Arg = II.getArgOperand(0);
620
  Type *ResTy = II.getType();
621

622
  // movmsk(undef) -> zero as we must ensure the upper bits are zero.
623
  if (isa<UndefValue>(Arg))
624
    return Constant::getNullValue(ResTy);
625

626
  auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
627
  // We can't easily peek through x86_mmx types.
628
  if (!ArgTy)
629
    return nullptr;
630

631
  // Expand MOVMSK to compare/bitcast/zext:
632
  // e.g. PMOVMSKB(v16i8 x):
633
  // %cmp = icmp slt <16 x i8> %x, zeroinitializer
634
  // %int = bitcast <16 x i1> %cmp to i16
635
  // %res = zext i16 %int to i32
636
  unsigned NumElts = ArgTy->getNumElements();
637
  Type *IntegerTy = Builder.getIntNTy(NumElts);
638

639
  Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
640
  Res = Builder.CreateIsNeg(Res);
641
  Res = Builder.CreateBitCast(Res, IntegerTy);
642
  Res = Builder.CreateZExtOrTrunc(Res, ResTy);
643
  return Res;
644
}
645

646
static Value *simplifyX86addcarry(const IntrinsicInst &II,
647
                                  InstCombiner::BuilderTy &Builder) {
648
  Value *CarryIn = II.getArgOperand(0);
649
  Value *Op1 = II.getArgOperand(1);
650
  Value *Op2 = II.getArgOperand(2);
651
  Type *RetTy = II.getType();
652
  Type *OpTy = Op1->getType();
653
  assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
654
         RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
655
         "Unexpected types for x86 addcarry");
656

657
  // If carry-in is zero, this is just an unsigned add with overflow.
658
  if (match(CarryIn, m_ZeroInt())) {
659
    Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
660
                                          {Op1, Op2});
661
    // The types have to be adjusted to match the x86 call types.
662
    Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
663
    Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
664
                                       Builder.getInt8Ty());
665
    Value *Res = PoisonValue::get(RetTy);
666
    Res = Builder.CreateInsertValue(Res, UAddOV, 0);
667
    return Builder.CreateInsertValue(Res, UAddResult, 1);
668
  }
669

670
  return nullptr;
671
}
672

673
static Value *simplifyTernarylogic(const IntrinsicInst &II,
674
                                   InstCombiner::BuilderTy &Builder) {
675

676
  auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
677
  if (!ArgImm || ArgImm->getValue().uge(256))
678
    return nullptr;
679

680
  Value *ArgA = II.getArgOperand(0);
681
  Value *ArgB = II.getArgOperand(1);
682
  Value *ArgC = II.getArgOperand(2);
683

684
  Type *Ty = II.getType();
685

686
  auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
687
    return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
688
  };
689
  auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
690
    return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
691
  };
692
  auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
693
    return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
694
  };
695
  auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
696
    return {Builder.CreateNot(V.first), ~V.second};
697
  };
698
  auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
699
  auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
700
  auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
701

702
  bool AIsConst = match(ArgA, m_ImmConstant());
703
  bool BIsConst = match(ArgB, m_ImmConstant());
704
  bool CIsConst = match(ArgC, m_ImmConstant());
705

706
  bool ABIsConst = AIsConst && BIsConst;
707
  bool ACIsConst = AIsConst && CIsConst;
708
  bool BCIsConst = BIsConst && CIsConst;
709
  bool ABCIsConst = AIsConst && BIsConst && CIsConst;
710

711
  // Use for verification. Its a big table. Its difficult to go from Imm ->
712
  // logic ops, but easy to verify that a set of logic ops is correct. We track
713
  // the logic ops through the second value in the pair. At the end it should
714
  // equal Imm.
715
  std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
716
  std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
717
  std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
718
  std::pair<Value *, uint8_t> Res = {nullptr, 0};
719

720
  // Currently we only handle cases that convert directly to another instruction
721
  // or cases where all the ops are constant.  This is because we don't properly
722
  // handle creating ternary ops in the backend, so splitting them here may
723
  // cause regressions. As the backend improves, uncomment more cases.
724

725
  uint8_t Imm = ArgImm->getValue().getZExtValue();
726
  switch (Imm) {
727
  case 0x0:
728
    Res = {Constant::getNullValue(Ty), 0};
729
    break;
730
  case 0x1:
731
    if (ABCIsConst)
732
      Res = Nor(Or(A, B), C);
733
    break;
734
  case 0x2:
735
    if (ABCIsConst)
736
      Res = And(Nor(A, B), C);
737
    break;
738
  case 0x3:
739
    if (ABIsConst)
740
      Res = Nor(A, B);
741
    break;
742
  case 0x4:
743
    if (ABCIsConst)
744
      Res = And(Nor(A, C), B);
745
    break;
746
  case 0x5:
747
    if (ACIsConst)
748
      Res = Nor(A, C);
749
    break;
750
  case 0x6:
751
    if (ABCIsConst)
752
      Res = Nor(A, Xnor(B, C));
753
    break;
754
  case 0x7:
755
    if (ABCIsConst)
756
      Res = Nor(A, And(B, C));
757
    break;
758
  case 0x8:
759
    if (ABCIsConst)
760
      Res = Nor(A, Nand(B, C));
761
    break;
762
  case 0x9:
763
    if (ABCIsConst)
764
      Res = Nor(A, Xor(B, C));
765
    break;
766
  case 0xa:
767
    if (ACIsConst)
768
      Res = Nor(A, Not(C));
769
    break;
770
  case 0xb:
771
    if (ABCIsConst)
772
      Res = Nor(A, Nor(C, Not(B)));
773
    break;
774
  case 0xc:
775
    if (ABIsConst)
776
      Res = Nor(A, Not(B));
777
    break;
778
  case 0xd:
779
    if (ABCIsConst)
780
      Res = Nor(A, Nor(B, Not(C)));
781
    break;
782
  case 0xe:
783
    if (ABCIsConst)
784
      Res = Nor(A, Nor(B, C));
785
    break;
786
  case 0xf:
787
    Res = Not(A);
788
    break;
789
  case 0x10:
790
    if (ABCIsConst)
791
      Res = And(A, Nor(B, C));
792
    break;
793
  case 0x11:
794
    if (BCIsConst)
795
      Res = Nor(B, C);
796
    break;
797
  case 0x12:
798
    if (ABCIsConst)
799
      Res = Nor(Xnor(A, C), B);
800
    break;
801
  case 0x13:
802
    if (ABCIsConst)
803
      Res = Nor(And(A, C), B);
804
    break;
805
  case 0x14:
806
    if (ABCIsConst)
807
      Res = Nor(Xnor(A, B), C);
808
    break;
809
  case 0x15:
810
    if (ABCIsConst)
811
      Res = Nor(And(A, B), C);
812
    break;
813
  case 0x16:
814
    if (ABCIsConst)
815
      Res = Xor(Xor(A, B), And(Nand(A, B), C));
816
    break;
817
  case 0x17:
818
    if (ABCIsConst)
819
      Res = Xor(Or(A, B), Or(Xnor(A, B), C));
820
    break;
821
  case 0x18:
822
    if (ABCIsConst)
823
      Res = Nor(Xnor(A, B), Xnor(A, C));
824
    break;
825
  case 0x19:
826
    if (ABCIsConst)
827
      Res = And(Nand(A, B), Xnor(B, C));
828
    break;
829
  case 0x1a:
830
    if (ABCIsConst)
831
      Res = Xor(A, Or(And(A, B), C));
832
    break;
833
  case 0x1b:
834
    if (ABCIsConst)
835
      Res = Xor(A, Or(Xnor(A, B), C));
836
    break;
837
  case 0x1c:
838
    if (ABCIsConst)
839
      Res = Xor(A, Or(And(A, C), B));
840
    break;
841
  case 0x1d:
842
    if (ABCIsConst)
843
      Res = Xor(A, Or(Xnor(A, C), B));
844
    break;
845
  case 0x1e:
846
    if (ABCIsConst)
847
      Res = Xor(A, Or(B, C));
848
    break;
849
  case 0x1f:
850
    if (ABCIsConst)
851
      Res = Nand(A, Or(B, C));
852
    break;
853
  case 0x20:
854
    if (ABCIsConst)
855
      Res = Nor(Nand(A, C), B);
856
    break;
857
  case 0x21:
858
    if (ABCIsConst)
859
      Res = Nor(Xor(A, C), B);
860
    break;
861
  case 0x22:
862
    if (BCIsConst)
863
      Res = Nor(B, Not(C));
864
    break;
865
  case 0x23:
866
    if (ABCIsConst)
867
      Res = Nor(B, Nor(C, Not(A)));
868
    break;
869
  case 0x24:
870
    if (ABCIsConst)
871
      Res = Nor(Xnor(A, B), Xor(A, C));
872
    break;
873
  case 0x25:
874
    if (ABCIsConst)
875
      Res = Xor(A, Nand(Nand(A, B), C));
876
    break;
877
  case 0x26:
878
    if (ABCIsConst)
879
      Res = And(Nand(A, B), Xor(B, C));
880
    break;
881
  case 0x27:
882
    if (ABCIsConst)
883
      Res = Xor(Or(Xnor(A, B), C), B);
884
    break;
885
  case 0x28:
886
    if (ABCIsConst)
887
      Res = And(Xor(A, B), C);
888
    break;
889
  case 0x29:
890
    if (ABCIsConst)
891
      Res = Xor(Xor(A, B), Nor(And(A, B), C));
892
    break;
893
  case 0x2a:
894
    if (ABCIsConst)
895
      Res = And(Nand(A, B), C);
896
    break;
897
  case 0x2b:
898
    if (ABCIsConst)
899
      Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
900
    break;
901
  case 0x2c:
902
    if (ABCIsConst)
903
      Res = Nor(Xnor(A, B), Nor(B, C));
904
    break;
905
  case 0x2d:
906
    if (ABCIsConst)
907
      Res = Xor(A, Or(B, Not(C)));
908
    break;
909
  case 0x2e:
910
    if (ABCIsConst)
911
      Res = Xor(A, Or(Xor(A, C), B));
912
    break;
913
  case 0x2f:
914
    if (ABCIsConst)
915
      Res = Nand(A, Or(B, Not(C)));
916
    break;
917
  case 0x30:
918
    if (ABIsConst)
919
      Res = Nor(B, Not(A));
920
    break;
921
  case 0x31:
922
    if (ABCIsConst)
923
      Res = Nor(Nor(A, Not(C)), B);
924
    break;
925
  case 0x32:
926
    if (ABCIsConst)
927
      Res = Nor(Nor(A, C), B);
928
    break;
929
  case 0x33:
930
    Res = Not(B);
931
    break;
932
  case 0x34:
933
    if (ABCIsConst)
934
      Res = And(Xor(A, B), Nand(B, C));
935
    break;
936
  case 0x35:
937
    if (ABCIsConst)
938
      Res = Xor(B, Or(A, Xnor(B, C)));
939
    break;
940
  case 0x36:
941
    if (ABCIsConst)
942
      Res = Xor(Or(A, C), B);
943
    break;
944
  case 0x37:
945
    if (ABCIsConst)
946
      Res = Nand(Or(A, C), B);
947
    break;
948
  case 0x38:
949
    if (ABCIsConst)
950
      Res = Nor(Xnor(A, B), Nor(A, C));
951
    break;
952
  case 0x39:
953
    if (ABCIsConst)
954
      Res = Xor(Or(A, Not(C)), B);
955
    break;
956
  case 0x3a:
957
    if (ABCIsConst)
958
      Res = Xor(B, Or(A, Xor(B, C)));
959
    break;
960
  case 0x3b:
961
    if (ABCIsConst)
962
      Res = Nand(Or(A, Not(C)), B);
963
    break;
964
  case 0x3c:
965
    Res = Xor(A, B);
966
    break;
967
  case 0x3d:
968
    if (ABCIsConst)
969
      Res = Xor(A, Or(Nor(A, C), B));
970
    break;
971
  case 0x3e:
972
    if (ABCIsConst)
973
      Res = Xor(A, Or(Nor(A, Not(C)), B));
974
    break;
975
  case 0x3f:
976
    if (ABIsConst)
977
      Res = Nand(A, B);
978
    break;
979
  case 0x40:
980
    if (ABCIsConst)
981
      Res = Nor(Nand(A, B), C);
982
    break;
983
  case 0x41:
984
    if (ABCIsConst)
985
      Res = Nor(Xor(A, B), C);
986
    break;
987
  case 0x42:
988
    if (ABCIsConst)
989
      Res = Nor(Xor(A, B), Xnor(A, C));
990
    break;
991
  case 0x43:
992
    if (ABCIsConst)
993
      Res = Xor(A, Nand(Nand(A, C), B));
994
    break;
995
  case 0x44:
996
    if (BCIsConst)
997
      Res = Nor(C, Not(B));
998
    break;
999
  case 0x45:
1000
    if (ABCIsConst)
1001
      Res = Nor(Nor(B, Not(A)), C);
1002
    break;
1003
  case 0x46:
1004
    if (ABCIsConst)
1005
      Res = Xor(Or(And(A, C), B), C);
1006
    break;
1007
  case 0x47:
1008
    if (ABCIsConst)
1009
      Res = Xor(Or(Xnor(A, C), B), C);
1010
    break;
1011
  case 0x48:
1012
    if (ABCIsConst)
1013
      Res = And(Xor(A, C), B);
1014
    break;
1015
  case 0x49:
1016
    if (ABCIsConst)
1017
      Res = Xor(Or(Xnor(A, B), And(A, C)), C);
1018
    break;
1019
  case 0x4a:
1020
    if (ABCIsConst)
1021
      Res = Nor(Xnor(A, C), Nor(B, C));
1022
    break;
1023
  case 0x4b:
1024
    if (ABCIsConst)
1025
      Res = Xor(A, Or(C, Not(B)));
1026
    break;
1027
  case 0x4c:
1028
    if (ABCIsConst)
1029
      Res = And(Nand(A, C), B);
1030
    break;
1031
  case 0x4d:
1032
    if (ABCIsConst)
1033
      Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
1034
    break;
1035
  case 0x4e:
1036
    if (ABCIsConst)
1037
      Res = Xor(A, Or(Xor(A, B), C));
1038
    break;
1039
  case 0x4f:
1040
    if (ABCIsConst)
1041
      Res = Nand(A, Nand(B, Not(C)));
1042
    break;
1043
  case 0x50:
1044
    if (ACIsConst)
1045
      Res = Nor(C, Not(A));
1046
    break;
1047
  case 0x51:
1048
    if (ABCIsConst)
1049
      Res = Nor(Nor(A, Not(B)), C);
1050
    break;
1051
  case 0x52:
1052
    if (ABCIsConst)
1053
      Res = And(Xor(A, C), Nand(B, C));
1054
    break;
1055
  case 0x53:
1056
    if (ABCIsConst)
1057
      Res = Xor(Or(Xnor(B, C), A), C);
1058
    break;
1059
  case 0x54:
1060
    if (ABCIsConst)
1061
      Res = Nor(Nor(A, B), C);
1062
    break;
1063
  case 0x55:
1064
    Res = Not(C);
1065
    break;
1066
  case 0x56:
1067
    if (ABCIsConst)
1068
      Res = Xor(Or(A, B), C);
1069
    break;
1070
  case 0x57:
1071
    if (ABCIsConst)
1072
      Res = Nand(Or(A, B), C);
1073
    break;
1074
  case 0x58:
1075
    if (ABCIsConst)
1076
      Res = Nor(Nor(A, B), Xnor(A, C));
1077
    break;
1078
  case 0x59:
1079
    if (ABCIsConst)
1080
      Res = Xor(Or(A, Not(B)), C);
1081
    break;
1082
  case 0x5a:
1083
    Res = Xor(A, C);
1084
    break;
1085
  case 0x5b:
1086
    if (ABCIsConst)
1087
      Res = Xor(A, Or(Nor(A, B), C));
1088
    break;
1089
  case 0x5c:
1090
    if (ABCIsConst)
1091
      Res = Xor(Or(Xor(B, C), A), C);
1092
    break;
1093
  case 0x5d:
1094
    if (ABCIsConst)
1095
      Res = Nand(Or(A, Not(B)), C);
1096
    break;
1097
  case 0x5e:
1098
    if (ABCIsConst)
1099
      Res = Xor(A, Or(Nor(A, Not(B)), C));
1100
    break;
1101
  case 0x5f:
1102
    if (ACIsConst)
1103
      Res = Nand(A, C);
1104
    break;
1105
  case 0x60:
1106
    if (ABCIsConst)
1107
      Res = And(A, Xor(B, C));
1108
    break;
1109
  case 0x61:
1110
    if (ABCIsConst)
1111
      Res = Xor(Or(Xnor(A, B), And(B, C)), C);
1112
    break;
1113
  case 0x62:
1114
    if (ABCIsConst)
1115
      Res = Nor(Nor(A, C), Xnor(B, C));
1116
    break;
1117
  case 0x63:
1118
    if (ABCIsConst)
1119
      Res = Xor(B, Or(C, Not(A)));
1120
    break;
1121
  case 0x64:
1122
    if (ABCIsConst)
1123
      Res = Nor(Nor(A, B), Xnor(B, C));
1124
    break;
1125
  case 0x65:
1126
    if (ABCIsConst)
1127
      Res = Xor(Or(B, Not(A)), C);
1128
    break;
1129
  case 0x66:
1130
    Res = Xor(B, C);
1131
    break;
1132
  case 0x67:
1133
    if (ABCIsConst)
1134
      Res = Or(Nor(A, B), Xor(B, C));
1135
    break;
1136
  case 0x68:
1137
    if (ABCIsConst)
1138
      Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1139
    break;
1140
  case 0x69:
1141
    if (ABCIsConst)
1142
      Res = Xor(Xnor(A, B), C);
1143
    break;
1144
  case 0x6a:
1145
    if (ABCIsConst)
1146
      Res = Xor(And(A, B), C);
1147
    break;
1148
  case 0x6b:
1149
    if (ABCIsConst)
1150
      Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1151
    break;
1152
  case 0x6c:
1153
    if (ABCIsConst)
1154
      Res = Xor(And(A, C), B);
1155
    break;
1156
  case 0x6d:
1157
    if (ABCIsConst)
1158
      Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1159
    break;
1160
  case 0x6e:
1161
    if (ABCIsConst)
1162
      Res = Or(Nor(A, Not(B)), Xor(B, C));
1163
    break;
1164
  case 0x6f:
1165
    if (ABCIsConst)
1166
      Res = Nand(A, Xnor(B, C));
1167
    break;
1168
  case 0x70:
1169
    if (ABCIsConst)
1170
      Res = And(A, Nand(B, C));
1171
    break;
1172
  case 0x71:
1173
    if (ABCIsConst)
1174
      Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1175
    break;
1176
  case 0x72:
1177
    if (ABCIsConst)
1178
      Res = Xor(Or(Xor(A, B), C), B);
1179
    break;
1180
  case 0x73:
1181
    if (ABCIsConst)
1182
      Res = Nand(Nand(A, Not(C)), B);
1183
    break;
1184
  case 0x74:
1185
    if (ABCIsConst)
1186
      Res = Xor(Or(Xor(A, C), B), C);
1187
    break;
1188
  case 0x75:
1189
    if (ABCIsConst)
1190
      Res = Nand(Nand(A, Not(B)), C);
1191
    break;
1192
  case 0x76:
1193
    if (ABCIsConst)
1194
      Res = Xor(B, Or(Nor(B, Not(A)), C));
1195
    break;
1196
  case 0x77:
1197
    if (BCIsConst)
1198
      Res = Nand(B, C);
1199
    break;
1200
  case 0x78:
1201
    if (ABCIsConst)
1202
      Res = Xor(A, And(B, C));
1203
    break;
1204
  case 0x79:
1205
    if (ABCIsConst)
1206
      Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1207
    break;
1208
  case 0x7a:
1209
    if (ABCIsConst)
1210
      Res = Or(Xor(A, C), Nor(B, Not(A)));
1211
    break;
1212
  case 0x7b:
1213
    if (ABCIsConst)
1214
      Res = Nand(Xnor(A, C), B);
1215
    break;
1216
  case 0x7c:
1217
    if (ABCIsConst)
1218
      Res = Or(Xor(A, B), Nor(C, Not(A)));
1219
    break;
1220
  case 0x7d:
1221
    if (ABCIsConst)
1222
      Res = Nand(Xnor(A, B), C);
1223
    break;
1224
  case 0x7e:
1225
    if (ABCIsConst)
1226
      Res = Or(Xor(A, B), Xor(A, C));
1227
    break;
1228
  case 0x7f:
1229
    if (ABCIsConst)
1230
      Res = Nand(And(A, B), C);
1231
    break;
1232
  case 0x80:
1233
    if (ABCIsConst)
1234
      Res = And(And(A, B), C);
1235
    break;
1236
  case 0x81:
1237
    if (ABCIsConst)
1238
      Res = Nor(Xor(A, B), Xor(A, C));
1239
    break;
1240
  case 0x82:
1241
    if (ABCIsConst)
1242
      Res = And(Xnor(A, B), C);
1243
    break;
1244
  case 0x83:
1245
    if (ABCIsConst)
1246
      Res = Nor(Xor(A, B), Nor(C, Not(A)));
1247
    break;
1248
  case 0x84:
1249
    if (ABCIsConst)
1250
      Res = And(Xnor(A, C), B);
1251
    break;
1252
  case 0x85:
1253
    if (ABCIsConst)
1254
      Res = Nor(Xor(A, C), Nor(B, Not(A)));
1255
    break;
1256
  case 0x86:
1257
    if (ABCIsConst)
1258
      Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1259
    break;
1260
  case 0x87:
1261
    if (ABCIsConst)
1262
      Res = Xor(A, Nand(B, C));
1263
    break;
1264
  case 0x88:
1265
    Res = And(B, C);
1266
    break;
1267
  case 0x89:
1268
    if (ABCIsConst)
1269
      Res = Xor(B, Nor(Nor(B, Not(A)), C));
1270
    break;
1271
  case 0x8a:
1272
    if (ABCIsConst)
1273
      Res = And(Nand(A, Not(B)), C);
1274
    break;
1275
  case 0x8b:
1276
    if (ABCIsConst)
1277
      Res = Xor(Nor(Xor(A, C), B), C);
1278
    break;
1279
  case 0x8c:
1280
    if (ABCIsConst)
1281
      Res = And(Nand(A, Not(C)), B);
1282
    break;
1283
  case 0x8d:
1284
    if (ABCIsConst)
1285
      Res = Xor(Nor(Xor(A, B), C), B);
1286
    break;
1287
  case 0x8e:
1288
    if (ABCIsConst)
1289
      Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1290
    break;
1291
  case 0x8f:
1292
    if (ABCIsConst)
1293
      Res = Nand(A, Nand(B, C));
1294
    break;
1295
  case 0x90:
1296
    if (ABCIsConst)
1297
      Res = And(A, Xnor(B, C));
1298
    break;
1299
  case 0x91:
1300
    if (ABCIsConst)
1301
      Res = Nor(Nor(A, Not(B)), Xor(B, C));
1302
    break;
1303
  case 0x92:
1304
    if (ABCIsConst)
1305
      Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1306
    break;
1307
  case 0x93:
1308
    if (ABCIsConst)
1309
      Res = Xor(Nand(A, C), B);
1310
    break;
1311
  case 0x94:
1312
    if (ABCIsConst)
1313
      Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1314
    break;
1315
  case 0x95:
1316
    if (ABCIsConst)
1317
      Res = Xor(Nand(A, B), C);
1318
    break;
1319
  case 0x96:
1320
    if (ABCIsConst)
1321
      Res = Xor(Xor(A, B), C);
1322
    break;
1323
  case 0x97:
1324
    if (ABCIsConst)
1325
      Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1326
    break;
1327
  case 0x98:
1328
    if (ABCIsConst)
1329
      Res = Nor(Nor(A, B), Xor(B, C));
1330
    break;
1331
  case 0x99:
1332
    if (BCIsConst)
1333
      Res = Xnor(B, C);
1334
    break;
1335
  case 0x9a:
1336
    if (ABCIsConst)
1337
      Res = Xor(Nor(B, Not(A)), C);
1338
    break;
1339
  case 0x9b:
1340
    if (ABCIsConst)
1341
      Res = Or(Nor(A, B), Xnor(B, C));
1342
    break;
1343
  case 0x9c:
1344
    if (ABCIsConst)
1345
      Res = Xor(B, Nor(C, Not(A)));
1346
    break;
1347
  case 0x9d:
1348
    if (ABCIsConst)
1349
      Res = Or(Nor(A, C), Xnor(B, C));
1350
    break;
1351
  case 0x9e:
1352
    if (ABCIsConst)
1353
      Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1354
    break;
1355
  case 0x9f:
1356
    if (ABCIsConst)
1357
      Res = Nand(A, Xor(B, C));
1358
    break;
1359
  case 0xa0:
1360
    Res = And(A, C);
1361
    break;
1362
  case 0xa1:
1363
    if (ABCIsConst)
1364
      Res = Xor(A, Nor(Nor(A, Not(B)), C));
1365
    break;
1366
  case 0xa2:
1367
    if (ABCIsConst)
1368
      Res = And(Or(A, Not(B)), C);
1369
    break;
1370
  case 0xa3:
1371
    if (ABCIsConst)
1372
      Res = Xor(Nor(Xor(B, C), A), C);
1373
    break;
1374
  case 0xa4:
1375
    if (ABCIsConst)
1376
      Res = Xor(A, Nor(Nor(A, B), C));
1377
    break;
1378
  case 0xa5:
1379
    if (ACIsConst)
1380
      Res = Xnor(A, C);
1381
    break;
1382
  case 0xa6:
1383
    if (ABCIsConst)
1384
      Res = Xor(Nor(A, Not(B)), C);
1385
    break;
1386
  case 0xa7:
1387
    if (ABCIsConst)
1388
      Res = Or(Nor(A, B), Xnor(A, C));
1389
    break;
1390
  case 0xa8:
1391
    if (ABCIsConst)
1392
      Res = And(Or(A, B), C);
1393
    break;
1394
  case 0xa9:
1395
    if (ABCIsConst)
1396
      Res = Xor(Nor(A, B), C);
1397
    break;
1398
  case 0xaa:
1399
    Res = C;
1400
    break;
1401
  case 0xab:
1402
    if (ABCIsConst)
1403
      Res = Or(Nor(A, B), C);
1404
    break;
1405
  case 0xac:
1406
    if (ABCIsConst)
1407
      Res = Xor(Nor(Xnor(B, C), A), C);
1408
    break;
1409
  case 0xad:
1410
    if (ABCIsConst)
1411
      Res = Or(Xnor(A, C), And(B, C));
1412
    break;
1413
  case 0xae:
1414
    if (ABCIsConst)
1415
      Res = Or(Nor(A, Not(B)), C);
1416
    break;
1417
  case 0xaf:
1418
    if (ACIsConst)
1419
      Res = Or(C, Not(A));
1420
    break;
1421
  case 0xb0:
1422
    if (ABCIsConst)
1423
      Res = And(A, Nand(B, Not(C)));
1424
    break;
1425
  case 0xb1:
1426
    if (ABCIsConst)
1427
      Res = Xor(A, Nor(Xor(A, B), C));
1428
    break;
1429
  case 0xb2:
1430
    if (ABCIsConst)
1431
      Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1432
    break;
1433
  case 0xb3:
1434
    if (ABCIsConst)
1435
      Res = Nand(Nand(A, C), B);
1436
    break;
1437
  case 0xb4:
1438
    if (ABCIsConst)
1439
      Res = Xor(A, Nor(C, Not(B)));
1440
    break;
1441
  case 0xb5:
1442
    if (ABCIsConst)
1443
      Res = Or(Xnor(A, C), Nor(B, C));
1444
    break;
1445
  case 0xb6:
1446
    if (ABCIsConst)
1447
      Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1448
    break;
1449
  case 0xb7:
1450
    if (ABCIsConst)
1451
      Res = Nand(Xor(A, C), B);
1452
    break;
1453
  case 0xb8:
1454
    if (ABCIsConst)
1455
      Res = Xor(Nor(Xnor(A, C), B), C);
1456
    break;
1457
  case 0xb9:
1458
    if (ABCIsConst)
1459
      Res = Xor(Nor(And(A, C), B), C);
1460
    break;
1461
  case 0xba:
1462
    if (ABCIsConst)
1463
      Res = Or(Nor(B, Not(A)), C);
1464
    break;
1465
  case 0xbb:
1466
    if (BCIsConst)
1467
      Res = Or(C, Not(B));
1468
    break;
1469
  case 0xbc:
1470
    if (ABCIsConst)
1471
      Res = Xor(A, And(Nand(A, C), B));
1472
    break;
1473
  case 0xbd:
1474
    if (ABCIsConst)
1475
      Res = Or(Xor(A, B), Xnor(A, C));
1476
    break;
1477
  case 0xbe:
1478
    if (ABCIsConst)
1479
      Res = Or(Xor(A, B), C);
1480
    break;
1481
  case 0xbf:
1482
    if (ABCIsConst)
1483
      Res = Or(Nand(A, B), C);
1484
    break;
1485
  case 0xc0:
1486
    Res = And(A, B);
1487
    break;
1488
  case 0xc1:
1489
    if (ABCIsConst)
1490
      Res = Xor(A, Nor(Nor(A, Not(C)), B));
1491
    break;
1492
  case 0xc2:
1493
    if (ABCIsConst)
1494
      Res = Xor(A, Nor(Nor(A, C), B));
1495
    break;
1496
  case 0xc3:
1497
    if (ABIsConst)
1498
      Res = Xnor(A, B);
1499
    break;
1500
  case 0xc4:
1501
    if (ABCIsConst)
1502
      Res = And(Or(A, Not(C)), B);
1503
    break;
1504
  case 0xc5:
1505
    if (ABCIsConst)
1506
      Res = Xor(B, Nor(A, Xor(B, C)));
1507
    break;
1508
  case 0xc6:
1509
    if (ABCIsConst)
1510
      Res = Xor(Nor(A, Not(C)), B);
1511
    break;
1512
  case 0xc7:
1513
    if (ABCIsConst)
1514
      Res = Or(Xnor(A, B), Nor(A, C));
1515
    break;
1516
  case 0xc8:
1517
    if (ABCIsConst)
1518
      Res = And(Or(A, C), B);
1519
    break;
1520
  case 0xc9:
1521
    if (ABCIsConst)
1522
      Res = Xor(Nor(A, C), B);
1523
    break;
1524
  case 0xca:
1525
    if (ABCIsConst)
1526
      Res = Xor(B, Nor(A, Xnor(B, C)));
1527
    break;
1528
  case 0xcb:
1529
    if (ABCIsConst)
1530
      Res = Or(Xnor(A, B), And(B, C));
1531
    break;
1532
  case 0xcc:
1533
    Res = B;
1534
    break;
1535
  case 0xcd:
1536
    if (ABCIsConst)
1537
      Res = Or(Nor(A, C), B);
1538
    break;
1539
  case 0xce:
1540
    if (ABCIsConst)
1541
      Res = Or(Nor(A, Not(C)), B);
1542
    break;
1543
  case 0xcf:
1544
    if (ABIsConst)
1545
      Res = Or(B, Not(A));
1546
    break;
1547
  case 0xd0:
1548
    if (ABCIsConst)
1549
      Res = And(A, Or(B, Not(C)));
1550
    break;
1551
  case 0xd1:
1552
    if (ABCIsConst)
1553
      Res = Xor(A, Nor(Xor(A, C), B));
1554
    break;
1555
  case 0xd2:
1556
    if (ABCIsConst)
1557
      Res = Xor(A, Nor(B, Not(C)));
1558
    break;
1559
  case 0xd3:
1560
    if (ABCIsConst)
1561
      Res = Or(Xnor(A, B), Nor(B, C));
1562
    break;
1563
  case 0xd4:
1564
    if (ABCIsConst)
1565
      Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1566
    break;
1567
  case 0xd5:
1568
    if (ABCIsConst)
1569
      Res = Nand(Nand(A, B), C);
1570
    break;
1571
  case 0xd6:
1572
    if (ABCIsConst)
1573
      Res = Xor(Xor(A, B), Or(And(A, B), C));
1574
    break;
1575
  case 0xd7:
1576
    if (ABCIsConst)
1577
      Res = Nand(Xor(A, B), C);
1578
    break;
1579
  case 0xd8:
1580
    if (ABCIsConst)
1581
      Res = Xor(Nor(Xnor(A, B), C), B);
1582
    break;
1583
  case 0xd9:
1584
    if (ABCIsConst)
1585
      Res = Or(And(A, B), Xnor(B, C));
1586
    break;
1587
  case 0xda:
1588
    if (ABCIsConst)
1589
      Res = Xor(A, And(Nand(A, B), C));
1590
    break;
1591
  case 0xdb:
1592
    if (ABCIsConst)
1593
      Res = Or(Xnor(A, B), Xor(A, C));
1594
    break;
1595
  case 0xdc:
1596
    if (ABCIsConst)
1597
      Res = Or(B, Nor(C, Not(A)));
1598
    break;
1599
  case 0xdd:
1600
    if (BCIsConst)
1601
      Res = Or(B, Not(C));
1602
    break;
1603
  case 0xde:
1604
    if (ABCIsConst)
1605
      Res = Or(Xor(A, C), B);
1606
    break;
1607
  case 0xdf:
1608
    if (ABCIsConst)
1609
      Res = Or(Nand(A, C), B);
1610
    break;
1611
  case 0xe0:
1612
    if (ABCIsConst)
1613
      Res = And(A, Or(B, C));
1614
    break;
1615
  case 0xe1:
1616
    if (ABCIsConst)
1617
      Res = Xor(A, Nor(B, C));
1618
    break;
1619
  case 0xe2:
1620
    if (ABCIsConst)
1621
      Res = Xor(A, Nor(Xnor(A, C), B));
1622
    break;
1623
  case 0xe3:
1624
    if (ABCIsConst)
1625
      Res = Xor(A, Nor(And(A, C), B));
1626
    break;
1627
  case 0xe4:
1628
    if (ABCIsConst)
1629
      Res = Xor(A, Nor(Xnor(A, B), C));
1630
    break;
1631
  case 0xe5:
1632
    if (ABCIsConst)
1633
      Res = Xor(A, Nor(And(A, B), C));
1634
    break;
1635
  case 0xe6:
1636
    if (ABCIsConst)
1637
      Res = Or(And(A, B), Xor(B, C));
1638
    break;
1639
  case 0xe7:
1640
    if (ABCIsConst)
1641
      Res = Or(Xnor(A, B), Xnor(A, C));
1642
    break;
1643
  case 0xe8:
1644
    if (ABCIsConst)
1645
      Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1646
    break;
1647
  case 0xe9:
1648
    if (ABCIsConst)
1649
      Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1650
    break;
1651
  case 0xea:
1652
    if (ABCIsConst)
1653
      Res = Or(And(A, B), C);
1654
    break;
1655
  case 0xeb:
1656
    if (ABCIsConst)
1657
      Res = Or(Xnor(A, B), C);
1658
    break;
1659
  case 0xec:
1660
    if (ABCIsConst)
1661
      Res = Or(And(A, C), B);
1662
    break;
1663
  case 0xed:
1664
    if (ABCIsConst)
1665
      Res = Or(Xnor(A, C), B);
1666
    break;
1667
  case 0xee:
1668
    Res = Or(B, C);
1669
    break;
1670
  case 0xef:
1671
    if (ABCIsConst)
1672
      Res = Nand(A, Nor(B, C));
1673
    break;
1674
  case 0xf0:
1675
    Res = A;
1676
    break;
1677
  case 0xf1:
1678
    if (ABCIsConst)
1679
      Res = Or(A, Nor(B, C));
1680
    break;
1681
  case 0xf2:
1682
    if (ABCIsConst)
1683
      Res = Or(A, Nor(B, Not(C)));
1684
    break;
1685
  case 0xf3:
1686
    if (ABIsConst)
1687
      Res = Or(A, Not(B));
1688
    break;
1689
  case 0xf4:
1690
    if (ABCIsConst)
1691
      Res = Or(A, Nor(C, Not(B)));
1692
    break;
1693
  case 0xf5:
1694
    if (ACIsConst)
1695
      Res = Or(A, Not(C));
1696
    break;
1697
  case 0xf6:
1698
    if (ABCIsConst)
1699
      Res = Or(A, Xor(B, C));
1700
    break;
1701
  case 0xf7:
1702
    if (ABCIsConst)
1703
      Res = Or(A, Nand(B, C));
1704
    break;
1705
  case 0xf8:
1706
    if (ABCIsConst)
1707
      Res = Or(A, And(B, C));
1708
    break;
1709
  case 0xf9:
1710
    if (ABCIsConst)
1711
      Res = Or(A, Xnor(B, C));
1712
    break;
1713
  case 0xfa:
1714
    Res = Or(A, C);
1715
    break;
1716
  case 0xfb:
1717
    if (ABCIsConst)
1718
      Res = Nand(Nor(A, C), B);
1719
    break;
1720
  case 0xfc:
1721
    Res = Or(A, B);
1722
    break;
1723
  case 0xfd:
1724
    if (ABCIsConst)
1725
      Res = Nand(Nor(A, B), C);
1726
    break;
1727
  case 0xfe:
1728
    if (ABCIsConst)
1729
      Res = Or(Or(A, B), C);
1730
    break;
1731
  case 0xff:
1732
    Res = {Constant::getAllOnesValue(Ty), 0xff};
1733
    break;
1734
  }
1735

1736
  assert((Res.first == nullptr || Res.second == Imm) &&
1737
         "Simplification of ternary logic does not verify!");
1738
  return Res.first;
1739
}
1740

1741
static Value *simplifyX86insertps(const IntrinsicInst &II,
1742
                                  InstCombiner::BuilderTy &Builder) {
1743
  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1744
  if (!CInt)
1745
    return nullptr;
1746

1747
  auto *VecTy = cast<FixedVectorType>(II.getType());
1748
  assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1749

1750
  // The immediate permute control byte looks like this:
1751
  //    [3:0] - zero mask for each 32-bit lane
1752
  //    [5:4] - select one 32-bit destination lane
1753
  //    [7:6] - select one 32-bit source lane
1754

1755
  uint8_t Imm = CInt->getZExtValue();
1756
  uint8_t ZMask = Imm & 0xf;
1757
  uint8_t DestLane = (Imm >> 4) & 0x3;
1758
  uint8_t SourceLane = (Imm >> 6) & 0x3;
1759

1760
  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
1761

1762
  // If all zero mask bits are set, this was just a weird way to
1763
  // generate a zero vector.
1764
  if (ZMask == 0xf)
1765
    return ZeroVector;
1766

1767
  // Initialize by passing all of the first source bits through.
1768
  int ShuffleMask[4] = {0, 1, 2, 3};
1769

1770
  // We may replace the second operand with the zero vector.
1771
  Value *V1 = II.getArgOperand(1);
1772

1773
  if (ZMask) {
1774
    // If the zero mask is being used with a single input or the zero mask
1775
    // overrides the destination lane, this is a shuffle with the zero vector.
1776
    if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1777
        (ZMask & (1 << DestLane))) {
1778
      V1 = ZeroVector;
1779
      // We may still move 32-bits of the first source vector from one lane
1780
      // to another.
1781
      ShuffleMask[DestLane] = SourceLane;
1782
      // The zero mask may override the previous insert operation.
1783
      for (unsigned i = 0; i < 4; ++i)
1784
        if ((ZMask >> i) & 0x1)
1785
          ShuffleMask[i] = i + 4;
1786
    } else {
1787
      // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1788
      return nullptr;
1789
    }
1790
  } else {
1791
    // Replace the selected destination lane with the selected source lane.
1792
    ShuffleMask[DestLane] = SourceLane + 4;
1793
  }
1794

1795
  return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1796
}
1797

1798
/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1799
/// or conversion to a shuffle vector.
1800
static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
1801
                               ConstantInt *CILength, ConstantInt *CIIndex,
1802
                               InstCombiner::BuilderTy &Builder) {
1803
  auto LowConstantHighUndef = [&](uint64_t Val) {
1804
    Type *IntTy64 = Type::getInt64Ty(II.getContext());
1805
    Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1806
                        UndefValue::get(IntTy64)};
1807
    return ConstantVector::get(Args);
1808
  };
1809

1810
  // See if we're dealing with constant values.
1811
  auto *C0 = dyn_cast<Constant>(Op0);
1812
  auto *CI0 =
1813
      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1814
         : nullptr;
1815

1816
  // Attempt to constant fold.
1817
  if (CILength && CIIndex) {
1818
    // From AMD documentation: "The bit index and field length are each six
1819
    // bits in length other bits of the field are ignored."
1820
    APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1821
    APInt APLength = CILength->getValue().zextOrTrunc(6);
1822

1823
    unsigned Index = APIndex.getZExtValue();
1824

1825
    // From AMD documentation: "a value of zero in the field length is
1826
    // defined as length of 64".
1827
    unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1828

1829
    // From AMD documentation: "If the sum of the bit index + length field
1830
    // is greater than 64, the results are undefined".
1831
    unsigned End = Index + Length;
1832

1833
    // Note that both field index and field length are 8-bit quantities.
1834
    // Since variables 'Index' and 'Length' are unsigned values
1835
    // obtained from zero-extending field index and field length
1836
    // respectively, their sum should never wrap around.
1837
    if (End > 64)
1838
      return UndefValue::get(II.getType());
1839

1840
    // If we are inserting whole bytes, we can convert this to a shuffle.
1841
    // Lowering can recognize EXTRQI shuffle masks.
1842
    if ((Length % 8) == 0 && (Index % 8) == 0) {
1843
      // Convert bit indices to byte indices.
1844
      Length /= 8;
1845
      Index /= 8;
1846

1847
      Type *IntTy8 = Type::getInt8Ty(II.getContext());
1848
      auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1849

1850
      SmallVector<int, 16> ShuffleMask;
1851
      for (int i = 0; i != (int)Length; ++i)
1852
        ShuffleMask.push_back(i + Index);
1853
      for (int i = Length; i != 8; ++i)
1854
        ShuffleMask.push_back(i + 16);
1855
      for (int i = 8; i != 16; ++i)
1856
        ShuffleMask.push_back(-1);
1857

1858
      Value *SV = Builder.CreateShuffleVector(
1859
          Builder.CreateBitCast(Op0, ShufTy),
1860
          ConstantAggregateZero::get(ShufTy), ShuffleMask);
1861
      return Builder.CreateBitCast(SV, II.getType());
1862
    }
1863

1864
    // Constant Fold - shift Index'th bit to lowest position and mask off
1865
    // Length bits.
1866
    if (CI0) {
1867
      APInt Elt = CI0->getValue();
1868
      Elt.lshrInPlace(Index);
1869
      Elt = Elt.zextOrTrunc(Length);
1870
      return LowConstantHighUndef(Elt.getZExtValue());
1871
    }
1872

1873
    // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1874
    if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1875
      Value *Args[] = {Op0, CILength, CIIndex};
1876
      Module *M = II.getModule();
1877
      Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
1878
      return Builder.CreateCall(F, Args);
1879
    }
1880
  }
1881

1882
  // Constant Fold - extraction from zero is always {zero, undef}.
1883
  if (CI0 && CI0->isZero())
1884
    return LowConstantHighUndef(0);
1885

1886
  return nullptr;
1887
}
1888

1889
/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1890
/// folding or conversion to a shuffle vector.
1891
static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
1892
                                 APInt APLength, APInt APIndex,
1893
                                 InstCombiner::BuilderTy &Builder) {
1894
  // From AMD documentation: "The bit index and field length are each six bits
1895
  // in length other bits of the field are ignored."
1896
  APIndex = APIndex.zextOrTrunc(6);
1897
  APLength = APLength.zextOrTrunc(6);
1898

1899
  // Attempt to constant fold.
1900
  unsigned Index = APIndex.getZExtValue();
1901

1902
  // From AMD documentation: "a value of zero in the field length is
1903
  // defined as length of 64".
1904
  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1905

1906
  // From AMD documentation: "If the sum of the bit index + length field
1907
  // is greater than 64, the results are undefined".
1908
  unsigned End = Index + Length;
1909

1910
  // Note that both field index and field length are 8-bit quantities.
1911
  // Since variables 'Index' and 'Length' are unsigned values
1912
  // obtained from zero-extending field index and field length
1913
  // respectively, their sum should never wrap around.
1914
  if (End > 64)
1915
    return UndefValue::get(II.getType());
1916

1917
  // If we are inserting whole bytes, we can convert this to a shuffle.
1918
  // Lowering can recognize INSERTQI shuffle masks.
1919
  if ((Length % 8) == 0 && (Index % 8) == 0) {
1920
    // Convert bit indices to byte indices.
1921
    Length /= 8;
1922
    Index /= 8;
1923

1924
    Type *IntTy8 = Type::getInt8Ty(II.getContext());
1925
    auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1926

1927
    SmallVector<int, 16> ShuffleMask;
1928
    for (int i = 0; i != (int)Index; ++i)
1929
      ShuffleMask.push_back(i);
1930
    for (int i = 0; i != (int)Length; ++i)
1931
      ShuffleMask.push_back(i + 16);
1932
    for (int i = Index + Length; i != 8; ++i)
1933
      ShuffleMask.push_back(i);
1934
    for (int i = 8; i != 16; ++i)
1935
      ShuffleMask.push_back(-1);
1936

1937
    Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1938
                                            Builder.CreateBitCast(Op1, ShufTy),
1939
                                            ShuffleMask);
1940
    return Builder.CreateBitCast(SV, II.getType());
1941
  }
1942

1943
  // See if we're dealing with constant values.
1944
  auto *C0 = dyn_cast<Constant>(Op0);
1945
  auto *C1 = dyn_cast<Constant>(Op1);
1946
  auto *CI00 =
1947
      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1948
         : nullptr;
1949
  auto *CI10 =
1950
      C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1951
         : nullptr;
1952

1953
  // Constant Fold - insert bottom Length bits starting at the Index'th bit.
1954
  if (CI00 && CI10) {
1955
    APInt V00 = CI00->getValue();
1956
    APInt V10 = CI10->getValue();
1957
    APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
1958
    V00 = V00 & ~Mask;
1959
    V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
1960
    APInt Val = V00 | V10;
1961
    Type *IntTy64 = Type::getInt64Ty(II.getContext());
1962
    Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
1963
                        UndefValue::get(IntTy64)};
1964
    return ConstantVector::get(Args);
1965
  }
1966

1967
  // If we were an INSERTQ call, we'll save demanded elements if we convert to
1968
  // INSERTQI.
1969
  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
1970
    Type *IntTy8 = Type::getInt8Ty(II.getContext());
1971
    Constant *CILength = ConstantInt::get(IntTy8, Length, false);
1972
    Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
1973

1974
    Value *Args[] = {Op0, Op1, CILength, CIIndex};
1975
    Module *M = II.getModule();
1976
    Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
1977
    return Builder.CreateCall(F, Args);
1978
  }
1979

1980
  return nullptr;
1981
}
1982

1983
/// Attempt to convert pshufb* to shufflevector if the mask is constant.
1984
static Value *simplifyX86pshufb(const IntrinsicInst &II,
1985
                                InstCombiner::BuilderTy &Builder) {
1986
  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1987
  if (!V)
1988
    return nullptr;
1989

1990
  auto *VecTy = cast<FixedVectorType>(II.getType());
1991
  unsigned NumElts = VecTy->getNumElements();
1992
  assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
1993
         "Unexpected number of elements in shuffle mask!");
1994

1995
  // Construct a shuffle mask from constant integers or UNDEFs.
1996
  int Indexes[64];
1997

1998
  // Each byte in the shuffle control mask forms an index to permute the
1999
  // corresponding byte in the destination operand.
2000
  for (unsigned I = 0; I < NumElts; ++I) {
2001
    Constant *COp = V->getAggregateElement(I);
2002
    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2003
      return nullptr;
2004

2005
    if (isa<UndefValue>(COp)) {
2006
      Indexes[I] = -1;
2007
      continue;
2008
    }
2009

2010
    int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
2011

2012
    // If the most significant bit (bit[7]) of each byte of the shuffle
2013
    // control mask is set, then zero is written in the result byte.
2014
    // The zero vector is in the right-hand side of the resulting
2015
    // shufflevector.
2016

2017
    // The value of each index for the high 128-bit lane is the least
2018
    // significant 4 bits of the respective shuffle control byte.
2019
    Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
2020
    Indexes[I] = Index;
2021
  }
2022

2023
  auto V1 = II.getArgOperand(0);
2024
  auto V2 = Constant::getNullValue(VecTy);
2025
  return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
2026
}
2027

2028
/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
2029
static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
2030
                                    InstCombiner::BuilderTy &Builder) {
2031
  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2032
  if (!V)
2033
    return nullptr;
2034

2035
  auto *VecTy = cast<FixedVectorType>(II.getType());
2036
  unsigned NumElts = VecTy->getNumElements();
2037
  bool IsPD = VecTy->getScalarType()->isDoubleTy();
2038
  unsigned NumLaneElts = IsPD ? 2 : 4;
2039
  assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
2040

2041
  // Construct a shuffle mask from constant integers or UNDEFs.
2042
  int Indexes[16];
2043

2044
  // The intrinsics only read one or two bits, clear the rest.
2045
  for (unsigned I = 0; I < NumElts; ++I) {
2046
    Constant *COp = V->getAggregateElement(I);
2047
    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2048
      return nullptr;
2049

2050
    if (isa<UndefValue>(COp)) {
2051
      Indexes[I] = -1;
2052
      continue;
2053
    }
2054

2055
    APInt Index = cast<ConstantInt>(COp)->getValue();
2056
    Index = Index.zextOrTrunc(32).getLoBits(2);
2057

2058
    // The PD variants uses bit 1 to select per-lane element index, so
2059
    // shift down to convert to generic shuffle mask index.
2060
    if (IsPD)
2061
      Index.lshrInPlace(1);
2062

2063
    // The _256 variants are a bit trickier since the mask bits always index
2064
    // into the corresponding 128 half. In order to convert to a generic
2065
    // shuffle, we have to make that explicit.
2066
    Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
2067

2068
    Indexes[I] = Index.getZExtValue();
2069
  }
2070

2071
  auto V1 = II.getArgOperand(0);
2072
  return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
2073
}
2074

2075
/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
2076
static Value *simplifyX86vpermv(const IntrinsicInst &II,
2077
                                InstCombiner::BuilderTy &Builder) {
2078
  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2079
  if (!V)
2080
    return nullptr;
2081

2082
  auto *VecTy = cast<FixedVectorType>(II.getType());
2083
  unsigned Size = VecTy->getNumElements();
2084
  assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
2085
         "Unexpected shuffle mask size");
2086

2087
  // Construct a shuffle mask from constant integers or UNDEFs.
2088
  int Indexes[64];
2089

2090
  for (unsigned I = 0; I < Size; ++I) {
2091
    Constant *COp = V->getAggregateElement(I);
2092
    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2093
      return nullptr;
2094

2095
    if (isa<UndefValue>(COp)) {
2096
      Indexes[I] = -1;
2097
      continue;
2098
    }
2099

2100
    uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2101
    Index &= Size - 1;
2102
    Indexes[I] = Index;
2103
  }
2104

2105
  auto V1 = II.getArgOperand(0);
2106
  return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
2107
}
2108

2109
/// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant.
2110
static Value *simplifyX86vpermv3(const IntrinsicInst &II,
2111
                                 InstCombiner::BuilderTy &Builder) {
2112
  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2113
  if (!V)
2114
    return nullptr;
2115

2116
  auto *VecTy = cast<FixedVectorType>(II.getType());
2117
  unsigned Size = VecTy->getNumElements();
2118
  assert((Size == 2 || Size == 4 || Size == 8 || Size == 16 || Size == 32 ||
2119
          Size == 64) &&
2120
         "Unexpected shuffle mask size");
2121

2122
  // Construct a shuffle mask from constant integers or UNDEFs.
2123
  int Indexes[64];
2124

2125
  for (unsigned I = 0; I < Size; ++I) {
2126
    Constant *COp = V->getAggregateElement(I);
2127
    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2128
      return nullptr;
2129

2130
    if (isa<UndefValue>(COp)) {
2131
      Indexes[I] = -1;
2132
      continue;
2133
    }
2134

2135
    uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2136
    Index &= (2 * Size) - 1;
2137
    Indexes[I] = Index;
2138
  }
2139

2140
  auto V1 = II.getArgOperand(0);
2141
  auto V2 = II.getArgOperand(2);
2142
  return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size));
2143
}
2144

2145
std::optional<Instruction *>
2146
X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
2147
  auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
2148
                                             unsigned DemandedWidth) {
2149
    APInt UndefElts(Width, 0);
2150
    APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2151
    return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2152
  };
2153

2154
  Intrinsic::ID IID = II.getIntrinsicID();
2155
  switch (IID) {
2156
  case Intrinsic::x86_bmi_bextr_32:
2157
  case Intrinsic::x86_bmi_bextr_64:
2158
  case Intrinsic::x86_tbm_bextri_u32:
2159
  case Intrinsic::x86_tbm_bextri_u64:
2160
    // If the RHS is a constant we can try some simplifications.
2161
    if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2162
      uint64_t Shift = C->getZExtValue();
2163
      uint64_t Length = (Shift >> 8) & 0xff;
2164
      Shift &= 0xff;
2165
      unsigned BitWidth = II.getType()->getIntegerBitWidth();
2166
      // If the length is 0 or the shift is out of range, replace with zero.
2167
      if (Length == 0 || Shift >= BitWidth) {
2168
        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2169
      }
2170
      // If the LHS is also a constant, we can completely constant fold this.
2171
      if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2172
        uint64_t Result = InC->getZExtValue() >> Shift;
2173
        if (Length > BitWidth)
2174
          Length = BitWidth;
2175
        Result &= maskTrailingOnes<uint64_t>(Length);
2176
        return IC.replaceInstUsesWith(II,
2177
                                      ConstantInt::get(II.getType(), Result));
2178
      }
2179
      // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2180
      // are only masking bits that a shift already cleared?
2181
    }
2182
    break;
2183

2184
  case Intrinsic::x86_bmi_bzhi_32:
2185
  case Intrinsic::x86_bmi_bzhi_64:
2186
    // If the RHS is a constant we can try some simplifications.
2187
    if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2188
      uint64_t Index = C->getZExtValue() & 0xff;
2189
      unsigned BitWidth = II.getType()->getIntegerBitWidth();
2190
      if (Index >= BitWidth) {
2191
        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2192
      }
2193
      if (Index == 0) {
2194
        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2195
      }
2196
      // If the LHS is also a constant, we can completely constant fold this.
2197
      if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2198
        uint64_t Result = InC->getZExtValue();
2199
        Result &= maskTrailingOnes<uint64_t>(Index);
2200
        return IC.replaceInstUsesWith(II,
2201
                                      ConstantInt::get(II.getType(), Result));
2202
      }
2203
      // TODO should we convert this to an AND if the RHS is constant?
2204
    }
2205
    break;
2206
  case Intrinsic::x86_bmi_pext_32:
2207
  case Intrinsic::x86_bmi_pext_64:
2208
    if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2209
      if (MaskC->isNullValue()) {
2210
        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2211
      }
2212
      if (MaskC->isAllOnesValue()) {
2213
        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2214
      }
2215

2216
      unsigned MaskIdx, MaskLen;
2217
      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2218
        // any single contingous sequence of 1s anywhere in the mask simply
2219
        // describes a subset of the input bits shifted to the appropriate
2220
        // position.  Replace with the straight forward IR.
2221
        Value *Input = II.getArgOperand(0);
2222
        Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2223
        Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2224
        Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2225
        return IC.replaceInstUsesWith(II, Shifted);
2226
      }
2227

2228
      if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2229
        uint64_t Src = SrcC->getZExtValue();
2230
        uint64_t Mask = MaskC->getZExtValue();
2231
        uint64_t Result = 0;
2232
        uint64_t BitToSet = 1;
2233

2234
        while (Mask) {
2235
          // Isolate lowest set bit.
2236
          uint64_t BitToTest = Mask & -Mask;
2237
          if (BitToTest & Src)
2238
            Result |= BitToSet;
2239

2240
          BitToSet <<= 1;
2241
          // Clear lowest set bit.
2242
          Mask &= Mask - 1;
2243
        }
2244

2245
        return IC.replaceInstUsesWith(II,
2246
                                      ConstantInt::get(II.getType(), Result));
2247
      }
2248
    }
2249
    break;
2250
  case Intrinsic::x86_bmi_pdep_32:
2251
  case Intrinsic::x86_bmi_pdep_64:
2252
    if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2253
      if (MaskC->isNullValue()) {
2254
        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2255
      }
2256
      if (MaskC->isAllOnesValue()) {
2257
        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2258
      }
2259

2260
      unsigned MaskIdx, MaskLen;
2261
      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2262
        // any single contingous sequence of 1s anywhere in the mask simply
2263
        // describes a subset of the input bits shifted to the appropriate
2264
        // position.  Replace with the straight forward IR.
2265
        Value *Input = II.getArgOperand(0);
2266
        Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2267
        Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2268
        Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2269
        return IC.replaceInstUsesWith(II, Masked);
2270
      }
2271

2272
      if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2273
        uint64_t Src = SrcC->getZExtValue();
2274
        uint64_t Mask = MaskC->getZExtValue();
2275
        uint64_t Result = 0;
2276
        uint64_t BitToTest = 1;
2277

2278
        while (Mask) {
2279
          // Isolate lowest set bit.
2280
          uint64_t BitToSet = Mask & -Mask;
2281
          if (BitToTest & Src)
2282
            Result |= BitToSet;
2283

2284
          BitToTest <<= 1;
2285
          // Clear lowest set bit;
2286
          Mask &= Mask - 1;
2287
        }
2288

2289
        return IC.replaceInstUsesWith(II,
2290
                                      ConstantInt::get(II.getType(), Result));
2291
      }
2292
    }
2293
    break;
2294

2295
  case Intrinsic::x86_sse_cvtss2si:
2296
  case Intrinsic::x86_sse_cvtss2si64:
2297
  case Intrinsic::x86_sse_cvttss2si:
2298
  case Intrinsic::x86_sse_cvttss2si64:
2299
  case Intrinsic::x86_sse2_cvtsd2si:
2300
  case Intrinsic::x86_sse2_cvtsd2si64:
2301
  case Intrinsic::x86_sse2_cvttsd2si:
2302
  case Intrinsic::x86_sse2_cvttsd2si64:
2303
  case Intrinsic::x86_avx512_vcvtss2si32:
2304
  case Intrinsic::x86_avx512_vcvtss2si64:
2305
  case Intrinsic::x86_avx512_vcvtss2usi32:
2306
  case Intrinsic::x86_avx512_vcvtss2usi64:
2307
  case Intrinsic::x86_avx512_vcvtsd2si32:
2308
  case Intrinsic::x86_avx512_vcvtsd2si64:
2309
  case Intrinsic::x86_avx512_vcvtsd2usi32:
2310
  case Intrinsic::x86_avx512_vcvtsd2usi64:
2311
  case Intrinsic::x86_avx512_cvttss2si:
2312
  case Intrinsic::x86_avx512_cvttss2si64:
2313
  case Intrinsic::x86_avx512_cvttss2usi:
2314
  case Intrinsic::x86_avx512_cvttss2usi64:
2315
  case Intrinsic::x86_avx512_cvttsd2si:
2316
  case Intrinsic::x86_avx512_cvttsd2si64:
2317
  case Intrinsic::x86_avx512_cvttsd2usi:
2318
  case Intrinsic::x86_avx512_cvttsd2usi64: {
2319
    // These intrinsics only demand the 0th element of their input vectors. If
2320
    // we can simplify the input based on that, do so now.
2321
    Value *Arg = II.getArgOperand(0);
2322
    unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2323
    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2324
      return IC.replaceOperand(II, 0, V);
2325
    }
2326
    break;
2327
  }
2328

2329
  case Intrinsic::x86_mmx_pmovmskb:
2330
  case Intrinsic::x86_sse_movmsk_ps:
2331
  case Intrinsic::x86_sse2_movmsk_pd:
2332
  case Intrinsic::x86_sse2_pmovmskb_128:
2333
  case Intrinsic::x86_avx_movmsk_pd_256:
2334
  case Intrinsic::x86_avx_movmsk_ps_256:
2335
  case Intrinsic::x86_avx2_pmovmskb:
2336
    if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2337
      return IC.replaceInstUsesWith(II, V);
2338
    }
2339
    break;
2340

2341
  case Intrinsic::x86_sse_comieq_ss:
2342
  case Intrinsic::x86_sse_comige_ss:
2343
  case Intrinsic::x86_sse_comigt_ss:
2344
  case Intrinsic::x86_sse_comile_ss:
2345
  case Intrinsic::x86_sse_comilt_ss:
2346
  case Intrinsic::x86_sse_comineq_ss:
2347
  case Intrinsic::x86_sse_ucomieq_ss:
2348
  case Intrinsic::x86_sse_ucomige_ss:
2349
  case Intrinsic::x86_sse_ucomigt_ss:
2350
  case Intrinsic::x86_sse_ucomile_ss:
2351
  case Intrinsic::x86_sse_ucomilt_ss:
2352
  case Intrinsic::x86_sse_ucomineq_ss:
2353
  case Intrinsic::x86_sse2_comieq_sd:
2354
  case Intrinsic::x86_sse2_comige_sd:
2355
  case Intrinsic::x86_sse2_comigt_sd:
2356
  case Intrinsic::x86_sse2_comile_sd:
2357
  case Intrinsic::x86_sse2_comilt_sd:
2358
  case Intrinsic::x86_sse2_comineq_sd:
2359
  case Intrinsic::x86_sse2_ucomieq_sd:
2360
  case Intrinsic::x86_sse2_ucomige_sd:
2361
  case Intrinsic::x86_sse2_ucomigt_sd:
2362
  case Intrinsic::x86_sse2_ucomile_sd:
2363
  case Intrinsic::x86_sse2_ucomilt_sd:
2364
  case Intrinsic::x86_sse2_ucomineq_sd:
2365
  case Intrinsic::x86_avx512_vcomi_ss:
2366
  case Intrinsic::x86_avx512_vcomi_sd:
2367
  case Intrinsic::x86_avx512_mask_cmp_ss:
2368
  case Intrinsic::x86_avx512_mask_cmp_sd: {
2369
    // These intrinsics only demand the 0th element of their input vectors. If
2370
    // we can simplify the input based on that, do so now.
2371
    bool MadeChange = false;
2372
    Value *Arg0 = II.getArgOperand(0);
2373
    Value *Arg1 = II.getArgOperand(1);
2374
    unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2375
    if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2376
      IC.replaceOperand(II, 0, V);
2377
      MadeChange = true;
2378
    }
2379
    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2380
      IC.replaceOperand(II, 1, V);
2381
      MadeChange = true;
2382
    }
2383
    if (MadeChange) {
2384
      return &II;
2385
    }
2386
    break;
2387
  }
2388

2389
  case Intrinsic::x86_avx512_add_ps_512:
2390
  case Intrinsic::x86_avx512_div_ps_512:
2391
  case Intrinsic::x86_avx512_mul_ps_512:
2392
  case Intrinsic::x86_avx512_sub_ps_512:
2393
  case Intrinsic::x86_avx512_add_pd_512:
2394
  case Intrinsic::x86_avx512_div_pd_512:
2395
  case Intrinsic::x86_avx512_mul_pd_512:
2396
  case Intrinsic::x86_avx512_sub_pd_512:
2397
    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2398
    // IR operations.
2399
    if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2400
      if (R->getValue() == 4) {
2401
        Value *Arg0 = II.getArgOperand(0);
2402
        Value *Arg1 = II.getArgOperand(1);
2403

2404
        Value *V;
2405
        switch (IID) {
2406
        default:
2407
          llvm_unreachable("Case stmts out of sync!");
2408
        case Intrinsic::x86_avx512_add_ps_512:
2409
        case Intrinsic::x86_avx512_add_pd_512:
2410
          V = IC.Builder.CreateFAdd(Arg0, Arg1);
2411
          break;
2412
        case Intrinsic::x86_avx512_sub_ps_512:
2413
        case Intrinsic::x86_avx512_sub_pd_512:
2414
          V = IC.Builder.CreateFSub(Arg0, Arg1);
2415
          break;
2416
        case Intrinsic::x86_avx512_mul_ps_512:
2417
        case Intrinsic::x86_avx512_mul_pd_512:
2418
          V = IC.Builder.CreateFMul(Arg0, Arg1);
2419
          break;
2420
        case Intrinsic::x86_avx512_div_ps_512:
2421
        case Intrinsic::x86_avx512_div_pd_512:
2422
          V = IC.Builder.CreateFDiv(Arg0, Arg1);
2423
          break;
2424
        }
2425

2426
        return IC.replaceInstUsesWith(II, V);
2427
      }
2428
    }
2429
    break;
2430

2431
  case Intrinsic::x86_avx512_mask_add_ss_round:
2432
  case Intrinsic::x86_avx512_mask_div_ss_round:
2433
  case Intrinsic::x86_avx512_mask_mul_ss_round:
2434
  case Intrinsic::x86_avx512_mask_sub_ss_round:
2435
  case Intrinsic::x86_avx512_mask_add_sd_round:
2436
  case Intrinsic::x86_avx512_mask_div_sd_round:
2437
  case Intrinsic::x86_avx512_mask_mul_sd_round:
2438
  case Intrinsic::x86_avx512_mask_sub_sd_round:
2439
    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2440
    // IR operations.
2441
    if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2442
      if (R->getValue() == 4) {
2443
        // Extract the element as scalars.
2444
        Value *Arg0 = II.getArgOperand(0);
2445
        Value *Arg1 = II.getArgOperand(1);
2446
        Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
2447
        Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
2448

2449
        Value *V;
2450
        switch (IID) {
2451
        default:
2452
          llvm_unreachable("Case stmts out of sync!");
2453
        case Intrinsic::x86_avx512_mask_add_ss_round:
2454
        case Intrinsic::x86_avx512_mask_add_sd_round:
2455
          V = IC.Builder.CreateFAdd(LHS, RHS);
2456
          break;
2457
        case Intrinsic::x86_avx512_mask_sub_ss_round:
2458
        case Intrinsic::x86_avx512_mask_sub_sd_round:
2459
          V = IC.Builder.CreateFSub(LHS, RHS);
2460
          break;
2461
        case Intrinsic::x86_avx512_mask_mul_ss_round:
2462
        case Intrinsic::x86_avx512_mask_mul_sd_round:
2463
          V = IC.Builder.CreateFMul(LHS, RHS);
2464
          break;
2465
        case Intrinsic::x86_avx512_mask_div_ss_round:
2466
        case Intrinsic::x86_avx512_mask_div_sd_round:
2467
          V = IC.Builder.CreateFDiv(LHS, RHS);
2468
          break;
2469
        }
2470

2471
        // Handle the masking aspect of the intrinsic.
2472
        Value *Mask = II.getArgOperand(3);
2473
        auto *C = dyn_cast<ConstantInt>(Mask);
2474
        // We don't need a select if we know the mask bit is a 1.
2475
        if (!C || !C->getValue()[0]) {
2476
          // Cast the mask to an i1 vector and then extract the lowest element.
2477
          auto *MaskTy = FixedVectorType::get(
2478
              IC.Builder.getInt1Ty(),
2479
              cast<IntegerType>(Mask->getType())->getBitWidth());
2480
          Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2481
          Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2482
          // Extract the lowest element from the passthru operand.
2483
          Value *Passthru =
2484
              IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2485
          V = IC.Builder.CreateSelect(Mask, V, Passthru);
2486
        }
2487

2488
        // Insert the result back into the original argument 0.
2489
        V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2490

2491
        return IC.replaceInstUsesWith(II, V);
2492
      }
2493
    }
2494
    break;
2495

2496
  // Constant fold ashr( <A x Bi>, Ci ).
2497
  // Constant fold lshr( <A x Bi>, Ci ).
2498
  // Constant fold shl( <A x Bi>, Ci ).
2499
  case Intrinsic::x86_sse2_psrai_d:
2500
  case Intrinsic::x86_sse2_psrai_w:
2501
  case Intrinsic::x86_avx2_psrai_d:
2502
  case Intrinsic::x86_avx2_psrai_w:
2503
  case Intrinsic::x86_avx512_psrai_q_128:
2504
  case Intrinsic::x86_avx512_psrai_q_256:
2505
  case Intrinsic::x86_avx512_psrai_d_512:
2506
  case Intrinsic::x86_avx512_psrai_q_512:
2507
  case Intrinsic::x86_avx512_psrai_w_512:
2508
  case Intrinsic::x86_sse2_psrli_d:
2509
  case Intrinsic::x86_sse2_psrli_q:
2510
  case Intrinsic::x86_sse2_psrli_w:
2511
  case Intrinsic::x86_avx2_psrli_d:
2512
  case Intrinsic::x86_avx2_psrli_q:
2513
  case Intrinsic::x86_avx2_psrli_w:
2514
  case Intrinsic::x86_avx512_psrli_d_512:
2515
  case Intrinsic::x86_avx512_psrli_q_512:
2516
  case Intrinsic::x86_avx512_psrli_w_512:
2517
  case Intrinsic::x86_sse2_pslli_d:
2518
  case Intrinsic::x86_sse2_pslli_q:
2519
  case Intrinsic::x86_sse2_pslli_w:
2520
  case Intrinsic::x86_avx2_pslli_d:
2521
  case Intrinsic::x86_avx2_pslli_q:
2522
  case Intrinsic::x86_avx2_pslli_w:
2523
  case Intrinsic::x86_avx512_pslli_d_512:
2524
  case Intrinsic::x86_avx512_pslli_q_512:
2525
  case Intrinsic::x86_avx512_pslli_w_512:
2526
    if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2527
      return IC.replaceInstUsesWith(II, V);
2528
    }
2529
    break;
2530

2531
  case Intrinsic::x86_sse2_psra_d:
2532
  case Intrinsic::x86_sse2_psra_w:
2533
  case Intrinsic::x86_avx2_psra_d:
2534
  case Intrinsic::x86_avx2_psra_w:
2535
  case Intrinsic::x86_avx512_psra_q_128:
2536
  case Intrinsic::x86_avx512_psra_q_256:
2537
  case Intrinsic::x86_avx512_psra_d_512:
2538
  case Intrinsic::x86_avx512_psra_q_512:
2539
  case Intrinsic::x86_avx512_psra_w_512:
2540
  case Intrinsic::x86_sse2_psrl_d:
2541
  case Intrinsic::x86_sse2_psrl_q:
2542
  case Intrinsic::x86_sse2_psrl_w:
2543
  case Intrinsic::x86_avx2_psrl_d:
2544
  case Intrinsic::x86_avx2_psrl_q:
2545
  case Intrinsic::x86_avx2_psrl_w:
2546
  case Intrinsic::x86_avx512_psrl_d_512:
2547
  case Intrinsic::x86_avx512_psrl_q_512:
2548
  case Intrinsic::x86_avx512_psrl_w_512:
2549
  case Intrinsic::x86_sse2_psll_d:
2550
  case Intrinsic::x86_sse2_psll_q:
2551
  case Intrinsic::x86_sse2_psll_w:
2552
  case Intrinsic::x86_avx2_psll_d:
2553
  case Intrinsic::x86_avx2_psll_q:
2554
  case Intrinsic::x86_avx2_psll_w:
2555
  case Intrinsic::x86_avx512_psll_d_512:
2556
  case Intrinsic::x86_avx512_psll_q_512:
2557
  case Intrinsic::x86_avx512_psll_w_512: {
2558
    if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2559
      return IC.replaceInstUsesWith(II, V);
2560
    }
2561

2562
    // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2563
    // operand to compute the shift amount.
2564
    Value *Arg1 = II.getArgOperand(1);
2565
    assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2566
           "Unexpected packed shift size");
2567
    unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2568

2569
    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2570
      return IC.replaceOperand(II, 1, V);
2571
    }
2572
    break;
2573
  }
2574

2575
  case Intrinsic::x86_avx2_psllv_d:
2576
  case Intrinsic::x86_avx2_psllv_d_256:
2577
  case Intrinsic::x86_avx2_psllv_q:
2578
  case Intrinsic::x86_avx2_psllv_q_256:
2579
  case Intrinsic::x86_avx512_psllv_d_512:
2580
  case Intrinsic::x86_avx512_psllv_q_512:
2581
  case Intrinsic::x86_avx512_psllv_w_128:
2582
  case Intrinsic::x86_avx512_psllv_w_256:
2583
  case Intrinsic::x86_avx512_psllv_w_512:
2584
  case Intrinsic::x86_avx2_psrav_d:
2585
  case Intrinsic::x86_avx2_psrav_d_256:
2586
  case Intrinsic::x86_avx512_psrav_q_128:
2587
  case Intrinsic::x86_avx512_psrav_q_256:
2588
  case Intrinsic::x86_avx512_psrav_d_512:
2589
  case Intrinsic::x86_avx512_psrav_q_512:
2590
  case Intrinsic::x86_avx512_psrav_w_128:
2591
  case Intrinsic::x86_avx512_psrav_w_256:
2592
  case Intrinsic::x86_avx512_psrav_w_512:
2593
  case Intrinsic::x86_avx2_psrlv_d:
2594
  case Intrinsic::x86_avx2_psrlv_d_256:
2595
  case Intrinsic::x86_avx2_psrlv_q:
2596
  case Intrinsic::x86_avx2_psrlv_q_256:
2597
  case Intrinsic::x86_avx512_psrlv_d_512:
2598
  case Intrinsic::x86_avx512_psrlv_q_512:
2599
  case Intrinsic::x86_avx512_psrlv_w_128:
2600
  case Intrinsic::x86_avx512_psrlv_w_256:
2601
  case Intrinsic::x86_avx512_psrlv_w_512:
2602
    if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2603
      return IC.replaceInstUsesWith(II, V);
2604
    }
2605
    break;
2606

2607
  case Intrinsic::x86_sse2_packssdw_128:
2608
  case Intrinsic::x86_sse2_packsswb_128:
2609
  case Intrinsic::x86_avx2_packssdw:
2610
  case Intrinsic::x86_avx2_packsswb:
2611
  case Intrinsic::x86_avx512_packssdw_512:
2612
  case Intrinsic::x86_avx512_packsswb_512:
2613
    if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2614
      return IC.replaceInstUsesWith(II, V);
2615
    }
2616
    break;
2617

2618
  case Intrinsic::x86_sse2_packuswb_128:
2619
  case Intrinsic::x86_sse41_packusdw:
2620
  case Intrinsic::x86_avx2_packusdw:
2621
  case Intrinsic::x86_avx2_packuswb:
2622
  case Intrinsic::x86_avx512_packusdw_512:
2623
  case Intrinsic::x86_avx512_packuswb_512:
2624
    if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2625
      return IC.replaceInstUsesWith(II, V);
2626
    }
2627
    break;
2628

2629
  case Intrinsic::x86_sse2_pmulh_w:
2630
  case Intrinsic::x86_avx2_pmulh_w:
2631
  case Intrinsic::x86_avx512_pmulh_w_512:
2632
    if (Value *V = simplifyX86pmulh(II, IC.Builder, true, false)) {
2633
      return IC.replaceInstUsesWith(II, V);
2634
    }
2635
    break;
2636

2637
  case Intrinsic::x86_sse2_pmulhu_w:
2638
  case Intrinsic::x86_avx2_pmulhu_w:
2639
  case Intrinsic::x86_avx512_pmulhu_w_512:
2640
    if (Value *V = simplifyX86pmulh(II, IC.Builder, false, false)) {
2641
      return IC.replaceInstUsesWith(II, V);
2642
    }
2643
    break;
2644

2645
  case Intrinsic::x86_ssse3_pmul_hr_sw_128:
2646
  case Intrinsic::x86_avx2_pmul_hr_sw:
2647
  case Intrinsic::x86_avx512_pmul_hr_sw_512:
2648
    if (Value *V = simplifyX86pmulh(II, IC.Builder, true, true)) {
2649
      return IC.replaceInstUsesWith(II, V);
2650
    }
2651
    break;
2652

2653
  case Intrinsic::x86_sse2_pmadd_wd:
2654
  case Intrinsic::x86_avx2_pmadd_wd:
2655
  case Intrinsic::x86_avx512_pmaddw_d_512:
2656
    if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) {
2657
      return IC.replaceInstUsesWith(II, V);
2658
    }
2659
    break;
2660

2661
  case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
2662
  case Intrinsic::x86_avx2_pmadd_ub_sw:
2663
  case Intrinsic::x86_avx512_pmaddubs_w_512:
2664
    if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) {
2665
      return IC.replaceInstUsesWith(II, V);
2666
    }
2667
    break;
2668

2669
  case Intrinsic::x86_pclmulqdq:
2670
  case Intrinsic::x86_pclmulqdq_256:
2671
  case Intrinsic::x86_pclmulqdq_512: {
2672
    if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2673
      unsigned Imm = C->getZExtValue();
2674

2675
      bool MadeChange = false;
2676
      Value *Arg0 = II.getArgOperand(0);
2677
      Value *Arg1 = II.getArgOperand(1);
2678
      unsigned VWidth =
2679
          cast<FixedVectorType>(Arg0->getType())->getNumElements();
2680

2681
      APInt UndefElts1(VWidth, 0);
2682
      APInt DemandedElts1 =
2683
          APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2684
      if (Value *V =
2685
              IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2686
        IC.replaceOperand(II, 0, V);
2687
        MadeChange = true;
2688
      }
2689

2690
      APInt UndefElts2(VWidth, 0);
2691
      APInt DemandedElts2 =
2692
          APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2693
      if (Value *V =
2694
              IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2695
        IC.replaceOperand(II, 1, V);
2696
        MadeChange = true;
2697
      }
2698

2699
      // If either input elements are undef, the result is zero.
2700
      if (DemandedElts1.isSubsetOf(UndefElts1) ||
2701
          DemandedElts2.isSubsetOf(UndefElts2)) {
2702
        return IC.replaceInstUsesWith(II,
2703
                                      ConstantAggregateZero::get(II.getType()));
2704
      }
2705

2706
      if (MadeChange) {
2707
        return &II;
2708
      }
2709
    }
2710
    break;
2711
  }
2712

2713
  case Intrinsic::x86_sse41_insertps:
2714
    if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2715
      return IC.replaceInstUsesWith(II, V);
2716
    }
2717
    break;
2718

2719
  case Intrinsic::x86_sse4a_extrq: {
2720
    Value *Op0 = II.getArgOperand(0);
2721
    Value *Op1 = II.getArgOperand(1);
2722
    unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2723
    unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2724
    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2725
           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2726
           VWidth1 == 16 && "Unexpected operand sizes");
2727

2728
    // See if we're dealing with constant values.
2729
    auto *C1 = dyn_cast<Constant>(Op1);
2730
    auto *CILength =
2731
        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2732
           : nullptr;
2733
    auto *CIIndex =
2734
        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2735
           : nullptr;
2736

2737
    // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2738
    if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2739
      return IC.replaceInstUsesWith(II, V);
2740
    }
2741

2742
    // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2743
    // operands and the lowest 16-bits of the second.
2744
    bool MadeChange = false;
2745
    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2746
      IC.replaceOperand(II, 0, V);
2747
      MadeChange = true;
2748
    }
2749
    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2750
      IC.replaceOperand(II, 1, V);
2751
      MadeChange = true;
2752
    }
2753
    if (MadeChange) {
2754
      return &II;
2755
    }
2756
    break;
2757
  }
2758

2759
  case Intrinsic::x86_sse4a_extrqi: {
2760
    // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2761
    // bits of the lower 64-bits. The upper 64-bits are undefined.
2762
    Value *Op0 = II.getArgOperand(0);
2763
    unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2764
    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2765
           "Unexpected operand size");
2766

2767
    // See if we're dealing with constant values.
2768
    auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2769
    auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2770

2771
    // Attempt to simplify to a constant or shuffle vector.
2772
    if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2773
      return IC.replaceInstUsesWith(II, V);
2774
    }
2775

2776
    // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2777
    // operand.
2778
    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2779
      return IC.replaceOperand(II, 0, V);
2780
    }
2781
    break;
2782
  }
2783

2784
  case Intrinsic::x86_sse4a_insertq: {
2785
    Value *Op0 = II.getArgOperand(0);
2786
    Value *Op1 = II.getArgOperand(1);
2787
    unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2788
    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2789
           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2790
           cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2791
           "Unexpected operand size");
2792

2793
    // See if we're dealing with constant values.
2794
    auto *C1 = dyn_cast<Constant>(Op1);
2795
    auto *CI11 =
2796
        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2797
           : nullptr;
2798

2799
    // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2800
    if (CI11) {
2801
      const APInt &V11 = CI11->getValue();
2802
      APInt Len = V11.zextOrTrunc(6);
2803
      APInt Idx = V11.lshr(8).zextOrTrunc(6);
2804
      if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2805
        return IC.replaceInstUsesWith(II, V);
2806
      }
2807
    }
2808

2809
    // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2810
    // operand.
2811
    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2812
      return IC.replaceOperand(II, 0, V);
2813
    }
2814
    break;
2815
  }
2816

2817
  case Intrinsic::x86_sse4a_insertqi: {
2818
    // INSERTQI: Extract lowest Length bits from lower half of second source and
2819
    // insert over first source starting at Index bit. The upper 64-bits are
2820
    // undefined.
2821
    Value *Op0 = II.getArgOperand(0);
2822
    Value *Op1 = II.getArgOperand(1);
2823
    unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2824
    unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2825
    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2826
           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2827
           VWidth1 == 2 && "Unexpected operand sizes");
2828

2829
    // See if we're dealing with constant values.
2830
    auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2831
    auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2832

2833
    // Attempt to simplify to a constant or shuffle vector.
2834
    if (CILength && CIIndex) {
2835
      APInt Len = CILength->getValue().zextOrTrunc(6);
2836
      APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2837
      if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2838
        return IC.replaceInstUsesWith(II, V);
2839
      }
2840
    }
2841

2842
    // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2843
    // operands.
2844
    bool MadeChange = false;
2845
    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2846
      IC.replaceOperand(II, 0, V);
2847
      MadeChange = true;
2848
    }
2849
    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2850
      IC.replaceOperand(II, 1, V);
2851
      MadeChange = true;
2852
    }
2853
    if (MadeChange) {
2854
      return &II;
2855
    }
2856
    break;
2857
  }
2858

2859
  case Intrinsic::x86_sse41_pblendvb:
2860
  case Intrinsic::x86_sse41_blendvps:
2861
  case Intrinsic::x86_sse41_blendvpd:
2862
  case Intrinsic::x86_avx_blendv_ps_256:
2863
  case Intrinsic::x86_avx_blendv_pd_256:
2864
  case Intrinsic::x86_avx2_pblendvb: {
2865
    // fold (blend A, A, Mask) -> A
2866
    Value *Op0 = II.getArgOperand(0);
2867
    Value *Op1 = II.getArgOperand(1);
2868
    Value *Mask = II.getArgOperand(2);
2869
    if (Op0 == Op1) {
2870
      return IC.replaceInstUsesWith(II, Op0);
2871
    }
2872

2873
    // Zero Mask - select 1st argument.
2874
    if (isa<ConstantAggregateZero>(Mask)) {
2875
      return IC.replaceInstUsesWith(II, Op0);
2876
    }
2877

2878
    // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2879
    if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2880
      Constant *NewSelector =
2881
          getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
2882
      return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2883
    }
2884

2885
    Mask = InstCombiner::peekThroughBitcast(Mask);
2886

2887
    // Peek through a one-use shuffle - VectorCombine should have simplified
2888
    // this for cases where we're splitting wider vectors to use blendv
2889
    // intrinsics.
2890
    Value *MaskSrc = nullptr;
2891
    ArrayRef<int> ShuffleMask;
2892
    if (match(Mask, m_OneUse(m_Shuffle(m_Value(MaskSrc), m_Undef(),
2893
                                       m_Mask(ShuffleMask))))) {
2894
      // Bail if the shuffle was irregular or contains undefs.
2895
      int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2896
      if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) ||
2897
          any_of(ShuffleMask,
2898
                 [NumElts](int M) { return M < 0 || M >= NumElts; }))
2899
        break;
2900
      Mask = InstCombiner::peekThroughBitcast(MaskSrc);
2901
    }
2902

2903
    // Convert to a vector select if we can bypass casts and find a boolean
2904
    // vector condition value.
2905
    Value *BoolVec;
2906
    if (match(Mask, m_SExt(m_Value(BoolVec))) &&
2907
        BoolVec->getType()->isVectorTy() &&
2908
        BoolVec->getType()->getScalarSizeInBits() == 1) {
2909
      auto *MaskTy = cast<FixedVectorType>(Mask->getType());
2910
      auto *OpTy = cast<FixedVectorType>(II.getType());
2911
      unsigned NumMaskElts = MaskTy->getNumElements();
2912
      unsigned NumOperandElts = OpTy->getNumElements();
2913

2914
      // If we peeked through a shuffle, reapply the shuffle to the bool vector.
2915
      if (MaskSrc) {
2916
        unsigned NumMaskSrcElts =
2917
            cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2918
        NumMaskElts = (ShuffleMask.size() * NumMaskElts) / NumMaskSrcElts;
2919
        // Multiple mask bits maps to the same operand element - bail out.
2920
        if (NumMaskElts > NumOperandElts)
2921
          break;
2922
        SmallVector<int> ScaledMask;
2923
        if (!llvm::scaleShuffleMaskElts(NumMaskElts, ShuffleMask, ScaledMask))
2924
          break;
2925
        BoolVec = IC.Builder.CreateShuffleVector(BoolVec, ScaledMask);
2926
        MaskTy = FixedVectorType::get(MaskTy->getElementType(), NumMaskElts);
2927
      }
2928
      assert(MaskTy->getPrimitiveSizeInBits() ==
2929
                 OpTy->getPrimitiveSizeInBits() &&
2930
             "Not expecting mask and operands with different sizes");
2931

2932
      if (NumMaskElts == NumOperandElts) {
2933
        return SelectInst::Create(BoolVec, Op1, Op0);
2934
      }
2935

2936
      // If the mask has less elements than the operands, each mask bit maps to
2937
      // multiple elements of the operands. Bitcast back and forth.
2938
      if (NumMaskElts < NumOperandElts) {
2939
        Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);
2940
        Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);
2941
        Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
2942
        return new BitCastInst(Sel, II.getType());
2943
      }
2944
    }
2945

2946
    break;
2947
  }
2948

2949
  case Intrinsic::x86_ssse3_pshuf_b_128:
2950
  case Intrinsic::x86_avx2_pshuf_b:
2951
  case Intrinsic::x86_avx512_pshuf_b_512:
2952
    if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
2953
      return IC.replaceInstUsesWith(II, V);
2954
    }
2955
    break;
2956

2957
  case Intrinsic::x86_avx_vpermilvar_ps:
2958
  case Intrinsic::x86_avx_vpermilvar_ps_256:
2959
  case Intrinsic::x86_avx512_vpermilvar_ps_512:
2960
  case Intrinsic::x86_avx_vpermilvar_pd:
2961
  case Intrinsic::x86_avx_vpermilvar_pd_256:
2962
  case Intrinsic::x86_avx512_vpermilvar_pd_512:
2963
    if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2964
      return IC.replaceInstUsesWith(II, V);
2965
    }
2966
    break;
2967

2968
  case Intrinsic::x86_avx2_permd:
2969
  case Intrinsic::x86_avx2_permps:
2970
  case Intrinsic::x86_avx512_permvar_df_256:
2971
  case Intrinsic::x86_avx512_permvar_df_512:
2972
  case Intrinsic::x86_avx512_permvar_di_256:
2973
  case Intrinsic::x86_avx512_permvar_di_512:
2974
  case Intrinsic::x86_avx512_permvar_hi_128:
2975
  case Intrinsic::x86_avx512_permvar_hi_256:
2976
  case Intrinsic::x86_avx512_permvar_hi_512:
2977
  case Intrinsic::x86_avx512_permvar_qi_128:
2978
  case Intrinsic::x86_avx512_permvar_qi_256:
2979
  case Intrinsic::x86_avx512_permvar_qi_512:
2980
  case Intrinsic::x86_avx512_permvar_sf_512:
2981
  case Intrinsic::x86_avx512_permvar_si_512:
2982
    if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
2983
      return IC.replaceInstUsesWith(II, V);
2984
    }
2985
    break;
2986

2987
  case Intrinsic::x86_avx512_vpermi2var_d_128:
2988
  case Intrinsic::x86_avx512_vpermi2var_d_256:
2989
  case Intrinsic::x86_avx512_vpermi2var_d_512:
2990
  case Intrinsic::x86_avx512_vpermi2var_hi_128: 
2991
  case Intrinsic::x86_avx512_vpermi2var_hi_256: 
2992
  case Intrinsic::x86_avx512_vpermi2var_hi_512: 
2993
  case Intrinsic::x86_avx512_vpermi2var_pd_128: 
2994
  case Intrinsic::x86_avx512_vpermi2var_pd_256: 
2995
  case Intrinsic::x86_avx512_vpermi2var_pd_512: 
2996
  case Intrinsic::x86_avx512_vpermi2var_ps_128: 
2997
  case Intrinsic::x86_avx512_vpermi2var_ps_256: 
2998
  case Intrinsic::x86_avx512_vpermi2var_ps_512: 
2999
  case Intrinsic::x86_avx512_vpermi2var_q_128:
3000
  case Intrinsic::x86_avx512_vpermi2var_q_256:
3001
  case Intrinsic::x86_avx512_vpermi2var_q_512:
3002
  case Intrinsic::x86_avx512_vpermi2var_qi_128:
3003
  case Intrinsic::x86_avx512_vpermi2var_qi_256:
3004
  case Intrinsic::x86_avx512_vpermi2var_qi_512:
3005
    if (Value *V = simplifyX86vpermv3(II, IC.Builder)) {
3006
      return IC.replaceInstUsesWith(II, V);
3007
    }
3008
    break;
3009

3010
  case Intrinsic::x86_avx_maskload_ps:
3011
  case Intrinsic::x86_avx_maskload_pd:
3012
  case Intrinsic::x86_avx_maskload_ps_256:
3013
  case Intrinsic::x86_avx_maskload_pd_256:
3014
  case Intrinsic::x86_avx2_maskload_d:
3015
  case Intrinsic::x86_avx2_maskload_q:
3016
  case Intrinsic::x86_avx2_maskload_d_256:
3017
  case Intrinsic::x86_avx2_maskload_q_256:
3018
    if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
3019
      return I;
3020
    }
3021
    break;
3022

3023
  case Intrinsic::x86_sse2_maskmov_dqu:
3024
  case Intrinsic::x86_avx_maskstore_ps:
3025
  case Intrinsic::x86_avx_maskstore_pd:
3026
  case Intrinsic::x86_avx_maskstore_ps_256:
3027
  case Intrinsic::x86_avx_maskstore_pd_256:
3028
  case Intrinsic::x86_avx2_maskstore_d:
3029
  case Intrinsic::x86_avx2_maskstore_q:
3030
  case Intrinsic::x86_avx2_maskstore_d_256:
3031
  case Intrinsic::x86_avx2_maskstore_q_256:
3032
    if (simplifyX86MaskedStore(II, IC)) {
3033
      return nullptr;
3034
    }
3035
    break;
3036

3037
  case Intrinsic::x86_addcarry_32:
3038
  case Intrinsic::x86_addcarry_64:
3039
    if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
3040
      return IC.replaceInstUsesWith(II, V);
3041
    }
3042
    break;
3043

3044
  case Intrinsic::x86_avx512_pternlog_d_128:
3045
  case Intrinsic::x86_avx512_pternlog_d_256:
3046
  case Intrinsic::x86_avx512_pternlog_d_512:
3047
  case Intrinsic::x86_avx512_pternlog_q_128:
3048
  case Intrinsic::x86_avx512_pternlog_q_256:
3049
  case Intrinsic::x86_avx512_pternlog_q_512:
3050
    if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
3051
      return IC.replaceInstUsesWith(II, V);
3052
    }
3053
    break;
3054
  default:
3055
    break;
3056
  }
3057
  return std::nullopt;
3058
}
3059

3060
std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
3061
    InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
3062
    bool &KnownBitsComputed) const {
3063
  switch (II.getIntrinsicID()) {
3064
  default:
3065
    break;
3066
  case Intrinsic::x86_mmx_pmovmskb:
3067
  case Intrinsic::x86_sse_movmsk_ps:
3068
  case Intrinsic::x86_sse2_movmsk_pd:
3069
  case Intrinsic::x86_sse2_pmovmskb_128:
3070
  case Intrinsic::x86_avx_movmsk_ps_256:
3071
  case Intrinsic::x86_avx_movmsk_pd_256:
3072
  case Intrinsic::x86_avx2_pmovmskb: {
3073
    // MOVMSK copies the vector elements' sign bits to the low bits
3074
    // and zeros the high bits.
3075
    unsigned ArgWidth;
3076
    if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
3077
      ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
3078
    } else {
3079
      auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
3080
      ArgWidth = ArgType->getNumElements();
3081
    }
3082

3083
    // If we don't need any of low bits then return zero,
3084
    // we know that DemandedMask is non-zero already.
3085
    APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
3086
    Type *VTy = II.getType();
3087
    if (DemandedElts.isZero()) {
3088
      return ConstantInt::getNullValue(VTy);
3089
    }
3090

3091
    // We know that the upper bits are set to zero.
3092
    Known.Zero.setBitsFrom(ArgWidth);
3093
    KnownBitsComputed = true;
3094
    break;
3095
  }
3096
  }
3097
  return std::nullopt;
3098
}
3099

3100
std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
3101
    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
3102
    APInt &UndefElts2, APInt &UndefElts3,
3103
    std::function<void(Instruction *, unsigned, APInt, APInt &)>
3104
        simplifyAndSetOp) const {
3105
  unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
3106
  switch (II.getIntrinsicID()) {
3107
  default:
3108
    break;
3109
  case Intrinsic::x86_xop_vfrcz_ss:
3110
  case Intrinsic::x86_xop_vfrcz_sd:
3111
    // The instructions for these intrinsics are speced to zero upper bits not
3112
    // pass them through like other scalar intrinsics. So we shouldn't just
3113
    // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
3114
    // Instead we should return a zero vector.
3115
    if (!DemandedElts[0]) {
3116
      IC.addToWorklist(&II);
3117
      return ConstantAggregateZero::get(II.getType());
3118
    }
3119

3120
    // Only the lower element is used.
3121
    DemandedElts = 1;
3122
    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3123

3124
    // Only the lower element is undefined. The high elements are zero.
3125
    UndefElts = UndefElts[0];
3126
    break;
3127

3128
  // Unary scalar-as-vector operations that work column-wise.
3129
  case Intrinsic::x86_sse_rcp_ss:
3130
  case Intrinsic::x86_sse_rsqrt_ss:
3131
    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3132

3133
    // If lowest element of a scalar op isn't used then use Arg0.
3134
    if (!DemandedElts[0]) {
3135
      IC.addToWorklist(&II);
3136
      return II.getArgOperand(0);
3137
    }
3138
    // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
3139
    // checks).
3140
    break;
3141

3142
  // Binary scalar-as-vector operations that work column-wise. The high
3143
  // elements come from operand 0. The low element is a function of both
3144
  // operands.
3145
  case Intrinsic::x86_sse_min_ss:
3146
  case Intrinsic::x86_sse_max_ss:
3147
  case Intrinsic::x86_sse_cmp_ss:
3148
  case Intrinsic::x86_sse2_min_sd:
3149
  case Intrinsic::x86_sse2_max_sd:
3150
  case Intrinsic::x86_sse2_cmp_sd: {
3151
    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3152

3153
    // If lowest element of a scalar op isn't used then use Arg0.
3154
    if (!DemandedElts[0]) {
3155
      IC.addToWorklist(&II);
3156
      return II.getArgOperand(0);
3157
    }
3158

3159
    // Only lower element is used for operand 1.
3160
    DemandedElts = 1;
3161
    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3162

3163
    // Lower element is undefined if both lower elements are undefined.
3164
    // Consider things like undef&0.  The result is known zero, not undef.
3165
    if (!UndefElts2[0])
3166
      UndefElts.clearBit(0);
3167

3168
    break;
3169
  }
3170

3171
  // Binary scalar-as-vector operations that work column-wise. The high
3172
  // elements come from operand 0 and the low element comes from operand 1.
3173
  case Intrinsic::x86_sse41_round_ss:
3174
  case Intrinsic::x86_sse41_round_sd: {
3175
    // Don't use the low element of operand 0.
3176
    APInt DemandedElts2 = DemandedElts;
3177
    DemandedElts2.clearBit(0);
3178
    simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
3179

3180
    // If lowest element of a scalar op isn't used then use Arg0.
3181
    if (!DemandedElts[0]) {
3182
      IC.addToWorklist(&II);
3183
      return II.getArgOperand(0);
3184
    }
3185

3186
    // Only lower element is used for operand 1.
3187
    DemandedElts = 1;
3188
    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3189

3190
    // Take the high undef elements from operand 0 and take the lower element
3191
    // from operand 1.
3192
    UndefElts.clearBit(0);
3193
    UndefElts |= UndefElts2[0];
3194
    break;
3195
  }
3196

3197
  // Three input scalar-as-vector operations that work column-wise. The high
3198
  // elements come from operand 0 and the low element is a function of all
3199
  // three inputs.
3200
  case Intrinsic::x86_avx512_mask_add_ss_round:
3201
  case Intrinsic::x86_avx512_mask_div_ss_round:
3202
  case Intrinsic::x86_avx512_mask_mul_ss_round:
3203
  case Intrinsic::x86_avx512_mask_sub_ss_round:
3204
  case Intrinsic::x86_avx512_mask_max_ss_round:
3205
  case Intrinsic::x86_avx512_mask_min_ss_round:
3206
  case Intrinsic::x86_avx512_mask_add_sd_round:
3207
  case Intrinsic::x86_avx512_mask_div_sd_round:
3208
  case Intrinsic::x86_avx512_mask_mul_sd_round:
3209
  case Intrinsic::x86_avx512_mask_sub_sd_round:
3210
  case Intrinsic::x86_avx512_mask_max_sd_round:
3211
  case Intrinsic::x86_avx512_mask_min_sd_round:
3212
    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3213

3214
    // If lowest element of a scalar op isn't used then use Arg0.
3215
    if (!DemandedElts[0]) {
3216
      IC.addToWorklist(&II);
3217
      return II.getArgOperand(0);
3218
    }
3219

3220
    // Only lower element is used for operand 1 and 2.
3221
    DemandedElts = 1;
3222
    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3223
    simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
3224

3225
    // Lower element is undefined if all three lower elements are undefined.
3226
    // Consider things like undef&0.  The result is known zero, not undef.
3227
    if (!UndefElts2[0] || !UndefElts3[0])
3228
      UndefElts.clearBit(0);
3229
    break;
3230

3231
  // TODO: Add fmaddsub support?
3232
  case Intrinsic::x86_sse3_addsub_pd:
3233
  case Intrinsic::x86_sse3_addsub_ps:
3234
  case Intrinsic::x86_avx_addsub_pd_256:
3235
  case Intrinsic::x86_avx_addsub_ps_256: {
3236
    // If none of the even or none of the odd lanes are required, turn this
3237
    // into a generic FP math instruction.
3238
    APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
3239
    APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
3240
    bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
3241
    bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
3242
    if (IsSubOnly || IsAddOnly) {
3243
      assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
3244
      IRBuilderBase::InsertPointGuard Guard(IC.Builder);
3245
      IC.Builder.SetInsertPoint(&II);
3246
      Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3247
      return IC.Builder.CreateBinOp(
3248
          IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3249
    }
3250

3251
    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3252
    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3253
    UndefElts &= UndefElts2;
3254
    break;
3255
  }
3256

3257
  // General per-element vector operations.
3258
  case Intrinsic::x86_avx2_psllv_d:
3259
  case Intrinsic::x86_avx2_psllv_d_256:
3260
  case Intrinsic::x86_avx2_psllv_q:
3261
  case Intrinsic::x86_avx2_psllv_q_256:
3262
  case Intrinsic::x86_avx2_psrlv_d:
3263
  case Intrinsic::x86_avx2_psrlv_d_256:
3264
  case Intrinsic::x86_avx2_psrlv_q:
3265
  case Intrinsic::x86_avx2_psrlv_q_256:
3266
  case Intrinsic::x86_avx2_psrav_d:
3267
  case Intrinsic::x86_avx2_psrav_d_256: {
3268
    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3269
    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3270
    UndefElts &= UndefElts2;
3271
    break;
3272
  }
3273

3274
  case Intrinsic::x86_sse2_pmulh_w:
3275
  case Intrinsic::x86_avx2_pmulh_w:
3276
  case Intrinsic::x86_avx512_pmulh_w_512:
3277
  case Intrinsic::x86_sse2_pmulhu_w:
3278
  case Intrinsic::x86_avx2_pmulhu_w:
3279
  case Intrinsic::x86_avx512_pmulhu_w_512:
3280
  case Intrinsic::x86_ssse3_pmul_hr_sw_128:
3281
  case Intrinsic::x86_avx2_pmul_hr_sw:
3282
  case Intrinsic::x86_avx512_pmul_hr_sw_512: {
3283
    simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3284
    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3285
    // NOTE: mulh(undef,undef) != undef.
3286
    break;
3287
  }
3288

3289
  case Intrinsic::x86_sse2_packssdw_128:
3290
  case Intrinsic::x86_sse2_packsswb_128:
3291
  case Intrinsic::x86_sse2_packuswb_128:
3292
  case Intrinsic::x86_sse41_packusdw:
3293
  case Intrinsic::x86_avx2_packssdw:
3294
  case Intrinsic::x86_avx2_packsswb:
3295
  case Intrinsic::x86_avx2_packusdw:
3296
  case Intrinsic::x86_avx2_packuswb:
3297
  case Intrinsic::x86_avx512_packssdw_512:
3298
  case Intrinsic::x86_avx512_packsswb_512:
3299
  case Intrinsic::x86_avx512_packusdw_512:
3300
  case Intrinsic::x86_avx512_packuswb_512: {
3301
    auto *Ty0 = II.getArgOperand(0)->getType();
3302
    unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3303
    assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3304

3305
    unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3306
    unsigned VWidthPerLane = VWidth / NumLanes;
3307
    unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3308

3309
    // Per lane, pack the elements of the first input and then the second.
3310
    // e.g.
3311
    // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3312
    // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3313
    for (int OpNum = 0; OpNum != 2; ++OpNum) {
3314
      APInt OpDemandedElts(InnerVWidth, 0);
3315
      for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3316
        unsigned LaneIdx = Lane * VWidthPerLane;
3317
        for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3318
          unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3319
          if (DemandedElts[Idx])
3320
            OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3321
        }
3322
      }
3323

3324
      // Demand elements from the operand.
3325
      APInt OpUndefElts(InnerVWidth, 0);
3326
      simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3327

3328
      // Pack the operand's UNDEF elements, one lane at a time.
3329
      OpUndefElts = OpUndefElts.zext(VWidth);
3330
      for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3331
        APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3332
        LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3333
        LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3334
        UndefElts |= LaneElts;
3335
      }
3336
    }
3337
    break;
3338
  }
3339

3340
  case Intrinsic::x86_sse2_pmadd_wd:
3341
  case Intrinsic::x86_avx2_pmadd_wd:
3342
  case Intrinsic::x86_avx512_pmaddw_d_512:
3343
  case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
3344
  case Intrinsic::x86_avx2_pmadd_ub_sw:
3345
  case Intrinsic::x86_avx512_pmaddubs_w_512: {
3346
    // PMADD - demand both src elements that map to each dst element.
3347
    auto *ArgTy = II.getArgOperand(0)->getType();
3348
    unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements();
3349
    assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
3350
    APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth);
3351
    APInt Op0UndefElts(InnerVWidth, 0);
3352
    APInt Op1UndefElts(InnerVWidth, 0);
3353
    simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts);
3354
    simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts);
3355
    // NOTE: madd(undef,undef) != undef.
3356
    break;
3357
  }
3358

3359
  // PSHUFB
3360
  case Intrinsic::x86_ssse3_pshuf_b_128:
3361
  case Intrinsic::x86_avx2_pshuf_b:
3362
  case Intrinsic::x86_avx512_pshuf_b_512:
3363
  // PERMILVAR
3364
  case Intrinsic::x86_avx_vpermilvar_ps:
3365
  case Intrinsic::x86_avx_vpermilvar_ps_256:
3366
  case Intrinsic::x86_avx512_vpermilvar_ps_512:
3367
  case Intrinsic::x86_avx_vpermilvar_pd:
3368
  case Intrinsic::x86_avx_vpermilvar_pd_256:
3369
  case Intrinsic::x86_avx512_vpermilvar_pd_512:
3370
  // PERMV
3371
  case Intrinsic::x86_avx2_permd:
3372
  case Intrinsic::x86_avx2_permps: {
3373
    simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3374
    break;
3375
  }
3376

3377
  // SSE4A instructions leave the upper 64-bits of the 128-bit result
3378
  // in an undefined state.
3379
  case Intrinsic::x86_sse4a_extrq:
3380
  case Intrinsic::x86_sse4a_extrqi:
3381
  case Intrinsic::x86_sse4a_insertq:
3382
  case Intrinsic::x86_sse4a_insertqi:
3383
    UndefElts.setHighBits(VWidth / 2);
3384
    break;
3385
  }
3386
  return std::nullopt;
3387
}
3388

3389
Product

Resources

Company