Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
35266 views
1
//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
/// \file
9
/// This file implements a TargetTransformInfo analysis pass specific to the
10
/// X86 target machine. It uses the target's detailed information to provide
11
/// more precise answers to certain TTI queries, while letting the target
12
/// independent and default TTI implementations handle the rest.
13
///
14
//===----------------------------------------------------------------------===//
15
16
#include "X86TargetTransformInfo.h"
17
#include "llvm/IR/IntrinsicInst.h"
18
#include "llvm/IR/IntrinsicsX86.h"
19
#include "llvm/Support/KnownBits.h"
20
#include "llvm/Transforms/InstCombine/InstCombiner.h"
21
#include <optional>
22
23
using namespace llvm;
24
using namespace llvm::PatternMatch;
25
26
#define DEBUG_TYPE "x86tti"
27
28
/// Return a constant boolean vector that has true elements in all positions
29
/// where the input constant data vector has an element with the sign bit set.
30
static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) {
31
VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
32
V = ConstantExpr::getBitCast(V, IntTy);
33
V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT,
34
Constant::getNullValue(IntTy), V, DL);
35
assert(V && "Vector must be foldable");
36
return V;
37
}
38
39
/// Convert the x86 XMM integer vector mask to a vector of bools based on
40
/// each element's most significant bit (the sign bit).
41
static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
42
// Fold Constant Mask.
43
if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
44
return getNegativeIsTrueBoolVec(ConstantMask, DL);
45
46
// Mask was extended from a boolean vector.
47
Value *ExtMask;
48
if (match(Mask, m_SExt(m_Value(ExtMask))) &&
49
ExtMask->getType()->isIntOrIntVectorTy(1))
50
return ExtMask;
51
52
return nullptr;
53
}
54
55
// TODO: If the x86 backend knew how to convert a bool vector mask back to an
56
// XMM register mask efficiently, we could transform all x86 masked intrinsics
57
// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
58
static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
59
Value *Ptr = II.getOperand(0);
60
Value *Mask = II.getOperand(1);
61
Constant *ZeroVec = Constant::getNullValue(II.getType());
62
63
// Zero Mask - masked load instruction creates a zero vector.
64
if (isa<ConstantAggregateZero>(Mask))
65
return IC.replaceInstUsesWith(II, ZeroVec);
66
67
// The mask is constant or extended from a bool vector. Convert this x86
68
// intrinsic to the LLVM intrinsic to allow target-independent optimizations.
69
if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
70
// First, cast the x86 intrinsic scalar pointer to a vector pointer to match
71
// the LLVM intrinsic definition for the pointer argument.
72
unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
73
PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
74
Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
75
76
// The pass-through vector for an x86 masked load is a zero vector.
77
CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
78
II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
79
return IC.replaceInstUsesWith(II, NewMaskedLoad);
80
}
81
82
return nullptr;
83
}
84
85
// TODO: If the x86 backend knew how to convert a bool vector mask back to an
86
// XMM register mask efficiently, we could transform all x86 masked intrinsics
87
// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
88
static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
89
Value *Ptr = II.getOperand(0);
90
Value *Mask = II.getOperand(1);
91
Value *Vec = II.getOperand(2);
92
93
// Zero Mask - this masked store instruction does nothing.
94
if (isa<ConstantAggregateZero>(Mask)) {
95
IC.eraseInstFromFunction(II);
96
return true;
97
}
98
99
// The SSE2 version is too weird (eg, unaligned but non-temporal) to do
100
// anything else at this level.
101
if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
102
return false;
103
104
// The mask is constant or extended from a bool vector. Convert this x86
105
// intrinsic to the LLVM intrinsic to allow target-independent optimizations.
106
if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {
107
unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
108
PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
109
Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
110
111
IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
112
113
// 'Replace uses' doesn't work for stores. Erase the original masked store.
114
IC.eraseInstFromFunction(II);
115
return true;
116
}
117
118
return false;
119
}
120
121
static Value *simplifyX86immShift(const IntrinsicInst &II,
122
InstCombiner::BuilderTy &Builder) {
123
bool LogicalShift = false;
124
bool ShiftLeft = false;
125
bool IsImm = false;
126
127
switch (II.getIntrinsicID()) {
128
default:
129
llvm_unreachable("Unexpected intrinsic!");
130
case Intrinsic::x86_sse2_psrai_d:
131
case Intrinsic::x86_sse2_psrai_w:
132
case Intrinsic::x86_avx2_psrai_d:
133
case Intrinsic::x86_avx2_psrai_w:
134
case Intrinsic::x86_avx512_psrai_q_128:
135
case Intrinsic::x86_avx512_psrai_q_256:
136
case Intrinsic::x86_avx512_psrai_d_512:
137
case Intrinsic::x86_avx512_psrai_q_512:
138
case Intrinsic::x86_avx512_psrai_w_512:
139
IsImm = true;
140
[[fallthrough]];
141
case Intrinsic::x86_sse2_psra_d:
142
case Intrinsic::x86_sse2_psra_w:
143
case Intrinsic::x86_avx2_psra_d:
144
case Intrinsic::x86_avx2_psra_w:
145
case Intrinsic::x86_avx512_psra_q_128:
146
case Intrinsic::x86_avx512_psra_q_256:
147
case Intrinsic::x86_avx512_psra_d_512:
148
case Intrinsic::x86_avx512_psra_q_512:
149
case Intrinsic::x86_avx512_psra_w_512:
150
LogicalShift = false;
151
ShiftLeft = false;
152
break;
153
case Intrinsic::x86_sse2_psrli_d:
154
case Intrinsic::x86_sse2_psrli_q:
155
case Intrinsic::x86_sse2_psrli_w:
156
case Intrinsic::x86_avx2_psrli_d:
157
case Intrinsic::x86_avx2_psrli_q:
158
case Intrinsic::x86_avx2_psrli_w:
159
case Intrinsic::x86_avx512_psrli_d_512:
160
case Intrinsic::x86_avx512_psrli_q_512:
161
case Intrinsic::x86_avx512_psrli_w_512:
162
IsImm = true;
163
[[fallthrough]];
164
case Intrinsic::x86_sse2_psrl_d:
165
case Intrinsic::x86_sse2_psrl_q:
166
case Intrinsic::x86_sse2_psrl_w:
167
case Intrinsic::x86_avx2_psrl_d:
168
case Intrinsic::x86_avx2_psrl_q:
169
case Intrinsic::x86_avx2_psrl_w:
170
case Intrinsic::x86_avx512_psrl_d_512:
171
case Intrinsic::x86_avx512_psrl_q_512:
172
case Intrinsic::x86_avx512_psrl_w_512:
173
LogicalShift = true;
174
ShiftLeft = false;
175
break;
176
case Intrinsic::x86_sse2_pslli_d:
177
case Intrinsic::x86_sse2_pslli_q:
178
case Intrinsic::x86_sse2_pslli_w:
179
case Intrinsic::x86_avx2_pslli_d:
180
case Intrinsic::x86_avx2_pslli_q:
181
case Intrinsic::x86_avx2_pslli_w:
182
case Intrinsic::x86_avx512_pslli_d_512:
183
case Intrinsic::x86_avx512_pslli_q_512:
184
case Intrinsic::x86_avx512_pslli_w_512:
185
IsImm = true;
186
[[fallthrough]];
187
case Intrinsic::x86_sse2_psll_d:
188
case Intrinsic::x86_sse2_psll_q:
189
case Intrinsic::x86_sse2_psll_w:
190
case Intrinsic::x86_avx2_psll_d:
191
case Intrinsic::x86_avx2_psll_q:
192
case Intrinsic::x86_avx2_psll_w:
193
case Intrinsic::x86_avx512_psll_d_512:
194
case Intrinsic::x86_avx512_psll_q_512:
195
case Intrinsic::x86_avx512_psll_w_512:
196
LogicalShift = true;
197
ShiftLeft = true;
198
break;
199
}
200
assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
201
202
Value *Vec = II.getArgOperand(0);
203
Value *Amt = II.getArgOperand(1);
204
auto *VT = cast<FixedVectorType>(Vec->getType());
205
Type *SVT = VT->getElementType();
206
Type *AmtVT = Amt->getType();
207
unsigned VWidth = VT->getNumElements();
208
unsigned BitWidth = SVT->getPrimitiveSizeInBits();
209
210
// If the shift amount is guaranteed to be in-range we can replace it with a
211
// generic shift. If its guaranteed to be out of range, logical shifts combine
212
// to zero and arithmetic shifts are clamped to (BitWidth - 1).
213
if (IsImm) {
214
assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
215
KnownBits KnownAmtBits =
216
llvm::computeKnownBits(Amt, II.getDataLayout());
217
if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
218
Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
219
Amt = Builder.CreateVectorSplat(VWidth, Amt);
220
return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
221
: Builder.CreateLShr(Vec, Amt))
222
: Builder.CreateAShr(Vec, Amt));
223
}
224
if (KnownAmtBits.getMinValue().uge(BitWidth)) {
225
if (LogicalShift)
226
return ConstantAggregateZero::get(VT);
227
Amt = ConstantInt::get(SVT, BitWidth - 1);
228
return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
229
}
230
} else {
231
// Ensure the first element has an in-range value and the rest of the
232
// elements in the bottom 64 bits are zero.
233
assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
234
cast<VectorType>(AmtVT)->getElementType() == SVT &&
235
"Unexpected shift-by-scalar type");
236
unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
237
APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
238
APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
239
KnownBits KnownLowerBits = llvm::computeKnownBits(
240
Amt, DemandedLower, II.getDataLayout());
241
KnownBits KnownUpperBits = llvm::computeKnownBits(
242
Amt, DemandedUpper, II.getDataLayout());
243
if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
244
(DemandedUpper.isZero() || KnownUpperBits.isZero())) {
245
SmallVector<int, 16> ZeroSplat(VWidth, 0);
246
Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
247
return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
248
: Builder.CreateLShr(Vec, Amt))
249
: Builder.CreateAShr(Vec, Amt));
250
}
251
}
252
253
// Simplify if count is constant vector.
254
auto *CDV = dyn_cast<ConstantDataVector>(Amt);
255
if (!CDV)
256
return nullptr;
257
258
// SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
259
// operand to compute the shift amount.
260
assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
261
cast<VectorType>(AmtVT)->getElementType() == SVT &&
262
"Unexpected shift-by-scalar type");
263
264
// Concatenate the sub-elements to create the 64-bit value.
265
APInt Count(64, 0);
266
for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
267
unsigned SubEltIdx = (NumSubElts - 1) - i;
268
auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
269
Count <<= BitWidth;
270
Count |= SubElt->getValue().zextOrTrunc(64);
271
}
272
273
// If shift-by-zero then just return the original value.
274
if (Count.isZero())
275
return Vec;
276
277
// Handle cases when Shift >= BitWidth.
278
if (Count.uge(BitWidth)) {
279
// If LogicalShift - just return zero.
280
if (LogicalShift)
281
return ConstantAggregateZero::get(VT);
282
283
// If ArithmeticShift - clamp Shift to (BitWidth - 1).
284
Count = APInt(64, BitWidth - 1);
285
}
286
287
// Get a constant vector of the same type as the first operand.
288
auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
289
auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
290
291
if (ShiftLeft)
292
return Builder.CreateShl(Vec, ShiftVec);
293
294
if (LogicalShift)
295
return Builder.CreateLShr(Vec, ShiftVec);
296
297
return Builder.CreateAShr(Vec, ShiftVec);
298
}
299
300
// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
301
// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
302
// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
303
static Value *simplifyX86varShift(const IntrinsicInst &II,
304
InstCombiner::BuilderTy &Builder) {
305
bool LogicalShift = false;
306
bool ShiftLeft = false;
307
308
switch (II.getIntrinsicID()) {
309
default:
310
llvm_unreachable("Unexpected intrinsic!");
311
case Intrinsic::x86_avx2_psrav_d:
312
case Intrinsic::x86_avx2_psrav_d_256:
313
case Intrinsic::x86_avx512_psrav_q_128:
314
case Intrinsic::x86_avx512_psrav_q_256:
315
case Intrinsic::x86_avx512_psrav_d_512:
316
case Intrinsic::x86_avx512_psrav_q_512:
317
case Intrinsic::x86_avx512_psrav_w_128:
318
case Intrinsic::x86_avx512_psrav_w_256:
319
case Intrinsic::x86_avx512_psrav_w_512:
320
LogicalShift = false;
321
ShiftLeft = false;
322
break;
323
case Intrinsic::x86_avx2_psrlv_d:
324
case Intrinsic::x86_avx2_psrlv_d_256:
325
case Intrinsic::x86_avx2_psrlv_q:
326
case Intrinsic::x86_avx2_psrlv_q_256:
327
case Intrinsic::x86_avx512_psrlv_d_512:
328
case Intrinsic::x86_avx512_psrlv_q_512:
329
case Intrinsic::x86_avx512_psrlv_w_128:
330
case Intrinsic::x86_avx512_psrlv_w_256:
331
case Intrinsic::x86_avx512_psrlv_w_512:
332
LogicalShift = true;
333
ShiftLeft = false;
334
break;
335
case Intrinsic::x86_avx2_psllv_d:
336
case Intrinsic::x86_avx2_psllv_d_256:
337
case Intrinsic::x86_avx2_psllv_q:
338
case Intrinsic::x86_avx2_psllv_q_256:
339
case Intrinsic::x86_avx512_psllv_d_512:
340
case Intrinsic::x86_avx512_psllv_q_512:
341
case Intrinsic::x86_avx512_psllv_w_128:
342
case Intrinsic::x86_avx512_psllv_w_256:
343
case Intrinsic::x86_avx512_psllv_w_512:
344
LogicalShift = true;
345
ShiftLeft = true;
346
break;
347
}
348
assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
349
350
Value *Vec = II.getArgOperand(0);
351
Value *Amt = II.getArgOperand(1);
352
auto *VT = cast<FixedVectorType>(II.getType());
353
Type *SVT = VT->getElementType();
354
int NumElts = VT->getNumElements();
355
int BitWidth = SVT->getIntegerBitWidth();
356
357
// If the shift amount is guaranteed to be in-range we can replace it with a
358
// generic shift.
359
KnownBits KnownAmt =
360
llvm::computeKnownBits(Amt, II.getDataLayout());
361
if (KnownAmt.getMaxValue().ult(BitWidth)) {
362
return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
363
: Builder.CreateLShr(Vec, Amt))
364
: Builder.CreateAShr(Vec, Amt));
365
}
366
367
// Simplify if all shift amounts are constant/undef.
368
auto *CShift = dyn_cast<Constant>(Amt);
369
if (!CShift)
370
return nullptr;
371
372
// Collect each element's shift amount.
373
// We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
374
bool AnyOutOfRange = false;
375
SmallVector<int, 8> ShiftAmts;
376
for (int I = 0; I < NumElts; ++I) {
377
auto *CElt = CShift->getAggregateElement(I);
378
if (isa_and_nonnull<UndefValue>(CElt)) {
379
ShiftAmts.push_back(-1);
380
continue;
381
}
382
383
auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
384
if (!COp)
385
return nullptr;
386
387
// Handle out of range shifts.
388
// If LogicalShift - set to BitWidth (special case).
389
// If ArithmeticShift - set to (BitWidth - 1) (sign splat).
390
APInt ShiftVal = COp->getValue();
391
if (ShiftVal.uge(BitWidth)) {
392
AnyOutOfRange = LogicalShift;
393
ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
394
continue;
395
}
396
397
ShiftAmts.push_back((int)ShiftVal.getZExtValue());
398
}
399
400
// If all elements out of range or UNDEF, return vector of zeros/undefs.
401
// ArithmeticShift should only hit this if they are all UNDEF.
402
auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
403
if (llvm::all_of(ShiftAmts, OutOfRange)) {
404
SmallVector<Constant *, 8> ConstantVec;
405
for (int Idx : ShiftAmts) {
406
if (Idx < 0) {
407
ConstantVec.push_back(UndefValue::get(SVT));
408
} else {
409
assert(LogicalShift && "Logical shift expected");
410
ConstantVec.push_back(ConstantInt::getNullValue(SVT));
411
}
412
}
413
return ConstantVector::get(ConstantVec);
414
}
415
416
// We can't handle only some out of range values with generic logical shifts.
417
if (AnyOutOfRange)
418
return nullptr;
419
420
// Build the shift amount constant vector.
421
SmallVector<Constant *, 8> ShiftVecAmts;
422
for (int Idx : ShiftAmts) {
423
if (Idx < 0)
424
ShiftVecAmts.push_back(UndefValue::get(SVT));
425
else
426
ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
427
}
428
auto ShiftVec = ConstantVector::get(ShiftVecAmts);
429
430
if (ShiftLeft)
431
return Builder.CreateShl(Vec, ShiftVec);
432
433
if (LogicalShift)
434
return Builder.CreateLShr(Vec, ShiftVec);
435
436
return Builder.CreateAShr(Vec, ShiftVec);
437
}
438
439
static Value *simplifyX86pack(IntrinsicInst &II,
440
InstCombiner::BuilderTy &Builder, bool IsSigned) {
441
Value *Arg0 = II.getArgOperand(0);
442
Value *Arg1 = II.getArgOperand(1);
443
Type *ResTy = II.getType();
444
445
// Fast all undef handling.
446
if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
447
return UndefValue::get(ResTy);
448
449
auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
450
unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
451
unsigned NumSrcElts = ArgTy->getNumElements();
452
assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
453
"Unexpected packing types");
454
455
unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
456
unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
457
unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
458
assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
459
"Unexpected packing types");
460
461
// Constant folding.
462
if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
463
return nullptr;
464
465
// Clamp Values - signed/unsigned both use signed clamp values, but they
466
// differ on the min/max values.
467
APInt MinValue, MaxValue;
468
if (IsSigned) {
469
// PACKSS: Truncate signed value with signed saturation.
470
// Source values less than dst minint are saturated to minint.
471
// Source values greater than dst maxint are saturated to maxint.
472
MinValue =
473
APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
474
MaxValue =
475
APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
476
} else {
477
// PACKUS: Truncate signed value with unsigned saturation.
478
// Source values less than zero are saturated to zero.
479
// Source values greater than dst maxuint are saturated to maxuint.
480
MinValue = APInt::getZero(SrcScalarSizeInBits);
481
MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
482
}
483
484
auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
485
auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
486
Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
487
Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
488
Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
489
Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
490
491
// Shuffle clamped args together at the lane level.
492
SmallVector<int, 32> PackMask;
493
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
494
for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
495
PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
496
for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
497
PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
498
}
499
auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
500
501
// Truncate to dst size.
502
return Builder.CreateTrunc(Shuffle, ResTy);
503
}
504
505
static Value *simplifyX86pmulh(IntrinsicInst &II,
506
InstCombiner::BuilderTy &Builder, bool IsSigned,
507
bool IsRounding) {
508
Value *Arg0 = II.getArgOperand(0);
509
Value *Arg1 = II.getArgOperand(1);
510
auto *ResTy = cast<FixedVectorType>(II.getType());
511
auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
512
assert(ArgTy == ResTy && ResTy->getScalarSizeInBits() == 16 &&
513
"Unexpected PMULH types");
514
assert((!IsRounding || IsSigned) && "PMULHRS instruction must be signed");
515
516
// Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
517
if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
518
return ConstantAggregateZero::get(ResTy);
519
520
// Multiply by zero.
521
if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
522
return ConstantAggregateZero::get(ResTy);
523
524
// Multiply by one.
525
if (!IsRounding) {
526
if (match(Arg0, m_One()))
527
return IsSigned ? Builder.CreateAShr(Arg1, 15)
528
: ConstantAggregateZero::get(ResTy);
529
if (match(Arg1, m_One()))
530
return IsSigned ? Builder.CreateAShr(Arg0, 15)
531
: ConstantAggregateZero::get(ResTy);
532
}
533
534
// Constant folding.
535
if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
536
return nullptr;
537
538
// Extend to twice the width and multiply.
539
auto Cast =
540
IsSigned ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
541
auto *ExtTy = FixedVectorType::getExtendedElementVectorType(ArgTy);
542
Value *LHS = Builder.CreateCast(Cast, Arg0, ExtTy);
543
Value *RHS = Builder.CreateCast(Cast, Arg1, ExtTy);
544
Value *Mul = Builder.CreateMul(LHS, RHS);
545
546
if (IsRounding) {
547
// PMULHRSW: truncate to vXi18 of the most significant bits, add one and
548
// extract bits[16:1].
549
auto *RndEltTy = IntegerType::get(ExtTy->getContext(), 18);
550
auto *RndTy = FixedVectorType::get(RndEltTy, ExtTy);
551
Mul = Builder.CreateLShr(Mul, 14);
552
Mul = Builder.CreateTrunc(Mul, RndTy);
553
Mul = Builder.CreateAdd(Mul, ConstantInt::get(RndTy, 1));
554
Mul = Builder.CreateLShr(Mul, 1);
555
} else {
556
// PMULH/PMULHU: extract the vXi16 most significant bits.
557
Mul = Builder.CreateLShr(Mul, 16);
558
}
559
560
return Builder.CreateTrunc(Mul, ResTy);
561
}
562
563
static Value *simplifyX86pmadd(IntrinsicInst &II,
564
InstCombiner::BuilderTy &Builder,
565
bool IsPMADDWD) {
566
Value *Arg0 = II.getArgOperand(0);
567
Value *Arg1 = II.getArgOperand(1);
568
auto *ResTy = cast<FixedVectorType>(II.getType());
569
[[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
570
571
unsigned NumDstElts = ResTy->getNumElements();
572
assert(ArgTy->getNumElements() == (2 * NumDstElts) &&
573
ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) &&
574
"Unexpected PMADD types");
575
576
// Multiply by undef -> zero (NOT undef!) as other arg could still be zero.
577
if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
578
return ConstantAggregateZero::get(ResTy);
579
580
// Multiply by zero.
581
if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))
582
return ConstantAggregateZero::get(ResTy);
583
584
// Constant folding.
585
if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
586
return nullptr;
587
588
// Split Lo/Hi elements pairs, extend and add together.
589
// PMADDWD(X,Y) =
590
// add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1])))
591
// PMADDUBSW(X,Y) =
592
// sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1])))
593
SmallVector<int> LoMask, HiMask;
594
for (unsigned I = 0; I != NumDstElts; ++I) {
595
LoMask.push_back(2 * I + 0);
596
HiMask.push_back(2 * I + 1);
597
}
598
599
auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask);
600
auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask);
601
auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask);
602
auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask);
603
604
auto LHSCast =
605
IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;
606
LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy);
607
LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy);
608
RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy);
609
RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy);
610
Value *Lo = Builder.CreateMul(LHSLo, RHSLo);
611
Value *Hi = Builder.CreateMul(LHSHi, RHSHi);
612
return IsPMADDWD
613
? Builder.CreateAdd(Lo, Hi)
614
: Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi});
615
}
616
617
static Value *simplifyX86movmsk(const IntrinsicInst &II,
618
InstCombiner::BuilderTy &Builder) {
619
Value *Arg = II.getArgOperand(0);
620
Type *ResTy = II.getType();
621
622
// movmsk(undef) -> zero as we must ensure the upper bits are zero.
623
if (isa<UndefValue>(Arg))
624
return Constant::getNullValue(ResTy);
625
626
auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
627
// We can't easily peek through x86_mmx types.
628
if (!ArgTy)
629
return nullptr;
630
631
// Expand MOVMSK to compare/bitcast/zext:
632
// e.g. PMOVMSKB(v16i8 x):
633
// %cmp = icmp slt <16 x i8> %x, zeroinitializer
634
// %int = bitcast <16 x i1> %cmp to i16
635
// %res = zext i16 %int to i32
636
unsigned NumElts = ArgTy->getNumElements();
637
Type *IntegerTy = Builder.getIntNTy(NumElts);
638
639
Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));
640
Res = Builder.CreateIsNeg(Res);
641
Res = Builder.CreateBitCast(Res, IntegerTy);
642
Res = Builder.CreateZExtOrTrunc(Res, ResTy);
643
return Res;
644
}
645
646
static Value *simplifyX86addcarry(const IntrinsicInst &II,
647
InstCombiner::BuilderTy &Builder) {
648
Value *CarryIn = II.getArgOperand(0);
649
Value *Op1 = II.getArgOperand(1);
650
Value *Op2 = II.getArgOperand(2);
651
Type *RetTy = II.getType();
652
Type *OpTy = Op1->getType();
653
assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
654
RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
655
"Unexpected types for x86 addcarry");
656
657
// If carry-in is zero, this is just an unsigned add with overflow.
658
if (match(CarryIn, m_ZeroInt())) {
659
Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
660
{Op1, Op2});
661
// The types have to be adjusted to match the x86 call types.
662
Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
663
Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
664
Builder.getInt8Ty());
665
Value *Res = PoisonValue::get(RetTy);
666
Res = Builder.CreateInsertValue(Res, UAddOV, 0);
667
return Builder.CreateInsertValue(Res, UAddResult, 1);
668
}
669
670
return nullptr;
671
}
672
673
static Value *simplifyTernarylogic(const IntrinsicInst &II,
674
InstCombiner::BuilderTy &Builder) {
675
676
auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));
677
if (!ArgImm || ArgImm->getValue().uge(256))
678
return nullptr;
679
680
Value *ArgA = II.getArgOperand(0);
681
Value *ArgB = II.getArgOperand(1);
682
Value *ArgC = II.getArgOperand(2);
683
684
Type *Ty = II.getType();
685
686
auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
687
return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};
688
};
689
auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
690
return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};
691
};
692
auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {
693
return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};
694
};
695
auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {
696
return {Builder.CreateNot(V.first), ~V.second};
697
};
698
auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };
699
auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
700
auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
701
702
bool AIsConst = match(ArgA, m_ImmConstant());
703
bool BIsConst = match(ArgB, m_ImmConstant());
704
bool CIsConst = match(ArgC, m_ImmConstant());
705
706
bool ABIsConst = AIsConst && BIsConst;
707
bool ACIsConst = AIsConst && CIsConst;
708
bool BCIsConst = BIsConst && CIsConst;
709
bool ABCIsConst = AIsConst && BIsConst && CIsConst;
710
711
// Use for verification. Its a big table. Its difficult to go from Imm ->
712
// logic ops, but easy to verify that a set of logic ops is correct. We track
713
// the logic ops through the second value in the pair. At the end it should
714
// equal Imm.
715
std::pair<Value *, uint8_t> A = {ArgA, 0xf0};
716
std::pair<Value *, uint8_t> B = {ArgB, 0xcc};
717
std::pair<Value *, uint8_t> C = {ArgC, 0xaa};
718
std::pair<Value *, uint8_t> Res = {nullptr, 0};
719
720
// Currently we only handle cases that convert directly to another instruction
721
// or cases where all the ops are constant. This is because we don't properly
722
// handle creating ternary ops in the backend, so splitting them here may
723
// cause regressions. As the backend improves, uncomment more cases.
724
725
uint8_t Imm = ArgImm->getValue().getZExtValue();
726
switch (Imm) {
727
case 0x0:
728
Res = {Constant::getNullValue(Ty), 0};
729
break;
730
case 0x1:
731
if (ABCIsConst)
732
Res = Nor(Or(A, B), C);
733
break;
734
case 0x2:
735
if (ABCIsConst)
736
Res = And(Nor(A, B), C);
737
break;
738
case 0x3:
739
if (ABIsConst)
740
Res = Nor(A, B);
741
break;
742
case 0x4:
743
if (ABCIsConst)
744
Res = And(Nor(A, C), B);
745
break;
746
case 0x5:
747
if (ACIsConst)
748
Res = Nor(A, C);
749
break;
750
case 0x6:
751
if (ABCIsConst)
752
Res = Nor(A, Xnor(B, C));
753
break;
754
case 0x7:
755
if (ABCIsConst)
756
Res = Nor(A, And(B, C));
757
break;
758
case 0x8:
759
if (ABCIsConst)
760
Res = Nor(A, Nand(B, C));
761
break;
762
case 0x9:
763
if (ABCIsConst)
764
Res = Nor(A, Xor(B, C));
765
break;
766
case 0xa:
767
if (ACIsConst)
768
Res = Nor(A, Not(C));
769
break;
770
case 0xb:
771
if (ABCIsConst)
772
Res = Nor(A, Nor(C, Not(B)));
773
break;
774
case 0xc:
775
if (ABIsConst)
776
Res = Nor(A, Not(B));
777
break;
778
case 0xd:
779
if (ABCIsConst)
780
Res = Nor(A, Nor(B, Not(C)));
781
break;
782
case 0xe:
783
if (ABCIsConst)
784
Res = Nor(A, Nor(B, C));
785
break;
786
case 0xf:
787
Res = Not(A);
788
break;
789
case 0x10:
790
if (ABCIsConst)
791
Res = And(A, Nor(B, C));
792
break;
793
case 0x11:
794
if (BCIsConst)
795
Res = Nor(B, C);
796
break;
797
case 0x12:
798
if (ABCIsConst)
799
Res = Nor(Xnor(A, C), B);
800
break;
801
case 0x13:
802
if (ABCIsConst)
803
Res = Nor(And(A, C), B);
804
break;
805
case 0x14:
806
if (ABCIsConst)
807
Res = Nor(Xnor(A, B), C);
808
break;
809
case 0x15:
810
if (ABCIsConst)
811
Res = Nor(And(A, B), C);
812
break;
813
case 0x16:
814
if (ABCIsConst)
815
Res = Xor(Xor(A, B), And(Nand(A, B), C));
816
break;
817
case 0x17:
818
if (ABCIsConst)
819
Res = Xor(Or(A, B), Or(Xnor(A, B), C));
820
break;
821
case 0x18:
822
if (ABCIsConst)
823
Res = Nor(Xnor(A, B), Xnor(A, C));
824
break;
825
case 0x19:
826
if (ABCIsConst)
827
Res = And(Nand(A, B), Xnor(B, C));
828
break;
829
case 0x1a:
830
if (ABCIsConst)
831
Res = Xor(A, Or(And(A, B), C));
832
break;
833
case 0x1b:
834
if (ABCIsConst)
835
Res = Xor(A, Or(Xnor(A, B), C));
836
break;
837
case 0x1c:
838
if (ABCIsConst)
839
Res = Xor(A, Or(And(A, C), B));
840
break;
841
case 0x1d:
842
if (ABCIsConst)
843
Res = Xor(A, Or(Xnor(A, C), B));
844
break;
845
case 0x1e:
846
if (ABCIsConst)
847
Res = Xor(A, Or(B, C));
848
break;
849
case 0x1f:
850
if (ABCIsConst)
851
Res = Nand(A, Or(B, C));
852
break;
853
case 0x20:
854
if (ABCIsConst)
855
Res = Nor(Nand(A, C), B);
856
break;
857
case 0x21:
858
if (ABCIsConst)
859
Res = Nor(Xor(A, C), B);
860
break;
861
case 0x22:
862
if (BCIsConst)
863
Res = Nor(B, Not(C));
864
break;
865
case 0x23:
866
if (ABCIsConst)
867
Res = Nor(B, Nor(C, Not(A)));
868
break;
869
case 0x24:
870
if (ABCIsConst)
871
Res = Nor(Xnor(A, B), Xor(A, C));
872
break;
873
case 0x25:
874
if (ABCIsConst)
875
Res = Xor(A, Nand(Nand(A, B), C));
876
break;
877
case 0x26:
878
if (ABCIsConst)
879
Res = And(Nand(A, B), Xor(B, C));
880
break;
881
case 0x27:
882
if (ABCIsConst)
883
Res = Xor(Or(Xnor(A, B), C), B);
884
break;
885
case 0x28:
886
if (ABCIsConst)
887
Res = And(Xor(A, B), C);
888
break;
889
case 0x29:
890
if (ABCIsConst)
891
Res = Xor(Xor(A, B), Nor(And(A, B), C));
892
break;
893
case 0x2a:
894
if (ABCIsConst)
895
Res = And(Nand(A, B), C);
896
break;
897
case 0x2b:
898
if (ABCIsConst)
899
Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);
900
break;
901
case 0x2c:
902
if (ABCIsConst)
903
Res = Nor(Xnor(A, B), Nor(B, C));
904
break;
905
case 0x2d:
906
if (ABCIsConst)
907
Res = Xor(A, Or(B, Not(C)));
908
break;
909
case 0x2e:
910
if (ABCIsConst)
911
Res = Xor(A, Or(Xor(A, C), B));
912
break;
913
case 0x2f:
914
if (ABCIsConst)
915
Res = Nand(A, Or(B, Not(C)));
916
break;
917
case 0x30:
918
if (ABIsConst)
919
Res = Nor(B, Not(A));
920
break;
921
case 0x31:
922
if (ABCIsConst)
923
Res = Nor(Nor(A, Not(C)), B);
924
break;
925
case 0x32:
926
if (ABCIsConst)
927
Res = Nor(Nor(A, C), B);
928
break;
929
case 0x33:
930
Res = Not(B);
931
break;
932
case 0x34:
933
if (ABCIsConst)
934
Res = And(Xor(A, B), Nand(B, C));
935
break;
936
case 0x35:
937
if (ABCIsConst)
938
Res = Xor(B, Or(A, Xnor(B, C)));
939
break;
940
case 0x36:
941
if (ABCIsConst)
942
Res = Xor(Or(A, C), B);
943
break;
944
case 0x37:
945
if (ABCIsConst)
946
Res = Nand(Or(A, C), B);
947
break;
948
case 0x38:
949
if (ABCIsConst)
950
Res = Nor(Xnor(A, B), Nor(A, C));
951
break;
952
case 0x39:
953
if (ABCIsConst)
954
Res = Xor(Or(A, Not(C)), B);
955
break;
956
case 0x3a:
957
if (ABCIsConst)
958
Res = Xor(B, Or(A, Xor(B, C)));
959
break;
960
case 0x3b:
961
if (ABCIsConst)
962
Res = Nand(Or(A, Not(C)), B);
963
break;
964
case 0x3c:
965
Res = Xor(A, B);
966
break;
967
case 0x3d:
968
if (ABCIsConst)
969
Res = Xor(A, Or(Nor(A, C), B));
970
break;
971
case 0x3e:
972
if (ABCIsConst)
973
Res = Xor(A, Or(Nor(A, Not(C)), B));
974
break;
975
case 0x3f:
976
if (ABIsConst)
977
Res = Nand(A, B);
978
break;
979
case 0x40:
980
if (ABCIsConst)
981
Res = Nor(Nand(A, B), C);
982
break;
983
case 0x41:
984
if (ABCIsConst)
985
Res = Nor(Xor(A, B), C);
986
break;
987
case 0x42:
988
if (ABCIsConst)
989
Res = Nor(Xor(A, B), Xnor(A, C));
990
break;
991
case 0x43:
992
if (ABCIsConst)
993
Res = Xor(A, Nand(Nand(A, C), B));
994
break;
995
case 0x44:
996
if (BCIsConst)
997
Res = Nor(C, Not(B));
998
break;
999
case 0x45:
1000
if (ABCIsConst)
1001
Res = Nor(Nor(B, Not(A)), C);
1002
break;
1003
case 0x46:
1004
if (ABCIsConst)
1005
Res = Xor(Or(And(A, C), B), C);
1006
break;
1007
case 0x47:
1008
if (ABCIsConst)
1009
Res = Xor(Or(Xnor(A, C), B), C);
1010
break;
1011
case 0x48:
1012
if (ABCIsConst)
1013
Res = And(Xor(A, C), B);
1014
break;
1015
case 0x49:
1016
if (ABCIsConst)
1017
Res = Xor(Or(Xnor(A, B), And(A, C)), C);
1018
break;
1019
case 0x4a:
1020
if (ABCIsConst)
1021
Res = Nor(Xnor(A, C), Nor(B, C));
1022
break;
1023
case 0x4b:
1024
if (ABCIsConst)
1025
Res = Xor(A, Or(C, Not(B)));
1026
break;
1027
case 0x4c:
1028
if (ABCIsConst)
1029
Res = And(Nand(A, C), B);
1030
break;
1031
case 0x4d:
1032
if (ABCIsConst)
1033
Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);
1034
break;
1035
case 0x4e:
1036
if (ABCIsConst)
1037
Res = Xor(A, Or(Xor(A, B), C));
1038
break;
1039
case 0x4f:
1040
if (ABCIsConst)
1041
Res = Nand(A, Nand(B, Not(C)));
1042
break;
1043
case 0x50:
1044
if (ACIsConst)
1045
Res = Nor(C, Not(A));
1046
break;
1047
case 0x51:
1048
if (ABCIsConst)
1049
Res = Nor(Nor(A, Not(B)), C);
1050
break;
1051
case 0x52:
1052
if (ABCIsConst)
1053
Res = And(Xor(A, C), Nand(B, C));
1054
break;
1055
case 0x53:
1056
if (ABCIsConst)
1057
Res = Xor(Or(Xnor(B, C), A), C);
1058
break;
1059
case 0x54:
1060
if (ABCIsConst)
1061
Res = Nor(Nor(A, B), C);
1062
break;
1063
case 0x55:
1064
Res = Not(C);
1065
break;
1066
case 0x56:
1067
if (ABCIsConst)
1068
Res = Xor(Or(A, B), C);
1069
break;
1070
case 0x57:
1071
if (ABCIsConst)
1072
Res = Nand(Or(A, B), C);
1073
break;
1074
case 0x58:
1075
if (ABCIsConst)
1076
Res = Nor(Nor(A, B), Xnor(A, C));
1077
break;
1078
case 0x59:
1079
if (ABCIsConst)
1080
Res = Xor(Or(A, Not(B)), C);
1081
break;
1082
case 0x5a:
1083
Res = Xor(A, C);
1084
break;
1085
case 0x5b:
1086
if (ABCIsConst)
1087
Res = Xor(A, Or(Nor(A, B), C));
1088
break;
1089
case 0x5c:
1090
if (ABCIsConst)
1091
Res = Xor(Or(Xor(B, C), A), C);
1092
break;
1093
case 0x5d:
1094
if (ABCIsConst)
1095
Res = Nand(Or(A, Not(B)), C);
1096
break;
1097
case 0x5e:
1098
if (ABCIsConst)
1099
Res = Xor(A, Or(Nor(A, Not(B)), C));
1100
break;
1101
case 0x5f:
1102
if (ACIsConst)
1103
Res = Nand(A, C);
1104
break;
1105
case 0x60:
1106
if (ABCIsConst)
1107
Res = And(A, Xor(B, C));
1108
break;
1109
case 0x61:
1110
if (ABCIsConst)
1111
Res = Xor(Or(Xnor(A, B), And(B, C)), C);
1112
break;
1113
case 0x62:
1114
if (ABCIsConst)
1115
Res = Nor(Nor(A, C), Xnor(B, C));
1116
break;
1117
case 0x63:
1118
if (ABCIsConst)
1119
Res = Xor(B, Or(C, Not(A)));
1120
break;
1121
case 0x64:
1122
if (ABCIsConst)
1123
Res = Nor(Nor(A, B), Xnor(B, C));
1124
break;
1125
case 0x65:
1126
if (ABCIsConst)
1127
Res = Xor(Or(B, Not(A)), C);
1128
break;
1129
case 0x66:
1130
Res = Xor(B, C);
1131
break;
1132
case 0x67:
1133
if (ABCIsConst)
1134
Res = Or(Nor(A, B), Xor(B, C));
1135
break;
1136
case 0x68:
1137
if (ABCIsConst)
1138
Res = Xor(Xor(A, B), Nor(Nor(A, B), C));
1139
break;
1140
case 0x69:
1141
if (ABCIsConst)
1142
Res = Xor(Xnor(A, B), C);
1143
break;
1144
case 0x6a:
1145
if (ABCIsConst)
1146
Res = Xor(And(A, B), C);
1147
break;
1148
case 0x6b:
1149
if (ABCIsConst)
1150
Res = Or(Nor(A, B), Xor(Xnor(A, B), C));
1151
break;
1152
case 0x6c:
1153
if (ABCIsConst)
1154
Res = Xor(And(A, C), B);
1155
break;
1156
case 0x6d:
1157
if (ABCIsConst)
1158
Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);
1159
break;
1160
case 0x6e:
1161
if (ABCIsConst)
1162
Res = Or(Nor(A, Not(B)), Xor(B, C));
1163
break;
1164
case 0x6f:
1165
if (ABCIsConst)
1166
Res = Nand(A, Xnor(B, C));
1167
break;
1168
case 0x70:
1169
if (ABCIsConst)
1170
Res = And(A, Nand(B, C));
1171
break;
1172
case 0x71:
1173
if (ABCIsConst)
1174
Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);
1175
break;
1176
case 0x72:
1177
if (ABCIsConst)
1178
Res = Xor(Or(Xor(A, B), C), B);
1179
break;
1180
case 0x73:
1181
if (ABCIsConst)
1182
Res = Nand(Nand(A, Not(C)), B);
1183
break;
1184
case 0x74:
1185
if (ABCIsConst)
1186
Res = Xor(Or(Xor(A, C), B), C);
1187
break;
1188
case 0x75:
1189
if (ABCIsConst)
1190
Res = Nand(Nand(A, Not(B)), C);
1191
break;
1192
case 0x76:
1193
if (ABCIsConst)
1194
Res = Xor(B, Or(Nor(B, Not(A)), C));
1195
break;
1196
case 0x77:
1197
if (BCIsConst)
1198
Res = Nand(B, C);
1199
break;
1200
case 0x78:
1201
if (ABCIsConst)
1202
Res = Xor(A, And(B, C));
1203
break;
1204
case 0x79:
1205
if (ABCIsConst)
1206
Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);
1207
break;
1208
case 0x7a:
1209
if (ABCIsConst)
1210
Res = Or(Xor(A, C), Nor(B, Not(A)));
1211
break;
1212
case 0x7b:
1213
if (ABCIsConst)
1214
Res = Nand(Xnor(A, C), B);
1215
break;
1216
case 0x7c:
1217
if (ABCIsConst)
1218
Res = Or(Xor(A, B), Nor(C, Not(A)));
1219
break;
1220
case 0x7d:
1221
if (ABCIsConst)
1222
Res = Nand(Xnor(A, B), C);
1223
break;
1224
case 0x7e:
1225
if (ABCIsConst)
1226
Res = Or(Xor(A, B), Xor(A, C));
1227
break;
1228
case 0x7f:
1229
if (ABCIsConst)
1230
Res = Nand(And(A, B), C);
1231
break;
1232
case 0x80:
1233
if (ABCIsConst)
1234
Res = And(And(A, B), C);
1235
break;
1236
case 0x81:
1237
if (ABCIsConst)
1238
Res = Nor(Xor(A, B), Xor(A, C));
1239
break;
1240
case 0x82:
1241
if (ABCIsConst)
1242
Res = And(Xnor(A, B), C);
1243
break;
1244
case 0x83:
1245
if (ABCIsConst)
1246
Res = Nor(Xor(A, B), Nor(C, Not(A)));
1247
break;
1248
case 0x84:
1249
if (ABCIsConst)
1250
Res = And(Xnor(A, C), B);
1251
break;
1252
case 0x85:
1253
if (ABCIsConst)
1254
Res = Nor(Xor(A, C), Nor(B, Not(A)));
1255
break;
1256
case 0x86:
1257
if (ABCIsConst)
1258
Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);
1259
break;
1260
case 0x87:
1261
if (ABCIsConst)
1262
Res = Xor(A, Nand(B, C));
1263
break;
1264
case 0x88:
1265
Res = And(B, C);
1266
break;
1267
case 0x89:
1268
if (ABCIsConst)
1269
Res = Xor(B, Nor(Nor(B, Not(A)), C));
1270
break;
1271
case 0x8a:
1272
if (ABCIsConst)
1273
Res = And(Nand(A, Not(B)), C);
1274
break;
1275
case 0x8b:
1276
if (ABCIsConst)
1277
Res = Xor(Nor(Xor(A, C), B), C);
1278
break;
1279
case 0x8c:
1280
if (ABCIsConst)
1281
Res = And(Nand(A, Not(C)), B);
1282
break;
1283
case 0x8d:
1284
if (ABCIsConst)
1285
Res = Xor(Nor(Xor(A, B), C), B);
1286
break;
1287
case 0x8e:
1288
if (ABCIsConst)
1289
Res = Xor(Or(Xor(A, B), Xor(A, C)), A);
1290
break;
1291
case 0x8f:
1292
if (ABCIsConst)
1293
Res = Nand(A, Nand(B, C));
1294
break;
1295
case 0x90:
1296
if (ABCIsConst)
1297
Res = And(A, Xnor(B, C));
1298
break;
1299
case 0x91:
1300
if (ABCIsConst)
1301
Res = Nor(Nor(A, Not(B)), Xor(B, C));
1302
break;
1303
case 0x92:
1304
if (ABCIsConst)
1305
Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);
1306
break;
1307
case 0x93:
1308
if (ABCIsConst)
1309
Res = Xor(Nand(A, C), B);
1310
break;
1311
case 0x94:
1312
if (ABCIsConst)
1313
Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));
1314
break;
1315
case 0x95:
1316
if (ABCIsConst)
1317
Res = Xor(Nand(A, B), C);
1318
break;
1319
case 0x96:
1320
if (ABCIsConst)
1321
Res = Xor(Xor(A, B), C);
1322
break;
1323
case 0x97:
1324
if (ABCIsConst)
1325
Res = Xor(Xor(A, B), Or(Nor(A, B), C));
1326
break;
1327
case 0x98:
1328
if (ABCIsConst)
1329
Res = Nor(Nor(A, B), Xor(B, C));
1330
break;
1331
case 0x99:
1332
if (BCIsConst)
1333
Res = Xnor(B, C);
1334
break;
1335
case 0x9a:
1336
if (ABCIsConst)
1337
Res = Xor(Nor(B, Not(A)), C);
1338
break;
1339
case 0x9b:
1340
if (ABCIsConst)
1341
Res = Or(Nor(A, B), Xnor(B, C));
1342
break;
1343
case 0x9c:
1344
if (ABCIsConst)
1345
Res = Xor(B, Nor(C, Not(A)));
1346
break;
1347
case 0x9d:
1348
if (ABCIsConst)
1349
Res = Or(Nor(A, C), Xnor(B, C));
1350
break;
1351
case 0x9e:
1352
if (ABCIsConst)
1353
Res = Xor(And(Xor(A, B), Nand(B, C)), C);
1354
break;
1355
case 0x9f:
1356
if (ABCIsConst)
1357
Res = Nand(A, Xor(B, C));
1358
break;
1359
case 0xa0:
1360
Res = And(A, C);
1361
break;
1362
case 0xa1:
1363
if (ABCIsConst)
1364
Res = Xor(A, Nor(Nor(A, Not(B)), C));
1365
break;
1366
case 0xa2:
1367
if (ABCIsConst)
1368
Res = And(Or(A, Not(B)), C);
1369
break;
1370
case 0xa3:
1371
if (ABCIsConst)
1372
Res = Xor(Nor(Xor(B, C), A), C);
1373
break;
1374
case 0xa4:
1375
if (ABCIsConst)
1376
Res = Xor(A, Nor(Nor(A, B), C));
1377
break;
1378
case 0xa5:
1379
if (ACIsConst)
1380
Res = Xnor(A, C);
1381
break;
1382
case 0xa6:
1383
if (ABCIsConst)
1384
Res = Xor(Nor(A, Not(B)), C);
1385
break;
1386
case 0xa7:
1387
if (ABCIsConst)
1388
Res = Or(Nor(A, B), Xnor(A, C));
1389
break;
1390
case 0xa8:
1391
if (ABCIsConst)
1392
Res = And(Or(A, B), C);
1393
break;
1394
case 0xa9:
1395
if (ABCIsConst)
1396
Res = Xor(Nor(A, B), C);
1397
break;
1398
case 0xaa:
1399
Res = C;
1400
break;
1401
case 0xab:
1402
if (ABCIsConst)
1403
Res = Or(Nor(A, B), C);
1404
break;
1405
case 0xac:
1406
if (ABCIsConst)
1407
Res = Xor(Nor(Xnor(B, C), A), C);
1408
break;
1409
case 0xad:
1410
if (ABCIsConst)
1411
Res = Or(Xnor(A, C), And(B, C));
1412
break;
1413
case 0xae:
1414
if (ABCIsConst)
1415
Res = Or(Nor(A, Not(B)), C);
1416
break;
1417
case 0xaf:
1418
if (ACIsConst)
1419
Res = Or(C, Not(A));
1420
break;
1421
case 0xb0:
1422
if (ABCIsConst)
1423
Res = And(A, Nand(B, Not(C)));
1424
break;
1425
case 0xb1:
1426
if (ABCIsConst)
1427
Res = Xor(A, Nor(Xor(A, B), C));
1428
break;
1429
case 0xb2:
1430
if (ABCIsConst)
1431
Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);
1432
break;
1433
case 0xb3:
1434
if (ABCIsConst)
1435
Res = Nand(Nand(A, C), B);
1436
break;
1437
case 0xb4:
1438
if (ABCIsConst)
1439
Res = Xor(A, Nor(C, Not(B)));
1440
break;
1441
case 0xb5:
1442
if (ABCIsConst)
1443
Res = Or(Xnor(A, C), Nor(B, C));
1444
break;
1445
case 0xb6:
1446
if (ABCIsConst)
1447
Res = Xor(And(Xor(A, B), Nand(A, C)), C);
1448
break;
1449
case 0xb7:
1450
if (ABCIsConst)
1451
Res = Nand(Xor(A, C), B);
1452
break;
1453
case 0xb8:
1454
if (ABCIsConst)
1455
Res = Xor(Nor(Xnor(A, C), B), C);
1456
break;
1457
case 0xb9:
1458
if (ABCIsConst)
1459
Res = Xor(Nor(And(A, C), B), C);
1460
break;
1461
case 0xba:
1462
if (ABCIsConst)
1463
Res = Or(Nor(B, Not(A)), C);
1464
break;
1465
case 0xbb:
1466
if (BCIsConst)
1467
Res = Or(C, Not(B));
1468
break;
1469
case 0xbc:
1470
if (ABCIsConst)
1471
Res = Xor(A, And(Nand(A, C), B));
1472
break;
1473
case 0xbd:
1474
if (ABCIsConst)
1475
Res = Or(Xor(A, B), Xnor(A, C));
1476
break;
1477
case 0xbe:
1478
if (ABCIsConst)
1479
Res = Or(Xor(A, B), C);
1480
break;
1481
case 0xbf:
1482
if (ABCIsConst)
1483
Res = Or(Nand(A, B), C);
1484
break;
1485
case 0xc0:
1486
Res = And(A, B);
1487
break;
1488
case 0xc1:
1489
if (ABCIsConst)
1490
Res = Xor(A, Nor(Nor(A, Not(C)), B));
1491
break;
1492
case 0xc2:
1493
if (ABCIsConst)
1494
Res = Xor(A, Nor(Nor(A, C), B));
1495
break;
1496
case 0xc3:
1497
if (ABIsConst)
1498
Res = Xnor(A, B);
1499
break;
1500
case 0xc4:
1501
if (ABCIsConst)
1502
Res = And(Or(A, Not(C)), B);
1503
break;
1504
case 0xc5:
1505
if (ABCIsConst)
1506
Res = Xor(B, Nor(A, Xor(B, C)));
1507
break;
1508
case 0xc6:
1509
if (ABCIsConst)
1510
Res = Xor(Nor(A, Not(C)), B);
1511
break;
1512
case 0xc7:
1513
if (ABCIsConst)
1514
Res = Or(Xnor(A, B), Nor(A, C));
1515
break;
1516
case 0xc8:
1517
if (ABCIsConst)
1518
Res = And(Or(A, C), B);
1519
break;
1520
case 0xc9:
1521
if (ABCIsConst)
1522
Res = Xor(Nor(A, C), B);
1523
break;
1524
case 0xca:
1525
if (ABCIsConst)
1526
Res = Xor(B, Nor(A, Xnor(B, C)));
1527
break;
1528
case 0xcb:
1529
if (ABCIsConst)
1530
Res = Or(Xnor(A, B), And(B, C));
1531
break;
1532
case 0xcc:
1533
Res = B;
1534
break;
1535
case 0xcd:
1536
if (ABCIsConst)
1537
Res = Or(Nor(A, C), B);
1538
break;
1539
case 0xce:
1540
if (ABCIsConst)
1541
Res = Or(Nor(A, Not(C)), B);
1542
break;
1543
case 0xcf:
1544
if (ABIsConst)
1545
Res = Or(B, Not(A));
1546
break;
1547
case 0xd0:
1548
if (ABCIsConst)
1549
Res = And(A, Or(B, Not(C)));
1550
break;
1551
case 0xd1:
1552
if (ABCIsConst)
1553
Res = Xor(A, Nor(Xor(A, C), B));
1554
break;
1555
case 0xd2:
1556
if (ABCIsConst)
1557
Res = Xor(A, Nor(B, Not(C)));
1558
break;
1559
case 0xd3:
1560
if (ABCIsConst)
1561
Res = Or(Xnor(A, B), Nor(B, C));
1562
break;
1563
case 0xd4:
1564
if (ABCIsConst)
1565
Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);
1566
break;
1567
case 0xd5:
1568
if (ABCIsConst)
1569
Res = Nand(Nand(A, B), C);
1570
break;
1571
case 0xd6:
1572
if (ABCIsConst)
1573
Res = Xor(Xor(A, B), Or(And(A, B), C));
1574
break;
1575
case 0xd7:
1576
if (ABCIsConst)
1577
Res = Nand(Xor(A, B), C);
1578
break;
1579
case 0xd8:
1580
if (ABCIsConst)
1581
Res = Xor(Nor(Xnor(A, B), C), B);
1582
break;
1583
case 0xd9:
1584
if (ABCIsConst)
1585
Res = Or(And(A, B), Xnor(B, C));
1586
break;
1587
case 0xda:
1588
if (ABCIsConst)
1589
Res = Xor(A, And(Nand(A, B), C));
1590
break;
1591
case 0xdb:
1592
if (ABCIsConst)
1593
Res = Or(Xnor(A, B), Xor(A, C));
1594
break;
1595
case 0xdc:
1596
if (ABCIsConst)
1597
Res = Or(B, Nor(C, Not(A)));
1598
break;
1599
case 0xdd:
1600
if (BCIsConst)
1601
Res = Or(B, Not(C));
1602
break;
1603
case 0xde:
1604
if (ABCIsConst)
1605
Res = Or(Xor(A, C), B);
1606
break;
1607
case 0xdf:
1608
if (ABCIsConst)
1609
Res = Or(Nand(A, C), B);
1610
break;
1611
case 0xe0:
1612
if (ABCIsConst)
1613
Res = And(A, Or(B, C));
1614
break;
1615
case 0xe1:
1616
if (ABCIsConst)
1617
Res = Xor(A, Nor(B, C));
1618
break;
1619
case 0xe2:
1620
if (ABCIsConst)
1621
Res = Xor(A, Nor(Xnor(A, C), B));
1622
break;
1623
case 0xe3:
1624
if (ABCIsConst)
1625
Res = Xor(A, Nor(And(A, C), B));
1626
break;
1627
case 0xe4:
1628
if (ABCIsConst)
1629
Res = Xor(A, Nor(Xnor(A, B), C));
1630
break;
1631
case 0xe5:
1632
if (ABCIsConst)
1633
Res = Xor(A, Nor(And(A, B), C));
1634
break;
1635
case 0xe6:
1636
if (ABCIsConst)
1637
Res = Or(And(A, B), Xor(B, C));
1638
break;
1639
case 0xe7:
1640
if (ABCIsConst)
1641
Res = Or(Xnor(A, B), Xnor(A, C));
1642
break;
1643
case 0xe8:
1644
if (ABCIsConst)
1645
Res = Xor(Or(A, B), Nor(Xnor(A, B), C));
1646
break;
1647
case 0xe9:
1648
if (ABCIsConst)
1649
Res = Xor(Xor(A, B), Nand(Nand(A, B), C));
1650
break;
1651
case 0xea:
1652
if (ABCIsConst)
1653
Res = Or(And(A, B), C);
1654
break;
1655
case 0xeb:
1656
if (ABCIsConst)
1657
Res = Or(Xnor(A, B), C);
1658
break;
1659
case 0xec:
1660
if (ABCIsConst)
1661
Res = Or(And(A, C), B);
1662
break;
1663
case 0xed:
1664
if (ABCIsConst)
1665
Res = Or(Xnor(A, C), B);
1666
break;
1667
case 0xee:
1668
Res = Or(B, C);
1669
break;
1670
case 0xef:
1671
if (ABCIsConst)
1672
Res = Nand(A, Nor(B, C));
1673
break;
1674
case 0xf0:
1675
Res = A;
1676
break;
1677
case 0xf1:
1678
if (ABCIsConst)
1679
Res = Or(A, Nor(B, C));
1680
break;
1681
case 0xf2:
1682
if (ABCIsConst)
1683
Res = Or(A, Nor(B, Not(C)));
1684
break;
1685
case 0xf3:
1686
if (ABIsConst)
1687
Res = Or(A, Not(B));
1688
break;
1689
case 0xf4:
1690
if (ABCIsConst)
1691
Res = Or(A, Nor(C, Not(B)));
1692
break;
1693
case 0xf5:
1694
if (ACIsConst)
1695
Res = Or(A, Not(C));
1696
break;
1697
case 0xf6:
1698
if (ABCIsConst)
1699
Res = Or(A, Xor(B, C));
1700
break;
1701
case 0xf7:
1702
if (ABCIsConst)
1703
Res = Or(A, Nand(B, C));
1704
break;
1705
case 0xf8:
1706
if (ABCIsConst)
1707
Res = Or(A, And(B, C));
1708
break;
1709
case 0xf9:
1710
if (ABCIsConst)
1711
Res = Or(A, Xnor(B, C));
1712
break;
1713
case 0xfa:
1714
Res = Or(A, C);
1715
break;
1716
case 0xfb:
1717
if (ABCIsConst)
1718
Res = Nand(Nor(A, C), B);
1719
break;
1720
case 0xfc:
1721
Res = Or(A, B);
1722
break;
1723
case 0xfd:
1724
if (ABCIsConst)
1725
Res = Nand(Nor(A, B), C);
1726
break;
1727
case 0xfe:
1728
if (ABCIsConst)
1729
Res = Or(Or(A, B), C);
1730
break;
1731
case 0xff:
1732
Res = {Constant::getAllOnesValue(Ty), 0xff};
1733
break;
1734
}
1735
1736
assert((Res.first == nullptr || Res.second == Imm) &&
1737
"Simplification of ternary logic does not verify!");
1738
return Res.first;
1739
}
1740
1741
static Value *simplifyX86insertps(const IntrinsicInst &II,
1742
InstCombiner::BuilderTy &Builder) {
1743
auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
1744
if (!CInt)
1745
return nullptr;
1746
1747
auto *VecTy = cast<FixedVectorType>(II.getType());
1748
assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
1749
1750
// The immediate permute control byte looks like this:
1751
// [3:0] - zero mask for each 32-bit lane
1752
// [5:4] - select one 32-bit destination lane
1753
// [7:6] - select one 32-bit source lane
1754
1755
uint8_t Imm = CInt->getZExtValue();
1756
uint8_t ZMask = Imm & 0xf;
1757
uint8_t DestLane = (Imm >> 4) & 0x3;
1758
uint8_t SourceLane = (Imm >> 6) & 0x3;
1759
1760
ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
1761
1762
// If all zero mask bits are set, this was just a weird way to
1763
// generate a zero vector.
1764
if (ZMask == 0xf)
1765
return ZeroVector;
1766
1767
// Initialize by passing all of the first source bits through.
1768
int ShuffleMask[4] = {0, 1, 2, 3};
1769
1770
// We may replace the second operand with the zero vector.
1771
Value *V1 = II.getArgOperand(1);
1772
1773
if (ZMask) {
1774
// If the zero mask is being used with a single input or the zero mask
1775
// overrides the destination lane, this is a shuffle with the zero vector.
1776
if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
1777
(ZMask & (1 << DestLane))) {
1778
V1 = ZeroVector;
1779
// We may still move 32-bits of the first source vector from one lane
1780
// to another.
1781
ShuffleMask[DestLane] = SourceLane;
1782
// The zero mask may override the previous insert operation.
1783
for (unsigned i = 0; i < 4; ++i)
1784
if ((ZMask >> i) & 0x1)
1785
ShuffleMask[i] = i + 4;
1786
} else {
1787
// TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
1788
return nullptr;
1789
}
1790
} else {
1791
// Replace the selected destination lane with the selected source lane.
1792
ShuffleMask[DestLane] = SourceLane + 4;
1793
}
1794
1795
return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
1796
}
1797
1798
/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
1799
/// or conversion to a shuffle vector.
1800
static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
1801
ConstantInt *CILength, ConstantInt *CIIndex,
1802
InstCombiner::BuilderTy &Builder) {
1803
auto LowConstantHighUndef = [&](uint64_t Val) {
1804
Type *IntTy64 = Type::getInt64Ty(II.getContext());
1805
Constant *Args[] = {ConstantInt::get(IntTy64, Val),
1806
UndefValue::get(IntTy64)};
1807
return ConstantVector::get(Args);
1808
};
1809
1810
// See if we're dealing with constant values.
1811
auto *C0 = dyn_cast<Constant>(Op0);
1812
auto *CI0 =
1813
C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1814
: nullptr;
1815
1816
// Attempt to constant fold.
1817
if (CILength && CIIndex) {
1818
// From AMD documentation: "The bit index and field length are each six
1819
// bits in length other bits of the field are ignored."
1820
APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
1821
APInt APLength = CILength->getValue().zextOrTrunc(6);
1822
1823
unsigned Index = APIndex.getZExtValue();
1824
1825
// From AMD documentation: "a value of zero in the field length is
1826
// defined as length of 64".
1827
unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1828
1829
// From AMD documentation: "If the sum of the bit index + length field
1830
// is greater than 64, the results are undefined".
1831
unsigned End = Index + Length;
1832
1833
// Note that both field index and field length are 8-bit quantities.
1834
// Since variables 'Index' and 'Length' are unsigned values
1835
// obtained from zero-extending field index and field length
1836
// respectively, their sum should never wrap around.
1837
if (End > 64)
1838
return UndefValue::get(II.getType());
1839
1840
// If we are inserting whole bytes, we can convert this to a shuffle.
1841
// Lowering can recognize EXTRQI shuffle masks.
1842
if ((Length % 8) == 0 && (Index % 8) == 0) {
1843
// Convert bit indices to byte indices.
1844
Length /= 8;
1845
Index /= 8;
1846
1847
Type *IntTy8 = Type::getInt8Ty(II.getContext());
1848
auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1849
1850
SmallVector<int, 16> ShuffleMask;
1851
for (int i = 0; i != (int)Length; ++i)
1852
ShuffleMask.push_back(i + Index);
1853
for (int i = Length; i != 8; ++i)
1854
ShuffleMask.push_back(i + 16);
1855
for (int i = 8; i != 16; ++i)
1856
ShuffleMask.push_back(-1);
1857
1858
Value *SV = Builder.CreateShuffleVector(
1859
Builder.CreateBitCast(Op0, ShufTy),
1860
ConstantAggregateZero::get(ShufTy), ShuffleMask);
1861
return Builder.CreateBitCast(SV, II.getType());
1862
}
1863
1864
// Constant Fold - shift Index'th bit to lowest position and mask off
1865
// Length bits.
1866
if (CI0) {
1867
APInt Elt = CI0->getValue();
1868
Elt.lshrInPlace(Index);
1869
Elt = Elt.zextOrTrunc(Length);
1870
return LowConstantHighUndef(Elt.getZExtValue());
1871
}
1872
1873
// If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
1874
if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
1875
Value *Args[] = {Op0, CILength, CIIndex};
1876
Module *M = II.getModule();
1877
Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
1878
return Builder.CreateCall(F, Args);
1879
}
1880
}
1881
1882
// Constant Fold - extraction from zero is always {zero, undef}.
1883
if (CI0 && CI0->isZero())
1884
return LowConstantHighUndef(0);
1885
1886
return nullptr;
1887
}
1888
1889
/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
1890
/// folding or conversion to a shuffle vector.
1891
static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
1892
APInt APLength, APInt APIndex,
1893
InstCombiner::BuilderTy &Builder) {
1894
// From AMD documentation: "The bit index and field length are each six bits
1895
// in length other bits of the field are ignored."
1896
APIndex = APIndex.zextOrTrunc(6);
1897
APLength = APLength.zextOrTrunc(6);
1898
1899
// Attempt to constant fold.
1900
unsigned Index = APIndex.getZExtValue();
1901
1902
// From AMD documentation: "a value of zero in the field length is
1903
// defined as length of 64".
1904
unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
1905
1906
// From AMD documentation: "If the sum of the bit index + length field
1907
// is greater than 64, the results are undefined".
1908
unsigned End = Index + Length;
1909
1910
// Note that both field index and field length are 8-bit quantities.
1911
// Since variables 'Index' and 'Length' are unsigned values
1912
// obtained from zero-extending field index and field length
1913
// respectively, their sum should never wrap around.
1914
if (End > 64)
1915
return UndefValue::get(II.getType());
1916
1917
// If we are inserting whole bytes, we can convert this to a shuffle.
1918
// Lowering can recognize INSERTQI shuffle masks.
1919
if ((Length % 8) == 0 && (Index % 8) == 0) {
1920
// Convert bit indices to byte indices.
1921
Length /= 8;
1922
Index /= 8;
1923
1924
Type *IntTy8 = Type::getInt8Ty(II.getContext());
1925
auto *ShufTy = FixedVectorType::get(IntTy8, 16);
1926
1927
SmallVector<int, 16> ShuffleMask;
1928
for (int i = 0; i != (int)Index; ++i)
1929
ShuffleMask.push_back(i);
1930
for (int i = 0; i != (int)Length; ++i)
1931
ShuffleMask.push_back(i + 16);
1932
for (int i = Index + Length; i != 8; ++i)
1933
ShuffleMask.push_back(i);
1934
for (int i = 8; i != 16; ++i)
1935
ShuffleMask.push_back(-1);
1936
1937
Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
1938
Builder.CreateBitCast(Op1, ShufTy),
1939
ShuffleMask);
1940
return Builder.CreateBitCast(SV, II.getType());
1941
}
1942
1943
// See if we're dealing with constant values.
1944
auto *C0 = dyn_cast<Constant>(Op0);
1945
auto *C1 = dyn_cast<Constant>(Op1);
1946
auto *CI00 =
1947
C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
1948
: nullptr;
1949
auto *CI10 =
1950
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
1951
: nullptr;
1952
1953
// Constant Fold - insert bottom Length bits starting at the Index'th bit.
1954
if (CI00 && CI10) {
1955
APInt V00 = CI00->getValue();
1956
APInt V10 = CI10->getValue();
1957
APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
1958
V00 = V00 & ~Mask;
1959
V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
1960
APInt Val = V00 | V10;
1961
Type *IntTy64 = Type::getInt64Ty(II.getContext());
1962
Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
1963
UndefValue::get(IntTy64)};
1964
return ConstantVector::get(Args);
1965
}
1966
1967
// If we were an INSERTQ call, we'll save demanded elements if we convert to
1968
// INSERTQI.
1969
if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
1970
Type *IntTy8 = Type::getInt8Ty(II.getContext());
1971
Constant *CILength = ConstantInt::get(IntTy8, Length, false);
1972
Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
1973
1974
Value *Args[] = {Op0, Op1, CILength, CIIndex};
1975
Module *M = II.getModule();
1976
Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
1977
return Builder.CreateCall(F, Args);
1978
}
1979
1980
return nullptr;
1981
}
1982
1983
/// Attempt to convert pshufb* to shufflevector if the mask is constant.
1984
static Value *simplifyX86pshufb(const IntrinsicInst &II,
1985
InstCombiner::BuilderTy &Builder) {
1986
auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1987
if (!V)
1988
return nullptr;
1989
1990
auto *VecTy = cast<FixedVectorType>(II.getType());
1991
unsigned NumElts = VecTy->getNumElements();
1992
assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
1993
"Unexpected number of elements in shuffle mask!");
1994
1995
// Construct a shuffle mask from constant integers or UNDEFs.
1996
int Indexes[64];
1997
1998
// Each byte in the shuffle control mask forms an index to permute the
1999
// corresponding byte in the destination operand.
2000
for (unsigned I = 0; I < NumElts; ++I) {
2001
Constant *COp = V->getAggregateElement(I);
2002
if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2003
return nullptr;
2004
2005
if (isa<UndefValue>(COp)) {
2006
Indexes[I] = -1;
2007
continue;
2008
}
2009
2010
int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
2011
2012
// If the most significant bit (bit[7]) of each byte of the shuffle
2013
// control mask is set, then zero is written in the result byte.
2014
// The zero vector is in the right-hand side of the resulting
2015
// shufflevector.
2016
2017
// The value of each index for the high 128-bit lane is the least
2018
// significant 4 bits of the respective shuffle control byte.
2019
Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
2020
Indexes[I] = Index;
2021
}
2022
2023
auto V1 = II.getArgOperand(0);
2024
auto V2 = Constant::getNullValue(VecTy);
2025
return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
2026
}
2027
2028
/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
2029
static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
2030
InstCombiner::BuilderTy &Builder) {
2031
auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2032
if (!V)
2033
return nullptr;
2034
2035
auto *VecTy = cast<FixedVectorType>(II.getType());
2036
unsigned NumElts = VecTy->getNumElements();
2037
bool IsPD = VecTy->getScalarType()->isDoubleTy();
2038
unsigned NumLaneElts = IsPD ? 2 : 4;
2039
assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
2040
2041
// Construct a shuffle mask from constant integers or UNDEFs.
2042
int Indexes[16];
2043
2044
// The intrinsics only read one or two bits, clear the rest.
2045
for (unsigned I = 0; I < NumElts; ++I) {
2046
Constant *COp = V->getAggregateElement(I);
2047
if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2048
return nullptr;
2049
2050
if (isa<UndefValue>(COp)) {
2051
Indexes[I] = -1;
2052
continue;
2053
}
2054
2055
APInt Index = cast<ConstantInt>(COp)->getValue();
2056
Index = Index.zextOrTrunc(32).getLoBits(2);
2057
2058
// The PD variants uses bit 1 to select per-lane element index, so
2059
// shift down to convert to generic shuffle mask index.
2060
if (IsPD)
2061
Index.lshrInPlace(1);
2062
2063
// The _256 variants are a bit trickier since the mask bits always index
2064
// into the corresponding 128 half. In order to convert to a generic
2065
// shuffle, we have to make that explicit.
2066
Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
2067
2068
Indexes[I] = Index.getZExtValue();
2069
}
2070
2071
auto V1 = II.getArgOperand(0);
2072
return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));
2073
}
2074
2075
/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
2076
static Value *simplifyX86vpermv(const IntrinsicInst &II,
2077
InstCombiner::BuilderTy &Builder) {
2078
auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2079
if (!V)
2080
return nullptr;
2081
2082
auto *VecTy = cast<FixedVectorType>(II.getType());
2083
unsigned Size = VecTy->getNumElements();
2084
assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
2085
"Unexpected shuffle mask size");
2086
2087
// Construct a shuffle mask from constant integers or UNDEFs.
2088
int Indexes[64];
2089
2090
for (unsigned I = 0; I < Size; ++I) {
2091
Constant *COp = V->getAggregateElement(I);
2092
if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2093
return nullptr;
2094
2095
if (isa<UndefValue>(COp)) {
2096
Indexes[I] = -1;
2097
continue;
2098
}
2099
2100
uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2101
Index &= Size - 1;
2102
Indexes[I] = Index;
2103
}
2104
2105
auto V1 = II.getArgOperand(0);
2106
return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));
2107
}
2108
2109
/// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant.
2110
static Value *simplifyX86vpermv3(const IntrinsicInst &II,
2111
InstCombiner::BuilderTy &Builder) {
2112
auto *V = dyn_cast<Constant>(II.getArgOperand(1));
2113
if (!V)
2114
return nullptr;
2115
2116
auto *VecTy = cast<FixedVectorType>(II.getType());
2117
unsigned Size = VecTy->getNumElements();
2118
assert((Size == 2 || Size == 4 || Size == 8 || Size == 16 || Size == 32 ||
2119
Size == 64) &&
2120
"Unexpected shuffle mask size");
2121
2122
// Construct a shuffle mask from constant integers or UNDEFs.
2123
int Indexes[64];
2124
2125
for (unsigned I = 0; I < Size; ++I) {
2126
Constant *COp = V->getAggregateElement(I);
2127
if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
2128
return nullptr;
2129
2130
if (isa<UndefValue>(COp)) {
2131
Indexes[I] = -1;
2132
continue;
2133
}
2134
2135
uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
2136
Index &= (2 * Size) - 1;
2137
Indexes[I] = Index;
2138
}
2139
2140
auto V1 = II.getArgOperand(0);
2141
auto V2 = II.getArgOperand(2);
2142
return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size));
2143
}
2144
2145
std::optional<Instruction *>
2146
X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
2147
auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
2148
unsigned DemandedWidth) {
2149
APInt UndefElts(Width, 0);
2150
APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
2151
return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
2152
};
2153
2154
Intrinsic::ID IID = II.getIntrinsicID();
2155
switch (IID) {
2156
case Intrinsic::x86_bmi_bextr_32:
2157
case Intrinsic::x86_bmi_bextr_64:
2158
case Intrinsic::x86_tbm_bextri_u32:
2159
case Intrinsic::x86_tbm_bextri_u64:
2160
// If the RHS is a constant we can try some simplifications.
2161
if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2162
uint64_t Shift = C->getZExtValue();
2163
uint64_t Length = (Shift >> 8) & 0xff;
2164
Shift &= 0xff;
2165
unsigned BitWidth = II.getType()->getIntegerBitWidth();
2166
// If the length is 0 or the shift is out of range, replace with zero.
2167
if (Length == 0 || Shift >= BitWidth) {
2168
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2169
}
2170
// If the LHS is also a constant, we can completely constant fold this.
2171
if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2172
uint64_t Result = InC->getZExtValue() >> Shift;
2173
if (Length > BitWidth)
2174
Length = BitWidth;
2175
Result &= maskTrailingOnes<uint64_t>(Length);
2176
return IC.replaceInstUsesWith(II,
2177
ConstantInt::get(II.getType(), Result));
2178
}
2179
// TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2180
// are only masking bits that a shift already cleared?
2181
}
2182
break;
2183
2184
case Intrinsic::x86_bmi_bzhi_32:
2185
case Intrinsic::x86_bmi_bzhi_64:
2186
// If the RHS is a constant we can try some simplifications.
2187
if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2188
uint64_t Index = C->getZExtValue() & 0xff;
2189
unsigned BitWidth = II.getType()->getIntegerBitWidth();
2190
if (Index >= BitWidth) {
2191
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2192
}
2193
if (Index == 0) {
2194
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2195
}
2196
// If the LHS is also a constant, we can completely constant fold this.
2197
if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2198
uint64_t Result = InC->getZExtValue();
2199
Result &= maskTrailingOnes<uint64_t>(Index);
2200
return IC.replaceInstUsesWith(II,
2201
ConstantInt::get(II.getType(), Result));
2202
}
2203
// TODO should we convert this to an AND if the RHS is constant?
2204
}
2205
break;
2206
case Intrinsic::x86_bmi_pext_32:
2207
case Intrinsic::x86_bmi_pext_64:
2208
if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2209
if (MaskC->isNullValue()) {
2210
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2211
}
2212
if (MaskC->isAllOnesValue()) {
2213
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2214
}
2215
2216
unsigned MaskIdx, MaskLen;
2217
if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2218
// any single contingous sequence of 1s anywhere in the mask simply
2219
// describes a subset of the input bits shifted to the appropriate
2220
// position. Replace with the straight forward IR.
2221
Value *Input = II.getArgOperand(0);
2222
Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
2223
Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2224
Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
2225
return IC.replaceInstUsesWith(II, Shifted);
2226
}
2227
2228
if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2229
uint64_t Src = SrcC->getZExtValue();
2230
uint64_t Mask = MaskC->getZExtValue();
2231
uint64_t Result = 0;
2232
uint64_t BitToSet = 1;
2233
2234
while (Mask) {
2235
// Isolate lowest set bit.
2236
uint64_t BitToTest = Mask & -Mask;
2237
if (BitToTest & Src)
2238
Result |= BitToSet;
2239
2240
BitToSet <<= 1;
2241
// Clear lowest set bit.
2242
Mask &= Mask - 1;
2243
}
2244
2245
return IC.replaceInstUsesWith(II,
2246
ConstantInt::get(II.getType(), Result));
2247
}
2248
}
2249
break;
2250
case Intrinsic::x86_bmi_pdep_32:
2251
case Intrinsic::x86_bmi_pdep_64:
2252
if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
2253
if (MaskC->isNullValue()) {
2254
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
2255
}
2256
if (MaskC->isAllOnesValue()) {
2257
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
2258
}
2259
2260
unsigned MaskIdx, MaskLen;
2261
if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
2262
// any single contingous sequence of 1s anywhere in the mask simply
2263
// describes a subset of the input bits shifted to the appropriate
2264
// position. Replace with the straight forward IR.
2265
Value *Input = II.getArgOperand(0);
2266
Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
2267
Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
2268
Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
2269
return IC.replaceInstUsesWith(II, Masked);
2270
}
2271
2272
if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
2273
uint64_t Src = SrcC->getZExtValue();
2274
uint64_t Mask = MaskC->getZExtValue();
2275
uint64_t Result = 0;
2276
uint64_t BitToTest = 1;
2277
2278
while (Mask) {
2279
// Isolate lowest set bit.
2280
uint64_t BitToSet = Mask & -Mask;
2281
if (BitToTest & Src)
2282
Result |= BitToSet;
2283
2284
BitToTest <<= 1;
2285
// Clear lowest set bit;
2286
Mask &= Mask - 1;
2287
}
2288
2289
return IC.replaceInstUsesWith(II,
2290
ConstantInt::get(II.getType(), Result));
2291
}
2292
}
2293
break;
2294
2295
case Intrinsic::x86_sse_cvtss2si:
2296
case Intrinsic::x86_sse_cvtss2si64:
2297
case Intrinsic::x86_sse_cvttss2si:
2298
case Intrinsic::x86_sse_cvttss2si64:
2299
case Intrinsic::x86_sse2_cvtsd2si:
2300
case Intrinsic::x86_sse2_cvtsd2si64:
2301
case Intrinsic::x86_sse2_cvttsd2si:
2302
case Intrinsic::x86_sse2_cvttsd2si64:
2303
case Intrinsic::x86_avx512_vcvtss2si32:
2304
case Intrinsic::x86_avx512_vcvtss2si64:
2305
case Intrinsic::x86_avx512_vcvtss2usi32:
2306
case Intrinsic::x86_avx512_vcvtss2usi64:
2307
case Intrinsic::x86_avx512_vcvtsd2si32:
2308
case Intrinsic::x86_avx512_vcvtsd2si64:
2309
case Intrinsic::x86_avx512_vcvtsd2usi32:
2310
case Intrinsic::x86_avx512_vcvtsd2usi64:
2311
case Intrinsic::x86_avx512_cvttss2si:
2312
case Intrinsic::x86_avx512_cvttss2si64:
2313
case Intrinsic::x86_avx512_cvttss2usi:
2314
case Intrinsic::x86_avx512_cvttss2usi64:
2315
case Intrinsic::x86_avx512_cvttsd2si:
2316
case Intrinsic::x86_avx512_cvttsd2si64:
2317
case Intrinsic::x86_avx512_cvttsd2usi:
2318
case Intrinsic::x86_avx512_cvttsd2usi64: {
2319
// These intrinsics only demand the 0th element of their input vectors. If
2320
// we can simplify the input based on that, do so now.
2321
Value *Arg = II.getArgOperand(0);
2322
unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
2323
if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2324
return IC.replaceOperand(II, 0, V);
2325
}
2326
break;
2327
}
2328
2329
case Intrinsic::x86_mmx_pmovmskb:
2330
case Intrinsic::x86_sse_movmsk_ps:
2331
case Intrinsic::x86_sse2_movmsk_pd:
2332
case Intrinsic::x86_sse2_pmovmskb_128:
2333
case Intrinsic::x86_avx_movmsk_pd_256:
2334
case Intrinsic::x86_avx_movmsk_ps_256:
2335
case Intrinsic::x86_avx2_pmovmskb:
2336
if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
2337
return IC.replaceInstUsesWith(II, V);
2338
}
2339
break;
2340
2341
case Intrinsic::x86_sse_comieq_ss:
2342
case Intrinsic::x86_sse_comige_ss:
2343
case Intrinsic::x86_sse_comigt_ss:
2344
case Intrinsic::x86_sse_comile_ss:
2345
case Intrinsic::x86_sse_comilt_ss:
2346
case Intrinsic::x86_sse_comineq_ss:
2347
case Intrinsic::x86_sse_ucomieq_ss:
2348
case Intrinsic::x86_sse_ucomige_ss:
2349
case Intrinsic::x86_sse_ucomigt_ss:
2350
case Intrinsic::x86_sse_ucomile_ss:
2351
case Intrinsic::x86_sse_ucomilt_ss:
2352
case Intrinsic::x86_sse_ucomineq_ss:
2353
case Intrinsic::x86_sse2_comieq_sd:
2354
case Intrinsic::x86_sse2_comige_sd:
2355
case Intrinsic::x86_sse2_comigt_sd:
2356
case Intrinsic::x86_sse2_comile_sd:
2357
case Intrinsic::x86_sse2_comilt_sd:
2358
case Intrinsic::x86_sse2_comineq_sd:
2359
case Intrinsic::x86_sse2_ucomieq_sd:
2360
case Intrinsic::x86_sse2_ucomige_sd:
2361
case Intrinsic::x86_sse2_ucomigt_sd:
2362
case Intrinsic::x86_sse2_ucomile_sd:
2363
case Intrinsic::x86_sse2_ucomilt_sd:
2364
case Intrinsic::x86_sse2_ucomineq_sd:
2365
case Intrinsic::x86_avx512_vcomi_ss:
2366
case Intrinsic::x86_avx512_vcomi_sd:
2367
case Intrinsic::x86_avx512_mask_cmp_ss:
2368
case Intrinsic::x86_avx512_mask_cmp_sd: {
2369
// These intrinsics only demand the 0th element of their input vectors. If
2370
// we can simplify the input based on that, do so now.
2371
bool MadeChange = false;
2372
Value *Arg0 = II.getArgOperand(0);
2373
Value *Arg1 = II.getArgOperand(1);
2374
unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
2375
if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2376
IC.replaceOperand(II, 0, V);
2377
MadeChange = true;
2378
}
2379
if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2380
IC.replaceOperand(II, 1, V);
2381
MadeChange = true;
2382
}
2383
if (MadeChange) {
2384
return &II;
2385
}
2386
break;
2387
}
2388
2389
case Intrinsic::x86_avx512_add_ps_512:
2390
case Intrinsic::x86_avx512_div_ps_512:
2391
case Intrinsic::x86_avx512_mul_ps_512:
2392
case Intrinsic::x86_avx512_sub_ps_512:
2393
case Intrinsic::x86_avx512_add_pd_512:
2394
case Intrinsic::x86_avx512_div_pd_512:
2395
case Intrinsic::x86_avx512_mul_pd_512:
2396
case Intrinsic::x86_avx512_sub_pd_512:
2397
// If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2398
// IR operations.
2399
if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2400
if (R->getValue() == 4) {
2401
Value *Arg0 = II.getArgOperand(0);
2402
Value *Arg1 = II.getArgOperand(1);
2403
2404
Value *V;
2405
switch (IID) {
2406
default:
2407
llvm_unreachable("Case stmts out of sync!");
2408
case Intrinsic::x86_avx512_add_ps_512:
2409
case Intrinsic::x86_avx512_add_pd_512:
2410
V = IC.Builder.CreateFAdd(Arg0, Arg1);
2411
break;
2412
case Intrinsic::x86_avx512_sub_ps_512:
2413
case Intrinsic::x86_avx512_sub_pd_512:
2414
V = IC.Builder.CreateFSub(Arg0, Arg1);
2415
break;
2416
case Intrinsic::x86_avx512_mul_ps_512:
2417
case Intrinsic::x86_avx512_mul_pd_512:
2418
V = IC.Builder.CreateFMul(Arg0, Arg1);
2419
break;
2420
case Intrinsic::x86_avx512_div_ps_512:
2421
case Intrinsic::x86_avx512_div_pd_512:
2422
V = IC.Builder.CreateFDiv(Arg0, Arg1);
2423
break;
2424
}
2425
2426
return IC.replaceInstUsesWith(II, V);
2427
}
2428
}
2429
break;
2430
2431
case Intrinsic::x86_avx512_mask_add_ss_round:
2432
case Intrinsic::x86_avx512_mask_div_ss_round:
2433
case Intrinsic::x86_avx512_mask_mul_ss_round:
2434
case Intrinsic::x86_avx512_mask_sub_ss_round:
2435
case Intrinsic::x86_avx512_mask_add_sd_round:
2436
case Intrinsic::x86_avx512_mask_div_sd_round:
2437
case Intrinsic::x86_avx512_mask_mul_sd_round:
2438
case Intrinsic::x86_avx512_mask_sub_sd_round:
2439
// If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2440
// IR operations.
2441
if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
2442
if (R->getValue() == 4) {
2443
// Extract the element as scalars.
2444
Value *Arg0 = II.getArgOperand(0);
2445
Value *Arg1 = II.getArgOperand(1);
2446
Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
2447
Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
2448
2449
Value *V;
2450
switch (IID) {
2451
default:
2452
llvm_unreachable("Case stmts out of sync!");
2453
case Intrinsic::x86_avx512_mask_add_ss_round:
2454
case Intrinsic::x86_avx512_mask_add_sd_round:
2455
V = IC.Builder.CreateFAdd(LHS, RHS);
2456
break;
2457
case Intrinsic::x86_avx512_mask_sub_ss_round:
2458
case Intrinsic::x86_avx512_mask_sub_sd_round:
2459
V = IC.Builder.CreateFSub(LHS, RHS);
2460
break;
2461
case Intrinsic::x86_avx512_mask_mul_ss_round:
2462
case Intrinsic::x86_avx512_mask_mul_sd_round:
2463
V = IC.Builder.CreateFMul(LHS, RHS);
2464
break;
2465
case Intrinsic::x86_avx512_mask_div_ss_round:
2466
case Intrinsic::x86_avx512_mask_div_sd_round:
2467
V = IC.Builder.CreateFDiv(LHS, RHS);
2468
break;
2469
}
2470
2471
// Handle the masking aspect of the intrinsic.
2472
Value *Mask = II.getArgOperand(3);
2473
auto *C = dyn_cast<ConstantInt>(Mask);
2474
// We don't need a select if we know the mask bit is a 1.
2475
if (!C || !C->getValue()[0]) {
2476
// Cast the mask to an i1 vector and then extract the lowest element.
2477
auto *MaskTy = FixedVectorType::get(
2478
IC.Builder.getInt1Ty(),
2479
cast<IntegerType>(Mask->getType())->getBitWidth());
2480
Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
2481
Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
2482
// Extract the lowest element from the passthru operand.
2483
Value *Passthru =
2484
IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
2485
V = IC.Builder.CreateSelect(Mask, V, Passthru);
2486
}
2487
2488
// Insert the result back into the original argument 0.
2489
V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2490
2491
return IC.replaceInstUsesWith(II, V);
2492
}
2493
}
2494
break;
2495
2496
// Constant fold ashr( <A x Bi>, Ci ).
2497
// Constant fold lshr( <A x Bi>, Ci ).
2498
// Constant fold shl( <A x Bi>, Ci ).
2499
case Intrinsic::x86_sse2_psrai_d:
2500
case Intrinsic::x86_sse2_psrai_w:
2501
case Intrinsic::x86_avx2_psrai_d:
2502
case Intrinsic::x86_avx2_psrai_w:
2503
case Intrinsic::x86_avx512_psrai_q_128:
2504
case Intrinsic::x86_avx512_psrai_q_256:
2505
case Intrinsic::x86_avx512_psrai_d_512:
2506
case Intrinsic::x86_avx512_psrai_q_512:
2507
case Intrinsic::x86_avx512_psrai_w_512:
2508
case Intrinsic::x86_sse2_psrli_d:
2509
case Intrinsic::x86_sse2_psrli_q:
2510
case Intrinsic::x86_sse2_psrli_w:
2511
case Intrinsic::x86_avx2_psrli_d:
2512
case Intrinsic::x86_avx2_psrli_q:
2513
case Intrinsic::x86_avx2_psrli_w:
2514
case Intrinsic::x86_avx512_psrli_d_512:
2515
case Intrinsic::x86_avx512_psrli_q_512:
2516
case Intrinsic::x86_avx512_psrli_w_512:
2517
case Intrinsic::x86_sse2_pslli_d:
2518
case Intrinsic::x86_sse2_pslli_q:
2519
case Intrinsic::x86_sse2_pslli_w:
2520
case Intrinsic::x86_avx2_pslli_d:
2521
case Intrinsic::x86_avx2_pslli_q:
2522
case Intrinsic::x86_avx2_pslli_w:
2523
case Intrinsic::x86_avx512_pslli_d_512:
2524
case Intrinsic::x86_avx512_pslli_q_512:
2525
case Intrinsic::x86_avx512_pslli_w_512:
2526
if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2527
return IC.replaceInstUsesWith(II, V);
2528
}
2529
break;
2530
2531
case Intrinsic::x86_sse2_psra_d:
2532
case Intrinsic::x86_sse2_psra_w:
2533
case Intrinsic::x86_avx2_psra_d:
2534
case Intrinsic::x86_avx2_psra_w:
2535
case Intrinsic::x86_avx512_psra_q_128:
2536
case Intrinsic::x86_avx512_psra_q_256:
2537
case Intrinsic::x86_avx512_psra_d_512:
2538
case Intrinsic::x86_avx512_psra_q_512:
2539
case Intrinsic::x86_avx512_psra_w_512:
2540
case Intrinsic::x86_sse2_psrl_d:
2541
case Intrinsic::x86_sse2_psrl_q:
2542
case Intrinsic::x86_sse2_psrl_w:
2543
case Intrinsic::x86_avx2_psrl_d:
2544
case Intrinsic::x86_avx2_psrl_q:
2545
case Intrinsic::x86_avx2_psrl_w:
2546
case Intrinsic::x86_avx512_psrl_d_512:
2547
case Intrinsic::x86_avx512_psrl_q_512:
2548
case Intrinsic::x86_avx512_psrl_w_512:
2549
case Intrinsic::x86_sse2_psll_d:
2550
case Intrinsic::x86_sse2_psll_q:
2551
case Intrinsic::x86_sse2_psll_w:
2552
case Intrinsic::x86_avx2_psll_d:
2553
case Intrinsic::x86_avx2_psll_q:
2554
case Intrinsic::x86_avx2_psll_w:
2555
case Intrinsic::x86_avx512_psll_d_512:
2556
case Intrinsic::x86_avx512_psll_q_512:
2557
case Intrinsic::x86_avx512_psll_w_512: {
2558
if (Value *V = simplifyX86immShift(II, IC.Builder)) {
2559
return IC.replaceInstUsesWith(II, V);
2560
}
2561
2562
// SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2563
// operand to compute the shift amount.
2564
Value *Arg1 = II.getArgOperand(1);
2565
assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2566
"Unexpected packed shift size");
2567
unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
2568
2569
if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2570
return IC.replaceOperand(II, 1, V);
2571
}
2572
break;
2573
}
2574
2575
case Intrinsic::x86_avx2_psllv_d:
2576
case Intrinsic::x86_avx2_psllv_d_256:
2577
case Intrinsic::x86_avx2_psllv_q:
2578
case Intrinsic::x86_avx2_psllv_q_256:
2579
case Intrinsic::x86_avx512_psllv_d_512:
2580
case Intrinsic::x86_avx512_psllv_q_512:
2581
case Intrinsic::x86_avx512_psllv_w_128:
2582
case Intrinsic::x86_avx512_psllv_w_256:
2583
case Intrinsic::x86_avx512_psllv_w_512:
2584
case Intrinsic::x86_avx2_psrav_d:
2585
case Intrinsic::x86_avx2_psrav_d_256:
2586
case Intrinsic::x86_avx512_psrav_q_128:
2587
case Intrinsic::x86_avx512_psrav_q_256:
2588
case Intrinsic::x86_avx512_psrav_d_512:
2589
case Intrinsic::x86_avx512_psrav_q_512:
2590
case Intrinsic::x86_avx512_psrav_w_128:
2591
case Intrinsic::x86_avx512_psrav_w_256:
2592
case Intrinsic::x86_avx512_psrav_w_512:
2593
case Intrinsic::x86_avx2_psrlv_d:
2594
case Intrinsic::x86_avx2_psrlv_d_256:
2595
case Intrinsic::x86_avx2_psrlv_q:
2596
case Intrinsic::x86_avx2_psrlv_q_256:
2597
case Intrinsic::x86_avx512_psrlv_d_512:
2598
case Intrinsic::x86_avx512_psrlv_q_512:
2599
case Intrinsic::x86_avx512_psrlv_w_128:
2600
case Intrinsic::x86_avx512_psrlv_w_256:
2601
case Intrinsic::x86_avx512_psrlv_w_512:
2602
if (Value *V = simplifyX86varShift(II, IC.Builder)) {
2603
return IC.replaceInstUsesWith(II, V);
2604
}
2605
break;
2606
2607
case Intrinsic::x86_sse2_packssdw_128:
2608
case Intrinsic::x86_sse2_packsswb_128:
2609
case Intrinsic::x86_avx2_packssdw:
2610
case Intrinsic::x86_avx2_packsswb:
2611
case Intrinsic::x86_avx512_packssdw_512:
2612
case Intrinsic::x86_avx512_packsswb_512:
2613
if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
2614
return IC.replaceInstUsesWith(II, V);
2615
}
2616
break;
2617
2618
case Intrinsic::x86_sse2_packuswb_128:
2619
case Intrinsic::x86_sse41_packusdw:
2620
case Intrinsic::x86_avx2_packusdw:
2621
case Intrinsic::x86_avx2_packuswb:
2622
case Intrinsic::x86_avx512_packusdw_512:
2623
case Intrinsic::x86_avx512_packuswb_512:
2624
if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
2625
return IC.replaceInstUsesWith(II, V);
2626
}
2627
break;
2628
2629
case Intrinsic::x86_sse2_pmulh_w:
2630
case Intrinsic::x86_avx2_pmulh_w:
2631
case Intrinsic::x86_avx512_pmulh_w_512:
2632
if (Value *V = simplifyX86pmulh(II, IC.Builder, true, false)) {
2633
return IC.replaceInstUsesWith(II, V);
2634
}
2635
break;
2636
2637
case Intrinsic::x86_sse2_pmulhu_w:
2638
case Intrinsic::x86_avx2_pmulhu_w:
2639
case Intrinsic::x86_avx512_pmulhu_w_512:
2640
if (Value *V = simplifyX86pmulh(II, IC.Builder, false, false)) {
2641
return IC.replaceInstUsesWith(II, V);
2642
}
2643
break;
2644
2645
case Intrinsic::x86_ssse3_pmul_hr_sw_128:
2646
case Intrinsic::x86_avx2_pmul_hr_sw:
2647
case Intrinsic::x86_avx512_pmul_hr_sw_512:
2648
if (Value *V = simplifyX86pmulh(II, IC.Builder, true, true)) {
2649
return IC.replaceInstUsesWith(II, V);
2650
}
2651
break;
2652
2653
case Intrinsic::x86_sse2_pmadd_wd:
2654
case Intrinsic::x86_avx2_pmadd_wd:
2655
case Intrinsic::x86_avx512_pmaddw_d_512:
2656
if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) {
2657
return IC.replaceInstUsesWith(II, V);
2658
}
2659
break;
2660
2661
case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
2662
case Intrinsic::x86_avx2_pmadd_ub_sw:
2663
case Intrinsic::x86_avx512_pmaddubs_w_512:
2664
if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) {
2665
return IC.replaceInstUsesWith(II, V);
2666
}
2667
break;
2668
2669
case Intrinsic::x86_pclmulqdq:
2670
case Intrinsic::x86_pclmulqdq_256:
2671
case Intrinsic::x86_pclmulqdq_512: {
2672
if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
2673
unsigned Imm = C->getZExtValue();
2674
2675
bool MadeChange = false;
2676
Value *Arg0 = II.getArgOperand(0);
2677
Value *Arg1 = II.getArgOperand(1);
2678
unsigned VWidth =
2679
cast<FixedVectorType>(Arg0->getType())->getNumElements();
2680
2681
APInt UndefElts1(VWidth, 0);
2682
APInt DemandedElts1 =
2683
APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
2684
if (Value *V =
2685
IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
2686
IC.replaceOperand(II, 0, V);
2687
MadeChange = true;
2688
}
2689
2690
APInt UndefElts2(VWidth, 0);
2691
APInt DemandedElts2 =
2692
APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
2693
if (Value *V =
2694
IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
2695
IC.replaceOperand(II, 1, V);
2696
MadeChange = true;
2697
}
2698
2699
// If either input elements are undef, the result is zero.
2700
if (DemandedElts1.isSubsetOf(UndefElts1) ||
2701
DemandedElts2.isSubsetOf(UndefElts2)) {
2702
return IC.replaceInstUsesWith(II,
2703
ConstantAggregateZero::get(II.getType()));
2704
}
2705
2706
if (MadeChange) {
2707
return &II;
2708
}
2709
}
2710
break;
2711
}
2712
2713
case Intrinsic::x86_sse41_insertps:
2714
if (Value *V = simplifyX86insertps(II, IC.Builder)) {
2715
return IC.replaceInstUsesWith(II, V);
2716
}
2717
break;
2718
2719
case Intrinsic::x86_sse4a_extrq: {
2720
Value *Op0 = II.getArgOperand(0);
2721
Value *Op1 = II.getArgOperand(1);
2722
unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2723
unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2724
assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2725
Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2726
VWidth1 == 16 && "Unexpected operand sizes");
2727
2728
// See if we're dealing with constant values.
2729
auto *C1 = dyn_cast<Constant>(Op1);
2730
auto *CILength =
2731
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
2732
: nullptr;
2733
auto *CIIndex =
2734
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2735
: nullptr;
2736
2737
// Attempt to simplify to a constant, shuffle vector or EXTRQI call.
2738
if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2739
return IC.replaceInstUsesWith(II, V);
2740
}
2741
2742
// EXTRQ only uses the lowest 64-bits of the first 128-bit vector
2743
// operands and the lowest 16-bits of the second.
2744
bool MadeChange = false;
2745
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2746
IC.replaceOperand(II, 0, V);
2747
MadeChange = true;
2748
}
2749
if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
2750
IC.replaceOperand(II, 1, V);
2751
MadeChange = true;
2752
}
2753
if (MadeChange) {
2754
return &II;
2755
}
2756
break;
2757
}
2758
2759
case Intrinsic::x86_sse4a_extrqi: {
2760
// EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
2761
// bits of the lower 64-bits. The upper 64-bits are undefined.
2762
Value *Op0 = II.getArgOperand(0);
2763
unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2764
assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2765
"Unexpected operand size");
2766
2767
// See if we're dealing with constant values.
2768
auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
2769
auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
2770
2771
// Attempt to simplify to a constant or shuffle vector.
2772
if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
2773
return IC.replaceInstUsesWith(II, V);
2774
}
2775
2776
// EXTRQI only uses the lowest 64-bits of the first 128-bit vector
2777
// operand.
2778
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2779
return IC.replaceOperand(II, 0, V);
2780
}
2781
break;
2782
}
2783
2784
case Intrinsic::x86_sse4a_insertq: {
2785
Value *Op0 = II.getArgOperand(0);
2786
Value *Op1 = II.getArgOperand(1);
2787
unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
2788
assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2789
Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
2790
cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
2791
"Unexpected operand size");
2792
2793
// See if we're dealing with constant values.
2794
auto *C1 = dyn_cast<Constant>(Op1);
2795
auto *CI11 =
2796
C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
2797
: nullptr;
2798
2799
// Attempt to simplify to a constant, shuffle vector or INSERTQI call.
2800
if (CI11) {
2801
const APInt &V11 = CI11->getValue();
2802
APInt Len = V11.zextOrTrunc(6);
2803
APInt Idx = V11.lshr(8).zextOrTrunc(6);
2804
if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2805
return IC.replaceInstUsesWith(II, V);
2806
}
2807
}
2808
2809
// INSERTQ only uses the lowest 64-bits of the first 128-bit vector
2810
// operand.
2811
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
2812
return IC.replaceOperand(II, 0, V);
2813
}
2814
break;
2815
}
2816
2817
case Intrinsic::x86_sse4a_insertqi: {
2818
// INSERTQI: Extract lowest Length bits from lower half of second source and
2819
// insert over first source starting at Index bit. The upper 64-bits are
2820
// undefined.
2821
Value *Op0 = II.getArgOperand(0);
2822
Value *Op1 = II.getArgOperand(1);
2823
unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
2824
unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
2825
assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
2826
Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
2827
VWidth1 == 2 && "Unexpected operand sizes");
2828
2829
// See if we're dealing with constant values.
2830
auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
2831
auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
2832
2833
// Attempt to simplify to a constant or shuffle vector.
2834
if (CILength && CIIndex) {
2835
APInt Len = CILength->getValue().zextOrTrunc(6);
2836
APInt Idx = CIIndex->getValue().zextOrTrunc(6);
2837
if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
2838
return IC.replaceInstUsesWith(II, V);
2839
}
2840
}
2841
2842
// INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
2843
// operands.
2844
bool MadeChange = false;
2845
if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
2846
IC.replaceOperand(II, 0, V);
2847
MadeChange = true;
2848
}
2849
if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
2850
IC.replaceOperand(II, 1, V);
2851
MadeChange = true;
2852
}
2853
if (MadeChange) {
2854
return &II;
2855
}
2856
break;
2857
}
2858
2859
case Intrinsic::x86_sse41_pblendvb:
2860
case Intrinsic::x86_sse41_blendvps:
2861
case Intrinsic::x86_sse41_blendvpd:
2862
case Intrinsic::x86_avx_blendv_ps_256:
2863
case Intrinsic::x86_avx_blendv_pd_256:
2864
case Intrinsic::x86_avx2_pblendvb: {
2865
// fold (blend A, A, Mask) -> A
2866
Value *Op0 = II.getArgOperand(0);
2867
Value *Op1 = II.getArgOperand(1);
2868
Value *Mask = II.getArgOperand(2);
2869
if (Op0 == Op1) {
2870
return IC.replaceInstUsesWith(II, Op0);
2871
}
2872
2873
// Zero Mask - select 1st argument.
2874
if (isa<ConstantAggregateZero>(Mask)) {
2875
return IC.replaceInstUsesWith(II, Op0);
2876
}
2877
2878
// Constant Mask - select 1st/2nd argument lane based on top bit of mask.
2879
if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
2880
Constant *NewSelector =
2881
getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());
2882
return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
2883
}
2884
2885
Mask = InstCombiner::peekThroughBitcast(Mask);
2886
2887
// Peek through a one-use shuffle - VectorCombine should have simplified
2888
// this for cases where we're splitting wider vectors to use blendv
2889
// intrinsics.
2890
Value *MaskSrc = nullptr;
2891
ArrayRef<int> ShuffleMask;
2892
if (match(Mask, m_OneUse(m_Shuffle(m_Value(MaskSrc), m_Undef(),
2893
m_Mask(ShuffleMask))))) {
2894
// Bail if the shuffle was irregular or contains undefs.
2895
int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2896
if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) ||
2897
any_of(ShuffleMask,
2898
[NumElts](int M) { return M < 0 || M >= NumElts; }))
2899
break;
2900
Mask = InstCombiner::peekThroughBitcast(MaskSrc);
2901
}
2902
2903
// Convert to a vector select if we can bypass casts and find a boolean
2904
// vector condition value.
2905
Value *BoolVec;
2906
if (match(Mask, m_SExt(m_Value(BoolVec))) &&
2907
BoolVec->getType()->isVectorTy() &&
2908
BoolVec->getType()->getScalarSizeInBits() == 1) {
2909
auto *MaskTy = cast<FixedVectorType>(Mask->getType());
2910
auto *OpTy = cast<FixedVectorType>(II.getType());
2911
unsigned NumMaskElts = MaskTy->getNumElements();
2912
unsigned NumOperandElts = OpTy->getNumElements();
2913
2914
// If we peeked through a shuffle, reapply the shuffle to the bool vector.
2915
if (MaskSrc) {
2916
unsigned NumMaskSrcElts =
2917
cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
2918
NumMaskElts = (ShuffleMask.size() * NumMaskElts) / NumMaskSrcElts;
2919
// Multiple mask bits maps to the same operand element - bail out.
2920
if (NumMaskElts > NumOperandElts)
2921
break;
2922
SmallVector<int> ScaledMask;
2923
if (!llvm::scaleShuffleMaskElts(NumMaskElts, ShuffleMask, ScaledMask))
2924
break;
2925
BoolVec = IC.Builder.CreateShuffleVector(BoolVec, ScaledMask);
2926
MaskTy = FixedVectorType::get(MaskTy->getElementType(), NumMaskElts);
2927
}
2928
assert(MaskTy->getPrimitiveSizeInBits() ==
2929
OpTy->getPrimitiveSizeInBits() &&
2930
"Not expecting mask and operands with different sizes");
2931
2932
if (NumMaskElts == NumOperandElts) {
2933
return SelectInst::Create(BoolVec, Op1, Op0);
2934
}
2935
2936
// If the mask has less elements than the operands, each mask bit maps to
2937
// multiple elements of the operands. Bitcast back and forth.
2938
if (NumMaskElts < NumOperandElts) {
2939
Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);
2940
Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);
2941
Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
2942
return new BitCastInst(Sel, II.getType());
2943
}
2944
}
2945
2946
break;
2947
}
2948
2949
case Intrinsic::x86_ssse3_pshuf_b_128:
2950
case Intrinsic::x86_avx2_pshuf_b:
2951
case Intrinsic::x86_avx512_pshuf_b_512:
2952
if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
2953
return IC.replaceInstUsesWith(II, V);
2954
}
2955
break;
2956
2957
case Intrinsic::x86_avx_vpermilvar_ps:
2958
case Intrinsic::x86_avx_vpermilvar_ps_256:
2959
case Intrinsic::x86_avx512_vpermilvar_ps_512:
2960
case Intrinsic::x86_avx_vpermilvar_pd:
2961
case Intrinsic::x86_avx_vpermilvar_pd_256:
2962
case Intrinsic::x86_avx512_vpermilvar_pd_512:
2963
if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
2964
return IC.replaceInstUsesWith(II, V);
2965
}
2966
break;
2967
2968
case Intrinsic::x86_avx2_permd:
2969
case Intrinsic::x86_avx2_permps:
2970
case Intrinsic::x86_avx512_permvar_df_256:
2971
case Intrinsic::x86_avx512_permvar_df_512:
2972
case Intrinsic::x86_avx512_permvar_di_256:
2973
case Intrinsic::x86_avx512_permvar_di_512:
2974
case Intrinsic::x86_avx512_permvar_hi_128:
2975
case Intrinsic::x86_avx512_permvar_hi_256:
2976
case Intrinsic::x86_avx512_permvar_hi_512:
2977
case Intrinsic::x86_avx512_permvar_qi_128:
2978
case Intrinsic::x86_avx512_permvar_qi_256:
2979
case Intrinsic::x86_avx512_permvar_qi_512:
2980
case Intrinsic::x86_avx512_permvar_sf_512:
2981
case Intrinsic::x86_avx512_permvar_si_512:
2982
if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
2983
return IC.replaceInstUsesWith(II, V);
2984
}
2985
break;
2986
2987
case Intrinsic::x86_avx512_vpermi2var_d_128:
2988
case Intrinsic::x86_avx512_vpermi2var_d_256:
2989
case Intrinsic::x86_avx512_vpermi2var_d_512:
2990
case Intrinsic::x86_avx512_vpermi2var_hi_128:
2991
case Intrinsic::x86_avx512_vpermi2var_hi_256:
2992
case Intrinsic::x86_avx512_vpermi2var_hi_512:
2993
case Intrinsic::x86_avx512_vpermi2var_pd_128:
2994
case Intrinsic::x86_avx512_vpermi2var_pd_256:
2995
case Intrinsic::x86_avx512_vpermi2var_pd_512:
2996
case Intrinsic::x86_avx512_vpermi2var_ps_128:
2997
case Intrinsic::x86_avx512_vpermi2var_ps_256:
2998
case Intrinsic::x86_avx512_vpermi2var_ps_512:
2999
case Intrinsic::x86_avx512_vpermi2var_q_128:
3000
case Intrinsic::x86_avx512_vpermi2var_q_256:
3001
case Intrinsic::x86_avx512_vpermi2var_q_512:
3002
case Intrinsic::x86_avx512_vpermi2var_qi_128:
3003
case Intrinsic::x86_avx512_vpermi2var_qi_256:
3004
case Intrinsic::x86_avx512_vpermi2var_qi_512:
3005
if (Value *V = simplifyX86vpermv3(II, IC.Builder)) {
3006
return IC.replaceInstUsesWith(II, V);
3007
}
3008
break;
3009
3010
case Intrinsic::x86_avx_maskload_ps:
3011
case Intrinsic::x86_avx_maskload_pd:
3012
case Intrinsic::x86_avx_maskload_ps_256:
3013
case Intrinsic::x86_avx_maskload_pd_256:
3014
case Intrinsic::x86_avx2_maskload_d:
3015
case Intrinsic::x86_avx2_maskload_q:
3016
case Intrinsic::x86_avx2_maskload_d_256:
3017
case Intrinsic::x86_avx2_maskload_q_256:
3018
if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
3019
return I;
3020
}
3021
break;
3022
3023
case Intrinsic::x86_sse2_maskmov_dqu:
3024
case Intrinsic::x86_avx_maskstore_ps:
3025
case Intrinsic::x86_avx_maskstore_pd:
3026
case Intrinsic::x86_avx_maskstore_ps_256:
3027
case Intrinsic::x86_avx_maskstore_pd_256:
3028
case Intrinsic::x86_avx2_maskstore_d:
3029
case Intrinsic::x86_avx2_maskstore_q:
3030
case Intrinsic::x86_avx2_maskstore_d_256:
3031
case Intrinsic::x86_avx2_maskstore_q_256:
3032
if (simplifyX86MaskedStore(II, IC)) {
3033
return nullptr;
3034
}
3035
break;
3036
3037
case Intrinsic::x86_addcarry_32:
3038
case Intrinsic::x86_addcarry_64:
3039
if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
3040
return IC.replaceInstUsesWith(II, V);
3041
}
3042
break;
3043
3044
case Intrinsic::x86_avx512_pternlog_d_128:
3045
case Intrinsic::x86_avx512_pternlog_d_256:
3046
case Intrinsic::x86_avx512_pternlog_d_512:
3047
case Intrinsic::x86_avx512_pternlog_q_128:
3048
case Intrinsic::x86_avx512_pternlog_q_256:
3049
case Intrinsic::x86_avx512_pternlog_q_512:
3050
if (Value *V = simplifyTernarylogic(II, IC.Builder)) {
3051
return IC.replaceInstUsesWith(II, V);
3052
}
3053
break;
3054
default:
3055
break;
3056
}
3057
return std::nullopt;
3058
}
3059
3060
std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
3061
InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
3062
bool &KnownBitsComputed) const {
3063
switch (II.getIntrinsicID()) {
3064
default:
3065
break;
3066
case Intrinsic::x86_mmx_pmovmskb:
3067
case Intrinsic::x86_sse_movmsk_ps:
3068
case Intrinsic::x86_sse2_movmsk_pd:
3069
case Intrinsic::x86_sse2_pmovmskb_128:
3070
case Intrinsic::x86_avx_movmsk_ps_256:
3071
case Intrinsic::x86_avx_movmsk_pd_256:
3072
case Intrinsic::x86_avx2_pmovmskb: {
3073
// MOVMSK copies the vector elements' sign bits to the low bits
3074
// and zeros the high bits.
3075
unsigned ArgWidth;
3076
if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
3077
ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
3078
} else {
3079
auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
3080
ArgWidth = ArgType->getNumElements();
3081
}
3082
3083
// If we don't need any of low bits then return zero,
3084
// we know that DemandedMask is non-zero already.
3085
APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
3086
Type *VTy = II.getType();
3087
if (DemandedElts.isZero()) {
3088
return ConstantInt::getNullValue(VTy);
3089
}
3090
3091
// We know that the upper bits are set to zero.
3092
Known.Zero.setBitsFrom(ArgWidth);
3093
KnownBitsComputed = true;
3094
break;
3095
}
3096
}
3097
return std::nullopt;
3098
}
3099
3100
std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
3101
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
3102
APInt &UndefElts2, APInt &UndefElts3,
3103
std::function<void(Instruction *, unsigned, APInt, APInt &)>
3104
simplifyAndSetOp) const {
3105
unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
3106
switch (II.getIntrinsicID()) {
3107
default:
3108
break;
3109
case Intrinsic::x86_xop_vfrcz_ss:
3110
case Intrinsic::x86_xop_vfrcz_sd:
3111
// The instructions for these intrinsics are speced to zero upper bits not
3112
// pass them through like other scalar intrinsics. So we shouldn't just
3113
// use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
3114
// Instead we should return a zero vector.
3115
if (!DemandedElts[0]) {
3116
IC.addToWorklist(&II);
3117
return ConstantAggregateZero::get(II.getType());
3118
}
3119
3120
// Only the lower element is used.
3121
DemandedElts = 1;
3122
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3123
3124
// Only the lower element is undefined. The high elements are zero.
3125
UndefElts = UndefElts[0];
3126
break;
3127
3128
// Unary scalar-as-vector operations that work column-wise.
3129
case Intrinsic::x86_sse_rcp_ss:
3130
case Intrinsic::x86_sse_rsqrt_ss:
3131
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3132
3133
// If lowest element of a scalar op isn't used then use Arg0.
3134
if (!DemandedElts[0]) {
3135
IC.addToWorklist(&II);
3136
return II.getArgOperand(0);
3137
}
3138
// TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
3139
// checks).
3140
break;
3141
3142
// Binary scalar-as-vector operations that work column-wise. The high
3143
// elements come from operand 0. The low element is a function of both
3144
// operands.
3145
case Intrinsic::x86_sse_min_ss:
3146
case Intrinsic::x86_sse_max_ss:
3147
case Intrinsic::x86_sse_cmp_ss:
3148
case Intrinsic::x86_sse2_min_sd:
3149
case Intrinsic::x86_sse2_max_sd:
3150
case Intrinsic::x86_sse2_cmp_sd: {
3151
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3152
3153
// If lowest element of a scalar op isn't used then use Arg0.
3154
if (!DemandedElts[0]) {
3155
IC.addToWorklist(&II);
3156
return II.getArgOperand(0);
3157
}
3158
3159
// Only lower element is used for operand 1.
3160
DemandedElts = 1;
3161
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3162
3163
// Lower element is undefined if both lower elements are undefined.
3164
// Consider things like undef&0. The result is known zero, not undef.
3165
if (!UndefElts2[0])
3166
UndefElts.clearBit(0);
3167
3168
break;
3169
}
3170
3171
// Binary scalar-as-vector operations that work column-wise. The high
3172
// elements come from operand 0 and the low element comes from operand 1.
3173
case Intrinsic::x86_sse41_round_ss:
3174
case Intrinsic::x86_sse41_round_sd: {
3175
// Don't use the low element of operand 0.
3176
APInt DemandedElts2 = DemandedElts;
3177
DemandedElts2.clearBit(0);
3178
simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
3179
3180
// If lowest element of a scalar op isn't used then use Arg0.
3181
if (!DemandedElts[0]) {
3182
IC.addToWorklist(&II);
3183
return II.getArgOperand(0);
3184
}
3185
3186
// Only lower element is used for operand 1.
3187
DemandedElts = 1;
3188
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3189
3190
// Take the high undef elements from operand 0 and take the lower element
3191
// from operand 1.
3192
UndefElts.clearBit(0);
3193
UndefElts |= UndefElts2[0];
3194
break;
3195
}
3196
3197
// Three input scalar-as-vector operations that work column-wise. The high
3198
// elements come from operand 0 and the low element is a function of all
3199
// three inputs.
3200
case Intrinsic::x86_avx512_mask_add_ss_round:
3201
case Intrinsic::x86_avx512_mask_div_ss_round:
3202
case Intrinsic::x86_avx512_mask_mul_ss_round:
3203
case Intrinsic::x86_avx512_mask_sub_ss_round:
3204
case Intrinsic::x86_avx512_mask_max_ss_round:
3205
case Intrinsic::x86_avx512_mask_min_ss_round:
3206
case Intrinsic::x86_avx512_mask_add_sd_round:
3207
case Intrinsic::x86_avx512_mask_div_sd_round:
3208
case Intrinsic::x86_avx512_mask_mul_sd_round:
3209
case Intrinsic::x86_avx512_mask_sub_sd_round:
3210
case Intrinsic::x86_avx512_mask_max_sd_round:
3211
case Intrinsic::x86_avx512_mask_min_sd_round:
3212
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3213
3214
// If lowest element of a scalar op isn't used then use Arg0.
3215
if (!DemandedElts[0]) {
3216
IC.addToWorklist(&II);
3217
return II.getArgOperand(0);
3218
}
3219
3220
// Only lower element is used for operand 1 and 2.
3221
DemandedElts = 1;
3222
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3223
simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
3224
3225
// Lower element is undefined if all three lower elements are undefined.
3226
// Consider things like undef&0. The result is known zero, not undef.
3227
if (!UndefElts2[0] || !UndefElts3[0])
3228
UndefElts.clearBit(0);
3229
break;
3230
3231
// TODO: Add fmaddsub support?
3232
case Intrinsic::x86_sse3_addsub_pd:
3233
case Intrinsic::x86_sse3_addsub_ps:
3234
case Intrinsic::x86_avx_addsub_pd_256:
3235
case Intrinsic::x86_avx_addsub_ps_256: {
3236
// If none of the even or none of the odd lanes are required, turn this
3237
// into a generic FP math instruction.
3238
APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
3239
APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
3240
bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
3241
bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
3242
if (IsSubOnly || IsAddOnly) {
3243
assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
3244
IRBuilderBase::InsertPointGuard Guard(IC.Builder);
3245
IC.Builder.SetInsertPoint(&II);
3246
Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
3247
return IC.Builder.CreateBinOp(
3248
IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
3249
}
3250
3251
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3252
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3253
UndefElts &= UndefElts2;
3254
break;
3255
}
3256
3257
// General per-element vector operations.
3258
case Intrinsic::x86_avx2_psllv_d:
3259
case Intrinsic::x86_avx2_psllv_d_256:
3260
case Intrinsic::x86_avx2_psllv_q:
3261
case Intrinsic::x86_avx2_psllv_q_256:
3262
case Intrinsic::x86_avx2_psrlv_d:
3263
case Intrinsic::x86_avx2_psrlv_d_256:
3264
case Intrinsic::x86_avx2_psrlv_q:
3265
case Intrinsic::x86_avx2_psrlv_q_256:
3266
case Intrinsic::x86_avx2_psrav_d:
3267
case Intrinsic::x86_avx2_psrav_d_256: {
3268
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3269
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3270
UndefElts &= UndefElts2;
3271
break;
3272
}
3273
3274
case Intrinsic::x86_sse2_pmulh_w:
3275
case Intrinsic::x86_avx2_pmulh_w:
3276
case Intrinsic::x86_avx512_pmulh_w_512:
3277
case Intrinsic::x86_sse2_pmulhu_w:
3278
case Intrinsic::x86_avx2_pmulhu_w:
3279
case Intrinsic::x86_avx512_pmulhu_w_512:
3280
case Intrinsic::x86_ssse3_pmul_hr_sw_128:
3281
case Intrinsic::x86_avx2_pmul_hr_sw:
3282
case Intrinsic::x86_avx512_pmul_hr_sw_512: {
3283
simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
3284
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
3285
// NOTE: mulh(undef,undef) != undef.
3286
break;
3287
}
3288
3289
case Intrinsic::x86_sse2_packssdw_128:
3290
case Intrinsic::x86_sse2_packsswb_128:
3291
case Intrinsic::x86_sse2_packuswb_128:
3292
case Intrinsic::x86_sse41_packusdw:
3293
case Intrinsic::x86_avx2_packssdw:
3294
case Intrinsic::x86_avx2_packsswb:
3295
case Intrinsic::x86_avx2_packusdw:
3296
case Intrinsic::x86_avx2_packuswb:
3297
case Intrinsic::x86_avx512_packssdw_512:
3298
case Intrinsic::x86_avx512_packsswb_512:
3299
case Intrinsic::x86_avx512_packusdw_512:
3300
case Intrinsic::x86_avx512_packuswb_512: {
3301
auto *Ty0 = II.getArgOperand(0)->getType();
3302
unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
3303
assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
3304
3305
unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
3306
unsigned VWidthPerLane = VWidth / NumLanes;
3307
unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
3308
3309
// Per lane, pack the elements of the first input and then the second.
3310
// e.g.
3311
// v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
3312
// v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
3313
for (int OpNum = 0; OpNum != 2; ++OpNum) {
3314
APInt OpDemandedElts(InnerVWidth, 0);
3315
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3316
unsigned LaneIdx = Lane * VWidthPerLane;
3317
for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
3318
unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
3319
if (DemandedElts[Idx])
3320
OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
3321
}
3322
}
3323
3324
// Demand elements from the operand.
3325
APInt OpUndefElts(InnerVWidth, 0);
3326
simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
3327
3328
// Pack the operand's UNDEF elements, one lane at a time.
3329
OpUndefElts = OpUndefElts.zext(VWidth);
3330
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3331
APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
3332
LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
3333
LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
3334
UndefElts |= LaneElts;
3335
}
3336
}
3337
break;
3338
}
3339
3340
case Intrinsic::x86_sse2_pmadd_wd:
3341
case Intrinsic::x86_avx2_pmadd_wd:
3342
case Intrinsic::x86_avx512_pmaddw_d_512:
3343
case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
3344
case Intrinsic::x86_avx2_pmadd_ub_sw:
3345
case Intrinsic::x86_avx512_pmaddubs_w_512: {
3346
// PMADD - demand both src elements that map to each dst element.
3347
auto *ArgTy = II.getArgOperand(0)->getType();
3348
unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements();
3349
assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
3350
APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth);
3351
APInt Op0UndefElts(InnerVWidth, 0);
3352
APInt Op1UndefElts(InnerVWidth, 0);
3353
simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts);
3354
simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts);
3355
// NOTE: madd(undef,undef) != undef.
3356
break;
3357
}
3358
3359
// PSHUFB
3360
case Intrinsic::x86_ssse3_pshuf_b_128:
3361
case Intrinsic::x86_avx2_pshuf_b:
3362
case Intrinsic::x86_avx512_pshuf_b_512:
3363
// PERMILVAR
3364
case Intrinsic::x86_avx_vpermilvar_ps:
3365
case Intrinsic::x86_avx_vpermilvar_ps_256:
3366
case Intrinsic::x86_avx512_vpermilvar_ps_512:
3367
case Intrinsic::x86_avx_vpermilvar_pd:
3368
case Intrinsic::x86_avx_vpermilvar_pd_256:
3369
case Intrinsic::x86_avx512_vpermilvar_pd_512:
3370
// PERMV
3371
case Intrinsic::x86_avx2_permd:
3372
case Intrinsic::x86_avx2_permps: {
3373
simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
3374
break;
3375
}
3376
3377
// SSE4A instructions leave the upper 64-bits of the 128-bit result
3378
// in an undefined state.
3379
case Intrinsic::x86_sse4a_extrq:
3380
case Intrinsic::x86_sse4a_extrqi:
3381
case Intrinsic::x86_sse4a_insertq:
3382
case Intrinsic::x86_sse4a_insertqi:
3383
UndefElts.setHighBits(VWidth / 2);
3384
break;
3385
}
3386
return std::nullopt;
3387
}
3388
3389