Path: blob/main/contrib/llvm-project/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
35266 views
//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7/// \file8/// This file implements a TargetTransformInfo analysis pass specific to the9/// X86 target machine. It uses the target's detailed information to provide10/// more precise answers to certain TTI queries, while letting the target11/// independent and default TTI implementations handle the rest.12///13//===----------------------------------------------------------------------===//1415#include "X86TargetTransformInfo.h"16#include "llvm/IR/IntrinsicInst.h"17#include "llvm/IR/IntrinsicsX86.h"18#include "llvm/Support/KnownBits.h"19#include "llvm/Transforms/InstCombine/InstCombiner.h"20#include <optional>2122using namespace llvm;23using namespace llvm::PatternMatch;2425#define DEBUG_TYPE "x86tti"2627/// Return a constant boolean vector that has true elements in all positions28/// where the input constant data vector has an element with the sign bit set.29static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) {30VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));31V = ConstantExpr::getBitCast(V, IntTy);32V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT,33Constant::getNullValue(IntTy), V, DL);34assert(V && "Vector must be foldable");35return V;36}3738/// Convert the x86 XMM integer vector mask to a vector of bools based on39/// each element's most significant bit (the sign bit).40static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {41// Fold Constant Mask.42if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))43return getNegativeIsTrueBoolVec(ConstantMask, DL);4445// Mask was extended from a boolean vector.46Value *ExtMask;47if (match(Mask, m_SExt(m_Value(ExtMask))) &&48ExtMask->getType()->isIntOrIntVectorTy(1))49return ExtMask;5051return nullptr;52}5354// TODO: If the x86 backend knew how to convert a bool vector mask back to an55// XMM register mask efficiently, we could transform all x86 masked intrinsics56// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.57static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {58Value *Ptr = II.getOperand(0);59Value *Mask = II.getOperand(1);60Constant *ZeroVec = Constant::getNullValue(II.getType());6162// Zero Mask - masked load instruction creates a zero vector.63if (isa<ConstantAggregateZero>(Mask))64return IC.replaceInstUsesWith(II, ZeroVec);6566// The mask is constant or extended from a bool vector. Convert this x8667// intrinsic to the LLVM intrinsic to allow target-independent optimizations.68if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {69// First, cast the x86 intrinsic scalar pointer to a vector pointer to match70// the LLVM intrinsic definition for the pointer argument.71unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();72PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);73Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");7475// The pass-through vector for an x86 masked load is a zero vector.76CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(77II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);78return IC.replaceInstUsesWith(II, NewMaskedLoad);79}8081return nullptr;82}8384// TODO: If the x86 backend knew how to convert a bool vector mask back to an85// XMM register mask efficiently, we could transform all x86 masked intrinsics86// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.87static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {88Value *Ptr = II.getOperand(0);89Value *Mask = II.getOperand(1);90Value *Vec = II.getOperand(2);9192// Zero Mask - this masked store instruction does nothing.93if (isa<ConstantAggregateZero>(Mask)) {94IC.eraseInstFromFunction(II);95return true;96}9798// The SSE2 version is too weird (eg, unaligned but non-temporal) to do99// anything else at this level.100if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)101return false;102103// The mask is constant or extended from a bool vector. Convert this x86104// intrinsic to the LLVM intrinsic to allow target-independent optimizations.105if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) {106unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();107PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);108Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");109110IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);111112// 'Replace uses' doesn't work for stores. Erase the original masked store.113IC.eraseInstFromFunction(II);114return true;115}116117return false;118}119120static Value *simplifyX86immShift(const IntrinsicInst &II,121InstCombiner::BuilderTy &Builder) {122bool LogicalShift = false;123bool ShiftLeft = false;124bool IsImm = false;125126switch (II.getIntrinsicID()) {127default:128llvm_unreachable("Unexpected intrinsic!");129case Intrinsic::x86_sse2_psrai_d:130case Intrinsic::x86_sse2_psrai_w:131case Intrinsic::x86_avx2_psrai_d:132case Intrinsic::x86_avx2_psrai_w:133case Intrinsic::x86_avx512_psrai_q_128:134case Intrinsic::x86_avx512_psrai_q_256:135case Intrinsic::x86_avx512_psrai_d_512:136case Intrinsic::x86_avx512_psrai_q_512:137case Intrinsic::x86_avx512_psrai_w_512:138IsImm = true;139[[fallthrough]];140case Intrinsic::x86_sse2_psra_d:141case Intrinsic::x86_sse2_psra_w:142case Intrinsic::x86_avx2_psra_d:143case Intrinsic::x86_avx2_psra_w:144case Intrinsic::x86_avx512_psra_q_128:145case Intrinsic::x86_avx512_psra_q_256:146case Intrinsic::x86_avx512_psra_d_512:147case Intrinsic::x86_avx512_psra_q_512:148case Intrinsic::x86_avx512_psra_w_512:149LogicalShift = false;150ShiftLeft = false;151break;152case Intrinsic::x86_sse2_psrli_d:153case Intrinsic::x86_sse2_psrli_q:154case Intrinsic::x86_sse2_psrli_w:155case Intrinsic::x86_avx2_psrli_d:156case Intrinsic::x86_avx2_psrli_q:157case Intrinsic::x86_avx2_psrli_w:158case Intrinsic::x86_avx512_psrli_d_512:159case Intrinsic::x86_avx512_psrli_q_512:160case Intrinsic::x86_avx512_psrli_w_512:161IsImm = true;162[[fallthrough]];163case Intrinsic::x86_sse2_psrl_d:164case Intrinsic::x86_sse2_psrl_q:165case Intrinsic::x86_sse2_psrl_w:166case Intrinsic::x86_avx2_psrl_d:167case Intrinsic::x86_avx2_psrl_q:168case Intrinsic::x86_avx2_psrl_w:169case Intrinsic::x86_avx512_psrl_d_512:170case Intrinsic::x86_avx512_psrl_q_512:171case Intrinsic::x86_avx512_psrl_w_512:172LogicalShift = true;173ShiftLeft = false;174break;175case Intrinsic::x86_sse2_pslli_d:176case Intrinsic::x86_sse2_pslli_q:177case Intrinsic::x86_sse2_pslli_w:178case Intrinsic::x86_avx2_pslli_d:179case Intrinsic::x86_avx2_pslli_q:180case Intrinsic::x86_avx2_pslli_w:181case Intrinsic::x86_avx512_pslli_d_512:182case Intrinsic::x86_avx512_pslli_q_512:183case Intrinsic::x86_avx512_pslli_w_512:184IsImm = true;185[[fallthrough]];186case Intrinsic::x86_sse2_psll_d:187case Intrinsic::x86_sse2_psll_q:188case Intrinsic::x86_sse2_psll_w:189case Intrinsic::x86_avx2_psll_d:190case Intrinsic::x86_avx2_psll_q:191case Intrinsic::x86_avx2_psll_w:192case Intrinsic::x86_avx512_psll_d_512:193case Intrinsic::x86_avx512_psll_q_512:194case Intrinsic::x86_avx512_psll_w_512:195LogicalShift = true;196ShiftLeft = true;197break;198}199assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");200201Value *Vec = II.getArgOperand(0);202Value *Amt = II.getArgOperand(1);203auto *VT = cast<FixedVectorType>(Vec->getType());204Type *SVT = VT->getElementType();205Type *AmtVT = Amt->getType();206unsigned VWidth = VT->getNumElements();207unsigned BitWidth = SVT->getPrimitiveSizeInBits();208209// If the shift amount is guaranteed to be in-range we can replace it with a210// generic shift. If its guaranteed to be out of range, logical shifts combine211// to zero and arithmetic shifts are clamped to (BitWidth - 1).212if (IsImm) {213assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");214KnownBits KnownAmtBits =215llvm::computeKnownBits(Amt, II.getDataLayout());216if (KnownAmtBits.getMaxValue().ult(BitWidth)) {217Amt = Builder.CreateZExtOrTrunc(Amt, SVT);218Amt = Builder.CreateVectorSplat(VWidth, Amt);219return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)220: Builder.CreateLShr(Vec, Amt))221: Builder.CreateAShr(Vec, Amt));222}223if (KnownAmtBits.getMinValue().uge(BitWidth)) {224if (LogicalShift)225return ConstantAggregateZero::get(VT);226Amt = ConstantInt::get(SVT, BitWidth - 1);227return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));228}229} else {230// Ensure the first element has an in-range value and the rest of the231// elements in the bottom 64 bits are zero.232assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&233cast<VectorType>(AmtVT)->getElementType() == SVT &&234"Unexpected shift-by-scalar type");235unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();236APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);237APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);238KnownBits KnownLowerBits = llvm::computeKnownBits(239Amt, DemandedLower, II.getDataLayout());240KnownBits KnownUpperBits = llvm::computeKnownBits(241Amt, DemandedUpper, II.getDataLayout());242if (KnownLowerBits.getMaxValue().ult(BitWidth) &&243(DemandedUpper.isZero() || KnownUpperBits.isZero())) {244SmallVector<int, 16> ZeroSplat(VWidth, 0);245Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);246return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)247: Builder.CreateLShr(Vec, Amt))248: Builder.CreateAShr(Vec, Amt));249}250}251252// Simplify if count is constant vector.253auto *CDV = dyn_cast<ConstantDataVector>(Amt);254if (!CDV)255return nullptr;256257// SSE2/AVX2 uses all the first 64-bits of the 128-bit vector258// operand to compute the shift amount.259assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&260cast<VectorType>(AmtVT)->getElementType() == SVT &&261"Unexpected shift-by-scalar type");262263// Concatenate the sub-elements to create the 64-bit value.264APInt Count(64, 0);265for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {266unsigned SubEltIdx = (NumSubElts - 1) - i;267auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));268Count <<= BitWidth;269Count |= SubElt->getValue().zextOrTrunc(64);270}271272// If shift-by-zero then just return the original value.273if (Count.isZero())274return Vec;275276// Handle cases when Shift >= BitWidth.277if (Count.uge(BitWidth)) {278// If LogicalShift - just return zero.279if (LogicalShift)280return ConstantAggregateZero::get(VT);281282// If ArithmeticShift - clamp Shift to (BitWidth - 1).283Count = APInt(64, BitWidth - 1);284}285286// Get a constant vector of the same type as the first operand.287auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));288auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);289290if (ShiftLeft)291return Builder.CreateShl(Vec, ShiftVec);292293if (LogicalShift)294return Builder.CreateLShr(Vec, ShiftVec);295296return Builder.CreateAShr(Vec, ShiftVec);297}298299// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.300// Unlike the generic IR shifts, the intrinsics have defined behaviour for out301// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).302static Value *simplifyX86varShift(const IntrinsicInst &II,303InstCombiner::BuilderTy &Builder) {304bool LogicalShift = false;305bool ShiftLeft = false;306307switch (II.getIntrinsicID()) {308default:309llvm_unreachable("Unexpected intrinsic!");310case Intrinsic::x86_avx2_psrav_d:311case Intrinsic::x86_avx2_psrav_d_256:312case Intrinsic::x86_avx512_psrav_q_128:313case Intrinsic::x86_avx512_psrav_q_256:314case Intrinsic::x86_avx512_psrav_d_512:315case Intrinsic::x86_avx512_psrav_q_512:316case Intrinsic::x86_avx512_psrav_w_128:317case Intrinsic::x86_avx512_psrav_w_256:318case Intrinsic::x86_avx512_psrav_w_512:319LogicalShift = false;320ShiftLeft = false;321break;322case Intrinsic::x86_avx2_psrlv_d:323case Intrinsic::x86_avx2_psrlv_d_256:324case Intrinsic::x86_avx2_psrlv_q:325case Intrinsic::x86_avx2_psrlv_q_256:326case Intrinsic::x86_avx512_psrlv_d_512:327case Intrinsic::x86_avx512_psrlv_q_512:328case Intrinsic::x86_avx512_psrlv_w_128:329case Intrinsic::x86_avx512_psrlv_w_256:330case Intrinsic::x86_avx512_psrlv_w_512:331LogicalShift = true;332ShiftLeft = false;333break;334case Intrinsic::x86_avx2_psllv_d:335case Intrinsic::x86_avx2_psllv_d_256:336case Intrinsic::x86_avx2_psllv_q:337case Intrinsic::x86_avx2_psllv_q_256:338case Intrinsic::x86_avx512_psllv_d_512:339case Intrinsic::x86_avx512_psllv_q_512:340case Intrinsic::x86_avx512_psllv_w_128:341case Intrinsic::x86_avx512_psllv_w_256:342case Intrinsic::x86_avx512_psllv_w_512:343LogicalShift = true;344ShiftLeft = true;345break;346}347assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");348349Value *Vec = II.getArgOperand(0);350Value *Amt = II.getArgOperand(1);351auto *VT = cast<FixedVectorType>(II.getType());352Type *SVT = VT->getElementType();353int NumElts = VT->getNumElements();354int BitWidth = SVT->getIntegerBitWidth();355356// If the shift amount is guaranteed to be in-range we can replace it with a357// generic shift.358KnownBits KnownAmt =359llvm::computeKnownBits(Amt, II.getDataLayout());360if (KnownAmt.getMaxValue().ult(BitWidth)) {361return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)362: Builder.CreateLShr(Vec, Amt))363: Builder.CreateAShr(Vec, Amt));364}365366// Simplify if all shift amounts are constant/undef.367auto *CShift = dyn_cast<Constant>(Amt);368if (!CShift)369return nullptr;370371// Collect each element's shift amount.372// We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.373bool AnyOutOfRange = false;374SmallVector<int, 8> ShiftAmts;375for (int I = 0; I < NumElts; ++I) {376auto *CElt = CShift->getAggregateElement(I);377if (isa_and_nonnull<UndefValue>(CElt)) {378ShiftAmts.push_back(-1);379continue;380}381382auto *COp = dyn_cast_or_null<ConstantInt>(CElt);383if (!COp)384return nullptr;385386// Handle out of range shifts.387// If LogicalShift - set to BitWidth (special case).388// If ArithmeticShift - set to (BitWidth - 1) (sign splat).389APInt ShiftVal = COp->getValue();390if (ShiftVal.uge(BitWidth)) {391AnyOutOfRange = LogicalShift;392ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);393continue;394}395396ShiftAmts.push_back((int)ShiftVal.getZExtValue());397}398399// If all elements out of range or UNDEF, return vector of zeros/undefs.400// ArithmeticShift should only hit this if they are all UNDEF.401auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };402if (llvm::all_of(ShiftAmts, OutOfRange)) {403SmallVector<Constant *, 8> ConstantVec;404for (int Idx : ShiftAmts) {405if (Idx < 0) {406ConstantVec.push_back(UndefValue::get(SVT));407} else {408assert(LogicalShift && "Logical shift expected");409ConstantVec.push_back(ConstantInt::getNullValue(SVT));410}411}412return ConstantVector::get(ConstantVec);413}414415// We can't handle only some out of range values with generic logical shifts.416if (AnyOutOfRange)417return nullptr;418419// Build the shift amount constant vector.420SmallVector<Constant *, 8> ShiftVecAmts;421for (int Idx : ShiftAmts) {422if (Idx < 0)423ShiftVecAmts.push_back(UndefValue::get(SVT));424else425ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));426}427auto ShiftVec = ConstantVector::get(ShiftVecAmts);428429if (ShiftLeft)430return Builder.CreateShl(Vec, ShiftVec);431432if (LogicalShift)433return Builder.CreateLShr(Vec, ShiftVec);434435return Builder.CreateAShr(Vec, ShiftVec);436}437438static Value *simplifyX86pack(IntrinsicInst &II,439InstCombiner::BuilderTy &Builder, bool IsSigned) {440Value *Arg0 = II.getArgOperand(0);441Value *Arg1 = II.getArgOperand(1);442Type *ResTy = II.getType();443444// Fast all undef handling.445if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))446return UndefValue::get(ResTy);447448auto *ArgTy = cast<FixedVectorType>(Arg0->getType());449unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;450unsigned NumSrcElts = ArgTy->getNumElements();451assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&452"Unexpected packing types");453454unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;455unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();456unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();457assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&458"Unexpected packing types");459460// Constant folding.461if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))462return nullptr;463464// Clamp Values - signed/unsigned both use signed clamp values, but they465// differ on the min/max values.466APInt MinValue, MaxValue;467if (IsSigned) {468// PACKSS: Truncate signed value with signed saturation.469// Source values less than dst minint are saturated to minint.470// Source values greater than dst maxint are saturated to maxint.471MinValue =472APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);473MaxValue =474APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);475} else {476// PACKUS: Truncate signed value with unsigned saturation.477// Source values less than zero are saturated to zero.478// Source values greater than dst maxuint are saturated to maxuint.479MinValue = APInt::getZero(SrcScalarSizeInBits);480MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);481}482483auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);484auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);485Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);486Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);487Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);488Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);489490// Shuffle clamped args together at the lane level.491SmallVector<int, 32> PackMask;492for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {493for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)494PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));495for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)496PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);497}498auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);499500// Truncate to dst size.501return Builder.CreateTrunc(Shuffle, ResTy);502}503504static Value *simplifyX86pmulh(IntrinsicInst &II,505InstCombiner::BuilderTy &Builder, bool IsSigned,506bool IsRounding) {507Value *Arg0 = II.getArgOperand(0);508Value *Arg1 = II.getArgOperand(1);509auto *ResTy = cast<FixedVectorType>(II.getType());510auto *ArgTy = cast<FixedVectorType>(Arg0->getType());511assert(ArgTy == ResTy && ResTy->getScalarSizeInBits() == 16 &&512"Unexpected PMULH types");513assert((!IsRounding || IsSigned) && "PMULHRS instruction must be signed");514515// Multiply by undef -> zero (NOT undef!) as other arg could still be zero.516if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))517return ConstantAggregateZero::get(ResTy);518519// Multiply by zero.520if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))521return ConstantAggregateZero::get(ResTy);522523// Multiply by one.524if (!IsRounding) {525if (match(Arg0, m_One()))526return IsSigned ? Builder.CreateAShr(Arg1, 15)527: ConstantAggregateZero::get(ResTy);528if (match(Arg1, m_One()))529return IsSigned ? Builder.CreateAShr(Arg0, 15)530: ConstantAggregateZero::get(ResTy);531}532533// Constant folding.534if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))535return nullptr;536537// Extend to twice the width and multiply.538auto Cast =539IsSigned ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;540auto *ExtTy = FixedVectorType::getExtendedElementVectorType(ArgTy);541Value *LHS = Builder.CreateCast(Cast, Arg0, ExtTy);542Value *RHS = Builder.CreateCast(Cast, Arg1, ExtTy);543Value *Mul = Builder.CreateMul(LHS, RHS);544545if (IsRounding) {546// PMULHRSW: truncate to vXi18 of the most significant bits, add one and547// extract bits[16:1].548auto *RndEltTy = IntegerType::get(ExtTy->getContext(), 18);549auto *RndTy = FixedVectorType::get(RndEltTy, ExtTy);550Mul = Builder.CreateLShr(Mul, 14);551Mul = Builder.CreateTrunc(Mul, RndTy);552Mul = Builder.CreateAdd(Mul, ConstantInt::get(RndTy, 1));553Mul = Builder.CreateLShr(Mul, 1);554} else {555// PMULH/PMULHU: extract the vXi16 most significant bits.556Mul = Builder.CreateLShr(Mul, 16);557}558559return Builder.CreateTrunc(Mul, ResTy);560}561562static Value *simplifyX86pmadd(IntrinsicInst &II,563InstCombiner::BuilderTy &Builder,564bool IsPMADDWD) {565Value *Arg0 = II.getArgOperand(0);566Value *Arg1 = II.getArgOperand(1);567auto *ResTy = cast<FixedVectorType>(II.getType());568[[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType());569570unsigned NumDstElts = ResTy->getNumElements();571assert(ArgTy->getNumElements() == (2 * NumDstElts) &&572ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) &&573"Unexpected PMADD types");574575// Multiply by undef -> zero (NOT undef!) as other arg could still be zero.576if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))577return ConstantAggregateZero::get(ResTy);578579// Multiply by zero.580if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1))581return ConstantAggregateZero::get(ResTy);582583// Constant folding.584if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))585return nullptr;586587// Split Lo/Hi elements pairs, extend and add together.588// PMADDWD(X,Y) =589// add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1])))590// PMADDUBSW(X,Y) =591// sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1])))592SmallVector<int> LoMask, HiMask;593for (unsigned I = 0; I != NumDstElts; ++I) {594LoMask.push_back(2 * I + 0);595HiMask.push_back(2 * I + 1);596}597598auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask);599auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask);600auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask);601auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask);602603auto LHSCast =604IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt;605LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy);606LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy);607RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy);608RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy);609Value *Lo = Builder.CreateMul(LHSLo, RHSLo);610Value *Hi = Builder.CreateMul(LHSHi, RHSHi);611return IsPMADDWD612? Builder.CreateAdd(Lo, Hi)613: Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi});614}615616static Value *simplifyX86movmsk(const IntrinsicInst &II,617InstCombiner::BuilderTy &Builder) {618Value *Arg = II.getArgOperand(0);619Type *ResTy = II.getType();620621// movmsk(undef) -> zero as we must ensure the upper bits are zero.622if (isa<UndefValue>(Arg))623return Constant::getNullValue(ResTy);624625auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());626// We can't easily peek through x86_mmx types.627if (!ArgTy)628return nullptr;629630// Expand MOVMSK to compare/bitcast/zext:631// e.g. PMOVMSKB(v16i8 x):632// %cmp = icmp slt <16 x i8> %x, zeroinitializer633// %int = bitcast <16 x i1> %cmp to i16634// %res = zext i16 %int to i32635unsigned NumElts = ArgTy->getNumElements();636Type *IntegerTy = Builder.getIntNTy(NumElts);637638Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy));639Res = Builder.CreateIsNeg(Res);640Res = Builder.CreateBitCast(Res, IntegerTy);641Res = Builder.CreateZExtOrTrunc(Res, ResTy);642return Res;643}644645static Value *simplifyX86addcarry(const IntrinsicInst &II,646InstCombiner::BuilderTy &Builder) {647Value *CarryIn = II.getArgOperand(0);648Value *Op1 = II.getArgOperand(1);649Value *Op2 = II.getArgOperand(2);650Type *RetTy = II.getType();651Type *OpTy = Op1->getType();652assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&653RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&654"Unexpected types for x86 addcarry");655656// If carry-in is zero, this is just an unsigned add with overflow.657if (match(CarryIn, m_ZeroInt())) {658Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,659{Op1, Op2});660// The types have to be adjusted to match the x86 call types.661Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);662Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),663Builder.getInt8Ty());664Value *Res = PoisonValue::get(RetTy);665Res = Builder.CreateInsertValue(Res, UAddOV, 0);666return Builder.CreateInsertValue(Res, UAddResult, 1);667}668669return nullptr;670}671672static Value *simplifyTernarylogic(const IntrinsicInst &II,673InstCombiner::BuilderTy &Builder) {674675auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3));676if (!ArgImm || ArgImm->getValue().uge(256))677return nullptr;678679Value *ArgA = II.getArgOperand(0);680Value *ArgB = II.getArgOperand(1);681Value *ArgC = II.getArgOperand(2);682683Type *Ty = II.getType();684685auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {686return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second};687};688auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {689return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second};690};691auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> {692return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second};693};694auto Not = [&](auto V) -> std::pair<Value *, uint8_t> {695return {Builder.CreateNot(V.first), ~V.second};696};697auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); };698auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };699auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };700701bool AIsConst = match(ArgA, m_ImmConstant());702bool BIsConst = match(ArgB, m_ImmConstant());703bool CIsConst = match(ArgC, m_ImmConstant());704705bool ABIsConst = AIsConst && BIsConst;706bool ACIsConst = AIsConst && CIsConst;707bool BCIsConst = BIsConst && CIsConst;708bool ABCIsConst = AIsConst && BIsConst && CIsConst;709710// Use for verification. Its a big table. Its difficult to go from Imm ->711// logic ops, but easy to verify that a set of logic ops is correct. We track712// the logic ops through the second value in the pair. At the end it should713// equal Imm.714std::pair<Value *, uint8_t> A = {ArgA, 0xf0};715std::pair<Value *, uint8_t> B = {ArgB, 0xcc};716std::pair<Value *, uint8_t> C = {ArgC, 0xaa};717std::pair<Value *, uint8_t> Res = {nullptr, 0};718719// Currently we only handle cases that convert directly to another instruction720// or cases where all the ops are constant. This is because we don't properly721// handle creating ternary ops in the backend, so splitting them here may722// cause regressions. As the backend improves, uncomment more cases.723724uint8_t Imm = ArgImm->getValue().getZExtValue();725switch (Imm) {726case 0x0:727Res = {Constant::getNullValue(Ty), 0};728break;729case 0x1:730if (ABCIsConst)731Res = Nor(Or(A, B), C);732break;733case 0x2:734if (ABCIsConst)735Res = And(Nor(A, B), C);736break;737case 0x3:738if (ABIsConst)739Res = Nor(A, B);740break;741case 0x4:742if (ABCIsConst)743Res = And(Nor(A, C), B);744break;745case 0x5:746if (ACIsConst)747Res = Nor(A, C);748break;749case 0x6:750if (ABCIsConst)751Res = Nor(A, Xnor(B, C));752break;753case 0x7:754if (ABCIsConst)755Res = Nor(A, And(B, C));756break;757case 0x8:758if (ABCIsConst)759Res = Nor(A, Nand(B, C));760break;761case 0x9:762if (ABCIsConst)763Res = Nor(A, Xor(B, C));764break;765case 0xa:766if (ACIsConst)767Res = Nor(A, Not(C));768break;769case 0xb:770if (ABCIsConst)771Res = Nor(A, Nor(C, Not(B)));772break;773case 0xc:774if (ABIsConst)775Res = Nor(A, Not(B));776break;777case 0xd:778if (ABCIsConst)779Res = Nor(A, Nor(B, Not(C)));780break;781case 0xe:782if (ABCIsConst)783Res = Nor(A, Nor(B, C));784break;785case 0xf:786Res = Not(A);787break;788case 0x10:789if (ABCIsConst)790Res = And(A, Nor(B, C));791break;792case 0x11:793if (BCIsConst)794Res = Nor(B, C);795break;796case 0x12:797if (ABCIsConst)798Res = Nor(Xnor(A, C), B);799break;800case 0x13:801if (ABCIsConst)802Res = Nor(And(A, C), B);803break;804case 0x14:805if (ABCIsConst)806Res = Nor(Xnor(A, B), C);807break;808case 0x15:809if (ABCIsConst)810Res = Nor(And(A, B), C);811break;812case 0x16:813if (ABCIsConst)814Res = Xor(Xor(A, B), And(Nand(A, B), C));815break;816case 0x17:817if (ABCIsConst)818Res = Xor(Or(A, B), Or(Xnor(A, B), C));819break;820case 0x18:821if (ABCIsConst)822Res = Nor(Xnor(A, B), Xnor(A, C));823break;824case 0x19:825if (ABCIsConst)826Res = And(Nand(A, B), Xnor(B, C));827break;828case 0x1a:829if (ABCIsConst)830Res = Xor(A, Or(And(A, B), C));831break;832case 0x1b:833if (ABCIsConst)834Res = Xor(A, Or(Xnor(A, B), C));835break;836case 0x1c:837if (ABCIsConst)838Res = Xor(A, Or(And(A, C), B));839break;840case 0x1d:841if (ABCIsConst)842Res = Xor(A, Or(Xnor(A, C), B));843break;844case 0x1e:845if (ABCIsConst)846Res = Xor(A, Or(B, C));847break;848case 0x1f:849if (ABCIsConst)850Res = Nand(A, Or(B, C));851break;852case 0x20:853if (ABCIsConst)854Res = Nor(Nand(A, C), B);855break;856case 0x21:857if (ABCIsConst)858Res = Nor(Xor(A, C), B);859break;860case 0x22:861if (BCIsConst)862Res = Nor(B, Not(C));863break;864case 0x23:865if (ABCIsConst)866Res = Nor(B, Nor(C, Not(A)));867break;868case 0x24:869if (ABCIsConst)870Res = Nor(Xnor(A, B), Xor(A, C));871break;872case 0x25:873if (ABCIsConst)874Res = Xor(A, Nand(Nand(A, B), C));875break;876case 0x26:877if (ABCIsConst)878Res = And(Nand(A, B), Xor(B, C));879break;880case 0x27:881if (ABCIsConst)882Res = Xor(Or(Xnor(A, B), C), B);883break;884case 0x28:885if (ABCIsConst)886Res = And(Xor(A, B), C);887break;888case 0x29:889if (ABCIsConst)890Res = Xor(Xor(A, B), Nor(And(A, B), C));891break;892case 0x2a:893if (ABCIsConst)894Res = And(Nand(A, B), C);895break;896case 0x2b:897if (ABCIsConst)898Res = Xor(Or(Xnor(A, B), Xor(A, C)), A);899break;900case 0x2c:901if (ABCIsConst)902Res = Nor(Xnor(A, B), Nor(B, C));903break;904case 0x2d:905if (ABCIsConst)906Res = Xor(A, Or(B, Not(C)));907break;908case 0x2e:909if (ABCIsConst)910Res = Xor(A, Or(Xor(A, C), B));911break;912case 0x2f:913if (ABCIsConst)914Res = Nand(A, Or(B, Not(C)));915break;916case 0x30:917if (ABIsConst)918Res = Nor(B, Not(A));919break;920case 0x31:921if (ABCIsConst)922Res = Nor(Nor(A, Not(C)), B);923break;924case 0x32:925if (ABCIsConst)926Res = Nor(Nor(A, C), B);927break;928case 0x33:929Res = Not(B);930break;931case 0x34:932if (ABCIsConst)933Res = And(Xor(A, B), Nand(B, C));934break;935case 0x35:936if (ABCIsConst)937Res = Xor(B, Or(A, Xnor(B, C)));938break;939case 0x36:940if (ABCIsConst)941Res = Xor(Or(A, C), B);942break;943case 0x37:944if (ABCIsConst)945Res = Nand(Or(A, C), B);946break;947case 0x38:948if (ABCIsConst)949Res = Nor(Xnor(A, B), Nor(A, C));950break;951case 0x39:952if (ABCIsConst)953Res = Xor(Or(A, Not(C)), B);954break;955case 0x3a:956if (ABCIsConst)957Res = Xor(B, Or(A, Xor(B, C)));958break;959case 0x3b:960if (ABCIsConst)961Res = Nand(Or(A, Not(C)), B);962break;963case 0x3c:964Res = Xor(A, B);965break;966case 0x3d:967if (ABCIsConst)968Res = Xor(A, Or(Nor(A, C), B));969break;970case 0x3e:971if (ABCIsConst)972Res = Xor(A, Or(Nor(A, Not(C)), B));973break;974case 0x3f:975if (ABIsConst)976Res = Nand(A, B);977break;978case 0x40:979if (ABCIsConst)980Res = Nor(Nand(A, B), C);981break;982case 0x41:983if (ABCIsConst)984Res = Nor(Xor(A, B), C);985break;986case 0x42:987if (ABCIsConst)988Res = Nor(Xor(A, B), Xnor(A, C));989break;990case 0x43:991if (ABCIsConst)992Res = Xor(A, Nand(Nand(A, C), B));993break;994case 0x44:995if (BCIsConst)996Res = Nor(C, Not(B));997break;998case 0x45:999if (ABCIsConst)1000Res = Nor(Nor(B, Not(A)), C);1001break;1002case 0x46:1003if (ABCIsConst)1004Res = Xor(Or(And(A, C), B), C);1005break;1006case 0x47:1007if (ABCIsConst)1008Res = Xor(Or(Xnor(A, C), B), C);1009break;1010case 0x48:1011if (ABCIsConst)1012Res = And(Xor(A, C), B);1013break;1014case 0x49:1015if (ABCIsConst)1016Res = Xor(Or(Xnor(A, B), And(A, C)), C);1017break;1018case 0x4a:1019if (ABCIsConst)1020Res = Nor(Xnor(A, C), Nor(B, C));1021break;1022case 0x4b:1023if (ABCIsConst)1024Res = Xor(A, Or(C, Not(B)));1025break;1026case 0x4c:1027if (ABCIsConst)1028Res = And(Nand(A, C), B);1029break;1030case 0x4d:1031if (ABCIsConst)1032Res = Xor(Or(Xor(A, B), Xnor(A, C)), A);1033break;1034case 0x4e:1035if (ABCIsConst)1036Res = Xor(A, Or(Xor(A, B), C));1037break;1038case 0x4f:1039if (ABCIsConst)1040Res = Nand(A, Nand(B, Not(C)));1041break;1042case 0x50:1043if (ACIsConst)1044Res = Nor(C, Not(A));1045break;1046case 0x51:1047if (ABCIsConst)1048Res = Nor(Nor(A, Not(B)), C);1049break;1050case 0x52:1051if (ABCIsConst)1052Res = And(Xor(A, C), Nand(B, C));1053break;1054case 0x53:1055if (ABCIsConst)1056Res = Xor(Or(Xnor(B, C), A), C);1057break;1058case 0x54:1059if (ABCIsConst)1060Res = Nor(Nor(A, B), C);1061break;1062case 0x55:1063Res = Not(C);1064break;1065case 0x56:1066if (ABCIsConst)1067Res = Xor(Or(A, B), C);1068break;1069case 0x57:1070if (ABCIsConst)1071Res = Nand(Or(A, B), C);1072break;1073case 0x58:1074if (ABCIsConst)1075Res = Nor(Nor(A, B), Xnor(A, C));1076break;1077case 0x59:1078if (ABCIsConst)1079Res = Xor(Or(A, Not(B)), C);1080break;1081case 0x5a:1082Res = Xor(A, C);1083break;1084case 0x5b:1085if (ABCIsConst)1086Res = Xor(A, Or(Nor(A, B), C));1087break;1088case 0x5c:1089if (ABCIsConst)1090Res = Xor(Or(Xor(B, C), A), C);1091break;1092case 0x5d:1093if (ABCIsConst)1094Res = Nand(Or(A, Not(B)), C);1095break;1096case 0x5e:1097if (ABCIsConst)1098Res = Xor(A, Or(Nor(A, Not(B)), C));1099break;1100case 0x5f:1101if (ACIsConst)1102Res = Nand(A, C);1103break;1104case 0x60:1105if (ABCIsConst)1106Res = And(A, Xor(B, C));1107break;1108case 0x61:1109if (ABCIsConst)1110Res = Xor(Or(Xnor(A, B), And(B, C)), C);1111break;1112case 0x62:1113if (ABCIsConst)1114Res = Nor(Nor(A, C), Xnor(B, C));1115break;1116case 0x63:1117if (ABCIsConst)1118Res = Xor(B, Or(C, Not(A)));1119break;1120case 0x64:1121if (ABCIsConst)1122Res = Nor(Nor(A, B), Xnor(B, C));1123break;1124case 0x65:1125if (ABCIsConst)1126Res = Xor(Or(B, Not(A)), C);1127break;1128case 0x66:1129Res = Xor(B, C);1130break;1131case 0x67:1132if (ABCIsConst)1133Res = Or(Nor(A, B), Xor(B, C));1134break;1135case 0x68:1136if (ABCIsConst)1137Res = Xor(Xor(A, B), Nor(Nor(A, B), C));1138break;1139case 0x69:1140if (ABCIsConst)1141Res = Xor(Xnor(A, B), C);1142break;1143case 0x6a:1144if (ABCIsConst)1145Res = Xor(And(A, B), C);1146break;1147case 0x6b:1148if (ABCIsConst)1149Res = Or(Nor(A, B), Xor(Xnor(A, B), C));1150break;1151case 0x6c:1152if (ABCIsConst)1153Res = Xor(And(A, C), B);1154break;1155case 0x6d:1156if (ABCIsConst)1157Res = Xor(Or(Xnor(A, B), Nor(A, C)), C);1158break;1159case 0x6e:1160if (ABCIsConst)1161Res = Or(Nor(A, Not(B)), Xor(B, C));1162break;1163case 0x6f:1164if (ABCIsConst)1165Res = Nand(A, Xnor(B, C));1166break;1167case 0x70:1168if (ABCIsConst)1169Res = And(A, Nand(B, C));1170break;1171case 0x71:1172if (ABCIsConst)1173Res = Xor(Nor(Xor(A, B), Xor(A, C)), A);1174break;1175case 0x72:1176if (ABCIsConst)1177Res = Xor(Or(Xor(A, B), C), B);1178break;1179case 0x73:1180if (ABCIsConst)1181Res = Nand(Nand(A, Not(C)), B);1182break;1183case 0x74:1184if (ABCIsConst)1185Res = Xor(Or(Xor(A, C), B), C);1186break;1187case 0x75:1188if (ABCIsConst)1189Res = Nand(Nand(A, Not(B)), C);1190break;1191case 0x76:1192if (ABCIsConst)1193Res = Xor(B, Or(Nor(B, Not(A)), C));1194break;1195case 0x77:1196if (BCIsConst)1197Res = Nand(B, C);1198break;1199case 0x78:1200if (ABCIsConst)1201Res = Xor(A, And(B, C));1202break;1203case 0x79:1204if (ABCIsConst)1205Res = Xor(Or(Xnor(A, B), Nor(B, C)), C);1206break;1207case 0x7a:1208if (ABCIsConst)1209Res = Or(Xor(A, C), Nor(B, Not(A)));1210break;1211case 0x7b:1212if (ABCIsConst)1213Res = Nand(Xnor(A, C), B);1214break;1215case 0x7c:1216if (ABCIsConst)1217Res = Or(Xor(A, B), Nor(C, Not(A)));1218break;1219case 0x7d:1220if (ABCIsConst)1221Res = Nand(Xnor(A, B), C);1222break;1223case 0x7e:1224if (ABCIsConst)1225Res = Or(Xor(A, B), Xor(A, C));1226break;1227case 0x7f:1228if (ABCIsConst)1229Res = Nand(And(A, B), C);1230break;1231case 0x80:1232if (ABCIsConst)1233Res = And(And(A, B), C);1234break;1235case 0x81:1236if (ABCIsConst)1237Res = Nor(Xor(A, B), Xor(A, C));1238break;1239case 0x82:1240if (ABCIsConst)1241Res = And(Xnor(A, B), C);1242break;1243case 0x83:1244if (ABCIsConst)1245Res = Nor(Xor(A, B), Nor(C, Not(A)));1246break;1247case 0x84:1248if (ABCIsConst)1249Res = And(Xnor(A, C), B);1250break;1251case 0x85:1252if (ABCIsConst)1253Res = Nor(Xor(A, C), Nor(B, Not(A)));1254break;1255case 0x86:1256if (ABCIsConst)1257Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C);1258break;1259case 0x87:1260if (ABCIsConst)1261Res = Xor(A, Nand(B, C));1262break;1263case 0x88:1264Res = And(B, C);1265break;1266case 0x89:1267if (ABCIsConst)1268Res = Xor(B, Nor(Nor(B, Not(A)), C));1269break;1270case 0x8a:1271if (ABCIsConst)1272Res = And(Nand(A, Not(B)), C);1273break;1274case 0x8b:1275if (ABCIsConst)1276Res = Xor(Nor(Xor(A, C), B), C);1277break;1278case 0x8c:1279if (ABCIsConst)1280Res = And(Nand(A, Not(C)), B);1281break;1282case 0x8d:1283if (ABCIsConst)1284Res = Xor(Nor(Xor(A, B), C), B);1285break;1286case 0x8e:1287if (ABCIsConst)1288Res = Xor(Or(Xor(A, B), Xor(A, C)), A);1289break;1290case 0x8f:1291if (ABCIsConst)1292Res = Nand(A, Nand(B, C));1293break;1294case 0x90:1295if (ABCIsConst)1296Res = And(A, Xnor(B, C));1297break;1298case 0x91:1299if (ABCIsConst)1300Res = Nor(Nor(A, Not(B)), Xor(B, C));1301break;1302case 0x92:1303if (ABCIsConst)1304Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C);1305break;1306case 0x93:1307if (ABCIsConst)1308Res = Xor(Nand(A, C), B);1309break;1310case 0x94:1311if (ABCIsConst)1312Res = Nor(Nor(A, B), Xor(Xnor(A, B), C));1313break;1314case 0x95:1315if (ABCIsConst)1316Res = Xor(Nand(A, B), C);1317break;1318case 0x96:1319if (ABCIsConst)1320Res = Xor(Xor(A, B), C);1321break;1322case 0x97:1323if (ABCIsConst)1324Res = Xor(Xor(A, B), Or(Nor(A, B), C));1325break;1326case 0x98:1327if (ABCIsConst)1328Res = Nor(Nor(A, B), Xor(B, C));1329break;1330case 0x99:1331if (BCIsConst)1332Res = Xnor(B, C);1333break;1334case 0x9a:1335if (ABCIsConst)1336Res = Xor(Nor(B, Not(A)), C);1337break;1338case 0x9b:1339if (ABCIsConst)1340Res = Or(Nor(A, B), Xnor(B, C));1341break;1342case 0x9c:1343if (ABCIsConst)1344Res = Xor(B, Nor(C, Not(A)));1345break;1346case 0x9d:1347if (ABCIsConst)1348Res = Or(Nor(A, C), Xnor(B, C));1349break;1350case 0x9e:1351if (ABCIsConst)1352Res = Xor(And(Xor(A, B), Nand(B, C)), C);1353break;1354case 0x9f:1355if (ABCIsConst)1356Res = Nand(A, Xor(B, C));1357break;1358case 0xa0:1359Res = And(A, C);1360break;1361case 0xa1:1362if (ABCIsConst)1363Res = Xor(A, Nor(Nor(A, Not(B)), C));1364break;1365case 0xa2:1366if (ABCIsConst)1367Res = And(Or(A, Not(B)), C);1368break;1369case 0xa3:1370if (ABCIsConst)1371Res = Xor(Nor(Xor(B, C), A), C);1372break;1373case 0xa4:1374if (ABCIsConst)1375Res = Xor(A, Nor(Nor(A, B), C));1376break;1377case 0xa5:1378if (ACIsConst)1379Res = Xnor(A, C);1380break;1381case 0xa6:1382if (ABCIsConst)1383Res = Xor(Nor(A, Not(B)), C);1384break;1385case 0xa7:1386if (ABCIsConst)1387Res = Or(Nor(A, B), Xnor(A, C));1388break;1389case 0xa8:1390if (ABCIsConst)1391Res = And(Or(A, B), C);1392break;1393case 0xa9:1394if (ABCIsConst)1395Res = Xor(Nor(A, B), C);1396break;1397case 0xaa:1398Res = C;1399break;1400case 0xab:1401if (ABCIsConst)1402Res = Or(Nor(A, B), C);1403break;1404case 0xac:1405if (ABCIsConst)1406Res = Xor(Nor(Xnor(B, C), A), C);1407break;1408case 0xad:1409if (ABCIsConst)1410Res = Or(Xnor(A, C), And(B, C));1411break;1412case 0xae:1413if (ABCIsConst)1414Res = Or(Nor(A, Not(B)), C);1415break;1416case 0xaf:1417if (ACIsConst)1418Res = Or(C, Not(A));1419break;1420case 0xb0:1421if (ABCIsConst)1422Res = And(A, Nand(B, Not(C)));1423break;1424case 0xb1:1425if (ABCIsConst)1426Res = Xor(A, Nor(Xor(A, B), C));1427break;1428case 0xb2:1429if (ABCIsConst)1430Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A);1431break;1432case 0xb3:1433if (ABCIsConst)1434Res = Nand(Nand(A, C), B);1435break;1436case 0xb4:1437if (ABCIsConst)1438Res = Xor(A, Nor(C, Not(B)));1439break;1440case 0xb5:1441if (ABCIsConst)1442Res = Or(Xnor(A, C), Nor(B, C));1443break;1444case 0xb6:1445if (ABCIsConst)1446Res = Xor(And(Xor(A, B), Nand(A, C)), C);1447break;1448case 0xb7:1449if (ABCIsConst)1450Res = Nand(Xor(A, C), B);1451break;1452case 0xb8:1453if (ABCIsConst)1454Res = Xor(Nor(Xnor(A, C), B), C);1455break;1456case 0xb9:1457if (ABCIsConst)1458Res = Xor(Nor(And(A, C), B), C);1459break;1460case 0xba:1461if (ABCIsConst)1462Res = Or(Nor(B, Not(A)), C);1463break;1464case 0xbb:1465if (BCIsConst)1466Res = Or(C, Not(B));1467break;1468case 0xbc:1469if (ABCIsConst)1470Res = Xor(A, And(Nand(A, C), B));1471break;1472case 0xbd:1473if (ABCIsConst)1474Res = Or(Xor(A, B), Xnor(A, C));1475break;1476case 0xbe:1477if (ABCIsConst)1478Res = Or(Xor(A, B), C);1479break;1480case 0xbf:1481if (ABCIsConst)1482Res = Or(Nand(A, B), C);1483break;1484case 0xc0:1485Res = And(A, B);1486break;1487case 0xc1:1488if (ABCIsConst)1489Res = Xor(A, Nor(Nor(A, Not(C)), B));1490break;1491case 0xc2:1492if (ABCIsConst)1493Res = Xor(A, Nor(Nor(A, C), B));1494break;1495case 0xc3:1496if (ABIsConst)1497Res = Xnor(A, B);1498break;1499case 0xc4:1500if (ABCIsConst)1501Res = And(Or(A, Not(C)), B);1502break;1503case 0xc5:1504if (ABCIsConst)1505Res = Xor(B, Nor(A, Xor(B, C)));1506break;1507case 0xc6:1508if (ABCIsConst)1509Res = Xor(Nor(A, Not(C)), B);1510break;1511case 0xc7:1512if (ABCIsConst)1513Res = Or(Xnor(A, B), Nor(A, C));1514break;1515case 0xc8:1516if (ABCIsConst)1517Res = And(Or(A, C), B);1518break;1519case 0xc9:1520if (ABCIsConst)1521Res = Xor(Nor(A, C), B);1522break;1523case 0xca:1524if (ABCIsConst)1525Res = Xor(B, Nor(A, Xnor(B, C)));1526break;1527case 0xcb:1528if (ABCIsConst)1529Res = Or(Xnor(A, B), And(B, C));1530break;1531case 0xcc:1532Res = B;1533break;1534case 0xcd:1535if (ABCIsConst)1536Res = Or(Nor(A, C), B);1537break;1538case 0xce:1539if (ABCIsConst)1540Res = Or(Nor(A, Not(C)), B);1541break;1542case 0xcf:1543if (ABIsConst)1544Res = Or(B, Not(A));1545break;1546case 0xd0:1547if (ABCIsConst)1548Res = And(A, Or(B, Not(C)));1549break;1550case 0xd1:1551if (ABCIsConst)1552Res = Xor(A, Nor(Xor(A, C), B));1553break;1554case 0xd2:1555if (ABCIsConst)1556Res = Xor(A, Nor(B, Not(C)));1557break;1558case 0xd3:1559if (ABCIsConst)1560Res = Or(Xnor(A, B), Nor(B, C));1561break;1562case 0xd4:1563if (ABCIsConst)1564Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A);1565break;1566case 0xd5:1567if (ABCIsConst)1568Res = Nand(Nand(A, B), C);1569break;1570case 0xd6:1571if (ABCIsConst)1572Res = Xor(Xor(A, B), Or(And(A, B), C));1573break;1574case 0xd7:1575if (ABCIsConst)1576Res = Nand(Xor(A, B), C);1577break;1578case 0xd8:1579if (ABCIsConst)1580Res = Xor(Nor(Xnor(A, B), C), B);1581break;1582case 0xd9:1583if (ABCIsConst)1584Res = Or(And(A, B), Xnor(B, C));1585break;1586case 0xda:1587if (ABCIsConst)1588Res = Xor(A, And(Nand(A, B), C));1589break;1590case 0xdb:1591if (ABCIsConst)1592Res = Or(Xnor(A, B), Xor(A, C));1593break;1594case 0xdc:1595if (ABCIsConst)1596Res = Or(B, Nor(C, Not(A)));1597break;1598case 0xdd:1599if (BCIsConst)1600Res = Or(B, Not(C));1601break;1602case 0xde:1603if (ABCIsConst)1604Res = Or(Xor(A, C), B);1605break;1606case 0xdf:1607if (ABCIsConst)1608Res = Or(Nand(A, C), B);1609break;1610case 0xe0:1611if (ABCIsConst)1612Res = And(A, Or(B, C));1613break;1614case 0xe1:1615if (ABCIsConst)1616Res = Xor(A, Nor(B, C));1617break;1618case 0xe2:1619if (ABCIsConst)1620Res = Xor(A, Nor(Xnor(A, C), B));1621break;1622case 0xe3:1623if (ABCIsConst)1624Res = Xor(A, Nor(And(A, C), B));1625break;1626case 0xe4:1627if (ABCIsConst)1628Res = Xor(A, Nor(Xnor(A, B), C));1629break;1630case 0xe5:1631if (ABCIsConst)1632Res = Xor(A, Nor(And(A, B), C));1633break;1634case 0xe6:1635if (ABCIsConst)1636Res = Or(And(A, B), Xor(B, C));1637break;1638case 0xe7:1639if (ABCIsConst)1640Res = Or(Xnor(A, B), Xnor(A, C));1641break;1642case 0xe8:1643if (ABCIsConst)1644Res = Xor(Or(A, B), Nor(Xnor(A, B), C));1645break;1646case 0xe9:1647if (ABCIsConst)1648Res = Xor(Xor(A, B), Nand(Nand(A, B), C));1649break;1650case 0xea:1651if (ABCIsConst)1652Res = Or(And(A, B), C);1653break;1654case 0xeb:1655if (ABCIsConst)1656Res = Or(Xnor(A, B), C);1657break;1658case 0xec:1659if (ABCIsConst)1660Res = Or(And(A, C), B);1661break;1662case 0xed:1663if (ABCIsConst)1664Res = Or(Xnor(A, C), B);1665break;1666case 0xee:1667Res = Or(B, C);1668break;1669case 0xef:1670if (ABCIsConst)1671Res = Nand(A, Nor(B, C));1672break;1673case 0xf0:1674Res = A;1675break;1676case 0xf1:1677if (ABCIsConst)1678Res = Or(A, Nor(B, C));1679break;1680case 0xf2:1681if (ABCIsConst)1682Res = Or(A, Nor(B, Not(C)));1683break;1684case 0xf3:1685if (ABIsConst)1686Res = Or(A, Not(B));1687break;1688case 0xf4:1689if (ABCIsConst)1690Res = Or(A, Nor(C, Not(B)));1691break;1692case 0xf5:1693if (ACIsConst)1694Res = Or(A, Not(C));1695break;1696case 0xf6:1697if (ABCIsConst)1698Res = Or(A, Xor(B, C));1699break;1700case 0xf7:1701if (ABCIsConst)1702Res = Or(A, Nand(B, C));1703break;1704case 0xf8:1705if (ABCIsConst)1706Res = Or(A, And(B, C));1707break;1708case 0xf9:1709if (ABCIsConst)1710Res = Or(A, Xnor(B, C));1711break;1712case 0xfa:1713Res = Or(A, C);1714break;1715case 0xfb:1716if (ABCIsConst)1717Res = Nand(Nor(A, C), B);1718break;1719case 0xfc:1720Res = Or(A, B);1721break;1722case 0xfd:1723if (ABCIsConst)1724Res = Nand(Nor(A, B), C);1725break;1726case 0xfe:1727if (ABCIsConst)1728Res = Or(Or(A, B), C);1729break;1730case 0xff:1731Res = {Constant::getAllOnesValue(Ty), 0xff};1732break;1733}17341735assert((Res.first == nullptr || Res.second == Imm) &&1736"Simplification of ternary logic does not verify!");1737return Res.first;1738}17391740static Value *simplifyX86insertps(const IntrinsicInst &II,1741InstCombiner::BuilderTy &Builder) {1742auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));1743if (!CInt)1744return nullptr;17451746auto *VecTy = cast<FixedVectorType>(II.getType());1747assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");17481749// The immediate permute control byte looks like this:1750// [3:0] - zero mask for each 32-bit lane1751// [5:4] - select one 32-bit destination lane1752// [7:6] - select one 32-bit source lane17531754uint8_t Imm = CInt->getZExtValue();1755uint8_t ZMask = Imm & 0xf;1756uint8_t DestLane = (Imm >> 4) & 0x3;1757uint8_t SourceLane = (Imm >> 6) & 0x3;17581759ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);17601761// If all zero mask bits are set, this was just a weird way to1762// generate a zero vector.1763if (ZMask == 0xf)1764return ZeroVector;17651766// Initialize by passing all of the first source bits through.1767int ShuffleMask[4] = {0, 1, 2, 3};17681769// We may replace the second operand with the zero vector.1770Value *V1 = II.getArgOperand(1);17711772if (ZMask) {1773// If the zero mask is being used with a single input or the zero mask1774// overrides the destination lane, this is a shuffle with the zero vector.1775if ((II.getArgOperand(0) == II.getArgOperand(1)) ||1776(ZMask & (1 << DestLane))) {1777V1 = ZeroVector;1778// We may still move 32-bits of the first source vector from one lane1779// to another.1780ShuffleMask[DestLane] = SourceLane;1781// The zero mask may override the previous insert operation.1782for (unsigned i = 0; i < 4; ++i)1783if ((ZMask >> i) & 0x1)1784ShuffleMask[i] = i + 4;1785} else {1786// TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?1787return nullptr;1788}1789} else {1790// Replace the selected destination lane with the selected source lane.1791ShuffleMask[DestLane] = SourceLane + 4;1792}17931794return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);1795}17961797/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding1798/// or conversion to a shuffle vector.1799static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,1800ConstantInt *CILength, ConstantInt *CIIndex,1801InstCombiner::BuilderTy &Builder) {1802auto LowConstantHighUndef = [&](uint64_t Val) {1803Type *IntTy64 = Type::getInt64Ty(II.getContext());1804Constant *Args[] = {ConstantInt::get(IntTy64, Val),1805UndefValue::get(IntTy64)};1806return ConstantVector::get(Args);1807};18081809// See if we're dealing with constant values.1810auto *C0 = dyn_cast<Constant>(Op0);1811auto *CI0 =1812C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))1813: nullptr;18141815// Attempt to constant fold.1816if (CILength && CIIndex) {1817// From AMD documentation: "The bit index and field length are each six1818// bits in length other bits of the field are ignored."1819APInt APIndex = CIIndex->getValue().zextOrTrunc(6);1820APInt APLength = CILength->getValue().zextOrTrunc(6);18211822unsigned Index = APIndex.getZExtValue();18231824// From AMD documentation: "a value of zero in the field length is1825// defined as length of 64".1826unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();18271828// From AMD documentation: "If the sum of the bit index + length field1829// is greater than 64, the results are undefined".1830unsigned End = Index + Length;18311832// Note that both field index and field length are 8-bit quantities.1833// Since variables 'Index' and 'Length' are unsigned values1834// obtained from zero-extending field index and field length1835// respectively, their sum should never wrap around.1836if (End > 64)1837return UndefValue::get(II.getType());18381839// If we are inserting whole bytes, we can convert this to a shuffle.1840// Lowering can recognize EXTRQI shuffle masks.1841if ((Length % 8) == 0 && (Index % 8) == 0) {1842// Convert bit indices to byte indices.1843Length /= 8;1844Index /= 8;18451846Type *IntTy8 = Type::getInt8Ty(II.getContext());1847auto *ShufTy = FixedVectorType::get(IntTy8, 16);18481849SmallVector<int, 16> ShuffleMask;1850for (int i = 0; i != (int)Length; ++i)1851ShuffleMask.push_back(i + Index);1852for (int i = Length; i != 8; ++i)1853ShuffleMask.push_back(i + 16);1854for (int i = 8; i != 16; ++i)1855ShuffleMask.push_back(-1);18561857Value *SV = Builder.CreateShuffleVector(1858Builder.CreateBitCast(Op0, ShufTy),1859ConstantAggregateZero::get(ShufTy), ShuffleMask);1860return Builder.CreateBitCast(SV, II.getType());1861}18621863// Constant Fold - shift Index'th bit to lowest position and mask off1864// Length bits.1865if (CI0) {1866APInt Elt = CI0->getValue();1867Elt.lshrInPlace(Index);1868Elt = Elt.zextOrTrunc(Length);1869return LowConstantHighUndef(Elt.getZExtValue());1870}18711872// If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.1873if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {1874Value *Args[] = {Op0, CILength, CIIndex};1875Module *M = II.getModule();1876Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);1877return Builder.CreateCall(F, Args);1878}1879}18801881// Constant Fold - extraction from zero is always {zero, undef}.1882if (CI0 && CI0->isZero())1883return LowConstantHighUndef(0);18841885return nullptr;1886}18871888/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant1889/// folding or conversion to a shuffle vector.1890static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,1891APInt APLength, APInt APIndex,1892InstCombiner::BuilderTy &Builder) {1893// From AMD documentation: "The bit index and field length are each six bits1894// in length other bits of the field are ignored."1895APIndex = APIndex.zextOrTrunc(6);1896APLength = APLength.zextOrTrunc(6);18971898// Attempt to constant fold.1899unsigned Index = APIndex.getZExtValue();19001901// From AMD documentation: "a value of zero in the field length is1902// defined as length of 64".1903unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();19041905// From AMD documentation: "If the sum of the bit index + length field1906// is greater than 64, the results are undefined".1907unsigned End = Index + Length;19081909// Note that both field index and field length are 8-bit quantities.1910// Since variables 'Index' and 'Length' are unsigned values1911// obtained from zero-extending field index and field length1912// respectively, their sum should never wrap around.1913if (End > 64)1914return UndefValue::get(II.getType());19151916// If we are inserting whole bytes, we can convert this to a shuffle.1917// Lowering can recognize INSERTQI shuffle masks.1918if ((Length % 8) == 0 && (Index % 8) == 0) {1919// Convert bit indices to byte indices.1920Length /= 8;1921Index /= 8;19221923Type *IntTy8 = Type::getInt8Ty(II.getContext());1924auto *ShufTy = FixedVectorType::get(IntTy8, 16);19251926SmallVector<int, 16> ShuffleMask;1927for (int i = 0; i != (int)Index; ++i)1928ShuffleMask.push_back(i);1929for (int i = 0; i != (int)Length; ++i)1930ShuffleMask.push_back(i + 16);1931for (int i = Index + Length; i != 8; ++i)1932ShuffleMask.push_back(i);1933for (int i = 8; i != 16; ++i)1934ShuffleMask.push_back(-1);19351936Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),1937Builder.CreateBitCast(Op1, ShufTy),1938ShuffleMask);1939return Builder.CreateBitCast(SV, II.getType());1940}19411942// See if we're dealing with constant values.1943auto *C0 = dyn_cast<Constant>(Op0);1944auto *C1 = dyn_cast<Constant>(Op1);1945auto *CI00 =1946C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))1947: nullptr;1948auto *CI10 =1949C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))1950: nullptr;19511952// Constant Fold - insert bottom Length bits starting at the Index'th bit.1953if (CI00 && CI10) {1954APInt V00 = CI00->getValue();1955APInt V10 = CI10->getValue();1956APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);1957V00 = V00 & ~Mask;1958V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);1959APInt Val = V00 | V10;1960Type *IntTy64 = Type::getInt64Ty(II.getContext());1961Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),1962UndefValue::get(IntTy64)};1963return ConstantVector::get(Args);1964}19651966// If we were an INSERTQ call, we'll save demanded elements if we convert to1967// INSERTQI.1968if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {1969Type *IntTy8 = Type::getInt8Ty(II.getContext());1970Constant *CILength = ConstantInt::get(IntTy8, Length, false);1971Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);19721973Value *Args[] = {Op0, Op1, CILength, CIIndex};1974Module *M = II.getModule();1975Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);1976return Builder.CreateCall(F, Args);1977}19781979return nullptr;1980}19811982/// Attempt to convert pshufb* to shufflevector if the mask is constant.1983static Value *simplifyX86pshufb(const IntrinsicInst &II,1984InstCombiner::BuilderTy &Builder) {1985auto *V = dyn_cast<Constant>(II.getArgOperand(1));1986if (!V)1987return nullptr;19881989auto *VecTy = cast<FixedVectorType>(II.getType());1990unsigned NumElts = VecTy->getNumElements();1991assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&1992"Unexpected number of elements in shuffle mask!");19931994// Construct a shuffle mask from constant integers or UNDEFs.1995int Indexes[64];19961997// Each byte in the shuffle control mask forms an index to permute the1998// corresponding byte in the destination operand.1999for (unsigned I = 0; I < NumElts; ++I) {2000Constant *COp = V->getAggregateElement(I);2001if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))2002return nullptr;20032004if (isa<UndefValue>(COp)) {2005Indexes[I] = -1;2006continue;2007}20082009int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();20102011// If the most significant bit (bit[7]) of each byte of the shuffle2012// control mask is set, then zero is written in the result byte.2013// The zero vector is in the right-hand side of the resulting2014// shufflevector.20152016// The value of each index for the high 128-bit lane is the least2017// significant 4 bits of the respective shuffle control byte.2018Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);2019Indexes[I] = Index;2020}20212022auto V1 = II.getArgOperand(0);2023auto V2 = Constant::getNullValue(VecTy);2024return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));2025}20262027/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.2028static Value *simplifyX86vpermilvar(const IntrinsicInst &II,2029InstCombiner::BuilderTy &Builder) {2030auto *V = dyn_cast<Constant>(II.getArgOperand(1));2031if (!V)2032return nullptr;20332034auto *VecTy = cast<FixedVectorType>(II.getType());2035unsigned NumElts = VecTy->getNumElements();2036bool IsPD = VecTy->getScalarType()->isDoubleTy();2037unsigned NumLaneElts = IsPD ? 2 : 4;2038assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);20392040// Construct a shuffle mask from constant integers or UNDEFs.2041int Indexes[16];20422043// The intrinsics only read one or two bits, clear the rest.2044for (unsigned I = 0; I < NumElts; ++I) {2045Constant *COp = V->getAggregateElement(I);2046if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))2047return nullptr;20482049if (isa<UndefValue>(COp)) {2050Indexes[I] = -1;2051continue;2052}20532054APInt Index = cast<ConstantInt>(COp)->getValue();2055Index = Index.zextOrTrunc(32).getLoBits(2);20562057// The PD variants uses bit 1 to select per-lane element index, so2058// shift down to convert to generic shuffle mask index.2059if (IsPD)2060Index.lshrInPlace(1);20612062// The _256 variants are a bit trickier since the mask bits always index2063// into the corresponding 128 half. In order to convert to a generic2064// shuffle, we have to make that explicit.2065Index += APInt(32, (I / NumLaneElts) * NumLaneElts);20662067Indexes[I] = Index.getZExtValue();2068}20692070auto V1 = II.getArgOperand(0);2071return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts));2072}20732074/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.2075static Value *simplifyX86vpermv(const IntrinsicInst &II,2076InstCombiner::BuilderTy &Builder) {2077auto *V = dyn_cast<Constant>(II.getArgOperand(1));2078if (!V)2079return nullptr;20802081auto *VecTy = cast<FixedVectorType>(II.getType());2082unsigned Size = VecTy->getNumElements();2083assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&2084"Unexpected shuffle mask size");20852086// Construct a shuffle mask from constant integers or UNDEFs.2087int Indexes[64];20882089for (unsigned I = 0; I < Size; ++I) {2090Constant *COp = V->getAggregateElement(I);2091if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))2092return nullptr;20932094if (isa<UndefValue>(COp)) {2095Indexes[I] = -1;2096continue;2097}20982099uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();2100Index &= Size - 1;2101Indexes[I] = Index;2102}21032104auto V1 = II.getArgOperand(0);2105return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size));2106}21072108/// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant.2109static Value *simplifyX86vpermv3(const IntrinsicInst &II,2110InstCombiner::BuilderTy &Builder) {2111auto *V = dyn_cast<Constant>(II.getArgOperand(1));2112if (!V)2113return nullptr;21142115auto *VecTy = cast<FixedVectorType>(II.getType());2116unsigned Size = VecTy->getNumElements();2117assert((Size == 2 || Size == 4 || Size == 8 || Size == 16 || Size == 32 ||2118Size == 64) &&2119"Unexpected shuffle mask size");21202121// Construct a shuffle mask from constant integers or UNDEFs.2122int Indexes[64];21232124for (unsigned I = 0; I < Size; ++I) {2125Constant *COp = V->getAggregateElement(I);2126if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))2127return nullptr;21282129if (isa<UndefValue>(COp)) {2130Indexes[I] = -1;2131continue;2132}21332134uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();2135Index &= (2 * Size) - 1;2136Indexes[I] = Index;2137}21382139auto V1 = II.getArgOperand(0);2140auto V2 = II.getArgOperand(2);2141return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size));2142}21432144std::optional<Instruction *>2145X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {2146auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,2147unsigned DemandedWidth) {2148APInt UndefElts(Width, 0);2149APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);2150return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);2151};21522153Intrinsic::ID IID = II.getIntrinsicID();2154switch (IID) {2155case Intrinsic::x86_bmi_bextr_32:2156case Intrinsic::x86_bmi_bextr_64:2157case Intrinsic::x86_tbm_bextri_u32:2158case Intrinsic::x86_tbm_bextri_u64:2159// If the RHS is a constant we can try some simplifications.2160if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {2161uint64_t Shift = C->getZExtValue();2162uint64_t Length = (Shift >> 8) & 0xff;2163Shift &= 0xff;2164unsigned BitWidth = II.getType()->getIntegerBitWidth();2165// If the length is 0 or the shift is out of range, replace with zero.2166if (Length == 0 || Shift >= BitWidth) {2167return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));2168}2169// If the LHS is also a constant, we can completely constant fold this.2170if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {2171uint64_t Result = InC->getZExtValue() >> Shift;2172if (Length > BitWidth)2173Length = BitWidth;2174Result &= maskTrailingOnes<uint64_t>(Length);2175return IC.replaceInstUsesWith(II,2176ConstantInt::get(II.getType(), Result));2177}2178// TODO should we turn this into 'and' if shift is 0? Or 'shl' if we2179// are only masking bits that a shift already cleared?2180}2181break;21822183case Intrinsic::x86_bmi_bzhi_32:2184case Intrinsic::x86_bmi_bzhi_64:2185// If the RHS is a constant we can try some simplifications.2186if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {2187uint64_t Index = C->getZExtValue() & 0xff;2188unsigned BitWidth = II.getType()->getIntegerBitWidth();2189if (Index >= BitWidth) {2190return IC.replaceInstUsesWith(II, II.getArgOperand(0));2191}2192if (Index == 0) {2193return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));2194}2195// If the LHS is also a constant, we can completely constant fold this.2196if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {2197uint64_t Result = InC->getZExtValue();2198Result &= maskTrailingOnes<uint64_t>(Index);2199return IC.replaceInstUsesWith(II,2200ConstantInt::get(II.getType(), Result));2201}2202// TODO should we convert this to an AND if the RHS is constant?2203}2204break;2205case Intrinsic::x86_bmi_pext_32:2206case Intrinsic::x86_bmi_pext_64:2207if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {2208if (MaskC->isNullValue()) {2209return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));2210}2211if (MaskC->isAllOnesValue()) {2212return IC.replaceInstUsesWith(II, II.getArgOperand(0));2213}22142215unsigned MaskIdx, MaskLen;2216if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {2217// any single contingous sequence of 1s anywhere in the mask simply2218// describes a subset of the input bits shifted to the appropriate2219// position. Replace with the straight forward IR.2220Value *Input = II.getArgOperand(0);2221Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));2222Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);2223Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);2224return IC.replaceInstUsesWith(II, Shifted);2225}22262227if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {2228uint64_t Src = SrcC->getZExtValue();2229uint64_t Mask = MaskC->getZExtValue();2230uint64_t Result = 0;2231uint64_t BitToSet = 1;22322233while (Mask) {2234// Isolate lowest set bit.2235uint64_t BitToTest = Mask & -Mask;2236if (BitToTest & Src)2237Result |= BitToSet;22382239BitToSet <<= 1;2240// Clear lowest set bit.2241Mask &= Mask - 1;2242}22432244return IC.replaceInstUsesWith(II,2245ConstantInt::get(II.getType(), Result));2246}2247}2248break;2249case Intrinsic::x86_bmi_pdep_32:2250case Intrinsic::x86_bmi_pdep_64:2251if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {2252if (MaskC->isNullValue()) {2253return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));2254}2255if (MaskC->isAllOnesValue()) {2256return IC.replaceInstUsesWith(II, II.getArgOperand(0));2257}22582259unsigned MaskIdx, MaskLen;2260if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {2261// any single contingous sequence of 1s anywhere in the mask simply2262// describes a subset of the input bits shifted to the appropriate2263// position. Replace with the straight forward IR.2264Value *Input = II.getArgOperand(0);2265Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);2266Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);2267Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));2268return IC.replaceInstUsesWith(II, Masked);2269}22702271if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {2272uint64_t Src = SrcC->getZExtValue();2273uint64_t Mask = MaskC->getZExtValue();2274uint64_t Result = 0;2275uint64_t BitToTest = 1;22762277while (Mask) {2278// Isolate lowest set bit.2279uint64_t BitToSet = Mask & -Mask;2280if (BitToTest & Src)2281Result |= BitToSet;22822283BitToTest <<= 1;2284// Clear lowest set bit;2285Mask &= Mask - 1;2286}22872288return IC.replaceInstUsesWith(II,2289ConstantInt::get(II.getType(), Result));2290}2291}2292break;22932294case Intrinsic::x86_sse_cvtss2si:2295case Intrinsic::x86_sse_cvtss2si64:2296case Intrinsic::x86_sse_cvttss2si:2297case Intrinsic::x86_sse_cvttss2si64:2298case Intrinsic::x86_sse2_cvtsd2si:2299case Intrinsic::x86_sse2_cvtsd2si64:2300case Intrinsic::x86_sse2_cvttsd2si:2301case Intrinsic::x86_sse2_cvttsd2si64:2302case Intrinsic::x86_avx512_vcvtss2si32:2303case Intrinsic::x86_avx512_vcvtss2si64:2304case Intrinsic::x86_avx512_vcvtss2usi32:2305case Intrinsic::x86_avx512_vcvtss2usi64:2306case Intrinsic::x86_avx512_vcvtsd2si32:2307case Intrinsic::x86_avx512_vcvtsd2si64:2308case Intrinsic::x86_avx512_vcvtsd2usi32:2309case Intrinsic::x86_avx512_vcvtsd2usi64:2310case Intrinsic::x86_avx512_cvttss2si:2311case Intrinsic::x86_avx512_cvttss2si64:2312case Intrinsic::x86_avx512_cvttss2usi:2313case Intrinsic::x86_avx512_cvttss2usi64:2314case Intrinsic::x86_avx512_cvttsd2si:2315case Intrinsic::x86_avx512_cvttsd2si64:2316case Intrinsic::x86_avx512_cvttsd2usi:2317case Intrinsic::x86_avx512_cvttsd2usi64: {2318// These intrinsics only demand the 0th element of their input vectors. If2319// we can simplify the input based on that, do so now.2320Value *Arg = II.getArgOperand(0);2321unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();2322if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {2323return IC.replaceOperand(II, 0, V);2324}2325break;2326}23272328case Intrinsic::x86_mmx_pmovmskb:2329case Intrinsic::x86_sse_movmsk_ps:2330case Intrinsic::x86_sse2_movmsk_pd:2331case Intrinsic::x86_sse2_pmovmskb_128:2332case Intrinsic::x86_avx_movmsk_pd_256:2333case Intrinsic::x86_avx_movmsk_ps_256:2334case Intrinsic::x86_avx2_pmovmskb:2335if (Value *V = simplifyX86movmsk(II, IC.Builder)) {2336return IC.replaceInstUsesWith(II, V);2337}2338break;23392340case Intrinsic::x86_sse_comieq_ss:2341case Intrinsic::x86_sse_comige_ss:2342case Intrinsic::x86_sse_comigt_ss:2343case Intrinsic::x86_sse_comile_ss:2344case Intrinsic::x86_sse_comilt_ss:2345case Intrinsic::x86_sse_comineq_ss:2346case Intrinsic::x86_sse_ucomieq_ss:2347case Intrinsic::x86_sse_ucomige_ss:2348case Intrinsic::x86_sse_ucomigt_ss:2349case Intrinsic::x86_sse_ucomile_ss:2350case Intrinsic::x86_sse_ucomilt_ss:2351case Intrinsic::x86_sse_ucomineq_ss:2352case Intrinsic::x86_sse2_comieq_sd:2353case Intrinsic::x86_sse2_comige_sd:2354case Intrinsic::x86_sse2_comigt_sd:2355case Intrinsic::x86_sse2_comile_sd:2356case Intrinsic::x86_sse2_comilt_sd:2357case Intrinsic::x86_sse2_comineq_sd:2358case Intrinsic::x86_sse2_ucomieq_sd:2359case Intrinsic::x86_sse2_ucomige_sd:2360case Intrinsic::x86_sse2_ucomigt_sd:2361case Intrinsic::x86_sse2_ucomile_sd:2362case Intrinsic::x86_sse2_ucomilt_sd:2363case Intrinsic::x86_sse2_ucomineq_sd:2364case Intrinsic::x86_avx512_vcomi_ss:2365case Intrinsic::x86_avx512_vcomi_sd:2366case Intrinsic::x86_avx512_mask_cmp_ss:2367case Intrinsic::x86_avx512_mask_cmp_sd: {2368// These intrinsics only demand the 0th element of their input vectors. If2369// we can simplify the input based on that, do so now.2370bool MadeChange = false;2371Value *Arg0 = II.getArgOperand(0);2372Value *Arg1 = II.getArgOperand(1);2373unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();2374if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {2375IC.replaceOperand(II, 0, V);2376MadeChange = true;2377}2378if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {2379IC.replaceOperand(II, 1, V);2380MadeChange = true;2381}2382if (MadeChange) {2383return &II;2384}2385break;2386}23872388case Intrinsic::x86_avx512_add_ps_512:2389case Intrinsic::x86_avx512_div_ps_512:2390case Intrinsic::x86_avx512_mul_ps_512:2391case Intrinsic::x86_avx512_sub_ps_512:2392case Intrinsic::x86_avx512_add_pd_512:2393case Intrinsic::x86_avx512_div_pd_512:2394case Intrinsic::x86_avx512_mul_pd_512:2395case Intrinsic::x86_avx512_sub_pd_512:2396// If the rounding mode is CUR_DIRECTION(4) we can turn these into regular2397// IR operations.2398if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {2399if (R->getValue() == 4) {2400Value *Arg0 = II.getArgOperand(0);2401Value *Arg1 = II.getArgOperand(1);24022403Value *V;2404switch (IID) {2405default:2406llvm_unreachable("Case stmts out of sync!");2407case Intrinsic::x86_avx512_add_ps_512:2408case Intrinsic::x86_avx512_add_pd_512:2409V = IC.Builder.CreateFAdd(Arg0, Arg1);2410break;2411case Intrinsic::x86_avx512_sub_ps_512:2412case Intrinsic::x86_avx512_sub_pd_512:2413V = IC.Builder.CreateFSub(Arg0, Arg1);2414break;2415case Intrinsic::x86_avx512_mul_ps_512:2416case Intrinsic::x86_avx512_mul_pd_512:2417V = IC.Builder.CreateFMul(Arg0, Arg1);2418break;2419case Intrinsic::x86_avx512_div_ps_512:2420case Intrinsic::x86_avx512_div_pd_512:2421V = IC.Builder.CreateFDiv(Arg0, Arg1);2422break;2423}24242425return IC.replaceInstUsesWith(II, V);2426}2427}2428break;24292430case Intrinsic::x86_avx512_mask_add_ss_round:2431case Intrinsic::x86_avx512_mask_div_ss_round:2432case Intrinsic::x86_avx512_mask_mul_ss_round:2433case Intrinsic::x86_avx512_mask_sub_ss_round:2434case Intrinsic::x86_avx512_mask_add_sd_round:2435case Intrinsic::x86_avx512_mask_div_sd_round:2436case Intrinsic::x86_avx512_mask_mul_sd_round:2437case Intrinsic::x86_avx512_mask_sub_sd_round:2438// If the rounding mode is CUR_DIRECTION(4) we can turn these into regular2439// IR operations.2440if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {2441if (R->getValue() == 4) {2442// Extract the element as scalars.2443Value *Arg0 = II.getArgOperand(0);2444Value *Arg1 = II.getArgOperand(1);2445Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);2446Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);24472448Value *V;2449switch (IID) {2450default:2451llvm_unreachable("Case stmts out of sync!");2452case Intrinsic::x86_avx512_mask_add_ss_round:2453case Intrinsic::x86_avx512_mask_add_sd_round:2454V = IC.Builder.CreateFAdd(LHS, RHS);2455break;2456case Intrinsic::x86_avx512_mask_sub_ss_round:2457case Intrinsic::x86_avx512_mask_sub_sd_round:2458V = IC.Builder.CreateFSub(LHS, RHS);2459break;2460case Intrinsic::x86_avx512_mask_mul_ss_round:2461case Intrinsic::x86_avx512_mask_mul_sd_round:2462V = IC.Builder.CreateFMul(LHS, RHS);2463break;2464case Intrinsic::x86_avx512_mask_div_ss_round:2465case Intrinsic::x86_avx512_mask_div_sd_round:2466V = IC.Builder.CreateFDiv(LHS, RHS);2467break;2468}24692470// Handle the masking aspect of the intrinsic.2471Value *Mask = II.getArgOperand(3);2472auto *C = dyn_cast<ConstantInt>(Mask);2473// We don't need a select if we know the mask bit is a 1.2474if (!C || !C->getValue()[0]) {2475// Cast the mask to an i1 vector and then extract the lowest element.2476auto *MaskTy = FixedVectorType::get(2477IC.Builder.getInt1Ty(),2478cast<IntegerType>(Mask->getType())->getBitWidth());2479Mask = IC.Builder.CreateBitCast(Mask, MaskTy);2480Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);2481// Extract the lowest element from the passthru operand.2482Value *Passthru =2483IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);2484V = IC.Builder.CreateSelect(Mask, V, Passthru);2485}24862487// Insert the result back into the original argument 0.2488V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);24892490return IC.replaceInstUsesWith(II, V);2491}2492}2493break;24942495// Constant fold ashr( <A x Bi>, Ci ).2496// Constant fold lshr( <A x Bi>, Ci ).2497// Constant fold shl( <A x Bi>, Ci ).2498case Intrinsic::x86_sse2_psrai_d:2499case Intrinsic::x86_sse2_psrai_w:2500case Intrinsic::x86_avx2_psrai_d:2501case Intrinsic::x86_avx2_psrai_w:2502case Intrinsic::x86_avx512_psrai_q_128:2503case Intrinsic::x86_avx512_psrai_q_256:2504case Intrinsic::x86_avx512_psrai_d_512:2505case Intrinsic::x86_avx512_psrai_q_512:2506case Intrinsic::x86_avx512_psrai_w_512:2507case Intrinsic::x86_sse2_psrli_d:2508case Intrinsic::x86_sse2_psrli_q:2509case Intrinsic::x86_sse2_psrli_w:2510case Intrinsic::x86_avx2_psrli_d:2511case Intrinsic::x86_avx2_psrli_q:2512case Intrinsic::x86_avx2_psrli_w:2513case Intrinsic::x86_avx512_psrli_d_512:2514case Intrinsic::x86_avx512_psrli_q_512:2515case Intrinsic::x86_avx512_psrli_w_512:2516case Intrinsic::x86_sse2_pslli_d:2517case Intrinsic::x86_sse2_pslli_q:2518case Intrinsic::x86_sse2_pslli_w:2519case Intrinsic::x86_avx2_pslli_d:2520case Intrinsic::x86_avx2_pslli_q:2521case Intrinsic::x86_avx2_pslli_w:2522case Intrinsic::x86_avx512_pslli_d_512:2523case Intrinsic::x86_avx512_pslli_q_512:2524case Intrinsic::x86_avx512_pslli_w_512:2525if (Value *V = simplifyX86immShift(II, IC.Builder)) {2526return IC.replaceInstUsesWith(II, V);2527}2528break;25292530case Intrinsic::x86_sse2_psra_d:2531case Intrinsic::x86_sse2_psra_w:2532case Intrinsic::x86_avx2_psra_d:2533case Intrinsic::x86_avx2_psra_w:2534case Intrinsic::x86_avx512_psra_q_128:2535case Intrinsic::x86_avx512_psra_q_256:2536case Intrinsic::x86_avx512_psra_d_512:2537case Intrinsic::x86_avx512_psra_q_512:2538case Intrinsic::x86_avx512_psra_w_512:2539case Intrinsic::x86_sse2_psrl_d:2540case Intrinsic::x86_sse2_psrl_q:2541case Intrinsic::x86_sse2_psrl_w:2542case Intrinsic::x86_avx2_psrl_d:2543case Intrinsic::x86_avx2_psrl_q:2544case Intrinsic::x86_avx2_psrl_w:2545case Intrinsic::x86_avx512_psrl_d_512:2546case Intrinsic::x86_avx512_psrl_q_512:2547case Intrinsic::x86_avx512_psrl_w_512:2548case Intrinsic::x86_sse2_psll_d:2549case Intrinsic::x86_sse2_psll_q:2550case Intrinsic::x86_sse2_psll_w:2551case Intrinsic::x86_avx2_psll_d:2552case Intrinsic::x86_avx2_psll_q:2553case Intrinsic::x86_avx2_psll_w:2554case Intrinsic::x86_avx512_psll_d_512:2555case Intrinsic::x86_avx512_psll_q_512:2556case Intrinsic::x86_avx512_psll_w_512: {2557if (Value *V = simplifyX86immShift(II, IC.Builder)) {2558return IC.replaceInstUsesWith(II, V);2559}25602561// SSE2/AVX2 uses only the first 64-bits of the 128-bit vector2562// operand to compute the shift amount.2563Value *Arg1 = II.getArgOperand(1);2564assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&2565"Unexpected packed shift size");2566unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();25672568if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {2569return IC.replaceOperand(II, 1, V);2570}2571break;2572}25732574case Intrinsic::x86_avx2_psllv_d:2575case Intrinsic::x86_avx2_psllv_d_256:2576case Intrinsic::x86_avx2_psllv_q:2577case Intrinsic::x86_avx2_psllv_q_256:2578case Intrinsic::x86_avx512_psllv_d_512:2579case Intrinsic::x86_avx512_psllv_q_512:2580case Intrinsic::x86_avx512_psllv_w_128:2581case Intrinsic::x86_avx512_psllv_w_256:2582case Intrinsic::x86_avx512_psllv_w_512:2583case Intrinsic::x86_avx2_psrav_d:2584case Intrinsic::x86_avx2_psrav_d_256:2585case Intrinsic::x86_avx512_psrav_q_128:2586case Intrinsic::x86_avx512_psrav_q_256:2587case Intrinsic::x86_avx512_psrav_d_512:2588case Intrinsic::x86_avx512_psrav_q_512:2589case Intrinsic::x86_avx512_psrav_w_128:2590case Intrinsic::x86_avx512_psrav_w_256:2591case Intrinsic::x86_avx512_psrav_w_512:2592case Intrinsic::x86_avx2_psrlv_d:2593case Intrinsic::x86_avx2_psrlv_d_256:2594case Intrinsic::x86_avx2_psrlv_q:2595case Intrinsic::x86_avx2_psrlv_q_256:2596case Intrinsic::x86_avx512_psrlv_d_512:2597case Intrinsic::x86_avx512_psrlv_q_512:2598case Intrinsic::x86_avx512_psrlv_w_128:2599case Intrinsic::x86_avx512_psrlv_w_256:2600case Intrinsic::x86_avx512_psrlv_w_512:2601if (Value *V = simplifyX86varShift(II, IC.Builder)) {2602return IC.replaceInstUsesWith(II, V);2603}2604break;26052606case Intrinsic::x86_sse2_packssdw_128:2607case Intrinsic::x86_sse2_packsswb_128:2608case Intrinsic::x86_avx2_packssdw:2609case Intrinsic::x86_avx2_packsswb:2610case Intrinsic::x86_avx512_packssdw_512:2611case Intrinsic::x86_avx512_packsswb_512:2612if (Value *V = simplifyX86pack(II, IC.Builder, true)) {2613return IC.replaceInstUsesWith(II, V);2614}2615break;26162617case Intrinsic::x86_sse2_packuswb_128:2618case Intrinsic::x86_sse41_packusdw:2619case Intrinsic::x86_avx2_packusdw:2620case Intrinsic::x86_avx2_packuswb:2621case Intrinsic::x86_avx512_packusdw_512:2622case Intrinsic::x86_avx512_packuswb_512:2623if (Value *V = simplifyX86pack(II, IC.Builder, false)) {2624return IC.replaceInstUsesWith(II, V);2625}2626break;26272628case Intrinsic::x86_sse2_pmulh_w:2629case Intrinsic::x86_avx2_pmulh_w:2630case Intrinsic::x86_avx512_pmulh_w_512:2631if (Value *V = simplifyX86pmulh(II, IC.Builder, true, false)) {2632return IC.replaceInstUsesWith(II, V);2633}2634break;26352636case Intrinsic::x86_sse2_pmulhu_w:2637case Intrinsic::x86_avx2_pmulhu_w:2638case Intrinsic::x86_avx512_pmulhu_w_512:2639if (Value *V = simplifyX86pmulh(II, IC.Builder, false, false)) {2640return IC.replaceInstUsesWith(II, V);2641}2642break;26432644case Intrinsic::x86_ssse3_pmul_hr_sw_128:2645case Intrinsic::x86_avx2_pmul_hr_sw:2646case Intrinsic::x86_avx512_pmul_hr_sw_512:2647if (Value *V = simplifyX86pmulh(II, IC.Builder, true, true)) {2648return IC.replaceInstUsesWith(II, V);2649}2650break;26512652case Intrinsic::x86_sse2_pmadd_wd:2653case Intrinsic::x86_avx2_pmadd_wd:2654case Intrinsic::x86_avx512_pmaddw_d_512:2655if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) {2656return IC.replaceInstUsesWith(II, V);2657}2658break;26592660case Intrinsic::x86_ssse3_pmadd_ub_sw_128:2661case Intrinsic::x86_avx2_pmadd_ub_sw:2662case Intrinsic::x86_avx512_pmaddubs_w_512:2663if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) {2664return IC.replaceInstUsesWith(II, V);2665}2666break;26672668case Intrinsic::x86_pclmulqdq:2669case Intrinsic::x86_pclmulqdq_256:2670case Intrinsic::x86_pclmulqdq_512: {2671if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {2672unsigned Imm = C->getZExtValue();26732674bool MadeChange = false;2675Value *Arg0 = II.getArgOperand(0);2676Value *Arg1 = II.getArgOperand(1);2677unsigned VWidth =2678cast<FixedVectorType>(Arg0->getType())->getNumElements();26792680APInt UndefElts1(VWidth, 0);2681APInt DemandedElts1 =2682APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));2683if (Value *V =2684IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {2685IC.replaceOperand(II, 0, V);2686MadeChange = true;2687}26882689APInt UndefElts2(VWidth, 0);2690APInt DemandedElts2 =2691APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));2692if (Value *V =2693IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {2694IC.replaceOperand(II, 1, V);2695MadeChange = true;2696}26972698// If either input elements are undef, the result is zero.2699if (DemandedElts1.isSubsetOf(UndefElts1) ||2700DemandedElts2.isSubsetOf(UndefElts2)) {2701return IC.replaceInstUsesWith(II,2702ConstantAggregateZero::get(II.getType()));2703}27042705if (MadeChange) {2706return &II;2707}2708}2709break;2710}27112712case Intrinsic::x86_sse41_insertps:2713if (Value *V = simplifyX86insertps(II, IC.Builder)) {2714return IC.replaceInstUsesWith(II, V);2715}2716break;27172718case Intrinsic::x86_sse4a_extrq: {2719Value *Op0 = II.getArgOperand(0);2720Value *Op1 = II.getArgOperand(1);2721unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();2722unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();2723assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&2724Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&2725VWidth1 == 16 && "Unexpected operand sizes");27262727// See if we're dealing with constant values.2728auto *C1 = dyn_cast<Constant>(Op1);2729auto *CILength =2730C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))2731: nullptr;2732auto *CIIndex =2733C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))2734: nullptr;27352736// Attempt to simplify to a constant, shuffle vector or EXTRQI call.2737if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {2738return IC.replaceInstUsesWith(II, V);2739}27402741// EXTRQ only uses the lowest 64-bits of the first 128-bit vector2742// operands and the lowest 16-bits of the second.2743bool MadeChange = false;2744if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {2745IC.replaceOperand(II, 0, V);2746MadeChange = true;2747}2748if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {2749IC.replaceOperand(II, 1, V);2750MadeChange = true;2751}2752if (MadeChange) {2753return &II;2754}2755break;2756}27572758case Intrinsic::x86_sse4a_extrqi: {2759// EXTRQI: Extract Length bits starting from Index. Zero pad the remaining2760// bits of the lower 64-bits. The upper 64-bits are undefined.2761Value *Op0 = II.getArgOperand(0);2762unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();2763assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&2764"Unexpected operand size");27652766// See if we're dealing with constant values.2767auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));2768auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));27692770// Attempt to simplify to a constant or shuffle vector.2771if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {2772return IC.replaceInstUsesWith(II, V);2773}27742775// EXTRQI only uses the lowest 64-bits of the first 128-bit vector2776// operand.2777if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {2778return IC.replaceOperand(II, 0, V);2779}2780break;2781}27822783case Intrinsic::x86_sse4a_insertq: {2784Value *Op0 = II.getArgOperand(0);2785Value *Op1 = II.getArgOperand(1);2786unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();2787assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&2788Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&2789cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&2790"Unexpected operand size");27912792// See if we're dealing with constant values.2793auto *C1 = dyn_cast<Constant>(Op1);2794auto *CI11 =2795C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))2796: nullptr;27972798// Attempt to simplify to a constant, shuffle vector or INSERTQI call.2799if (CI11) {2800const APInt &V11 = CI11->getValue();2801APInt Len = V11.zextOrTrunc(6);2802APInt Idx = V11.lshr(8).zextOrTrunc(6);2803if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {2804return IC.replaceInstUsesWith(II, V);2805}2806}28072808// INSERTQ only uses the lowest 64-bits of the first 128-bit vector2809// operand.2810if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {2811return IC.replaceOperand(II, 0, V);2812}2813break;2814}28152816case Intrinsic::x86_sse4a_insertqi: {2817// INSERTQI: Extract lowest Length bits from lower half of second source and2818// insert over first source starting at Index bit. The upper 64-bits are2819// undefined.2820Value *Op0 = II.getArgOperand(0);2821Value *Op1 = II.getArgOperand(1);2822unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();2823unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();2824assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&2825Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&2826VWidth1 == 2 && "Unexpected operand sizes");28272828// See if we're dealing with constant values.2829auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));2830auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));28312832// Attempt to simplify to a constant or shuffle vector.2833if (CILength && CIIndex) {2834APInt Len = CILength->getValue().zextOrTrunc(6);2835APInt Idx = CIIndex->getValue().zextOrTrunc(6);2836if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {2837return IC.replaceInstUsesWith(II, V);2838}2839}28402841// INSERTQI only uses the lowest 64-bits of the first two 128-bit vector2842// operands.2843bool MadeChange = false;2844if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {2845IC.replaceOperand(II, 0, V);2846MadeChange = true;2847}2848if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {2849IC.replaceOperand(II, 1, V);2850MadeChange = true;2851}2852if (MadeChange) {2853return &II;2854}2855break;2856}28572858case Intrinsic::x86_sse41_pblendvb:2859case Intrinsic::x86_sse41_blendvps:2860case Intrinsic::x86_sse41_blendvpd:2861case Intrinsic::x86_avx_blendv_ps_256:2862case Intrinsic::x86_avx_blendv_pd_256:2863case Intrinsic::x86_avx2_pblendvb: {2864// fold (blend A, A, Mask) -> A2865Value *Op0 = II.getArgOperand(0);2866Value *Op1 = II.getArgOperand(1);2867Value *Mask = II.getArgOperand(2);2868if (Op0 == Op1) {2869return IC.replaceInstUsesWith(II, Op0);2870}28712872// Zero Mask - select 1st argument.2873if (isa<ConstantAggregateZero>(Mask)) {2874return IC.replaceInstUsesWith(II, Op0);2875}28762877// Constant Mask - select 1st/2nd argument lane based on top bit of mask.2878if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {2879Constant *NewSelector =2880getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout());2881return SelectInst::Create(NewSelector, Op1, Op0, "blendv");2882}28832884Mask = InstCombiner::peekThroughBitcast(Mask);28852886// Peek through a one-use shuffle - VectorCombine should have simplified2887// this for cases where we're splitting wider vectors to use blendv2888// intrinsics.2889Value *MaskSrc = nullptr;2890ArrayRef<int> ShuffleMask;2891if (match(Mask, m_OneUse(m_Shuffle(m_Value(MaskSrc), m_Undef(),2892m_Mask(ShuffleMask))))) {2893// Bail if the shuffle was irregular or contains undefs.2894int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements();2895if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) ||2896any_of(ShuffleMask,2897[NumElts](int M) { return M < 0 || M >= NumElts; }))2898break;2899Mask = InstCombiner::peekThroughBitcast(MaskSrc);2900}29012902// Convert to a vector select if we can bypass casts and find a boolean2903// vector condition value.2904Value *BoolVec;2905if (match(Mask, m_SExt(m_Value(BoolVec))) &&2906BoolVec->getType()->isVectorTy() &&2907BoolVec->getType()->getScalarSizeInBits() == 1) {2908auto *MaskTy = cast<FixedVectorType>(Mask->getType());2909auto *OpTy = cast<FixedVectorType>(II.getType());2910unsigned NumMaskElts = MaskTy->getNumElements();2911unsigned NumOperandElts = OpTy->getNumElements();29122913// If we peeked through a shuffle, reapply the shuffle to the bool vector.2914if (MaskSrc) {2915unsigned NumMaskSrcElts =2916cast<FixedVectorType>(MaskSrc->getType())->getNumElements();2917NumMaskElts = (ShuffleMask.size() * NumMaskElts) / NumMaskSrcElts;2918// Multiple mask bits maps to the same operand element - bail out.2919if (NumMaskElts > NumOperandElts)2920break;2921SmallVector<int> ScaledMask;2922if (!llvm::scaleShuffleMaskElts(NumMaskElts, ShuffleMask, ScaledMask))2923break;2924BoolVec = IC.Builder.CreateShuffleVector(BoolVec, ScaledMask);2925MaskTy = FixedVectorType::get(MaskTy->getElementType(), NumMaskElts);2926}2927assert(MaskTy->getPrimitiveSizeInBits() ==2928OpTy->getPrimitiveSizeInBits() &&2929"Not expecting mask and operands with different sizes");29302931if (NumMaskElts == NumOperandElts) {2932return SelectInst::Create(BoolVec, Op1, Op0);2933}29342935// If the mask has less elements than the operands, each mask bit maps to2936// multiple elements of the operands. Bitcast back and forth.2937if (NumMaskElts < NumOperandElts) {2938Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy);2939Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy);2940Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);2941return new BitCastInst(Sel, II.getType());2942}2943}29442945break;2946}29472948case Intrinsic::x86_ssse3_pshuf_b_128:2949case Intrinsic::x86_avx2_pshuf_b:2950case Intrinsic::x86_avx512_pshuf_b_512:2951if (Value *V = simplifyX86pshufb(II, IC.Builder)) {2952return IC.replaceInstUsesWith(II, V);2953}2954break;29552956case Intrinsic::x86_avx_vpermilvar_ps:2957case Intrinsic::x86_avx_vpermilvar_ps_256:2958case Intrinsic::x86_avx512_vpermilvar_ps_512:2959case Intrinsic::x86_avx_vpermilvar_pd:2960case Intrinsic::x86_avx_vpermilvar_pd_256:2961case Intrinsic::x86_avx512_vpermilvar_pd_512:2962if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {2963return IC.replaceInstUsesWith(II, V);2964}2965break;29662967case Intrinsic::x86_avx2_permd:2968case Intrinsic::x86_avx2_permps:2969case Intrinsic::x86_avx512_permvar_df_256:2970case Intrinsic::x86_avx512_permvar_df_512:2971case Intrinsic::x86_avx512_permvar_di_256:2972case Intrinsic::x86_avx512_permvar_di_512:2973case Intrinsic::x86_avx512_permvar_hi_128:2974case Intrinsic::x86_avx512_permvar_hi_256:2975case Intrinsic::x86_avx512_permvar_hi_512:2976case Intrinsic::x86_avx512_permvar_qi_128:2977case Intrinsic::x86_avx512_permvar_qi_256:2978case Intrinsic::x86_avx512_permvar_qi_512:2979case Intrinsic::x86_avx512_permvar_sf_512:2980case Intrinsic::x86_avx512_permvar_si_512:2981if (Value *V = simplifyX86vpermv(II, IC.Builder)) {2982return IC.replaceInstUsesWith(II, V);2983}2984break;29852986case Intrinsic::x86_avx512_vpermi2var_d_128:2987case Intrinsic::x86_avx512_vpermi2var_d_256:2988case Intrinsic::x86_avx512_vpermi2var_d_512:2989case Intrinsic::x86_avx512_vpermi2var_hi_128:2990case Intrinsic::x86_avx512_vpermi2var_hi_256:2991case Intrinsic::x86_avx512_vpermi2var_hi_512:2992case Intrinsic::x86_avx512_vpermi2var_pd_128:2993case Intrinsic::x86_avx512_vpermi2var_pd_256:2994case Intrinsic::x86_avx512_vpermi2var_pd_512:2995case Intrinsic::x86_avx512_vpermi2var_ps_128:2996case Intrinsic::x86_avx512_vpermi2var_ps_256:2997case Intrinsic::x86_avx512_vpermi2var_ps_512:2998case Intrinsic::x86_avx512_vpermi2var_q_128:2999case Intrinsic::x86_avx512_vpermi2var_q_256:3000case Intrinsic::x86_avx512_vpermi2var_q_512:3001case Intrinsic::x86_avx512_vpermi2var_qi_128:3002case Intrinsic::x86_avx512_vpermi2var_qi_256:3003case Intrinsic::x86_avx512_vpermi2var_qi_512:3004if (Value *V = simplifyX86vpermv3(II, IC.Builder)) {3005return IC.replaceInstUsesWith(II, V);3006}3007break;30083009case Intrinsic::x86_avx_maskload_ps:3010case Intrinsic::x86_avx_maskload_pd:3011case Intrinsic::x86_avx_maskload_ps_256:3012case Intrinsic::x86_avx_maskload_pd_256:3013case Intrinsic::x86_avx2_maskload_d:3014case Intrinsic::x86_avx2_maskload_q:3015case Intrinsic::x86_avx2_maskload_d_256:3016case Intrinsic::x86_avx2_maskload_q_256:3017if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {3018return I;3019}3020break;30213022case Intrinsic::x86_sse2_maskmov_dqu:3023case Intrinsic::x86_avx_maskstore_ps:3024case Intrinsic::x86_avx_maskstore_pd:3025case Intrinsic::x86_avx_maskstore_ps_256:3026case Intrinsic::x86_avx_maskstore_pd_256:3027case Intrinsic::x86_avx2_maskstore_d:3028case Intrinsic::x86_avx2_maskstore_q:3029case Intrinsic::x86_avx2_maskstore_d_256:3030case Intrinsic::x86_avx2_maskstore_q_256:3031if (simplifyX86MaskedStore(II, IC)) {3032return nullptr;3033}3034break;30353036case Intrinsic::x86_addcarry_32:3037case Intrinsic::x86_addcarry_64:3038if (Value *V = simplifyX86addcarry(II, IC.Builder)) {3039return IC.replaceInstUsesWith(II, V);3040}3041break;30423043case Intrinsic::x86_avx512_pternlog_d_128:3044case Intrinsic::x86_avx512_pternlog_d_256:3045case Intrinsic::x86_avx512_pternlog_d_512:3046case Intrinsic::x86_avx512_pternlog_q_128:3047case Intrinsic::x86_avx512_pternlog_q_256:3048case Intrinsic::x86_avx512_pternlog_q_512:3049if (Value *V = simplifyTernarylogic(II, IC.Builder)) {3050return IC.replaceInstUsesWith(II, V);3051}3052break;3053default:3054break;3055}3056return std::nullopt;3057}30583059std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(3060InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,3061bool &KnownBitsComputed) const {3062switch (II.getIntrinsicID()) {3063default:3064break;3065case Intrinsic::x86_mmx_pmovmskb:3066case Intrinsic::x86_sse_movmsk_ps:3067case Intrinsic::x86_sse2_movmsk_pd:3068case Intrinsic::x86_sse2_pmovmskb_128:3069case Intrinsic::x86_avx_movmsk_ps_256:3070case Intrinsic::x86_avx_movmsk_pd_256:3071case Intrinsic::x86_avx2_pmovmskb: {3072// MOVMSK copies the vector elements' sign bits to the low bits3073// and zeros the high bits.3074unsigned ArgWidth;3075if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {3076ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.3077} else {3078auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());3079ArgWidth = ArgType->getNumElements();3080}30813082// If we don't need any of low bits then return zero,3083// we know that DemandedMask is non-zero already.3084APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);3085Type *VTy = II.getType();3086if (DemandedElts.isZero()) {3087return ConstantInt::getNullValue(VTy);3088}30893090// We know that the upper bits are set to zero.3091Known.Zero.setBitsFrom(ArgWidth);3092KnownBitsComputed = true;3093break;3094}3095}3096return std::nullopt;3097}30983099std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(3100InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,3101APInt &UndefElts2, APInt &UndefElts3,3102std::function<void(Instruction *, unsigned, APInt, APInt &)>3103simplifyAndSetOp) const {3104unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();3105switch (II.getIntrinsicID()) {3106default:3107break;3108case Intrinsic::x86_xop_vfrcz_ss:3109case Intrinsic::x86_xop_vfrcz_sd:3110// The instructions for these intrinsics are speced to zero upper bits not3111// pass them through like other scalar intrinsics. So we shouldn't just3112// use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.3113// Instead we should return a zero vector.3114if (!DemandedElts[0]) {3115IC.addToWorklist(&II);3116return ConstantAggregateZero::get(II.getType());3117}31183119// Only the lower element is used.3120DemandedElts = 1;3121simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);31223123// Only the lower element is undefined. The high elements are zero.3124UndefElts = UndefElts[0];3125break;31263127// Unary scalar-as-vector operations that work column-wise.3128case Intrinsic::x86_sse_rcp_ss:3129case Intrinsic::x86_sse_rsqrt_ss:3130simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);31313132// If lowest element of a scalar op isn't used then use Arg0.3133if (!DemandedElts[0]) {3134IC.addToWorklist(&II);3135return II.getArgOperand(0);3136}3137// TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions3138// checks).3139break;31403141// Binary scalar-as-vector operations that work column-wise. The high3142// elements come from operand 0. The low element is a function of both3143// operands.3144case Intrinsic::x86_sse_min_ss:3145case Intrinsic::x86_sse_max_ss:3146case Intrinsic::x86_sse_cmp_ss:3147case Intrinsic::x86_sse2_min_sd:3148case Intrinsic::x86_sse2_max_sd:3149case Intrinsic::x86_sse2_cmp_sd: {3150simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);31513152// If lowest element of a scalar op isn't used then use Arg0.3153if (!DemandedElts[0]) {3154IC.addToWorklist(&II);3155return II.getArgOperand(0);3156}31573158// Only lower element is used for operand 1.3159DemandedElts = 1;3160simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);31613162// Lower element is undefined if both lower elements are undefined.3163// Consider things like undef&0. The result is known zero, not undef.3164if (!UndefElts2[0])3165UndefElts.clearBit(0);31663167break;3168}31693170// Binary scalar-as-vector operations that work column-wise. The high3171// elements come from operand 0 and the low element comes from operand 1.3172case Intrinsic::x86_sse41_round_ss:3173case Intrinsic::x86_sse41_round_sd: {3174// Don't use the low element of operand 0.3175APInt DemandedElts2 = DemandedElts;3176DemandedElts2.clearBit(0);3177simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);31783179// If lowest element of a scalar op isn't used then use Arg0.3180if (!DemandedElts[0]) {3181IC.addToWorklist(&II);3182return II.getArgOperand(0);3183}31843185// Only lower element is used for operand 1.3186DemandedElts = 1;3187simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);31883189// Take the high undef elements from operand 0 and take the lower element3190// from operand 1.3191UndefElts.clearBit(0);3192UndefElts |= UndefElts2[0];3193break;3194}31953196// Three input scalar-as-vector operations that work column-wise. The high3197// elements come from operand 0 and the low element is a function of all3198// three inputs.3199case Intrinsic::x86_avx512_mask_add_ss_round:3200case Intrinsic::x86_avx512_mask_div_ss_round:3201case Intrinsic::x86_avx512_mask_mul_ss_round:3202case Intrinsic::x86_avx512_mask_sub_ss_round:3203case Intrinsic::x86_avx512_mask_max_ss_round:3204case Intrinsic::x86_avx512_mask_min_ss_round:3205case Intrinsic::x86_avx512_mask_add_sd_round:3206case Intrinsic::x86_avx512_mask_div_sd_round:3207case Intrinsic::x86_avx512_mask_mul_sd_round:3208case Intrinsic::x86_avx512_mask_sub_sd_round:3209case Intrinsic::x86_avx512_mask_max_sd_round:3210case Intrinsic::x86_avx512_mask_min_sd_round:3211simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);32123213// If lowest element of a scalar op isn't used then use Arg0.3214if (!DemandedElts[0]) {3215IC.addToWorklist(&II);3216return II.getArgOperand(0);3217}32183219// Only lower element is used for operand 1 and 2.3220DemandedElts = 1;3221simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);3222simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);32233224// Lower element is undefined if all three lower elements are undefined.3225// Consider things like undef&0. The result is known zero, not undef.3226if (!UndefElts2[0] || !UndefElts3[0])3227UndefElts.clearBit(0);3228break;32293230// TODO: Add fmaddsub support?3231case Intrinsic::x86_sse3_addsub_pd:3232case Intrinsic::x86_sse3_addsub_ps:3233case Intrinsic::x86_avx_addsub_pd_256:3234case Intrinsic::x86_avx_addsub_ps_256: {3235// If none of the even or none of the odd lanes are required, turn this3236// into a generic FP math instruction.3237APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));3238APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));3239bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);3240bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);3241if (IsSubOnly || IsAddOnly) {3242assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");3243IRBuilderBase::InsertPointGuard Guard(IC.Builder);3244IC.Builder.SetInsertPoint(&II);3245Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);3246return IC.Builder.CreateBinOp(3247IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);3248}32493250simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);3251simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);3252UndefElts &= UndefElts2;3253break;3254}32553256// General per-element vector operations.3257case Intrinsic::x86_avx2_psllv_d:3258case Intrinsic::x86_avx2_psllv_d_256:3259case Intrinsic::x86_avx2_psllv_q:3260case Intrinsic::x86_avx2_psllv_q_256:3261case Intrinsic::x86_avx2_psrlv_d:3262case Intrinsic::x86_avx2_psrlv_d_256:3263case Intrinsic::x86_avx2_psrlv_q:3264case Intrinsic::x86_avx2_psrlv_q_256:3265case Intrinsic::x86_avx2_psrav_d:3266case Intrinsic::x86_avx2_psrav_d_256: {3267simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);3268simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);3269UndefElts &= UndefElts2;3270break;3271}32723273case Intrinsic::x86_sse2_pmulh_w:3274case Intrinsic::x86_avx2_pmulh_w:3275case Intrinsic::x86_avx512_pmulh_w_512:3276case Intrinsic::x86_sse2_pmulhu_w:3277case Intrinsic::x86_avx2_pmulhu_w:3278case Intrinsic::x86_avx512_pmulhu_w_512:3279case Intrinsic::x86_ssse3_pmul_hr_sw_128:3280case Intrinsic::x86_avx2_pmul_hr_sw:3281case Intrinsic::x86_avx512_pmul_hr_sw_512: {3282simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);3283simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);3284// NOTE: mulh(undef,undef) != undef.3285break;3286}32873288case Intrinsic::x86_sse2_packssdw_128:3289case Intrinsic::x86_sse2_packsswb_128:3290case Intrinsic::x86_sse2_packuswb_128:3291case Intrinsic::x86_sse41_packusdw:3292case Intrinsic::x86_avx2_packssdw:3293case Intrinsic::x86_avx2_packsswb:3294case Intrinsic::x86_avx2_packusdw:3295case Intrinsic::x86_avx2_packuswb:3296case Intrinsic::x86_avx512_packssdw_512:3297case Intrinsic::x86_avx512_packsswb_512:3298case Intrinsic::x86_avx512_packusdw_512:3299case Intrinsic::x86_avx512_packuswb_512: {3300auto *Ty0 = II.getArgOperand(0)->getType();3301unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();3302assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");33033304unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;3305unsigned VWidthPerLane = VWidth / NumLanes;3306unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;33073308// Per lane, pack the elements of the first input and then the second.3309// e.g.3310// v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])3311// v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])3312for (int OpNum = 0; OpNum != 2; ++OpNum) {3313APInt OpDemandedElts(InnerVWidth, 0);3314for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {3315unsigned LaneIdx = Lane * VWidthPerLane;3316for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {3317unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;3318if (DemandedElts[Idx])3319OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);3320}3321}33223323// Demand elements from the operand.3324APInt OpUndefElts(InnerVWidth, 0);3325simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);33263327// Pack the operand's UNDEF elements, one lane at a time.3328OpUndefElts = OpUndefElts.zext(VWidth);3329for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {3330APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);3331LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);3332LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);3333UndefElts |= LaneElts;3334}3335}3336break;3337}33383339case Intrinsic::x86_sse2_pmadd_wd:3340case Intrinsic::x86_avx2_pmadd_wd:3341case Intrinsic::x86_avx512_pmaddw_d_512:3342case Intrinsic::x86_ssse3_pmadd_ub_sw_128:3343case Intrinsic::x86_avx2_pmadd_ub_sw:3344case Intrinsic::x86_avx512_pmaddubs_w_512: {3345// PMADD - demand both src elements that map to each dst element.3346auto *ArgTy = II.getArgOperand(0)->getType();3347unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements();3348assert((VWidth * 2) == InnerVWidth && "Unexpected input size");3349APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth);3350APInt Op0UndefElts(InnerVWidth, 0);3351APInt Op1UndefElts(InnerVWidth, 0);3352simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts);3353simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts);3354// NOTE: madd(undef,undef) != undef.3355break;3356}33573358// PSHUFB3359case Intrinsic::x86_ssse3_pshuf_b_128:3360case Intrinsic::x86_avx2_pshuf_b:3361case Intrinsic::x86_avx512_pshuf_b_512:3362// PERMILVAR3363case Intrinsic::x86_avx_vpermilvar_ps:3364case Intrinsic::x86_avx_vpermilvar_ps_256:3365case Intrinsic::x86_avx512_vpermilvar_ps_512:3366case Intrinsic::x86_avx_vpermilvar_pd:3367case Intrinsic::x86_avx_vpermilvar_pd_256:3368case Intrinsic::x86_avx512_vpermilvar_pd_512:3369// PERMV3370case Intrinsic::x86_avx2_permd:3371case Intrinsic::x86_avx2_permps: {3372simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);3373break;3374}33753376// SSE4A instructions leave the upper 64-bits of the 128-bit result3377// in an undefined state.3378case Intrinsic::x86_sse4a_extrq:3379case Intrinsic::x86_sse4a_extrqi:3380case Intrinsic::x86_sse4a_insertq:3381case Intrinsic::x86_sse4a_insertqi:3382UndefElts.setHighBits(VWidth / 2);3383break;3384}3385return std::nullopt;3386}338733883389