Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
35266 views
//===- InstCombineVectorOps.cpp -------------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file implements instcombine for ExtractElement, InsertElement and9// ShuffleVector.10//11//===----------------------------------------------------------------------===//1213#include "InstCombineInternal.h"14#include "llvm/ADT/APInt.h"15#include "llvm/ADT/ArrayRef.h"16#include "llvm/ADT/DenseMap.h"17#include "llvm/ADT/STLExtras.h"18#include "llvm/ADT/SmallBitVector.h"19#include "llvm/ADT/SmallVector.h"20#include "llvm/ADT/Statistic.h"21#include "llvm/Analysis/InstructionSimplify.h"22#include "llvm/Analysis/VectorUtils.h"23#include "llvm/IR/BasicBlock.h"24#include "llvm/IR/Constant.h"25#include "llvm/IR/Constants.h"26#include "llvm/IR/DerivedTypes.h"27#include "llvm/IR/InstrTypes.h"28#include "llvm/IR/Instruction.h"29#include "llvm/IR/Instructions.h"30#include "llvm/IR/Operator.h"31#include "llvm/IR/PatternMatch.h"32#include "llvm/IR/Type.h"33#include "llvm/IR/User.h"34#include "llvm/IR/Value.h"35#include "llvm/Support/Casting.h"36#include "llvm/Support/ErrorHandling.h"37#include "llvm/Transforms/InstCombine/InstCombiner.h"38#include <cassert>39#include <cstdint>40#include <iterator>41#include <utility>4243#define DEBUG_TYPE "instcombine"4445using namespace llvm;46using namespace PatternMatch;4748STATISTIC(NumAggregateReconstructionsSimplified,49"Number of aggregate reconstructions turned into reuse of the "50"original aggregate");5152/// Return true if the value is cheaper to scalarize than it is to leave as a53/// vector operation. If the extract index \p EI is a constant integer then54/// some operations may be cheap to scalarize.55///56/// FIXME: It's possible to create more instructions than previously existed.57static bool cheapToScalarize(Value *V, Value *EI) {58ConstantInt *CEI = dyn_cast<ConstantInt>(EI);5960// If we can pick a scalar constant value out of a vector, that is free.61if (auto *C = dyn_cast<Constant>(V))62return CEI || C->getSplatValue();6364if (CEI && match(V, m_Intrinsic<Intrinsic::experimental_stepvector>())) {65ElementCount EC = cast<VectorType>(V->getType())->getElementCount();66// Index needs to be lower than the minimum size of the vector, because67// for scalable vector, the vector size is known at run time.68return CEI->getValue().ult(EC.getKnownMinValue());69}7071// An insertelement to the same constant index as our extract will simplify72// to the scalar inserted element. An insertelement to a different constant73// index is irrelevant to our extract.74if (match(V, m_InsertElt(m_Value(), m_Value(), m_ConstantInt())))75return CEI;7677if (match(V, m_OneUse(m_Load(m_Value()))))78return true;7980if (match(V, m_OneUse(m_UnOp())))81return true;8283Value *V0, *V1;84if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1)))))85if (cheapToScalarize(V0, EI) || cheapToScalarize(V1, EI))86return true;8788CmpInst::Predicate UnusedPred;89if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1)))))90if (cheapToScalarize(V0, EI) || cheapToScalarize(V1, EI))91return true;9293return false;94}9596// If we have a PHI node with a vector type that is only used to feed97// itself and be an operand of extractelement at a constant location,98// try to replace the PHI of the vector type with a PHI of a scalar type.99Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI,100PHINode *PN) {101SmallVector<Instruction *, 2> Extracts;102// The users we want the PHI to have are:103// 1) The EI ExtractElement (we already know this)104// 2) Possibly more ExtractElements with the same index.105// 3) Another operand, which will feed back into the PHI.106Instruction *PHIUser = nullptr;107for (auto *U : PN->users()) {108if (ExtractElementInst *EU = dyn_cast<ExtractElementInst>(U)) {109if (EI.getIndexOperand() == EU->getIndexOperand())110Extracts.push_back(EU);111else112return nullptr;113} else if (!PHIUser) {114PHIUser = cast<Instruction>(U);115} else {116return nullptr;117}118}119120if (!PHIUser)121return nullptr;122123// Verify that this PHI user has one use, which is the PHI itself,124// and that it is a binary operation which is cheap to scalarize.125// otherwise return nullptr.126if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) ||127!(isa<BinaryOperator>(PHIUser)) ||128!cheapToScalarize(PHIUser, EI.getIndexOperand()))129return nullptr;130131// Create a scalar PHI node that will replace the vector PHI node132// just before the current PHI node.133PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith(134PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), PN->getIterator()));135// Scalarize each PHI operand.136for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {137Value *PHIInVal = PN->getIncomingValue(i);138BasicBlock *inBB = PN->getIncomingBlock(i);139Value *Elt = EI.getIndexOperand();140// If the operand is the PHI induction variable:141if (PHIInVal == PHIUser) {142// Scalarize the binary operation. Its first operand is the143// scalar PHI, and the second operand is extracted from the other144// vector operand.145BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);146unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0;147Value *Op = InsertNewInstWith(148ExtractElementInst::Create(B0->getOperand(opId), Elt,149B0->getOperand(opId)->getName() + ".Elt"),150B0->getIterator());151Value *newPHIUser = InsertNewInstWith(152BinaryOperator::CreateWithCopiedFlags(B0->getOpcode(),153scalarPHI, Op, B0), B0->getIterator());154scalarPHI->addIncoming(newPHIUser, inBB);155} else {156// Scalarize PHI input:157Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, "");158// Insert the new instruction into the predecessor basic block.159Instruction *pos = dyn_cast<Instruction>(PHIInVal);160BasicBlock::iterator InsertPos;161if (pos && !isa<PHINode>(pos)) {162InsertPos = ++pos->getIterator();163} else {164InsertPos = inBB->getFirstInsertionPt();165}166167InsertNewInstWith(newEI, InsertPos);168169scalarPHI->addIncoming(newEI, inBB);170}171}172173for (auto *E : Extracts) {174replaceInstUsesWith(*E, scalarPHI);175// Add old extract to worklist for DCE.176addToWorklist(E);177}178179return &EI;180}181182Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) {183Value *X;184uint64_t ExtIndexC;185if (!match(Ext.getVectorOperand(), m_BitCast(m_Value(X))) ||186!match(Ext.getIndexOperand(), m_ConstantInt(ExtIndexC)))187return nullptr;188189ElementCount NumElts =190cast<VectorType>(Ext.getVectorOperandType())->getElementCount();191Type *DestTy = Ext.getType();192unsigned DestWidth = DestTy->getPrimitiveSizeInBits();193bool IsBigEndian = DL.isBigEndian();194195// If we are casting an integer to vector and extracting a portion, that is196// a shift-right and truncate.197if (X->getType()->isIntegerTy()) {198assert(isa<FixedVectorType>(Ext.getVectorOperand()->getType()) &&199"Expected fixed vector type for bitcast from scalar integer");200201// Big endian requires adjusting the extract index since MSB is at index 0.202// LittleEndian: extelt (bitcast i32 X to v4i8), 0 -> trunc i32 X to i8203// BigEndian: extelt (bitcast i32 X to v4i8), 0 -> trunc i32 (X >> 24) to i8204if (IsBigEndian)205ExtIndexC = NumElts.getKnownMinValue() - 1 - ExtIndexC;206unsigned ShiftAmountC = ExtIndexC * DestWidth;207if (!ShiftAmountC ||208(isDesirableIntType(X->getType()->getPrimitiveSizeInBits()) &&209Ext.getVectorOperand()->hasOneUse())) {210if (ShiftAmountC)211X = Builder.CreateLShr(X, ShiftAmountC, "extelt.offset");212if (DestTy->isFloatingPointTy()) {213Type *DstIntTy = IntegerType::getIntNTy(X->getContext(), DestWidth);214Value *Trunc = Builder.CreateTrunc(X, DstIntTy);215return new BitCastInst(Trunc, DestTy);216}217return new TruncInst(X, DestTy);218}219}220221if (!X->getType()->isVectorTy())222return nullptr;223224// If this extractelement is using a bitcast from a vector of the same number225// of elements, see if we can find the source element from the source vector:226// extelt (bitcast VecX), IndexC --> bitcast X[IndexC]227auto *SrcTy = cast<VectorType>(X->getType());228ElementCount NumSrcElts = SrcTy->getElementCount();229if (NumSrcElts == NumElts)230if (Value *Elt = findScalarElement(X, ExtIndexC))231return new BitCastInst(Elt, DestTy);232233assert(NumSrcElts.isScalable() == NumElts.isScalable() &&234"Src and Dst must be the same sort of vector type");235236// If the source elements are wider than the destination, try to shift and237// truncate a subset of scalar bits of an insert op.238if (NumSrcElts.getKnownMinValue() < NumElts.getKnownMinValue()) {239Value *Scalar;240Value *Vec;241uint64_t InsIndexC;242if (!match(X, m_InsertElt(m_Value(Vec), m_Value(Scalar),243m_ConstantInt(InsIndexC))))244return nullptr;245246// The extract must be from the subset of vector elements that we inserted247// into. Example: if we inserted element 1 of a <2 x i64> and we are248// extracting an i16 (narrowing ratio = 4), then this extract must be from 1249// of elements 4-7 of the bitcasted vector.250unsigned NarrowingRatio =251NumElts.getKnownMinValue() / NumSrcElts.getKnownMinValue();252253if (ExtIndexC / NarrowingRatio != InsIndexC) {254// Remove insertelement, if we don't use the inserted element.255// extractelement (bitcast (insertelement (Vec, b)), a) ->256// extractelement (bitcast (Vec), a)257// FIXME: this should be removed to SimplifyDemandedVectorElts,258// once scale vectors are supported.259if (X->hasOneUse() && Ext.getVectorOperand()->hasOneUse()) {260Value *NewBC = Builder.CreateBitCast(Vec, Ext.getVectorOperandType());261return ExtractElementInst::Create(NewBC, Ext.getIndexOperand());262}263return nullptr;264}265266// We are extracting part of the original scalar. How that scalar is267// inserted into the vector depends on the endian-ness. Example:268// Vector Byte Elt Index: 0 1 2 3 4 5 6 7269// +--+--+--+--+--+--+--+--+270// inselt <2 x i32> V, <i32> S, 1: |V0|V1|V2|V3|S0|S1|S2|S3|271// extelt <4 x i16> V', 3: | |S2|S3|272// +--+--+--+--+--+--+--+--+273// If this is little-endian, S2|S3 are the MSB of the 32-bit 'S' value.274// If this is big-endian, S2|S3 are the LSB of the 32-bit 'S' value.275// In this example, we must right-shift little-endian. Big-endian is just a276// truncate.277unsigned Chunk = ExtIndexC % NarrowingRatio;278if (IsBigEndian)279Chunk = NarrowingRatio - 1 - Chunk;280281// Bail out if this is an FP vector to FP vector sequence. That would take282// more instructions than we started with unless there is no shift, and it283// may not be handled as well in the backend.284bool NeedSrcBitcast = SrcTy->getScalarType()->isFloatingPointTy();285bool NeedDestBitcast = DestTy->isFloatingPointTy();286if (NeedSrcBitcast && NeedDestBitcast)287return nullptr;288289unsigned SrcWidth = SrcTy->getScalarSizeInBits();290unsigned ShAmt = Chunk * DestWidth;291292// TODO: This limitation is more strict than necessary. We could sum the293// number of new instructions and subtract the number eliminated to know if294// we can proceed.295if (!X->hasOneUse() || !Ext.getVectorOperand()->hasOneUse())296if (NeedSrcBitcast || NeedDestBitcast)297return nullptr;298299if (NeedSrcBitcast) {300Type *SrcIntTy = IntegerType::getIntNTy(Scalar->getContext(), SrcWidth);301Scalar = Builder.CreateBitCast(Scalar, SrcIntTy);302}303304if (ShAmt) {305// Bail out if we could end with more instructions than we started with.306if (!Ext.getVectorOperand()->hasOneUse())307return nullptr;308Scalar = Builder.CreateLShr(Scalar, ShAmt);309}310311if (NeedDestBitcast) {312Type *DestIntTy = IntegerType::getIntNTy(Scalar->getContext(), DestWidth);313return new BitCastInst(Builder.CreateTrunc(Scalar, DestIntTy), DestTy);314}315return new TruncInst(Scalar, DestTy);316}317318return nullptr;319}320321/// Find elements of V demanded by UserInstr.322static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {323unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();324325// Conservatively assume that all elements are needed.326APInt UsedElts(APInt::getAllOnes(VWidth));327328switch (UserInstr->getOpcode()) {329case Instruction::ExtractElement: {330ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr);331assert(EEI->getVectorOperand() == V);332ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand());333if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) {334UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue());335}336break;337}338case Instruction::ShuffleVector: {339ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr);340unsigned MaskNumElts =341cast<FixedVectorType>(UserInstr->getType())->getNumElements();342343UsedElts = APInt(VWidth, 0);344for (unsigned i = 0; i < MaskNumElts; i++) {345unsigned MaskVal = Shuffle->getMaskValue(i);346if (MaskVal == -1u || MaskVal >= 2 * VWidth)347continue;348if (Shuffle->getOperand(0) == V && (MaskVal < VWidth))349UsedElts.setBit(MaskVal);350if (Shuffle->getOperand(1) == V &&351((MaskVal >= VWidth) && (MaskVal < 2 * VWidth)))352UsedElts.setBit(MaskVal - VWidth);353}354break;355}356default:357break;358}359return UsedElts;360}361362/// Find union of elements of V demanded by all its users.363/// If it is known by querying findDemandedEltsBySingleUser that364/// no user demands an element of V, then the corresponding bit365/// remains unset in the returned value.366static APInt findDemandedEltsByAllUsers(Value *V) {367unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();368369APInt UnionUsedElts(VWidth, 0);370for (const Use &U : V->uses()) {371if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {372UnionUsedElts |= findDemandedEltsBySingleUser(V, I);373} else {374UnionUsedElts = APInt::getAllOnes(VWidth);375break;376}377378if (UnionUsedElts.isAllOnes())379break;380}381382return UnionUsedElts;383}384385/// Given a constant index for a extractelement or insertelement instruction,386/// return it with the canonical type if it isn't already canonical. We387/// arbitrarily pick 64 bit as our canonical type. The actual bitwidth doesn't388/// matter, we just want a consistent type to simplify CSE.389static ConstantInt *getPreferredVectorIndex(ConstantInt *IndexC) {390const unsigned IndexBW = IndexC->getBitWidth();391if (IndexBW == 64 || IndexC->getValue().getActiveBits() > 64)392return nullptr;393return ConstantInt::get(IndexC->getContext(),394IndexC->getValue().zextOrTrunc(64));395}396397Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {398Value *SrcVec = EI.getVectorOperand();399Value *Index = EI.getIndexOperand();400if (Value *V = simplifyExtractElementInst(SrcVec, Index,401SQ.getWithInstruction(&EI)))402return replaceInstUsesWith(EI, V);403404// extractelt (select %x, %vec1, %vec2), %const ->405// select %x, %vec1[%const], %vec2[%const]406// TODO: Support constant folding of multiple select operands:407// extractelt (select %x, %vec1, %vec2), (select %x, %c1, %c2)408// If the extractelement will for instance try to do out of bounds accesses409// because of the values of %c1 and/or %c2, the sequence could be optimized410// early. This is currently not possible because constant folding will reach411// an unreachable assertion if it doesn't find a constant operand.412if (SelectInst *SI = dyn_cast<SelectInst>(EI.getVectorOperand()))413if (SI->getCondition()->getType()->isIntegerTy() &&414isa<Constant>(EI.getIndexOperand()))415if (Instruction *R = FoldOpIntoSelect(EI, SI))416return R;417418// If extracting a specified index from the vector, see if we can recursively419// find a previously computed scalar that was inserted into the vector.420auto *IndexC = dyn_cast<ConstantInt>(Index);421bool HasKnownValidIndex = false;422if (IndexC) {423// Canonicalize type of constant indices to i64 to simplify CSE424if (auto *NewIdx = getPreferredVectorIndex(IndexC))425return replaceOperand(EI, 1, NewIdx);426427ElementCount EC = EI.getVectorOperandType()->getElementCount();428unsigned NumElts = EC.getKnownMinValue();429HasKnownValidIndex = IndexC->getValue().ult(NumElts);430431if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(SrcVec)) {432Intrinsic::ID IID = II->getIntrinsicID();433// Index needs to be lower than the minimum size of the vector, because434// for scalable vector, the vector size is known at run time.435if (IID == Intrinsic::experimental_stepvector &&436IndexC->getValue().ult(NumElts)) {437Type *Ty = EI.getType();438unsigned BitWidth = Ty->getIntegerBitWidth();439Value *Idx;440// Return index when its value does not exceed the allowed limit441// for the element type of the vector, otherwise return undefined.442if (IndexC->getValue().getActiveBits() <= BitWidth)443Idx = ConstantInt::get(Ty, IndexC->getValue().zextOrTrunc(BitWidth));444else445Idx = PoisonValue::get(Ty);446return replaceInstUsesWith(EI, Idx);447}448}449450// InstSimplify should handle cases where the index is invalid.451// For fixed-length vector, it's invalid to extract out-of-range element.452if (!EC.isScalable() && IndexC->getValue().uge(NumElts))453return nullptr;454455if (Instruction *I = foldBitcastExtElt(EI))456return I;457458// If there's a vector PHI feeding a scalar use through this extractelement459// instruction, try to scalarize the PHI.460if (auto *Phi = dyn_cast<PHINode>(SrcVec))461if (Instruction *ScalarPHI = scalarizePHI(EI, Phi))462return ScalarPHI;463}464465// TODO come up with a n-ary matcher that subsumes both unary and466// binary matchers.467UnaryOperator *UO;468if (match(SrcVec, m_UnOp(UO)) && cheapToScalarize(SrcVec, Index)) {469// extelt (unop X), Index --> unop (extelt X, Index)470Value *X = UO->getOperand(0);471Value *E = Builder.CreateExtractElement(X, Index);472return UnaryOperator::CreateWithCopiedFlags(UO->getOpcode(), E, UO);473}474475// If the binop is not speculatable, we cannot hoist the extractelement if476// it may make the operand poison.477BinaryOperator *BO;478if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, Index) &&479(HasKnownValidIndex || isSafeToSpeculativelyExecute(BO))) {480// extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index)481Value *X = BO->getOperand(0), *Y = BO->getOperand(1);482Value *E0 = Builder.CreateExtractElement(X, Index);483Value *E1 = Builder.CreateExtractElement(Y, Index);484return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), E0, E1, BO);485}486487Value *X, *Y;488CmpInst::Predicate Pred;489if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&490cheapToScalarize(SrcVec, Index)) {491// extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index)492Value *E0 = Builder.CreateExtractElement(X, Index);493Value *E1 = Builder.CreateExtractElement(Y, Index);494CmpInst *SrcCmpInst = cast<CmpInst>(SrcVec);495return CmpInst::CreateWithCopiedFlags(SrcCmpInst->getOpcode(), Pred, E0, E1,496SrcCmpInst);497}498499if (auto *I = dyn_cast<Instruction>(SrcVec)) {500if (auto *IE = dyn_cast<InsertElementInst>(I)) {501// instsimplify already handled the case where the indices are constants502// and equal by value, if both are constants, they must not be the same503// value, extract from the pre-inserted value instead.504if (isa<Constant>(IE->getOperand(2)) && IndexC)505return replaceOperand(EI, 0, IE->getOperand(0));506} else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {507auto *VecType = cast<VectorType>(GEP->getType());508ElementCount EC = VecType->getElementCount();509uint64_t IdxVal = IndexC ? IndexC->getZExtValue() : 0;510if (IndexC && IdxVal < EC.getKnownMinValue() && GEP->hasOneUse()) {511// Find out why we have a vector result - these are a few examples:512// 1. We have a scalar pointer and a vector of indices, or513// 2. We have a vector of pointers and a scalar index, or514// 3. We have a vector of pointers and a vector of indices, etc.515// Here we only consider combining when there is exactly one vector516// operand, since the optimization is less obviously a win due to517// needing more than one extractelements.518519unsigned VectorOps =520llvm::count_if(GEP->operands(), [](const Value *V) {521return isa<VectorType>(V->getType());522});523if (VectorOps == 1) {524Value *NewPtr = GEP->getPointerOperand();525if (isa<VectorType>(NewPtr->getType()))526NewPtr = Builder.CreateExtractElement(NewPtr, IndexC);527528SmallVector<Value *> NewOps;529for (unsigned I = 1; I != GEP->getNumOperands(); ++I) {530Value *Op = GEP->getOperand(I);531if (isa<VectorType>(Op->getType()))532NewOps.push_back(Builder.CreateExtractElement(Op, IndexC));533else534NewOps.push_back(Op);535}536537GetElementPtrInst *NewGEP = GetElementPtrInst::Create(538GEP->getSourceElementType(), NewPtr, NewOps);539NewGEP->setIsInBounds(GEP->isInBounds());540return NewGEP;541}542}543} else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {544// If this is extracting an element from a shufflevector, figure out where545// it came from and extract from the appropriate input element instead.546// Restrict the following transformation to fixed-length vector.547if (isa<FixedVectorType>(SVI->getType()) && isa<ConstantInt>(Index)) {548int SrcIdx =549SVI->getMaskValue(cast<ConstantInt>(Index)->getZExtValue());550Value *Src;551unsigned LHSWidth = cast<FixedVectorType>(SVI->getOperand(0)->getType())552->getNumElements();553554if (SrcIdx < 0)555return replaceInstUsesWith(EI, PoisonValue::get(EI.getType()));556if (SrcIdx < (int)LHSWidth)557Src = SVI->getOperand(0);558else {559SrcIdx -= LHSWidth;560Src = SVI->getOperand(1);561}562Type *Int64Ty = Type::getInt64Ty(EI.getContext());563return ExtractElementInst::Create(564Src, ConstantInt::get(Int64Ty, SrcIdx, false));565}566} else if (auto *CI = dyn_cast<CastInst>(I)) {567// Canonicalize extractelement(cast) -> cast(extractelement).568// Bitcasts can change the number of vector elements, and they cost569// nothing.570if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {571Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);572return CastInst::Create(CI->getOpcode(), EE, EI.getType());573}574}575}576577// Run demanded elements after other transforms as this can drop flags on578// binops. If there's two paths to the same final result, we prefer the579// one which doesn't force us to drop flags.580if (IndexC) {581ElementCount EC = EI.getVectorOperandType()->getElementCount();582unsigned NumElts = EC.getKnownMinValue();583// This instruction only demands the single element from the input vector.584// Skip for scalable type, the number of elements is unknown at585// compile-time.586if (!EC.isScalable() && NumElts != 1) {587// If the input vector has a single use, simplify it based on this use588// property.589if (SrcVec->hasOneUse()) {590APInt PoisonElts(NumElts, 0);591APInt DemandedElts(NumElts, 0);592DemandedElts.setBit(IndexC->getZExtValue());593if (Value *V =594SimplifyDemandedVectorElts(SrcVec, DemandedElts, PoisonElts))595return replaceOperand(EI, 0, V);596} else {597// If the input vector has multiple uses, simplify it based on a union598// of all elements used.599APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);600if (!DemandedElts.isAllOnes()) {601APInt PoisonElts(NumElts, 0);602if (Value *V = SimplifyDemandedVectorElts(603SrcVec, DemandedElts, PoisonElts, 0 /* Depth */,604true /* AllowMultipleUsers */)) {605if (V != SrcVec) {606Worklist.addValue(SrcVec);607SrcVec->replaceAllUsesWith(V);608return &EI;609}610}611}612}613}614}615return nullptr;616}617618/// If V is a shuffle of values that ONLY returns elements from either LHS or619/// RHS, return the shuffle mask and true. Otherwise, return false.620static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,621SmallVectorImpl<int> &Mask) {622assert(LHS->getType() == RHS->getType() &&623"Invalid CollectSingleShuffleElements");624unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();625626if (match(V, m_Poison())) {627Mask.assign(NumElts, -1);628return true;629}630631if (V == LHS) {632for (unsigned i = 0; i != NumElts; ++i)633Mask.push_back(i);634return true;635}636637if (V == RHS) {638for (unsigned i = 0; i != NumElts; ++i)639Mask.push_back(i + NumElts);640return true;641}642643if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {644// If this is an insert of an extract from some other vector, include it.645Value *VecOp = IEI->getOperand(0);646Value *ScalarOp = IEI->getOperand(1);647Value *IdxOp = IEI->getOperand(2);648649if (!isa<ConstantInt>(IdxOp))650return false;651unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();652653if (isa<PoisonValue>(ScalarOp)) { // inserting poison into vector.654// We can handle this if the vector we are inserting into is655// transitively ok.656if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {657// If so, update the mask to reflect the inserted poison.658Mask[InsertedIdx] = -1;659return true;660}661} else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){662if (isa<ConstantInt>(EI->getOperand(1))) {663unsigned ExtractedIdx =664cast<ConstantInt>(EI->getOperand(1))->getZExtValue();665unsigned NumLHSElts =666cast<FixedVectorType>(LHS->getType())->getNumElements();667668// This must be extracting from either LHS or RHS.669if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {670// We can handle this if the vector we are inserting into is671// transitively ok.672if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {673// If so, update the mask to reflect the inserted value.674if (EI->getOperand(0) == LHS) {675Mask[InsertedIdx % NumElts] = ExtractedIdx;676} else {677assert(EI->getOperand(0) == RHS);678Mask[InsertedIdx % NumElts] = ExtractedIdx + NumLHSElts;679}680return true;681}682}683}684}685}686687return false;688}689690/// If we have insertion into a vector that is wider than the vector that we691/// are extracting from, try to widen the source vector to allow a single692/// shufflevector to replace one or more insert/extract pairs.693static bool replaceExtractElements(InsertElementInst *InsElt,694ExtractElementInst *ExtElt,695InstCombinerImpl &IC) {696auto *InsVecType = cast<FixedVectorType>(InsElt->getType());697auto *ExtVecType = cast<FixedVectorType>(ExtElt->getVectorOperandType());698unsigned NumInsElts = InsVecType->getNumElements();699unsigned NumExtElts = ExtVecType->getNumElements();700701// The inserted-to vector must be wider than the extracted-from vector.702if (InsVecType->getElementType() != ExtVecType->getElementType() ||703NumExtElts >= NumInsElts)704return false;705706// Create a shuffle mask to widen the extended-from vector using poison707// values. The mask selects all of the values of the original vector followed708// by as many poison values as needed to create a vector of the same length709// as the inserted-to vector.710SmallVector<int, 16> ExtendMask;711for (unsigned i = 0; i < NumExtElts; ++i)712ExtendMask.push_back(i);713for (unsigned i = NumExtElts; i < NumInsElts; ++i)714ExtendMask.push_back(-1);715716Value *ExtVecOp = ExtElt->getVectorOperand();717auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);718BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))719? ExtVecOpInst->getParent()720: ExtElt->getParent();721722// TODO: This restriction matches the basic block check below when creating723// new extractelement instructions. If that limitation is removed, this one724// could also be removed. But for now, we just bail out to ensure that we725// will replace the extractelement instruction that is feeding our726// insertelement instruction. This allows the insertelement to then be727// replaced by a shufflevector. If the insertelement is not replaced, we can728// induce infinite looping because there's an optimization for extractelement729// that will delete our widening shuffle. This would trigger another attempt730// here to create that shuffle, and we spin forever.731if (InsertionBlock != InsElt->getParent())732return false;733734// TODO: This restriction matches the check in visitInsertElementInst() and735// prevents an infinite loop caused by not turning the extract/insert pair736// into a shuffle. We really should not need either check, but we're lacking737// folds for shufflevectors because we're afraid to generate shuffle masks738// that the backend can't handle.739if (InsElt->hasOneUse() && isa<InsertElementInst>(InsElt->user_back()))740return false;741742auto *WideVec = new ShuffleVectorInst(ExtVecOp, ExtendMask);743744// Insert the new shuffle after the vector operand of the extract is defined745// (as long as it's not a PHI) or at the start of the basic block of the746// extract, so any subsequent extracts in the same basic block can use it.747// TODO: Insert before the earliest ExtractElementInst that is replaced.748if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))749WideVec->insertAfter(ExtVecOpInst);750else751IC.InsertNewInstWith(WideVec, ExtElt->getParent()->getFirstInsertionPt());752753// Replace extracts from the original narrow vector with extracts from the new754// wide vector.755for (User *U : ExtVecOp->users()) {756ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U);757if (!OldExt || OldExt->getParent() != WideVec->getParent())758continue;759auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));760IC.InsertNewInstWith(NewExt, OldExt->getIterator());761IC.replaceInstUsesWith(*OldExt, NewExt);762// Add the old extracts to the worklist for DCE. We can't remove the763// extracts directly, because they may still be used by the calling code.764IC.addToWorklist(OldExt);765}766767return true;768}769770/// We are building a shuffle to create V, which is a sequence of insertelement,771/// extractelement pairs. If PermittedRHS is set, then we must either use it or772/// not rely on the second vector source. Return a std::pair containing the773/// left and right vectors of the proposed shuffle (or 0), and set the Mask774/// parameter as required.775///776/// Note: we intentionally don't try to fold earlier shuffles since they have777/// often been chosen carefully to be efficiently implementable on the target.778using ShuffleOps = std::pair<Value *, Value *>;779780static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,781Value *PermittedRHS,782InstCombinerImpl &IC, bool &Rerun) {783assert(V->getType()->isVectorTy() && "Invalid shuffle!");784unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();785786if (match(V, m_Poison())) {787Mask.assign(NumElts, -1);788return std::make_pair(789PermittedRHS ? PoisonValue::get(PermittedRHS->getType()) : V, nullptr);790}791792if (isa<ConstantAggregateZero>(V)) {793Mask.assign(NumElts, 0);794return std::make_pair(V, nullptr);795}796797if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {798// If this is an insert of an extract from some other vector, include it.799Value *VecOp = IEI->getOperand(0);800Value *ScalarOp = IEI->getOperand(1);801Value *IdxOp = IEI->getOperand(2);802803if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {804if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {805unsigned ExtractedIdx =806cast<ConstantInt>(EI->getOperand(1))->getZExtValue();807unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();808809// Either the extracted from or inserted into vector must be RHSVec,810// otherwise we'd end up with a shuffle of three inputs.811if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {812Value *RHS = EI->getOperand(0);813ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC, Rerun);814assert(LR.second == nullptr || LR.second == RHS);815816if (LR.first->getType() != RHS->getType()) {817// Although we are giving up for now, see if we can create extracts818// that match the inserts for another round of combining.819if (replaceExtractElements(IEI, EI, IC))820Rerun = true;821822// We tried our best, but we can't find anything compatible with RHS823// further up the chain. Return a trivial shuffle.824for (unsigned i = 0; i < NumElts; ++i)825Mask[i] = i;826return std::make_pair(V, nullptr);827}828829unsigned NumLHSElts =830cast<FixedVectorType>(RHS->getType())->getNumElements();831Mask[InsertedIdx % NumElts] = NumLHSElts + ExtractedIdx;832return std::make_pair(LR.first, RHS);833}834835if (VecOp == PermittedRHS) {836// We've gone as far as we can: anything on the other side of the837// extractelement will already have been converted into a shuffle.838unsigned NumLHSElts =839cast<FixedVectorType>(EI->getOperand(0)->getType())840->getNumElements();841for (unsigned i = 0; i != NumElts; ++i)842Mask.push_back(i == InsertedIdx ? ExtractedIdx : NumLHSElts + i);843return std::make_pair(EI->getOperand(0), PermittedRHS);844}845846// If this insertelement is a chain that comes from exactly these two847// vectors, return the vector and the effective shuffle.848if (EI->getOperand(0)->getType() == PermittedRHS->getType() &&849collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,850Mask))851return std::make_pair(EI->getOperand(0), PermittedRHS);852}853}854}855856// Otherwise, we can't do anything fancy. Return an identity vector.857for (unsigned i = 0; i != NumElts; ++i)858Mask.push_back(i);859return std::make_pair(V, nullptr);860}861862/// Look for chain of insertvalue's that fully define an aggregate, and trace863/// back the values inserted, see if they are all were extractvalue'd from864/// the same source aggregate from the exact same element indexes.865/// If they were, just reuse the source aggregate.866/// This potentially deals with PHI indirections.867Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse(868InsertValueInst &OrigIVI) {869Type *AggTy = OrigIVI.getType();870unsigned NumAggElts;871switch (AggTy->getTypeID()) {872case Type::StructTyID:873NumAggElts = AggTy->getStructNumElements();874break;875case Type::ArrayTyID:876NumAggElts = AggTy->getArrayNumElements();877break;878default:879llvm_unreachable("Unhandled aggregate type?");880}881882// Arbitrary aggregate size cut-off. Motivation for limit of 2 is to be able883// to handle clang C++ exception struct (which is hardcoded as {i8*, i32}),884// FIXME: any interesting patterns to be caught with larger limit?885assert(NumAggElts > 0 && "Aggregate should have elements.");886if (NumAggElts > 2)887return nullptr;888889static constexpr auto NotFound = std::nullopt;890static constexpr auto FoundMismatch = nullptr;891892// Try to find a value of each element of an aggregate.893// FIXME: deal with more complex, not one-dimensional, aggregate types894SmallVector<std::optional<Instruction *>, 2> AggElts(NumAggElts, NotFound);895896// Do we know values for each element of the aggregate?897auto KnowAllElts = [&AggElts]() {898return !llvm::is_contained(AggElts, NotFound);899};900901int Depth = 0;902903// Arbitrary `insertvalue` visitation depth limit. Let's be okay with904// every element being overwritten twice, which should never happen.905static const int DepthLimit = 2 * NumAggElts;906907// Recurse up the chain of `insertvalue` aggregate operands until either we've908// reconstructed full initializer or can't visit any more `insertvalue`'s.909for (InsertValueInst *CurrIVI = &OrigIVI;910Depth < DepthLimit && CurrIVI && !KnowAllElts();911CurrIVI = dyn_cast<InsertValueInst>(CurrIVI->getAggregateOperand()),912++Depth) {913auto *InsertedValue =914dyn_cast<Instruction>(CurrIVI->getInsertedValueOperand());915if (!InsertedValue)916return nullptr; // Inserted value must be produced by an instruction.917918ArrayRef<unsigned int> Indices = CurrIVI->getIndices();919920// Don't bother with more than single-level aggregates.921if (Indices.size() != 1)922return nullptr; // FIXME: deal with more complex aggregates?923924// Now, we may have already previously recorded the value for this element925// of an aggregate. If we did, that means the CurrIVI will later be926// overwritten with the already-recorded value. But if not, let's record it!927std::optional<Instruction *> &Elt = AggElts[Indices.front()];928Elt = Elt.value_or(InsertedValue);929930// FIXME: should we handle chain-terminating undef base operand?931}932933// Was that sufficient to deduce the full initializer for the aggregate?934if (!KnowAllElts())935return nullptr; // Give up then.936937// We now want to find the source[s] of the aggregate elements we've found.938// And with "source" we mean the original aggregate[s] from which939// the inserted elements were extracted. This may require PHI translation.940941enum class AggregateDescription {942/// When analyzing the value that was inserted into an aggregate, we did943/// not manage to find defining `extractvalue` instruction to analyze.944NotFound,945/// When analyzing the value that was inserted into an aggregate, we did946/// manage to find defining `extractvalue` instruction[s], and everything947/// matched perfectly - aggregate type, element insertion/extraction index.948Found,949/// When analyzing the value that was inserted into an aggregate, we did950/// manage to find defining `extractvalue` instruction, but there was951/// a mismatch: either the source type from which the extraction was didn't952/// match the aggregate type into which the insertion was,953/// or the extraction/insertion channels mismatched,954/// or different elements had different source aggregates.955FoundMismatch956};957auto Describe = [](std::optional<Value *> SourceAggregate) {958if (SourceAggregate == NotFound)959return AggregateDescription::NotFound;960if (*SourceAggregate == FoundMismatch)961return AggregateDescription::FoundMismatch;962return AggregateDescription::Found;963};964965// Given the value \p Elt that was being inserted into element \p EltIdx of an966// aggregate AggTy, see if \p Elt was originally defined by an967// appropriate extractvalue (same element index, same aggregate type).968// If found, return the source aggregate from which the extraction was.969// If \p PredBB is provided, does PHI translation of an \p Elt first.970auto FindSourceAggregate =971[&](Instruction *Elt, unsigned EltIdx, std::optional<BasicBlock *> UseBB,972std::optional<BasicBlock *> PredBB) -> std::optional<Value *> {973// For now(?), only deal with, at most, a single level of PHI indirection.974if (UseBB && PredBB)975Elt = dyn_cast<Instruction>(Elt->DoPHITranslation(*UseBB, *PredBB));976// FIXME: deal with multiple levels of PHI indirection?977978// Did we find an extraction?979auto *EVI = dyn_cast_or_null<ExtractValueInst>(Elt);980if (!EVI)981return NotFound;982983Value *SourceAggregate = EVI->getAggregateOperand();984985// Is the extraction from the same type into which the insertion was?986if (SourceAggregate->getType() != AggTy)987return FoundMismatch;988// And the element index doesn't change between extraction and insertion?989if (EVI->getNumIndices() != 1 || EltIdx != EVI->getIndices().front())990return FoundMismatch;991992return SourceAggregate; // AggregateDescription::Found993};994995// Given elements AggElts that were constructing an aggregate OrigIVI,996// see if we can find appropriate source aggregate for each of the elements,997// and see it's the same aggregate for each element. If so, return it.998auto FindCommonSourceAggregate =999[&](std::optional<BasicBlock *> UseBB,1000std::optional<BasicBlock *> PredBB) -> std::optional<Value *> {1001std::optional<Value *> SourceAggregate;10021003for (auto I : enumerate(AggElts)) {1004assert(Describe(SourceAggregate) != AggregateDescription::FoundMismatch &&1005"We don't store nullptr in SourceAggregate!");1006assert((Describe(SourceAggregate) == AggregateDescription::Found) ==1007(I.index() != 0) &&1008"SourceAggregate should be valid after the first element,");10091010// For this element, is there a plausible source aggregate?1011// FIXME: we could special-case undef element, IFF we know that in the1012// source aggregate said element isn't poison.1013std::optional<Value *> SourceAggregateForElement =1014FindSourceAggregate(*I.value(), I.index(), UseBB, PredBB);10151016// Okay, what have we found? Does that correlate with previous findings?10171018// Regardless of whether or not we have previously found source1019// aggregate for previous elements (if any), if we didn't find one for1020// this element, passthrough whatever we have just found.1021if (Describe(SourceAggregateForElement) != AggregateDescription::Found)1022return SourceAggregateForElement;10231024// Okay, we have found source aggregate for this element.1025// Let's see what we already know from previous elements, if any.1026switch (Describe(SourceAggregate)) {1027case AggregateDescription::NotFound:1028// This is apparently the first element that we have examined.1029SourceAggregate = SourceAggregateForElement; // Record the aggregate!1030continue; // Great, now look at next element.1031case AggregateDescription::Found:1032// We have previously already successfully examined other elements.1033// Is this the same source aggregate we've found for other elements?1034if (*SourceAggregateForElement != *SourceAggregate)1035return FoundMismatch;1036continue; // Still the same aggregate, look at next element.1037case AggregateDescription::FoundMismatch:1038llvm_unreachable("Can't happen. We would have early-exited then.");1039};1040}10411042assert(Describe(SourceAggregate) == AggregateDescription::Found &&1043"Must be a valid Value");1044return *SourceAggregate;1045};10461047std::optional<Value *> SourceAggregate;10481049// Can we find the source aggregate without looking at predecessors?1050SourceAggregate = FindCommonSourceAggregate(/*UseBB=*/std::nullopt,1051/*PredBB=*/std::nullopt);1052if (Describe(SourceAggregate) != AggregateDescription::NotFound) {1053if (Describe(SourceAggregate) == AggregateDescription::FoundMismatch)1054return nullptr; // Conflicting source aggregates!1055++NumAggregateReconstructionsSimplified;1056return replaceInstUsesWith(OrigIVI, *SourceAggregate);1057}10581059// Okay, apparently we need to look at predecessors.10601061// We should be smart about picking the "use" basic block, which will be the1062// merge point for aggregate, where we'll insert the final PHI that will be1063// used instead of OrigIVI. Basic block of OrigIVI is *not* the right choice.1064// We should look in which blocks each of the AggElts is being defined,1065// they all should be defined in the same basic block.1066BasicBlock *UseBB = nullptr;10671068for (const std::optional<Instruction *> &I : AggElts) {1069BasicBlock *BB = (*I)->getParent();1070// If it's the first instruction we've encountered, record the basic block.1071if (!UseBB) {1072UseBB = BB;1073continue;1074}1075// Otherwise, this must be the same basic block we've seen previously.1076if (UseBB != BB)1077return nullptr;1078}10791080// If *all* of the elements are basic-block-independent, meaning they are1081// either function arguments, or constant expressions, then if we didn't1082// handle them without predecessor-aware handling, we won't handle them now.1083if (!UseBB)1084return nullptr;10851086// If we didn't manage to find source aggregate without looking at1087// predecessors, and there are no predecessors to look at, then we're done.1088if (pred_empty(UseBB))1089return nullptr;10901091// Arbitrary predecessor count limit.1092static const int PredCountLimit = 64;10931094// Cache the (non-uniqified!) list of predecessors in a vector,1095// checking the limit at the same time for efficiency.1096SmallVector<BasicBlock *, 4> Preds; // May have duplicates!1097for (BasicBlock *Pred : predecessors(UseBB)) {1098// Don't bother if there are too many predecessors.1099if (Preds.size() >= PredCountLimit) // FIXME: only count duplicates once?1100return nullptr;1101Preds.emplace_back(Pred);1102}11031104// For each predecessor, what is the source aggregate,1105// from which all the elements were originally extracted from?1106// Note that we want for the map to have stable iteration order!1107SmallDenseMap<BasicBlock *, Value *, 4> SourceAggregates;1108for (BasicBlock *Pred : Preds) {1109std::pair<decltype(SourceAggregates)::iterator, bool> IV =1110SourceAggregates.insert({Pred, nullptr});1111// Did we already evaluate this predecessor?1112if (!IV.second)1113continue;11141115// Let's hope that when coming from predecessor Pred, all elements of the1116// aggregate produced by OrigIVI must have been originally extracted from1117// the same aggregate. Is that so? Can we find said original aggregate?1118SourceAggregate = FindCommonSourceAggregate(UseBB, Pred);1119if (Describe(SourceAggregate) != AggregateDescription::Found)1120return nullptr; // Give up.1121IV.first->second = *SourceAggregate;1122}11231124// All good! Now we just need to thread the source aggregates here.1125// Note that we have to insert the new PHI here, ourselves, because we can't1126// rely on InstCombinerImpl::run() inserting it into the right basic block.1127// Note that the same block can be a predecessor more than once,1128// and we need to preserve that invariant for the PHI node.1129BuilderTy::InsertPointGuard Guard(Builder);1130Builder.SetInsertPoint(UseBB, UseBB->getFirstNonPHIIt());1131auto *PHI =1132Builder.CreatePHI(AggTy, Preds.size(), OrigIVI.getName() + ".merged");1133for (BasicBlock *Pred : Preds)1134PHI->addIncoming(SourceAggregates[Pred], Pred);11351136++NumAggregateReconstructionsSimplified;1137return replaceInstUsesWith(OrigIVI, PHI);1138}11391140/// Try to find redundant insertvalue instructions, like the following ones:1141/// %0 = insertvalue { i8, i32 } undef, i8 %x, 01142/// %1 = insertvalue { i8, i32 } %0, i8 %y, 01143/// Here the second instruction inserts values at the same indices, as the1144/// first one, making the first one redundant.1145/// It should be transformed to:1146/// %0 = insertvalue { i8, i32 } undef, i8 %y, 01147Instruction *InstCombinerImpl::visitInsertValueInst(InsertValueInst &I) {1148if (Value *V = simplifyInsertValueInst(1149I.getAggregateOperand(), I.getInsertedValueOperand(), I.getIndices(),1150SQ.getWithInstruction(&I)))1151return replaceInstUsesWith(I, V);11521153bool IsRedundant = false;1154ArrayRef<unsigned int> FirstIndices = I.getIndices();11551156// If there is a chain of insertvalue instructions (each of them except the1157// last one has only one use and it's another insertvalue insn from this1158// chain), check if any of the 'children' uses the same indices as the first1159// instruction. In this case, the first one is redundant.1160Value *V = &I;1161unsigned Depth = 0;1162while (V->hasOneUse() && Depth < 10) {1163User *U = V->user_back();1164auto UserInsInst = dyn_cast<InsertValueInst>(U);1165if (!UserInsInst || U->getOperand(0) != V)1166break;1167if (UserInsInst->getIndices() == FirstIndices) {1168IsRedundant = true;1169break;1170}1171V = UserInsInst;1172Depth++;1173}11741175if (IsRedundant)1176return replaceInstUsesWith(I, I.getOperand(0));11771178if (Instruction *NewI = foldAggregateConstructionIntoAggregateReuse(I))1179return NewI;11801181return nullptr;1182}11831184static bool isShuffleEquivalentToSelect(ShuffleVectorInst &Shuf) {1185// Can not analyze scalable type, the number of elements is not a compile-time1186// constant.1187if (isa<ScalableVectorType>(Shuf.getOperand(0)->getType()))1188return false;11891190int MaskSize = Shuf.getShuffleMask().size();1191int VecSize =1192cast<FixedVectorType>(Shuf.getOperand(0)->getType())->getNumElements();11931194// A vector select does not change the size of the operands.1195if (MaskSize != VecSize)1196return false;11971198// Each mask element must be undefined or choose a vector element from one of1199// the source operands without crossing vector lanes.1200for (int i = 0; i != MaskSize; ++i) {1201int Elt = Shuf.getMaskValue(i);1202if (Elt != -1 && Elt != i && Elt != i + VecSize)1203return false;1204}12051206return true;1207}12081209/// Turn a chain of inserts that splats a value into an insert + shuffle:1210/// insertelt(insertelt(insertelt(insertelt X, %k, 0), %k, 1), %k, 2) ... ->1211/// shufflevector(insertelt(X, %k, 0), poison, zero)1212static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) {1213// We are interested in the last insert in a chain. So if this insert has a1214// single user and that user is an insert, bail.1215if (InsElt.hasOneUse() && isa<InsertElementInst>(InsElt.user_back()))1216return nullptr;12171218VectorType *VecTy = InsElt.getType();1219// Can not handle scalable type, the number of elements is not a compile-time1220// constant.1221if (isa<ScalableVectorType>(VecTy))1222return nullptr;1223unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();12241225// Do not try to do this for a one-element vector, since that's a nop,1226// and will cause an inf-loop.1227if (NumElements == 1)1228return nullptr;12291230Value *SplatVal = InsElt.getOperand(1);1231InsertElementInst *CurrIE = &InsElt;1232SmallBitVector ElementPresent(NumElements, false);1233InsertElementInst *FirstIE = nullptr;12341235// Walk the chain backwards, keeping track of which indices we inserted into,1236// until we hit something that isn't an insert of the splatted value.1237while (CurrIE) {1238auto *Idx = dyn_cast<ConstantInt>(CurrIE->getOperand(2));1239if (!Idx || CurrIE->getOperand(1) != SplatVal)1240return nullptr;12411242auto *NextIE = dyn_cast<InsertElementInst>(CurrIE->getOperand(0));1243// Check none of the intermediate steps have any additional uses, except1244// for the root insertelement instruction, which can be re-used, if it1245// inserts at position 0.1246if (CurrIE != &InsElt &&1247(!CurrIE->hasOneUse() && (NextIE != nullptr || !Idx->isZero())))1248return nullptr;12491250ElementPresent[Idx->getZExtValue()] = true;1251FirstIE = CurrIE;1252CurrIE = NextIE;1253}12541255// If this is just a single insertelement (not a sequence), we are done.1256if (FirstIE == &InsElt)1257return nullptr;12581259// If we are not inserting into a poison vector, make sure we've seen an1260// insert into every element.1261// TODO: If the base vector is not undef, it might be better to create a splat1262// and then a select-shuffle (blend) with the base vector.1263if (!match(FirstIE->getOperand(0), m_Poison()))1264if (!ElementPresent.all())1265return nullptr;12661267// Create the insert + shuffle.1268Type *Int64Ty = Type::getInt64Ty(InsElt.getContext());1269PoisonValue *PoisonVec = PoisonValue::get(VecTy);1270Constant *Zero = ConstantInt::get(Int64Ty, 0);1271if (!cast<ConstantInt>(FirstIE->getOperand(2))->isZero())1272FirstIE = InsertElementInst::Create(PoisonVec, SplatVal, Zero, "",1273InsElt.getIterator());12741275// Splat from element 0, but replace absent elements with poison in the mask.1276SmallVector<int, 16> Mask(NumElements, 0);1277for (unsigned i = 0; i != NumElements; ++i)1278if (!ElementPresent[i])1279Mask[i] = -1;12801281return new ShuffleVectorInst(FirstIE, Mask);1282}12831284/// Try to fold an insert element into an existing splat shuffle by changing1285/// the shuffle's mask to include the index of this insert element.1286static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) {1287// Check if the vector operand of this insert is a canonical splat shuffle.1288auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));1289if (!Shuf || !Shuf->isZeroEltSplat())1290return nullptr;12911292// Bail out early if shuffle is scalable type. The number of elements in1293// shuffle mask is unknown at compile-time.1294if (isa<ScalableVectorType>(Shuf->getType()))1295return nullptr;12961297// Check for a constant insertion index.1298uint64_t IdxC;1299if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))1300return nullptr;13011302// Check if the splat shuffle's input is the same as this insert's scalar op.1303Value *X = InsElt.getOperand(1);1304Value *Op0 = Shuf->getOperand(0);1305if (!match(Op0, m_InsertElt(m_Undef(), m_Specific(X), m_ZeroInt())))1306return nullptr;13071308// Replace the shuffle mask element at the index of this insert with a zero.1309// For example:1310// inselt (shuf (inselt undef, X, 0), _, <0,undef,0,undef>), X, 11311// --> shuf (inselt undef, X, 0), poison, <0,0,0,undef>1312unsigned NumMaskElts =1313cast<FixedVectorType>(Shuf->getType())->getNumElements();1314SmallVector<int, 16> NewMask(NumMaskElts);1315for (unsigned i = 0; i != NumMaskElts; ++i)1316NewMask[i] = i == IdxC ? 0 : Shuf->getMaskValue(i);13171318return new ShuffleVectorInst(Op0, NewMask);1319}13201321/// Try to fold an extract+insert element into an existing identity shuffle by1322/// changing the shuffle's mask to include the index of this insert element.1323static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) {1324// Check if the vector operand of this insert is an identity shuffle.1325auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));1326if (!Shuf || !match(Shuf->getOperand(1), m_Poison()) ||1327!(Shuf->isIdentityWithExtract() || Shuf->isIdentityWithPadding()))1328return nullptr;13291330// Bail out early if shuffle is scalable type. The number of elements in1331// shuffle mask is unknown at compile-time.1332if (isa<ScalableVectorType>(Shuf->getType()))1333return nullptr;13341335// Check for a constant insertion index.1336uint64_t IdxC;1337if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))1338return nullptr;13391340// Check if this insert's scalar op is extracted from the identity shuffle's1341// input vector.1342Value *Scalar = InsElt.getOperand(1);1343Value *X = Shuf->getOperand(0);1344if (!match(Scalar, m_ExtractElt(m_Specific(X), m_SpecificInt(IdxC))))1345return nullptr;13461347// Replace the shuffle mask element at the index of this extract+insert with1348// that same index value.1349// For example:1350// inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask'1351unsigned NumMaskElts =1352cast<FixedVectorType>(Shuf->getType())->getNumElements();1353SmallVector<int, 16> NewMask(NumMaskElts);1354ArrayRef<int> OldMask = Shuf->getShuffleMask();1355for (unsigned i = 0; i != NumMaskElts; ++i) {1356if (i != IdxC) {1357// All mask elements besides the inserted element remain the same.1358NewMask[i] = OldMask[i];1359} else if (OldMask[i] == (int)IdxC) {1360// If the mask element was already set, there's nothing to do1361// (demanded elements analysis may unset it later).1362return nullptr;1363} else {1364assert(OldMask[i] == PoisonMaskElem &&1365"Unexpected shuffle mask element for identity shuffle");1366NewMask[i] = IdxC;1367}1368}13691370return new ShuffleVectorInst(X, Shuf->getOperand(1), NewMask);1371}13721373/// If we have an insertelement instruction feeding into another insertelement1374/// and the 2nd is inserting a constant into the vector, canonicalize that1375/// constant insertion before the insertion of a variable:1376///1377/// insertelement (insertelement X, Y, IdxC1), ScalarC, IdxC2 -->1378/// insertelement (insertelement X, ScalarC, IdxC2), Y, IdxC11379///1380/// This has the potential of eliminating the 2nd insertelement instruction1381/// via constant folding of the scalar constant into a vector constant.1382static Instruction *hoistInsEltConst(InsertElementInst &InsElt2,1383InstCombiner::BuilderTy &Builder) {1384auto *InsElt1 = dyn_cast<InsertElementInst>(InsElt2.getOperand(0));1385if (!InsElt1 || !InsElt1->hasOneUse())1386return nullptr;13871388Value *X, *Y;1389Constant *ScalarC;1390ConstantInt *IdxC1, *IdxC2;1391if (match(InsElt1->getOperand(0), m_Value(X)) &&1392match(InsElt1->getOperand(1), m_Value(Y)) && !isa<Constant>(Y) &&1393match(InsElt1->getOperand(2), m_ConstantInt(IdxC1)) &&1394match(InsElt2.getOperand(1), m_Constant(ScalarC)) &&1395match(InsElt2.getOperand(2), m_ConstantInt(IdxC2)) && IdxC1 != IdxC2) {1396Value *NewInsElt1 = Builder.CreateInsertElement(X, ScalarC, IdxC2);1397return InsertElementInst::Create(NewInsElt1, Y, IdxC1);1398}13991400return nullptr;1401}14021403/// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex1404/// --> shufflevector X, CVec', Mask'1405static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {1406auto *Inst = dyn_cast<Instruction>(InsElt.getOperand(0));1407// Bail out if the parent has more than one use. In that case, we'd be1408// replacing the insertelt with a shuffle, and that's not a clear win.1409if (!Inst || !Inst->hasOneUse())1410return nullptr;1411if (auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0))) {1412// The shuffle must have a constant vector operand. The insertelt must have1413// a constant scalar being inserted at a constant position in the vector.1414Constant *ShufConstVec, *InsEltScalar;1415uint64_t InsEltIndex;1416if (!match(Shuf->getOperand(1), m_Constant(ShufConstVec)) ||1417!match(InsElt.getOperand(1), m_Constant(InsEltScalar)) ||1418!match(InsElt.getOperand(2), m_ConstantInt(InsEltIndex)))1419return nullptr;14201421// Adding an element to an arbitrary shuffle could be expensive, but a1422// shuffle that selects elements from vectors without crossing lanes is1423// assumed cheap.1424// If we're just adding a constant into that shuffle, it will still be1425// cheap.1426if (!isShuffleEquivalentToSelect(*Shuf))1427return nullptr;14281429// From the above 'select' check, we know that the mask has the same number1430// of elements as the vector input operands. We also know that each constant1431// input element is used in its lane and can not be used more than once by1432// the shuffle. Therefore, replace the constant in the shuffle's constant1433// vector with the insertelt constant. Replace the constant in the shuffle's1434// mask vector with the insertelt index plus the length of the vector1435// (because the constant vector operand of a shuffle is always the 2nd1436// operand).1437ArrayRef<int> Mask = Shuf->getShuffleMask();1438unsigned NumElts = Mask.size();1439SmallVector<Constant *, 16> NewShufElts(NumElts);1440SmallVector<int, 16> NewMaskElts(NumElts);1441for (unsigned I = 0; I != NumElts; ++I) {1442if (I == InsEltIndex) {1443NewShufElts[I] = InsEltScalar;1444NewMaskElts[I] = InsEltIndex + NumElts;1445} else {1446// Copy over the existing values.1447NewShufElts[I] = ShufConstVec->getAggregateElement(I);1448NewMaskElts[I] = Mask[I];1449}14501451// Bail if we failed to find an element.1452if (!NewShufElts[I])1453return nullptr;1454}14551456// Create new operands for a shuffle that includes the constant of the1457// original insertelt. The old shuffle will be dead now.1458return new ShuffleVectorInst(Shuf->getOperand(0),1459ConstantVector::get(NewShufElts), NewMaskElts);1460} else if (auto *IEI = dyn_cast<InsertElementInst>(Inst)) {1461// Transform sequences of insertelements ops with constant data/indexes into1462// a single shuffle op.1463// Can not handle scalable type, the number of elements needed to create1464// shuffle mask is not a compile-time constant.1465if (isa<ScalableVectorType>(InsElt.getType()))1466return nullptr;1467unsigned NumElts =1468cast<FixedVectorType>(InsElt.getType())->getNumElements();14691470uint64_t InsertIdx[2];1471Constant *Val[2];1472if (!match(InsElt.getOperand(2), m_ConstantInt(InsertIdx[0])) ||1473!match(InsElt.getOperand(1), m_Constant(Val[0])) ||1474!match(IEI->getOperand(2), m_ConstantInt(InsertIdx[1])) ||1475!match(IEI->getOperand(1), m_Constant(Val[1])))1476return nullptr;1477SmallVector<Constant *, 16> Values(NumElts);1478SmallVector<int, 16> Mask(NumElts);1479auto ValI = std::begin(Val);1480// Generate new constant vector and mask.1481// We have 2 values/masks from the insertelements instructions. Insert them1482// into new value/mask vectors.1483for (uint64_t I : InsertIdx) {1484if (!Values[I]) {1485Values[I] = *ValI;1486Mask[I] = NumElts + I;1487}1488++ValI;1489}1490// Remaining values are filled with 'poison' values.1491for (unsigned I = 0; I < NumElts; ++I) {1492if (!Values[I]) {1493Values[I] = PoisonValue::get(InsElt.getType()->getElementType());1494Mask[I] = I;1495}1496}1497// Create new operands for a shuffle that includes the constant of the1498// original insertelt.1499return new ShuffleVectorInst(IEI->getOperand(0),1500ConstantVector::get(Values), Mask);1501}1502return nullptr;1503}15041505/// If both the base vector and the inserted element are extended from the same1506/// type, do the insert element in the narrow source type followed by extend.1507/// TODO: This can be extended to include other cast opcodes, but particularly1508/// if we create a wider insertelement, make sure codegen is not harmed.1509static Instruction *narrowInsElt(InsertElementInst &InsElt,1510InstCombiner::BuilderTy &Builder) {1511// We are creating a vector extend. If the original vector extend has another1512// use, that would mean we end up with 2 vector extends, so avoid that.1513// TODO: We could ease the use-clause to "if at least one op has one use"1514// (assuming that the source types match - see next TODO comment).1515Value *Vec = InsElt.getOperand(0);1516if (!Vec->hasOneUse())1517return nullptr;15181519Value *Scalar = InsElt.getOperand(1);1520Value *X, *Y;1521CastInst::CastOps CastOpcode;1522if (match(Vec, m_FPExt(m_Value(X))) && match(Scalar, m_FPExt(m_Value(Y))))1523CastOpcode = Instruction::FPExt;1524else if (match(Vec, m_SExt(m_Value(X))) && match(Scalar, m_SExt(m_Value(Y))))1525CastOpcode = Instruction::SExt;1526else if (match(Vec, m_ZExt(m_Value(X))) && match(Scalar, m_ZExt(m_Value(Y))))1527CastOpcode = Instruction::ZExt;1528else1529return nullptr;15301531// TODO: We can allow mismatched types by creating an intermediate cast.1532if (X->getType()->getScalarType() != Y->getType())1533return nullptr;15341535// inselt (ext X), (ext Y), Index --> ext (inselt X, Y, Index)1536Value *NewInsElt = Builder.CreateInsertElement(X, Y, InsElt.getOperand(2));1537return CastInst::Create(CastOpcode, NewInsElt, InsElt.getType());1538}15391540/// If we are inserting 2 halves of a value into adjacent elements of a vector,1541/// try to convert to a single insert with appropriate bitcasts.1542static Instruction *foldTruncInsEltPair(InsertElementInst &InsElt,1543bool IsBigEndian,1544InstCombiner::BuilderTy &Builder) {1545Value *VecOp = InsElt.getOperand(0);1546Value *ScalarOp = InsElt.getOperand(1);1547Value *IndexOp = InsElt.getOperand(2);15481549// Pattern depends on endian because we expect lower index is inserted first.1550// Big endian:1551// inselt (inselt BaseVec, (trunc (lshr X, BW/2), Index0), (trunc X), Index11552// Little endian:1553// inselt (inselt BaseVec, (trunc X), Index0), (trunc (lshr X, BW/2)), Index11554// Note: It is not safe to do this transform with an arbitrary base vector1555// because the bitcast of that vector to fewer/larger elements could1556// allow poison to spill into an element that was not poison before.1557// TODO: Detect smaller fractions of the scalar.1558// TODO: One-use checks are conservative.1559auto *VTy = dyn_cast<FixedVectorType>(InsElt.getType());1560Value *Scalar0, *BaseVec;1561uint64_t Index0, Index1;1562if (!VTy || (VTy->getNumElements() & 1) ||1563!match(IndexOp, m_ConstantInt(Index1)) ||1564!match(VecOp, m_InsertElt(m_Value(BaseVec), m_Value(Scalar0),1565m_ConstantInt(Index0))) ||1566!match(BaseVec, m_Undef()))1567return nullptr;15681569// The first insert must be to the index one less than this one, and1570// the first insert must be to an even index.1571if (Index0 + 1 != Index1 || Index0 & 1)1572return nullptr;15731574// For big endian, the high half of the value should be inserted first.1575// For little endian, the low half of the value should be inserted first.1576Value *X;1577uint64_t ShAmt;1578if (IsBigEndian) {1579if (!match(ScalarOp, m_Trunc(m_Value(X))) ||1580!match(Scalar0, m_Trunc(m_LShr(m_Specific(X), m_ConstantInt(ShAmt)))))1581return nullptr;1582} else {1583if (!match(Scalar0, m_Trunc(m_Value(X))) ||1584!match(ScalarOp, m_Trunc(m_LShr(m_Specific(X), m_ConstantInt(ShAmt)))))1585return nullptr;1586}15871588Type *SrcTy = X->getType();1589unsigned ScalarWidth = SrcTy->getScalarSizeInBits();1590unsigned VecEltWidth = VTy->getScalarSizeInBits();1591if (ScalarWidth != VecEltWidth * 2 || ShAmt != VecEltWidth)1592return nullptr;15931594// Bitcast the base vector to a vector type with the source element type.1595Type *CastTy = FixedVectorType::get(SrcTy, VTy->getNumElements() / 2);1596Value *CastBaseVec = Builder.CreateBitCast(BaseVec, CastTy);15971598// Scale the insert index for a vector with half as many elements.1599// bitcast (inselt (bitcast BaseVec), X, NewIndex)1600uint64_t NewIndex = IsBigEndian ? Index1 / 2 : Index0 / 2;1601Value *NewInsert = Builder.CreateInsertElement(CastBaseVec, X, NewIndex);1602return new BitCastInst(NewInsert, VTy);1603}16041605Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {1606Value *VecOp = IE.getOperand(0);1607Value *ScalarOp = IE.getOperand(1);1608Value *IdxOp = IE.getOperand(2);16091610if (auto *V = simplifyInsertElementInst(1611VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE)))1612return replaceInstUsesWith(IE, V);16131614// Canonicalize type of constant indices to i64 to simplify CSE1615if (auto *IndexC = dyn_cast<ConstantInt>(IdxOp)) {1616if (auto *NewIdx = getPreferredVectorIndex(IndexC))1617return replaceOperand(IE, 2, NewIdx);16181619Value *BaseVec, *OtherScalar;1620uint64_t OtherIndexVal;1621if (match(VecOp, m_OneUse(m_InsertElt(m_Value(BaseVec),1622m_Value(OtherScalar),1623m_ConstantInt(OtherIndexVal)))) &&1624!isa<Constant>(OtherScalar) && OtherIndexVal > IndexC->getZExtValue()) {1625Value *NewIns = Builder.CreateInsertElement(BaseVec, ScalarOp, IdxOp);1626return InsertElementInst::Create(NewIns, OtherScalar,1627Builder.getInt64(OtherIndexVal));1628}1629}16301631// If the scalar is bitcast and inserted into undef, do the insert in the1632// source type followed by bitcast.1633// TODO: Generalize for insert into any constant, not just undef?1634Value *ScalarSrc;1635if (match(VecOp, m_Undef()) &&1636match(ScalarOp, m_OneUse(m_BitCast(m_Value(ScalarSrc)))) &&1637(ScalarSrc->getType()->isIntegerTy() ||1638ScalarSrc->getType()->isFloatingPointTy())) {1639// inselt undef, (bitcast ScalarSrc), IdxOp -->1640// bitcast (inselt undef, ScalarSrc, IdxOp)1641Type *ScalarTy = ScalarSrc->getType();1642Type *VecTy = VectorType::get(ScalarTy, IE.getType()->getElementCount());1643Constant *NewUndef = isa<PoisonValue>(VecOp) ? PoisonValue::get(VecTy)1644: UndefValue::get(VecTy);1645Value *NewInsElt = Builder.CreateInsertElement(NewUndef, ScalarSrc, IdxOp);1646return new BitCastInst(NewInsElt, IE.getType());1647}16481649// If the vector and scalar are both bitcast from the same element type, do1650// the insert in that source type followed by bitcast.1651Value *VecSrc;1652if (match(VecOp, m_BitCast(m_Value(VecSrc))) &&1653match(ScalarOp, m_BitCast(m_Value(ScalarSrc))) &&1654(VecOp->hasOneUse() || ScalarOp->hasOneUse()) &&1655VecSrc->getType()->isVectorTy() && !ScalarSrc->getType()->isVectorTy() &&1656cast<VectorType>(VecSrc->getType())->getElementType() ==1657ScalarSrc->getType()) {1658// inselt (bitcast VecSrc), (bitcast ScalarSrc), IdxOp -->1659// bitcast (inselt VecSrc, ScalarSrc, IdxOp)1660Value *NewInsElt = Builder.CreateInsertElement(VecSrc, ScalarSrc, IdxOp);1661return new BitCastInst(NewInsElt, IE.getType());1662}16631664// If the inserted element was extracted from some other fixed-length vector1665// and both indexes are valid constants, try to turn this into a shuffle.1666// Can not handle scalable vector type, the number of elements needed to1667// create shuffle mask is not a compile-time constant.1668uint64_t InsertedIdx, ExtractedIdx;1669Value *ExtVecOp;1670if (isa<FixedVectorType>(IE.getType()) &&1671match(IdxOp, m_ConstantInt(InsertedIdx)) &&1672match(ScalarOp,1673m_ExtractElt(m_Value(ExtVecOp), m_ConstantInt(ExtractedIdx))) &&1674isa<FixedVectorType>(ExtVecOp->getType()) &&1675ExtractedIdx <1676cast<FixedVectorType>(ExtVecOp->getType())->getNumElements()) {1677// TODO: Looking at the user(s) to determine if this insert is a1678// fold-to-shuffle opportunity does not match the usual instcombine1679// constraints. We should decide if the transform is worthy based only1680// on this instruction and its operands, but that may not work currently.1681//1682// Here, we are trying to avoid creating shuffles before reaching1683// the end of a chain of extract-insert pairs. This is complicated because1684// we do not generally form arbitrary shuffle masks in instcombine1685// (because those may codegen poorly), but collectShuffleElements() does1686// exactly that.1687//1688// The rules for determining what is an acceptable target-independent1689// shuffle mask are fuzzy because they evolve based on the backend's1690// capabilities and real-world impact.1691auto isShuffleRootCandidate = [](InsertElementInst &Insert) {1692if (!Insert.hasOneUse())1693return true;1694auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back());1695if (!InsertUser)1696return true;1697return false;1698};16991700// Try to form a shuffle from a chain of extract-insert ops.1701if (isShuffleRootCandidate(IE)) {1702bool Rerun = true;1703while (Rerun) {1704Rerun = false;17051706SmallVector<int, 16> Mask;1707ShuffleOps LR =1708collectShuffleElements(&IE, Mask, nullptr, *this, Rerun);17091710// The proposed shuffle may be trivial, in which case we shouldn't1711// perform the combine.1712if (LR.first != &IE && LR.second != &IE) {1713// We now have a shuffle of LHS, RHS, Mask.1714if (LR.second == nullptr)1715LR.second = PoisonValue::get(LR.first->getType());1716return new ShuffleVectorInst(LR.first, LR.second, Mask);1717}1718}1719}1720}17211722if (auto VecTy = dyn_cast<FixedVectorType>(VecOp->getType())) {1723unsigned VWidth = VecTy->getNumElements();1724APInt PoisonElts(VWidth, 0);1725APInt AllOnesEltMask(APInt::getAllOnes(VWidth));1726if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask,1727PoisonElts)) {1728if (V != &IE)1729return replaceInstUsesWith(IE, V);1730return &IE;1731}1732}17331734if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE))1735return Shuf;17361737if (Instruction *NewInsElt = hoistInsEltConst(IE, Builder))1738return NewInsElt;17391740if (Instruction *Broadcast = foldInsSequenceIntoSplat(IE))1741return Broadcast;17421743if (Instruction *Splat = foldInsEltIntoSplat(IE))1744return Splat;17451746if (Instruction *IdentityShuf = foldInsEltIntoIdentityShuffle(IE))1747return IdentityShuf;17481749if (Instruction *Ext = narrowInsElt(IE, Builder))1750return Ext;17511752if (Instruction *Ext = foldTruncInsEltPair(IE, DL.isBigEndian(), Builder))1753return Ext;17541755return nullptr;1756}17571758/// Return true if we can evaluate the specified expression tree if the vector1759/// elements were shuffled in a different order.1760static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask,1761unsigned Depth = 5) {1762// We can always reorder the elements of a constant.1763if (isa<Constant>(V))1764return true;17651766// We won't reorder vector arguments. No IPO here.1767Instruction *I = dyn_cast<Instruction>(V);1768if (!I) return false;17691770// Two users may expect different orders of the elements. Don't try it.1771if (!I->hasOneUse())1772return false;17731774if (Depth == 0) return false;17751776switch (I->getOpcode()) {1777case Instruction::UDiv:1778case Instruction::SDiv:1779case Instruction::URem:1780case Instruction::SRem:1781// Propagating an undefined shuffle mask element to integer div/rem is not1782// allowed because those opcodes can create immediate undefined behavior1783// from an undefined element in an operand.1784if (llvm::is_contained(Mask, -1))1785return false;1786[[fallthrough]];1787case Instruction::Add:1788case Instruction::FAdd:1789case Instruction::Sub:1790case Instruction::FSub:1791case Instruction::Mul:1792case Instruction::FMul:1793case Instruction::FDiv:1794case Instruction::FRem:1795case Instruction::Shl:1796case Instruction::LShr:1797case Instruction::AShr:1798case Instruction::And:1799case Instruction::Or:1800case Instruction::Xor:1801case Instruction::ICmp:1802case Instruction::FCmp:1803case Instruction::Trunc:1804case Instruction::ZExt:1805case Instruction::SExt:1806case Instruction::FPToUI:1807case Instruction::FPToSI:1808case Instruction::UIToFP:1809case Instruction::SIToFP:1810case Instruction::FPTrunc:1811case Instruction::FPExt:1812case Instruction::GetElementPtr: {1813// Bail out if we would create longer vector ops. We could allow creating1814// longer vector ops, but that may result in more expensive codegen.1815Type *ITy = I->getType();1816if (ITy->isVectorTy() &&1817Mask.size() > cast<FixedVectorType>(ITy)->getNumElements())1818return false;1819for (Value *Operand : I->operands()) {1820if (!canEvaluateShuffled(Operand, Mask, Depth - 1))1821return false;1822}1823return true;1824}1825case Instruction::InsertElement: {1826ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));1827if (!CI) return false;1828int ElementNumber = CI->getLimitedValue();18291830// Verify that 'CI' does not occur twice in Mask. A single 'insertelement'1831// can't put an element into multiple indices.1832bool SeenOnce = false;1833for (int I : Mask) {1834if (I == ElementNumber) {1835if (SeenOnce)1836return false;1837SeenOnce = true;1838}1839}1840return canEvaluateShuffled(I->getOperand(0), Mask, Depth - 1);1841}1842}1843return false;1844}18451846/// Rebuild a new instruction just like 'I' but with the new operands given.1847/// In the event of type mismatch, the type of the operands is correct.1848static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps,1849IRBuilderBase &Builder) {1850Builder.SetInsertPoint(I);1851switch (I->getOpcode()) {1852case Instruction::Add:1853case Instruction::FAdd:1854case Instruction::Sub:1855case Instruction::FSub:1856case Instruction::Mul:1857case Instruction::FMul:1858case Instruction::UDiv:1859case Instruction::SDiv:1860case Instruction::FDiv:1861case Instruction::URem:1862case Instruction::SRem:1863case Instruction::FRem:1864case Instruction::Shl:1865case Instruction::LShr:1866case Instruction::AShr:1867case Instruction::And:1868case Instruction::Or:1869case Instruction::Xor: {1870BinaryOperator *BO = cast<BinaryOperator>(I);1871assert(NewOps.size() == 2 && "binary operator with #ops != 2");1872Value *New = Builder.CreateBinOp(cast<BinaryOperator>(I)->getOpcode(),1873NewOps[0], NewOps[1]);1874if (auto *NewI = dyn_cast<Instruction>(New)) {1875if (isa<OverflowingBinaryOperator>(BO)) {1876NewI->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap());1877NewI->setHasNoSignedWrap(BO->hasNoSignedWrap());1878}1879if (isa<PossiblyExactOperator>(BO)) {1880NewI->setIsExact(BO->isExact());1881}1882if (isa<FPMathOperator>(BO))1883NewI->copyFastMathFlags(I);1884}1885return New;1886}1887case Instruction::ICmp:1888assert(NewOps.size() == 2 && "icmp with #ops != 2");1889return Builder.CreateICmp(cast<ICmpInst>(I)->getPredicate(), NewOps[0],1890NewOps[1]);1891case Instruction::FCmp:1892assert(NewOps.size() == 2 && "fcmp with #ops != 2");1893return Builder.CreateFCmp(cast<FCmpInst>(I)->getPredicate(), NewOps[0],1894NewOps[1]);1895case Instruction::Trunc:1896case Instruction::ZExt:1897case Instruction::SExt:1898case Instruction::FPToUI:1899case Instruction::FPToSI:1900case Instruction::UIToFP:1901case Instruction::SIToFP:1902case Instruction::FPTrunc:1903case Instruction::FPExt: {1904// It's possible that the mask has a different number of elements from1905// the original cast. We recompute the destination type to match the mask.1906Type *DestTy = VectorType::get(1907I->getType()->getScalarType(),1908cast<VectorType>(NewOps[0]->getType())->getElementCount());1909assert(NewOps.size() == 1 && "cast with #ops != 1");1910return Builder.CreateCast(cast<CastInst>(I)->getOpcode(), NewOps[0],1911DestTy);1912}1913case Instruction::GetElementPtr: {1914Value *Ptr = NewOps[0];1915ArrayRef<Value*> Idx = NewOps.slice(1);1916return Builder.CreateGEP(cast<GEPOperator>(I)->getSourceElementType(),1917Ptr, Idx, "",1918cast<GEPOperator>(I)->isInBounds());1919}1920}1921llvm_unreachable("failed to rebuild vector instructions");1922}19231924static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask,1925IRBuilderBase &Builder) {1926// Mask.size() does not need to be equal to the number of vector elements.19271928assert(V->getType()->isVectorTy() && "can't reorder non-vector elements");1929Type *EltTy = V->getType()->getScalarType();19301931if (isa<PoisonValue>(V))1932return PoisonValue::get(FixedVectorType::get(EltTy, Mask.size()));19331934if (match(V, m_Undef()))1935return UndefValue::get(FixedVectorType::get(EltTy, Mask.size()));19361937if (isa<ConstantAggregateZero>(V))1938return ConstantAggregateZero::get(FixedVectorType::get(EltTy, Mask.size()));19391940if (Constant *C = dyn_cast<Constant>(V))1941return ConstantExpr::getShuffleVector(C, PoisonValue::get(C->getType()),1942Mask);19431944Instruction *I = cast<Instruction>(V);1945switch (I->getOpcode()) {1946case Instruction::Add:1947case Instruction::FAdd:1948case Instruction::Sub:1949case Instruction::FSub:1950case Instruction::Mul:1951case Instruction::FMul:1952case Instruction::UDiv:1953case Instruction::SDiv:1954case Instruction::FDiv:1955case Instruction::URem:1956case Instruction::SRem:1957case Instruction::FRem:1958case Instruction::Shl:1959case Instruction::LShr:1960case Instruction::AShr:1961case Instruction::And:1962case Instruction::Or:1963case Instruction::Xor:1964case Instruction::ICmp:1965case Instruction::FCmp:1966case Instruction::Trunc:1967case Instruction::ZExt:1968case Instruction::SExt:1969case Instruction::FPToUI:1970case Instruction::FPToSI:1971case Instruction::UIToFP:1972case Instruction::SIToFP:1973case Instruction::FPTrunc:1974case Instruction::FPExt:1975case Instruction::Select:1976case Instruction::GetElementPtr: {1977SmallVector<Value*, 8> NewOps;1978bool NeedsRebuild =1979(Mask.size() !=1980cast<FixedVectorType>(I->getType())->getNumElements());1981for (int i = 0, e = I->getNumOperands(); i != e; ++i) {1982Value *V;1983// Recursively call evaluateInDifferentElementOrder on vector arguments1984// as well. E.g. GetElementPtr may have scalar operands even if the1985// return value is a vector, so we need to examine the operand type.1986if (I->getOperand(i)->getType()->isVectorTy())1987V = evaluateInDifferentElementOrder(I->getOperand(i), Mask, Builder);1988else1989V = I->getOperand(i);1990NewOps.push_back(V);1991NeedsRebuild |= (V != I->getOperand(i));1992}1993if (NeedsRebuild)1994return buildNew(I, NewOps, Builder);1995return I;1996}1997case Instruction::InsertElement: {1998int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue();19992000// The insertelement was inserting at Element. Figure out which element2001// that becomes after shuffling. The answer is guaranteed to be unique2002// by CanEvaluateShuffled.2003bool Found = false;2004int Index = 0;2005for (int e = Mask.size(); Index != e; ++Index) {2006if (Mask[Index] == Element) {2007Found = true;2008break;2009}2010}20112012// If element is not in Mask, no need to handle the operand 1 (element to2013// be inserted). Just evaluate values in operand 0 according to Mask.2014if (!Found)2015return evaluateInDifferentElementOrder(I->getOperand(0), Mask, Builder);20162017Value *V = evaluateInDifferentElementOrder(I->getOperand(0), Mask,2018Builder);2019Builder.SetInsertPoint(I);2020return Builder.CreateInsertElement(V, I->getOperand(1), Index);2021}2022}2023llvm_unreachable("failed to reorder elements of vector instruction!");2024}20252026// Returns true if the shuffle is extracting a contiguous range of values from2027// LHS, for example:2028// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+2029// Input: |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP|2030// Shuffles to: |EE|FF|GG|HH|2031// +--+--+--+--+2032static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,2033ArrayRef<int> Mask) {2034unsigned LHSElems =2035cast<FixedVectorType>(SVI.getOperand(0)->getType())->getNumElements();2036unsigned MaskElems = Mask.size();2037unsigned BegIdx = Mask.front();2038unsigned EndIdx = Mask.back();2039if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1)2040return false;2041for (unsigned I = 0; I != MaskElems; ++I)2042if (static_cast<unsigned>(Mask[I]) != BegIdx + I)2043return false;2044return true;2045}20462047/// These are the ingredients in an alternate form binary operator as described2048/// below.2049struct BinopElts {2050BinaryOperator::BinaryOps Opcode;2051Value *Op0;2052Value *Op1;2053BinopElts(BinaryOperator::BinaryOps Opc = (BinaryOperator::BinaryOps)0,2054Value *V0 = nullptr, Value *V1 = nullptr) :2055Opcode(Opc), Op0(V0), Op1(V1) {}2056operator bool() const { return Opcode != 0; }2057};20582059/// Binops may be transformed into binops with different opcodes and operands.2060/// Reverse the usual canonicalization to enable folds with the non-canonical2061/// form of the binop. If a transform is possible, return the elements of the2062/// new binop. If not, return invalid elements.2063static BinopElts getAlternateBinop(BinaryOperator *BO, const DataLayout &DL) {2064Value *BO0 = BO->getOperand(0), *BO1 = BO->getOperand(1);2065Type *Ty = BO->getType();2066switch (BO->getOpcode()) {2067case Instruction::Shl: {2068// shl X, C --> mul X, (1 << C)2069Constant *C;2070if (match(BO1, m_ImmConstant(C))) {2071Constant *ShlOne = ConstantFoldBinaryOpOperands(2072Instruction::Shl, ConstantInt::get(Ty, 1), C, DL);2073assert(ShlOne && "Constant folding of immediate constants failed");2074return {Instruction::Mul, BO0, ShlOne};2075}2076break;2077}2078case Instruction::Or: {2079// or disjoin X, C --> add X, C2080if (cast<PossiblyDisjointInst>(BO)->isDisjoint())2081return {Instruction::Add, BO0, BO1};2082break;2083}2084case Instruction::Sub:2085// sub 0, X --> mul X, -12086if (match(BO0, m_ZeroInt()))2087return {Instruction::Mul, BO1, ConstantInt::getAllOnesValue(Ty)};2088break;2089default:2090break;2091}2092return {};2093}20942095/// A select shuffle of a select shuffle with a shared operand can be reduced2096/// to a single select shuffle. This is an obvious improvement in IR, and the2097/// backend is expected to lower select shuffles efficiently.2098static Instruction *foldSelectShuffleOfSelectShuffle(ShuffleVectorInst &Shuf) {2099assert(Shuf.isSelect() && "Must have select-equivalent shuffle");21002101Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);2102SmallVector<int, 16> Mask;2103Shuf.getShuffleMask(Mask);2104unsigned NumElts = Mask.size();21052106// Canonicalize a select shuffle with common operand as Op1.2107auto *ShufOp = dyn_cast<ShuffleVectorInst>(Op0);2108if (ShufOp && ShufOp->isSelect() &&2109(ShufOp->getOperand(0) == Op1 || ShufOp->getOperand(1) == Op1)) {2110std::swap(Op0, Op1);2111ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);2112}21132114ShufOp = dyn_cast<ShuffleVectorInst>(Op1);2115if (!ShufOp || !ShufOp->isSelect() ||2116(ShufOp->getOperand(0) != Op0 && ShufOp->getOperand(1) != Op0))2117return nullptr;21182119Value *X = ShufOp->getOperand(0), *Y = ShufOp->getOperand(1);2120SmallVector<int, 16> Mask1;2121ShufOp->getShuffleMask(Mask1);2122assert(Mask1.size() == NumElts && "Vector size changed with select shuffle");21232124// Canonicalize common operand (Op0) as X (first operand of first shuffle).2125if (Y == Op0) {2126std::swap(X, Y);2127ShuffleVectorInst::commuteShuffleMask(Mask1, NumElts);2128}21292130// If the mask chooses from X (operand 0), it stays the same.2131// If the mask chooses from the earlier shuffle, the other mask value is2132// transferred to the combined select shuffle:2133// shuf X, (shuf X, Y, M1), M --> shuf X, Y, M'2134SmallVector<int, 16> NewMask(NumElts);2135for (unsigned i = 0; i != NumElts; ++i)2136NewMask[i] = Mask[i] < (signed)NumElts ? Mask[i] : Mask1[i];21372138// A select mask with undef elements might look like an identity mask.2139assert((ShuffleVectorInst::isSelectMask(NewMask, NumElts) ||2140ShuffleVectorInst::isIdentityMask(NewMask, NumElts)) &&2141"Unexpected shuffle mask");2142return new ShuffleVectorInst(X, Y, NewMask);2143}21442145static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf,2146const SimplifyQuery &SQ) {2147assert(Shuf.isSelect() && "Must have select-equivalent shuffle");21482149// Are we shuffling together some value and that same value after it has been2150// modified by a binop with a constant?2151Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);2152Constant *C;2153bool Op0IsBinop;2154if (match(Op0, m_BinOp(m_Specific(Op1), m_Constant(C))))2155Op0IsBinop = true;2156else if (match(Op1, m_BinOp(m_Specific(Op0), m_Constant(C))))2157Op0IsBinop = false;2158else2159return nullptr;21602161// The identity constant for a binop leaves a variable operand unchanged. For2162// a vector, this is a splat of something like 0, -1, or 1.2163// If there's no identity constant for this binop, we're done.2164auto *BO = cast<BinaryOperator>(Op0IsBinop ? Op0 : Op1);2165BinaryOperator::BinaryOps BOpcode = BO->getOpcode();2166Constant *IdC = ConstantExpr::getBinOpIdentity(BOpcode, Shuf.getType(), true);2167if (!IdC)2168return nullptr;21692170Value *X = Op0IsBinop ? Op1 : Op0;21712172// Prevent folding in the case the non-binop operand might have NaN values.2173// If X can have NaN elements then we have that the floating point math2174// operation in the transformed code may not preserve the exact NaN2175// bit-pattern -- e.g. `fadd sNaN, 0.0 -> qNaN`.2176// This makes the transformation incorrect since the original program would2177// have preserved the exact NaN bit-pattern.2178// Avoid the folding if X can have NaN elements.2179if (Shuf.getType()->getElementType()->isFloatingPointTy() &&2180!isKnownNeverNaN(X, 0, SQ))2181return nullptr;21822183// Shuffle identity constants into the lanes that return the original value.2184// Example: shuf (mul X, {-1,-2,-3,-4}), X, {0,5,6,3} --> mul X, {-1,1,1,-4}2185// Example: shuf X, (add X, {-1,-2,-3,-4}), {0,1,6,7} --> add X, {0,0,-3,-4}2186// The existing binop constant vector remains in the same operand position.2187ArrayRef<int> Mask = Shuf.getShuffleMask();2188Constant *NewC = Op0IsBinop ? ConstantExpr::getShuffleVector(C, IdC, Mask) :2189ConstantExpr::getShuffleVector(IdC, C, Mask);21902191bool MightCreatePoisonOrUB =2192is_contained(Mask, PoisonMaskElem) &&2193(Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode));2194if (MightCreatePoisonOrUB)2195NewC = InstCombiner::getSafeVectorConstantForBinop(BOpcode, NewC, true);21962197// shuf (bop X, C), X, M --> bop X, C'2198// shuf X, (bop X, C), M --> bop X, C'2199Instruction *NewBO = BinaryOperator::Create(BOpcode, X, NewC);2200NewBO->copyIRFlags(BO);22012202// An undef shuffle mask element may propagate as an undef constant element in2203// the new binop. That would produce poison where the original code might not.2204// If we already made a safe constant, then there's no danger.2205if (is_contained(Mask, PoisonMaskElem) && !MightCreatePoisonOrUB)2206NewBO->dropPoisonGeneratingFlags();2207return NewBO;2208}22092210/// If we have an insert of a scalar to a non-zero element of an undefined2211/// vector and then shuffle that value, that's the same as inserting to the zero2212/// element and shuffling. Splatting from the zero element is recognized as the2213/// canonical form of splat.2214static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf,2215InstCombiner::BuilderTy &Builder) {2216Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);2217ArrayRef<int> Mask = Shuf.getShuffleMask();2218Value *X;2219uint64_t IndexC;22202221// Match a shuffle that is a splat to a non-zero element.2222if (!match(Op0, m_OneUse(m_InsertElt(m_Poison(), m_Value(X),2223m_ConstantInt(IndexC)))) ||2224!match(Op1, m_Poison()) || match(Mask, m_ZeroMask()) || IndexC == 0)2225return nullptr;22262227// Insert into element 0 of a poison vector.2228PoisonValue *PoisonVec = PoisonValue::get(Shuf.getType());2229Value *NewIns = Builder.CreateInsertElement(PoisonVec, X, (uint64_t)0);22302231// Splat from element 0. Any mask element that is poison remains poison.2232// For example:2233// shuf (inselt poison, X, 2), _, <2,2,undef>2234// --> shuf (inselt poison, X, 0), poison, <0,0,undef>2235unsigned NumMaskElts =2236cast<FixedVectorType>(Shuf.getType())->getNumElements();2237SmallVector<int, 16> NewMask(NumMaskElts, 0);2238for (unsigned i = 0; i != NumMaskElts; ++i)2239if (Mask[i] == PoisonMaskElem)2240NewMask[i] = Mask[i];22412242return new ShuffleVectorInst(NewIns, NewMask);2243}22442245/// Try to fold shuffles that are the equivalent of a vector select.2246Instruction *InstCombinerImpl::foldSelectShuffle(ShuffleVectorInst &Shuf) {2247if (!Shuf.isSelect())2248return nullptr;22492250// Canonicalize to choose from operand 0 first unless operand 1 is undefined.2251// Commuting undef to operand 0 conflicts with another canonicalization.2252unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements();2253if (!match(Shuf.getOperand(1), m_Undef()) &&2254Shuf.getMaskValue(0) >= (int)NumElts) {2255// TODO: Can we assert that both operands of a shuffle-select are not undef2256// (otherwise, it would have been folded by instsimplify?2257Shuf.commute();2258return &Shuf;2259}22602261if (Instruction *I = foldSelectShuffleOfSelectShuffle(Shuf))2262return I;22632264if (Instruction *I = foldSelectShuffleWith1Binop(2265Shuf, getSimplifyQuery().getWithInstruction(&Shuf)))2266return I;22672268BinaryOperator *B0, *B1;2269if (!match(Shuf.getOperand(0), m_BinOp(B0)) ||2270!match(Shuf.getOperand(1), m_BinOp(B1)))2271return nullptr;22722273// If one operand is "0 - X", allow that to be viewed as "X * -1"2274// (ConstantsAreOp1) by getAlternateBinop below. If the neg is not paired2275// with a multiply, we will exit because C0/C1 will not be set.2276Value *X, *Y;2277Constant *C0 = nullptr, *C1 = nullptr;2278bool ConstantsAreOp1;2279if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) &&2280match(B1, m_BinOp(m_Constant(C1), m_Value(Y))))2281ConstantsAreOp1 = false;2282else if (match(B0, m_CombineOr(m_BinOp(m_Value(X), m_Constant(C0)),2283m_Neg(m_Value(X)))) &&2284match(B1, m_CombineOr(m_BinOp(m_Value(Y), m_Constant(C1)),2285m_Neg(m_Value(Y)))))2286ConstantsAreOp1 = true;2287else2288return nullptr;22892290// We need matching binops to fold the lanes together.2291BinaryOperator::BinaryOps Opc0 = B0->getOpcode();2292BinaryOperator::BinaryOps Opc1 = B1->getOpcode();2293bool DropNSW = false;2294if (ConstantsAreOp1 && Opc0 != Opc1) {2295// TODO: We drop "nsw" if shift is converted into multiply because it may2296// not be correct when the shift amount is BitWidth - 1. We could examine2297// each vector element to determine if it is safe to keep that flag.2298if (Opc0 == Instruction::Shl || Opc1 == Instruction::Shl)2299DropNSW = true;2300if (BinopElts AltB0 = getAlternateBinop(B0, DL)) {2301assert(isa<Constant>(AltB0.Op1) && "Expecting constant with alt binop");2302Opc0 = AltB0.Opcode;2303C0 = cast<Constant>(AltB0.Op1);2304} else if (BinopElts AltB1 = getAlternateBinop(B1, DL)) {2305assert(isa<Constant>(AltB1.Op1) && "Expecting constant with alt binop");2306Opc1 = AltB1.Opcode;2307C1 = cast<Constant>(AltB1.Op1);2308}2309}23102311if (Opc0 != Opc1 || !C0 || !C1)2312return nullptr;23132314// The opcodes must be the same. Use a new name to make that clear.2315BinaryOperator::BinaryOps BOpc = Opc0;23162317// Select the constant elements needed for the single binop.2318ArrayRef<int> Mask = Shuf.getShuffleMask();2319Constant *NewC = ConstantExpr::getShuffleVector(C0, C1, Mask);23202321// We are moving a binop after a shuffle. When a shuffle has an undefined2322// mask element, the result is undefined, but it is not poison or undefined2323// behavior. That is not necessarily true for div/rem/shift.2324bool MightCreatePoisonOrUB =2325is_contained(Mask, PoisonMaskElem) &&2326(Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc));2327if (MightCreatePoisonOrUB)2328NewC = InstCombiner::getSafeVectorConstantForBinop(BOpc, NewC,2329ConstantsAreOp1);23302331Value *V;2332if (X == Y) {2333// Remove a binop and the shuffle by rearranging the constant:2334// shuffle (op V, C0), (op V, C1), M --> op V, C'2335// shuffle (op C0, V), (op C1, V), M --> op C', V2336V = X;2337} else {2338// If there are 2 different variable operands, we must create a new shuffle2339// (select) first, so check uses to ensure that we don't end up with more2340// instructions than we started with.2341if (!B0->hasOneUse() && !B1->hasOneUse())2342return nullptr;23432344// If we use the original shuffle mask and op1 is *variable*, we would be2345// putting an undef into operand 1 of div/rem/shift. This is either UB or2346// poison. We do not have to guard against UB when *constants* are op12347// because safe constants guarantee that we do not overflow sdiv/srem (and2348// there's no danger for other opcodes).2349// TODO: To allow this case, create a new shuffle mask with no undefs.2350if (MightCreatePoisonOrUB && !ConstantsAreOp1)2351return nullptr;23522353// Note: In general, we do not create new shuffles in InstCombine because we2354// do not know if a target can lower an arbitrary shuffle optimally. In this2355// case, the shuffle uses the existing mask, so there is no additional risk.23562357// Select the variable vectors first, then perform the binop:2358// shuffle (op X, C0), (op Y, C1), M --> op (shuffle X, Y, M), C'2359// shuffle (op C0, X), (op C1, Y), M --> op C', (shuffle X, Y, M)2360V = Builder.CreateShuffleVector(X, Y, Mask);2361}23622363Value *NewBO = ConstantsAreOp1 ? Builder.CreateBinOp(BOpc, V, NewC) :2364Builder.CreateBinOp(BOpc, NewC, V);23652366// Flags are intersected from the 2 source binops. But there are 2 exceptions:2367// 1. If we changed an opcode, poison conditions might have changed.2368// 2. If the shuffle had undef mask elements, the new binop might have undefs2369// where the original code did not. But if we already made a safe constant,2370// then there's no danger.2371if (auto *NewI = dyn_cast<Instruction>(NewBO)) {2372NewI->copyIRFlags(B0);2373NewI->andIRFlags(B1);2374if (DropNSW)2375NewI->setHasNoSignedWrap(false);2376if (is_contained(Mask, PoisonMaskElem) && !MightCreatePoisonOrUB)2377NewI->dropPoisonGeneratingFlags();2378}2379return replaceInstUsesWith(Shuf, NewBO);2380}23812382/// Convert a narrowing shuffle of a bitcasted vector into a vector truncate.2383/// Example (little endian):2384/// shuf (bitcast <4 x i16> X to <8 x i8>), <0, 2, 4, 6> --> trunc X to <4 x i8>2385static Instruction *foldTruncShuffle(ShuffleVectorInst &Shuf,2386bool IsBigEndian) {2387// This must be a bitcasted shuffle of 1 vector integer operand.2388Type *DestType = Shuf.getType();2389Value *X;2390if (!match(Shuf.getOperand(0), m_BitCast(m_Value(X))) ||2391!match(Shuf.getOperand(1), m_Poison()) || !DestType->isIntOrIntVectorTy())2392return nullptr;23932394// The source type must have the same number of elements as the shuffle,2395// and the source element type must be larger than the shuffle element type.2396Type *SrcType = X->getType();2397if (!SrcType->isVectorTy() || !SrcType->isIntOrIntVectorTy() ||2398cast<FixedVectorType>(SrcType)->getNumElements() !=2399cast<FixedVectorType>(DestType)->getNumElements() ||2400SrcType->getScalarSizeInBits() % DestType->getScalarSizeInBits() != 0)2401return nullptr;24022403assert(Shuf.changesLength() && !Shuf.increasesLength() &&2404"Expected a shuffle that decreases length");24052406// Last, check that the mask chooses the correct low bits for each narrow2407// element in the result.2408uint64_t TruncRatio =2409SrcType->getScalarSizeInBits() / DestType->getScalarSizeInBits();2410ArrayRef<int> Mask = Shuf.getShuffleMask();2411for (unsigned i = 0, e = Mask.size(); i != e; ++i) {2412if (Mask[i] == PoisonMaskElem)2413continue;2414uint64_t LSBIndex = IsBigEndian ? (i + 1) * TruncRatio - 1 : i * TruncRatio;2415assert(LSBIndex <= INT32_MAX && "Overflowed 32-bits");2416if (Mask[i] != (int)LSBIndex)2417return nullptr;2418}24192420return new TruncInst(X, DestType);2421}24222423/// Match a shuffle-select-shuffle pattern where the shuffles are widening and2424/// narrowing (concatenating with poison and extracting back to the original2425/// length). This allows replacing the wide select with a narrow select.2426static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf,2427InstCombiner::BuilderTy &Builder) {2428// This must be a narrowing identity shuffle. It extracts the 1st N elements2429// of the 1st vector operand of a shuffle.2430if (!match(Shuf.getOperand(1), m_Poison()) || !Shuf.isIdentityWithExtract())2431return nullptr;24322433// The vector being shuffled must be a vector select that we can eliminate.2434// TODO: The one-use requirement could be eased if X and/or Y are constants.2435Value *Cond, *X, *Y;2436if (!match(Shuf.getOperand(0),2437m_OneUse(m_Select(m_Value(Cond), m_Value(X), m_Value(Y)))))2438return nullptr;24392440// We need a narrow condition value. It must be extended with poison elements2441// and have the same number of elements as this shuffle.2442unsigned NarrowNumElts =2443cast<FixedVectorType>(Shuf.getType())->getNumElements();2444Value *NarrowCond;2445if (!match(Cond, m_OneUse(m_Shuffle(m_Value(NarrowCond), m_Poison()))) ||2446cast<FixedVectorType>(NarrowCond->getType())->getNumElements() !=2447NarrowNumElts ||2448!cast<ShuffleVectorInst>(Cond)->isIdentityWithPadding())2449return nullptr;24502451// shuf (sel (shuf NarrowCond, poison, WideMask), X, Y), poison, NarrowMask)2452// -->2453// sel NarrowCond, (shuf X, poison, NarrowMask), (shuf Y, poison, NarrowMask)2454Value *NarrowX = Builder.CreateShuffleVector(X, Shuf.getShuffleMask());2455Value *NarrowY = Builder.CreateShuffleVector(Y, Shuf.getShuffleMask());2456return SelectInst::Create(NarrowCond, NarrowX, NarrowY);2457}24582459/// Canonicalize FP negate/abs after shuffle.2460static Instruction *foldShuffleOfUnaryOps(ShuffleVectorInst &Shuf,2461InstCombiner::BuilderTy &Builder) {2462auto *S0 = dyn_cast<Instruction>(Shuf.getOperand(0));2463Value *X;2464if (!S0 || !match(S0, m_CombineOr(m_FNeg(m_Value(X)), m_FAbs(m_Value(X)))))2465return nullptr;24662467bool IsFNeg = S0->getOpcode() == Instruction::FNeg;24682469// Match 1-input (unary) shuffle.2470// shuffle (fneg/fabs X), Mask --> fneg/fabs (shuffle X, Mask)2471if (S0->hasOneUse() && match(Shuf.getOperand(1), m_Poison())) {2472Value *NewShuf = Builder.CreateShuffleVector(X, Shuf.getShuffleMask());2473if (IsFNeg)2474return UnaryOperator::CreateFNegFMF(NewShuf, S0);24752476Function *FAbs = Intrinsic::getDeclaration(Shuf.getModule(),2477Intrinsic::fabs, Shuf.getType());2478CallInst *NewF = CallInst::Create(FAbs, {NewShuf});2479NewF->setFastMathFlags(S0->getFastMathFlags());2480return NewF;2481}24822483// Match 2-input (binary) shuffle.2484auto *S1 = dyn_cast<Instruction>(Shuf.getOperand(1));2485Value *Y;2486if (!S1 || !match(S1, m_CombineOr(m_FNeg(m_Value(Y)), m_FAbs(m_Value(Y)))) ||2487S0->getOpcode() != S1->getOpcode() ||2488(!S0->hasOneUse() && !S1->hasOneUse()))2489return nullptr;24902491// shuf (fneg/fabs X), (fneg/fabs Y), Mask --> fneg/fabs (shuf X, Y, Mask)2492Value *NewShuf = Builder.CreateShuffleVector(X, Y, Shuf.getShuffleMask());2493Instruction *NewF;2494if (IsFNeg) {2495NewF = UnaryOperator::CreateFNeg(NewShuf);2496} else {2497Function *FAbs = Intrinsic::getDeclaration(Shuf.getModule(),2498Intrinsic::fabs, Shuf.getType());2499NewF = CallInst::Create(FAbs, {NewShuf});2500}2501NewF->copyIRFlags(S0);2502NewF->andIRFlags(S1);2503return NewF;2504}25052506/// Canonicalize casts after shuffle.2507static Instruction *foldCastShuffle(ShuffleVectorInst &Shuf,2508InstCombiner::BuilderTy &Builder) {2509// Do we have 2 matching cast operands?2510auto *Cast0 = dyn_cast<CastInst>(Shuf.getOperand(0));2511auto *Cast1 = dyn_cast<CastInst>(Shuf.getOperand(1));2512if (!Cast0 || !Cast1 || Cast0->getOpcode() != Cast1->getOpcode() ||2513Cast0->getSrcTy() != Cast1->getSrcTy())2514return nullptr;25152516// TODO: Allow other opcodes? That would require easing the type restrictions2517// below here.2518CastInst::CastOps CastOpcode = Cast0->getOpcode();2519switch (CastOpcode) {2520case Instruction::FPToSI:2521case Instruction::FPToUI:2522case Instruction::SIToFP:2523case Instruction::UIToFP:2524break;2525default:2526return nullptr;2527}25282529VectorType *ShufTy = Shuf.getType();2530VectorType *ShufOpTy = cast<VectorType>(Shuf.getOperand(0)->getType());2531VectorType *CastSrcTy = cast<VectorType>(Cast0->getSrcTy());25322533// TODO: Allow length-increasing shuffles?2534if (ShufTy->getElementCount().getKnownMinValue() >2535ShufOpTy->getElementCount().getKnownMinValue())2536return nullptr;25372538// TODO: Allow element-size-decreasing casts (ex: fptosi float to i8)?2539assert(isa<FixedVectorType>(CastSrcTy) && isa<FixedVectorType>(ShufOpTy) &&2540"Expected fixed vector operands for casts and binary shuffle");2541if (CastSrcTy->getPrimitiveSizeInBits() > ShufOpTy->getPrimitiveSizeInBits())2542return nullptr;25432544// At least one of the operands must have only one use (the shuffle).2545if (!Cast0->hasOneUse() && !Cast1->hasOneUse())2546return nullptr;25472548// shuffle (cast X), (cast Y), Mask --> cast (shuffle X, Y, Mask)2549Value *X = Cast0->getOperand(0);2550Value *Y = Cast1->getOperand(0);2551Value *NewShuf = Builder.CreateShuffleVector(X, Y, Shuf.getShuffleMask());2552return CastInst::Create(CastOpcode, NewShuf, ShufTy);2553}25542555/// Try to fold an extract subvector operation.2556static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {2557Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);2558if (!Shuf.isIdentityWithExtract() || !match(Op1, m_Poison()))2559return nullptr;25602561// Check if we are extracting all bits of an inserted scalar:2562// extract-subvec (bitcast (inselt ?, X, 0) --> bitcast X to subvec type2563Value *X;2564if (match(Op0, m_BitCast(m_InsertElt(m_Value(), m_Value(X), m_Zero()))) &&2565X->getType()->getPrimitiveSizeInBits() ==2566Shuf.getType()->getPrimitiveSizeInBits())2567return new BitCastInst(X, Shuf.getType());25682569// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask.2570Value *Y;2571ArrayRef<int> Mask;2572if (!match(Op0, m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask))))2573return nullptr;25742575// Be conservative with shuffle transforms. If we can't kill the 1st shuffle,2576// then combining may result in worse codegen.2577if (!Op0->hasOneUse())2578return nullptr;25792580// We are extracting a subvector from a shuffle. Remove excess elements from2581// the 1st shuffle mask to eliminate the extract.2582//2583// This transform is conservatively limited to identity extracts because we do2584// not allow arbitrary shuffle mask creation as a target-independent transform2585// (because we can't guarantee that will lower efficiently).2586//2587// If the extracting shuffle has an poison mask element, it transfers to the2588// new shuffle mask. Otherwise, copy the original mask element. Example:2589// shuf (shuf X, Y, <C0, C1, C2, poison, C4>), poison, <0, poison, 2, 3> -->2590// shuf X, Y, <C0, poison, C2, poison>2591unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements();2592SmallVector<int, 16> NewMask(NumElts);2593assert(NumElts < Mask.size() &&2594"Identity with extract must have less elements than its inputs");25952596for (unsigned i = 0; i != NumElts; ++i) {2597int ExtractMaskElt = Shuf.getMaskValue(i);2598int MaskElt = Mask[i];2599NewMask[i] = ExtractMaskElt == PoisonMaskElem ? ExtractMaskElt : MaskElt;2600}2601return new ShuffleVectorInst(X, Y, NewMask);2602}26032604/// Try to replace a shuffle with an insertelement or try to replace a shuffle2605/// operand with the operand of an insertelement.2606static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,2607InstCombinerImpl &IC) {2608Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1);2609SmallVector<int, 16> Mask;2610Shuf.getShuffleMask(Mask);26112612int NumElts = Mask.size();2613int InpNumElts = cast<FixedVectorType>(V0->getType())->getNumElements();26142615// This is a specialization of a fold in SimplifyDemandedVectorElts. We may2616// not be able to handle it there if the insertelement has >1 use.2617// If the shuffle has an insertelement operand but does not choose the2618// inserted scalar element from that value, then we can replace that shuffle2619// operand with the source vector of the insertelement.2620Value *X;2621uint64_t IdxC;2622if (match(V0, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {2623// shuf (inselt X, ?, IdxC), ?, Mask --> shuf X, ?, Mask2624if (!is_contained(Mask, (int)IdxC))2625return IC.replaceOperand(Shuf, 0, X);2626}2627if (match(V1, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {2628// Offset the index constant by the vector width because we are checking for2629// accesses to the 2nd vector input of the shuffle.2630IdxC += InpNumElts;2631// shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask2632if (!is_contained(Mask, (int)IdxC))2633return IC.replaceOperand(Shuf, 1, X);2634}2635// For the rest of the transform, the shuffle must not change vector sizes.2636// TODO: This restriction could be removed if the insert has only one use2637// (because the transform would require a new length-changing shuffle).2638if (NumElts != InpNumElts)2639return nullptr;26402641// shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC'2642auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) {2643// We need an insertelement with a constant index.2644if (!match(V0, m_InsertElt(m_Value(), m_Value(Scalar),2645m_ConstantInt(IndexC))))2646return false;26472648// Test the shuffle mask to see if it splices the inserted scalar into the2649// operand 1 vector of the shuffle.2650int NewInsIndex = -1;2651for (int i = 0; i != NumElts; ++i) {2652// Ignore undef mask elements.2653if (Mask[i] == -1)2654continue;26552656// The shuffle takes elements of operand 1 without lane changes.2657if (Mask[i] == NumElts + i)2658continue;26592660// The shuffle must choose the inserted scalar exactly once.2661if (NewInsIndex != -1 || Mask[i] != IndexC->getSExtValue())2662return false;26632664// The shuffle is placing the inserted scalar into element i.2665NewInsIndex = i;2666}26672668assert(NewInsIndex != -1 && "Did not fold shuffle with unused operand?");26692670// Index is updated to the potentially translated insertion lane.2671IndexC = ConstantInt::get(IndexC->getIntegerType(), NewInsIndex);2672return true;2673};26742675// If the shuffle is unnecessary, insert the scalar operand directly into2676// operand 1 of the shuffle. Example:2677// shuffle (insert ?, S, 1), V1, <1, 5, 6, 7> --> insert V1, S, 02678Value *Scalar;2679ConstantInt *IndexC;2680if (isShufflingScalarIntoOp1(Scalar, IndexC))2681return InsertElementInst::Create(V1, Scalar, IndexC);26822683// Try again after commuting shuffle. Example:2684// shuffle V0, (insert ?, S, 0), <0, 1, 2, 4> -->2685// shuffle (insert ?, S, 0), V0, <4, 5, 6, 0> --> insert V0, S, 32686std::swap(V0, V1);2687ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);2688if (isShufflingScalarIntoOp1(Scalar, IndexC))2689return InsertElementInst::Create(V1, Scalar, IndexC);26902691return nullptr;2692}26932694static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {2695// Match the operands as identity with padding (also known as concatenation2696// with undef) shuffles of the same source type. The backend is expected to2697// recreate these concatenations from a shuffle of narrow operands.2698auto *Shuffle0 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(0));2699auto *Shuffle1 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(1));2700if (!Shuffle0 || !Shuffle0->isIdentityWithPadding() ||2701!Shuffle1 || !Shuffle1->isIdentityWithPadding())2702return nullptr;27032704// We limit this transform to power-of-2 types because we expect that the2705// backend can convert the simplified IR patterns to identical nodes as the2706// original IR.2707// TODO: If we can verify the same behavior for arbitrary types, the2708// power-of-2 checks can be removed.2709Value *X = Shuffle0->getOperand(0);2710Value *Y = Shuffle1->getOperand(0);2711if (X->getType() != Y->getType() ||2712!isPowerOf2_32(cast<FixedVectorType>(Shuf.getType())->getNumElements()) ||2713!isPowerOf2_32(2714cast<FixedVectorType>(Shuffle0->getType())->getNumElements()) ||2715!isPowerOf2_32(cast<FixedVectorType>(X->getType())->getNumElements()) ||2716match(X, m_Undef()) || match(Y, m_Undef()))2717return nullptr;2718assert(match(Shuffle0->getOperand(1), m_Undef()) &&2719match(Shuffle1->getOperand(1), m_Undef()) &&2720"Unexpected operand for identity shuffle");27212722// This is a shuffle of 2 widening shuffles. We can shuffle the narrow source2723// operands directly by adjusting the shuffle mask to account for the narrower2724// types:2725// shuf (widen X), (widen Y), Mask --> shuf X, Y, Mask'2726int NarrowElts = cast<FixedVectorType>(X->getType())->getNumElements();2727int WideElts = cast<FixedVectorType>(Shuffle0->getType())->getNumElements();2728assert(WideElts > NarrowElts && "Unexpected types for identity with padding");27292730ArrayRef<int> Mask = Shuf.getShuffleMask();2731SmallVector<int, 16> NewMask(Mask.size(), -1);2732for (int i = 0, e = Mask.size(); i != e; ++i) {2733if (Mask[i] == -1)2734continue;27352736// If this shuffle is choosing an undef element from 1 of the sources, that2737// element is undef.2738if (Mask[i] < WideElts) {2739if (Shuffle0->getMaskValue(Mask[i]) == -1)2740continue;2741} else {2742if (Shuffle1->getMaskValue(Mask[i] - WideElts) == -1)2743continue;2744}27452746// If this shuffle is choosing from the 1st narrow op, the mask element is2747// the same. If this shuffle is choosing from the 2nd narrow op, the mask2748// element is offset down to adjust for the narrow vector widths.2749if (Mask[i] < WideElts) {2750assert(Mask[i] < NarrowElts && "Unexpected shuffle mask");2751NewMask[i] = Mask[i];2752} else {2753assert(Mask[i] < (WideElts + NarrowElts) && "Unexpected shuffle mask");2754NewMask[i] = Mask[i] - (WideElts - NarrowElts);2755}2756}2757return new ShuffleVectorInst(X, Y, NewMask);2758}27592760// Splatting the first element of the result of a BinOp, where any of the2761// BinOp's operands are the result of a first element splat can be simplified to2762// splatting the first element of the result of the BinOp2763Instruction *InstCombinerImpl::simplifyBinOpSplats(ShuffleVectorInst &SVI) {2764if (!match(SVI.getOperand(1), m_Poison()) ||2765!match(SVI.getShuffleMask(), m_ZeroMask()) ||2766!SVI.getOperand(0)->hasOneUse())2767return nullptr;27682769Value *Op0 = SVI.getOperand(0);2770Value *X, *Y;2771if (!match(Op0, m_BinOp(m_Shuffle(m_Value(X), m_Poison(), m_ZeroMask()),2772m_Value(Y))) &&2773!match(Op0, m_BinOp(m_Value(X),2774m_Shuffle(m_Value(Y), m_Poison(), m_ZeroMask()))))2775return nullptr;2776if (X->getType() != Y->getType())2777return nullptr;27782779auto *BinOp = cast<BinaryOperator>(Op0);2780if (!isSafeToSpeculativelyExecute(BinOp))2781return nullptr;27822783Value *NewBO = Builder.CreateBinOp(BinOp->getOpcode(), X, Y);2784if (auto NewBOI = dyn_cast<Instruction>(NewBO))2785NewBOI->copyIRFlags(BinOp);27862787return new ShuffleVectorInst(NewBO, SVI.getShuffleMask());2788}27892790Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {2791Value *LHS = SVI.getOperand(0);2792Value *RHS = SVI.getOperand(1);2793SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI);2794if (auto *V = simplifyShuffleVectorInst(LHS, RHS, SVI.getShuffleMask(),2795SVI.getType(), ShufQuery))2796return replaceInstUsesWith(SVI, V);27972798if (Instruction *I = simplifyBinOpSplats(SVI))2799return I;28002801// Canonicalize splat shuffle to use poison RHS. Handle this explicitly in2802// order to support scalable vectors.2803if (match(SVI.getShuffleMask(), m_ZeroMask()) && !isa<PoisonValue>(RHS))2804return replaceOperand(SVI, 1, PoisonValue::get(RHS->getType()));28052806if (isa<ScalableVectorType>(LHS->getType()))2807return nullptr;28082809unsigned VWidth = cast<FixedVectorType>(SVI.getType())->getNumElements();2810unsigned LHSWidth = cast<FixedVectorType>(LHS->getType())->getNumElements();28112812// shuffle (bitcast X), (bitcast Y), Mask --> bitcast (shuffle X, Y, Mask)2813//2814// if X and Y are of the same (vector) type, and the element size is not2815// changed by the bitcasts, we can distribute the bitcasts through the2816// shuffle, hopefully reducing the number of instructions. We make sure that2817// at least one bitcast only has one use, so we don't *increase* the number of2818// instructions here.2819Value *X, *Y;2820if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_BitCast(m_Value(Y))) &&2821X->getType()->isVectorTy() && X->getType() == Y->getType() &&2822X->getType()->getScalarSizeInBits() ==2823SVI.getType()->getScalarSizeInBits() &&2824(LHS->hasOneUse() || RHS->hasOneUse())) {2825Value *V = Builder.CreateShuffleVector(X, Y, SVI.getShuffleMask(),2826SVI.getName() + ".uncasted");2827return new BitCastInst(V, SVI.getType());2828}28292830ArrayRef<int> Mask = SVI.getShuffleMask();28312832// Peek through a bitcasted shuffle operand by scaling the mask. If the2833// simulated shuffle can simplify, then this shuffle is unnecessary:2834// shuf (bitcast X), undef, Mask --> bitcast X'2835// TODO: This could be extended to allow length-changing shuffles.2836// The transform might also be obsoleted if we allowed canonicalization2837// of bitcasted shuffles.2838if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_Undef()) &&2839X->getType()->isVectorTy() && VWidth == LHSWidth) {2840// Try to create a scaled mask constant.2841auto *XType = cast<FixedVectorType>(X->getType());2842unsigned XNumElts = XType->getNumElements();2843SmallVector<int, 16> ScaledMask;2844if (scaleShuffleMaskElts(XNumElts, Mask, ScaledMask)) {2845// If the shuffled source vector simplifies, cast that value to this2846// shuffle's type.2847if (auto *V = simplifyShuffleVectorInst(X, UndefValue::get(XType),2848ScaledMask, XType, ShufQuery))2849return BitCastInst::Create(Instruction::BitCast, V, SVI.getType());2850}2851}28522853// shuffle x, x, mask --> shuffle x, undef, mask'2854if (LHS == RHS) {2855assert(!match(RHS, m_Undef()) &&2856"Shuffle with 2 undef ops not simplified?");2857return new ShuffleVectorInst(LHS, createUnaryMask(Mask, LHSWidth));2858}28592860// shuffle undef, x, mask --> shuffle x, undef, mask'2861if (match(LHS, m_Undef())) {2862SVI.commute();2863return &SVI;2864}28652866if (Instruction *I = canonicalizeInsertSplat(SVI, Builder))2867return I;28682869if (Instruction *I = foldSelectShuffle(SVI))2870return I;28712872if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian()))2873return I;28742875if (Instruction *I = narrowVectorSelect(SVI, Builder))2876return I;28772878if (Instruction *I = foldShuffleOfUnaryOps(SVI, Builder))2879return I;28802881if (Instruction *I = foldCastShuffle(SVI, Builder))2882return I;28832884APInt PoisonElts(VWidth, 0);2885APInt AllOnesEltMask(APInt::getAllOnes(VWidth));2886if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, PoisonElts)) {2887if (V != &SVI)2888return replaceInstUsesWith(SVI, V);2889return &SVI;2890}28912892if (Instruction *I = foldIdentityExtractShuffle(SVI))2893return I;28942895// These transforms have the potential to lose undef knowledge, so they are2896// intentionally placed after SimplifyDemandedVectorElts().2897if (Instruction *I = foldShuffleWithInsert(SVI, *this))2898return I;2899if (Instruction *I = foldIdentityPaddedShuffles(SVI))2900return I;29012902if (match(RHS, m_Poison()) && canEvaluateShuffled(LHS, Mask)) {2903Value *V = evaluateInDifferentElementOrder(LHS, Mask, Builder);2904return replaceInstUsesWith(SVI, V);2905}29062907// SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to2908// a non-vector type. We can instead bitcast the original vector followed by2909// an extract of the desired element:2910//2911// %sroa = shufflevector <16 x i8> %in, <16 x i8> undef,2912// <4 x i32> <i32 0, i32 1, i32 2, i32 3>2913// %1 = bitcast <4 x i8> %sroa to i322914// Becomes:2915// %bc = bitcast <16 x i8> %in to <4 x i32>2916// %ext = extractelement <4 x i32> %bc, i32 02917//2918// If the shuffle is extracting a contiguous range of values from the input2919// vector then each use which is a bitcast of the extracted size can be2920// replaced. This will work if the vector types are compatible, and the begin2921// index is aligned to a value in the casted vector type. If the begin index2922// isn't aligned then we can shuffle the original vector (keeping the same2923// vector type) before extracting.2924//2925// This code will bail out if the target type is fundamentally incompatible2926// with vectors of the source type.2927//2928// Example of <16 x i8>, target type i32:2929// Index range [4,8): v-----------v Will work.2930// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+2931// <16 x i8>: | | | | | | | | | | | | | | | | |2932// <4 x i32>: | | | | |2933// +-----------+-----------+-----------+-----------+2934// Index range [6,10): ^-----------^ Needs an extra shuffle.2935// Target type i40: ^--------------^ Won't work, bail.2936bool MadeChange = false;2937if (isShuffleExtractingFromLHS(SVI, Mask)) {2938Value *V = LHS;2939unsigned MaskElems = Mask.size();2940auto *SrcTy = cast<FixedVectorType>(V->getType());2941unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedValue();2942unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());2943assert(SrcElemBitWidth && "vector elements must have a bitwidth");2944unsigned SrcNumElems = SrcTy->getNumElements();2945SmallVector<BitCastInst *, 8> BCs;2946DenseMap<Type *, Value *> NewBCs;2947for (User *U : SVI.users())2948if (BitCastInst *BC = dyn_cast<BitCastInst>(U))2949if (!BC->use_empty())2950// Only visit bitcasts that weren't previously handled.2951BCs.push_back(BC);2952for (BitCastInst *BC : BCs) {2953unsigned BegIdx = Mask.front();2954Type *TgtTy = BC->getDestTy();2955unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);2956if (!TgtElemBitWidth)2957continue;2958unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;2959bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;2960bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);2961if (!VecBitWidthsEqual)2962continue;2963if (!VectorType::isValidElementType(TgtTy))2964continue;2965auto *CastSrcTy = FixedVectorType::get(TgtTy, TgtNumElems);2966if (!BegIsAligned) {2967// Shuffle the input so [0,NumElements) contains the output, and2968// [NumElems,SrcNumElems) is undef.2969SmallVector<int, 16> ShuffleMask(SrcNumElems, -1);2970for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)2971ShuffleMask[I] = Idx;2972V = Builder.CreateShuffleVector(V, ShuffleMask,2973SVI.getName() + ".extract");2974BegIdx = 0;2975}2976unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;2977assert(SrcElemsPerTgtElem);2978BegIdx /= SrcElemsPerTgtElem;2979bool BCAlreadyExists = NewBCs.contains(CastSrcTy);2980auto *NewBC =2981BCAlreadyExists2982? NewBCs[CastSrcTy]2983: Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");2984if (!BCAlreadyExists)2985NewBCs[CastSrcTy] = NewBC;2986auto *Ext = Builder.CreateExtractElement(NewBC, BegIdx,2987SVI.getName() + ".extract");2988// The shufflevector isn't being replaced: the bitcast that used it2989// is. InstCombine will visit the newly-created instructions.2990replaceInstUsesWith(*BC, Ext);2991MadeChange = true;2992}2993}29942995// If the LHS is a shufflevector itself, see if we can combine it with this2996// one without producing an unusual shuffle.2997// Cases that might be simplified:2998// 1.2999// x1=shuffle(v1,v2,mask1)3000// x=shuffle(x1,undef,mask)3001// ==>3002// x=shuffle(v1,undef,newMask)3003// newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : -13004// 2.3005// x1=shuffle(v1,undef,mask1)3006// x=shuffle(x1,x2,mask)3007// where v1.size() == mask1.size()3008// ==>3009// x=shuffle(v1,x2,newMask)3010// newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : mask[i]3011// 3.3012// x2=shuffle(v2,undef,mask2)3013// x=shuffle(x1,x2,mask)3014// where v2.size() == mask2.size()3015// ==>3016// x=shuffle(x1,v2,newMask)3017// newMask[i] = (mask[i] < x1.size())3018// ? mask[i] : mask2[mask[i]-x1.size()]+x1.size()3019// 4.3020// x1=shuffle(v1,undef,mask1)3021// x2=shuffle(v2,undef,mask2)3022// x=shuffle(x1,x2,mask)3023// where v1.size() == v2.size()3024// ==>3025// x=shuffle(v1,v2,newMask)3026// newMask[i] = (mask[i] < x1.size())3027// ? mask1[mask[i]] : mask2[mask[i]-x1.size()]+v1.size()3028//3029// Here we are really conservative:3030// we are absolutely afraid of producing a shuffle mask not in the input3031// program, because the code gen may not be smart enough to turn a merged3032// shuffle into two specific shuffles: it may produce worse code. As such,3033// we only merge two shuffles if the result is either a splat or one of the3034// input shuffle masks. In this case, merging the shuffles just removes3035// one instruction, which we know is safe. This is good for things like3036// turning: (splat(splat)) -> splat, or3037// merge(V[0..n], V[n+1..2n]) -> V[0..2n]3038ShuffleVectorInst* LHSShuffle = dyn_cast<ShuffleVectorInst>(LHS);3039ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS);3040if (LHSShuffle)3041if (!match(LHSShuffle->getOperand(1), m_Poison()) &&3042!match(RHS, m_Poison()))3043LHSShuffle = nullptr;3044if (RHSShuffle)3045if (!match(RHSShuffle->getOperand(1), m_Poison()))3046RHSShuffle = nullptr;3047if (!LHSShuffle && !RHSShuffle)3048return MadeChange ? &SVI : nullptr;30493050Value* LHSOp0 = nullptr;3051Value* LHSOp1 = nullptr;3052Value* RHSOp0 = nullptr;3053unsigned LHSOp0Width = 0;3054unsigned RHSOp0Width = 0;3055if (LHSShuffle) {3056LHSOp0 = LHSShuffle->getOperand(0);3057LHSOp1 = LHSShuffle->getOperand(1);3058LHSOp0Width = cast<FixedVectorType>(LHSOp0->getType())->getNumElements();3059}3060if (RHSShuffle) {3061RHSOp0 = RHSShuffle->getOperand(0);3062RHSOp0Width = cast<FixedVectorType>(RHSOp0->getType())->getNumElements();3063}3064Value* newLHS = LHS;3065Value* newRHS = RHS;3066if (LHSShuffle) {3067// case 13068if (match(RHS, m_Poison())) {3069newLHS = LHSOp0;3070newRHS = LHSOp1;3071}3072// case 2 or 43073else if (LHSOp0Width == LHSWidth) {3074newLHS = LHSOp0;3075}3076}3077// case 3 or 43078if (RHSShuffle && RHSOp0Width == LHSWidth) {3079newRHS = RHSOp0;3080}3081// case 43082if (LHSOp0 == RHSOp0) {3083newLHS = LHSOp0;3084newRHS = nullptr;3085}30863087if (newLHS == LHS && newRHS == RHS)3088return MadeChange ? &SVI : nullptr;30893090ArrayRef<int> LHSMask;3091ArrayRef<int> RHSMask;3092if (newLHS != LHS)3093LHSMask = LHSShuffle->getShuffleMask();3094if (RHSShuffle && newRHS != RHS)3095RHSMask = RHSShuffle->getShuffleMask();30963097unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth;3098SmallVector<int, 16> newMask;3099bool isSplat = true;3100int SplatElt = -1;3101// Create a new mask for the new ShuffleVectorInst so that the new3102// ShuffleVectorInst is equivalent to the original one.3103for (unsigned i = 0; i < VWidth; ++i) {3104int eltMask;3105if (Mask[i] < 0) {3106// This element is a poison value.3107eltMask = -1;3108} else if (Mask[i] < (int)LHSWidth) {3109// This element is from left hand side vector operand.3110//3111// If LHS is going to be replaced (case 1, 2, or 4), calculate the3112// new mask value for the element.3113if (newLHS != LHS) {3114eltMask = LHSMask[Mask[i]];3115// If the value selected is an poison value, explicitly specify it3116// with a -1 mask value.3117if (eltMask >= (int)LHSOp0Width && isa<PoisonValue>(LHSOp1))3118eltMask = -1;3119} else3120eltMask = Mask[i];3121} else {3122// This element is from right hand side vector operand3123//3124// If the value selected is a poison value, explicitly specify it3125// with a -1 mask value. (case 1)3126if (match(RHS, m_Poison()))3127eltMask = -1;3128// If RHS is going to be replaced (case 3 or 4), calculate the3129// new mask value for the element.3130else if (newRHS != RHS) {3131eltMask = RHSMask[Mask[i]-LHSWidth];3132// If the value selected is an poison value, explicitly specify it3133// with a -1 mask value.3134if (eltMask >= (int)RHSOp0Width) {3135assert(match(RHSShuffle->getOperand(1), m_Poison()) &&3136"should have been check above");3137eltMask = -1;3138}3139} else3140eltMask = Mask[i]-LHSWidth;31413142// If LHS's width is changed, shift the mask value accordingly.3143// If newRHS == nullptr, i.e. LHSOp0 == RHSOp0, we want to remap any3144// references from RHSOp0 to LHSOp0, so we don't need to shift the mask.3145// If newRHS == newLHS, we want to remap any references from newRHS to3146// newLHS so that we can properly identify splats that may occur due to3147// obfuscation across the two vectors.3148if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS)3149eltMask += newLHSWidth;3150}31513152// Check if this could still be a splat.3153if (eltMask >= 0) {3154if (SplatElt >= 0 && SplatElt != eltMask)3155isSplat = false;3156SplatElt = eltMask;3157}31583159newMask.push_back(eltMask);3160}31613162// If the result mask is equal to one of the original shuffle masks,3163// or is a splat, do the replacement.3164if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {3165if (!newRHS)3166newRHS = PoisonValue::get(newLHS->getType());3167return new ShuffleVectorInst(newLHS, newRHS, newMask);3168}31693170return MadeChange ? &SVI : nullptr;3171}317231733174