Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
35266 views
//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7///8/// \file9/// This file contains implementations for different VPlan recipes.10///11//===----------------------------------------------------------------------===//1213#include "VPlan.h"14#include "VPlanAnalysis.h"15#include "llvm/ADT/STLExtras.h"16#include "llvm/ADT/SmallVector.h"17#include "llvm/ADT/Twine.h"18#include "llvm/Analysis/IVDescriptors.h"19#include "llvm/IR/BasicBlock.h"20#include "llvm/IR/IRBuilder.h"21#include "llvm/IR/Instruction.h"22#include "llvm/IR/Instructions.h"23#include "llvm/IR/Type.h"24#include "llvm/IR/Value.h"25#include "llvm/Support/Casting.h"26#include "llvm/Support/CommandLine.h"27#include "llvm/Support/Debug.h"28#include "llvm/Support/raw_ostream.h"29#include "llvm/Transforms/Utils/BasicBlockUtils.h"30#include "llvm/Transforms/Utils/LoopUtils.h"31#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"32#include <cassert>3334using namespace llvm;3536using VectorParts = SmallVector<Value *, 2>;3738namespace llvm {39extern cl::opt<bool> EnableVPlanNativePath;40}41extern cl::opt<unsigned> ForceTargetInstructionCost;4243#define LV_NAME "loop-vectorize"44#define DEBUG_TYPE LV_NAME4546bool VPRecipeBase::mayWriteToMemory() const {47switch (getVPDefID()) {48case VPInterleaveSC:49return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;50case VPWidenStoreEVLSC:51case VPWidenStoreSC:52return true;53case VPReplicateSC:54return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())55->mayWriteToMemory();56case VPWidenCallSC:57return !cast<VPWidenCallRecipe>(this)58->getCalledScalarFunction()59->onlyReadsMemory();60case VPBranchOnMaskSC:61case VPScalarIVStepsSC:62case VPPredInstPHISC:63return false;64case VPBlendSC:65case VPReductionEVLSC:66case VPReductionSC:67case VPWidenCanonicalIVSC:68case VPWidenCastSC:69case VPWidenGEPSC:70case VPWidenIntOrFpInductionSC:71case VPWidenLoadEVLSC:72case VPWidenLoadSC:73case VPWidenPHISC:74case VPWidenSC:75case VPWidenSelectSC: {76const Instruction *I =77dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());78(void)I;79assert((!I || !I->mayWriteToMemory()) &&80"underlying instruction may write to memory");81return false;82}83default:84return true;85}86}8788bool VPRecipeBase::mayReadFromMemory() const {89switch (getVPDefID()) {90case VPWidenLoadEVLSC:91case VPWidenLoadSC:92return true;93case VPReplicateSC:94return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())95->mayReadFromMemory();96case VPWidenCallSC:97return !cast<VPWidenCallRecipe>(this)98->getCalledScalarFunction()99->onlyWritesMemory();100case VPBranchOnMaskSC:101case VPPredInstPHISC:102case VPScalarIVStepsSC:103case VPWidenStoreEVLSC:104case VPWidenStoreSC:105return false;106case VPBlendSC:107case VPReductionEVLSC:108case VPReductionSC:109case VPWidenCanonicalIVSC:110case VPWidenCastSC:111case VPWidenGEPSC:112case VPWidenIntOrFpInductionSC:113case VPWidenPHISC:114case VPWidenSC:115case VPWidenSelectSC: {116const Instruction *I =117dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());118(void)I;119assert((!I || !I->mayReadFromMemory()) &&120"underlying instruction may read from memory");121return false;122}123default:124return true;125}126}127128bool VPRecipeBase::mayHaveSideEffects() const {129switch (getVPDefID()) {130case VPDerivedIVSC:131case VPPredInstPHISC:132case VPScalarCastSC:133return false;134case VPInstructionSC:135switch (cast<VPInstruction>(this)->getOpcode()) {136case Instruction::Or:137case Instruction::ICmp:138case Instruction::Select:139case VPInstruction::Not:140case VPInstruction::CalculateTripCountMinusVF:141case VPInstruction::CanonicalIVIncrementForPart:142case VPInstruction::ExtractFromEnd:143case VPInstruction::FirstOrderRecurrenceSplice:144case VPInstruction::LogicalAnd:145case VPInstruction::PtrAdd:146return false;147default:148return true;149}150case VPWidenCallSC: {151Function *Fn = cast<VPWidenCallRecipe>(this)->getCalledScalarFunction();152return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();153}154case VPBlendSC:155case VPReductionEVLSC:156case VPReductionSC:157case VPScalarIVStepsSC:158case VPWidenCanonicalIVSC:159case VPWidenCastSC:160case VPWidenGEPSC:161case VPWidenIntOrFpInductionSC:162case VPWidenPHISC:163case VPWidenPointerInductionSC:164case VPWidenSC:165case VPWidenSelectSC: {166const Instruction *I =167dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());168(void)I;169assert((!I || !I->mayHaveSideEffects()) &&170"underlying instruction has side-effects");171return false;172}173case VPInterleaveSC:174return mayWriteToMemory();175case VPWidenLoadEVLSC:176case VPWidenLoadSC:177case VPWidenStoreEVLSC:178case VPWidenStoreSC:179assert(180cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==181mayWriteToMemory() &&182"mayHaveSideffects result for ingredient differs from this "183"implementation");184return mayWriteToMemory();185case VPReplicateSC: {186auto *R = cast<VPReplicateRecipe>(this);187return R->getUnderlyingInstr()->mayHaveSideEffects();188}189default:190return true;191}192}193194void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {195VPValue *ExitValue = getOperand(0);196auto Lane = vputils::isUniformAfterVectorization(ExitValue)197? VPLane::getFirstLane()198: VPLane::getLastLaneForVF(State.VF);199VPBasicBlock *MiddleVPBB =200cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());201VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();202auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;203// Values leaving the vector loop reach live out phi's in the exiting block204// via middle block.205auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()206? MiddleVPBB207: ExitingVPBB;208BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];209// Set insertion point in PredBB in case an extract needs to be generated.210// TODO: Model extracts explicitly.211State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());212Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));213if (Phi->getBasicBlockIndex(PredBB) != -1)214Phi->setIncomingValueForBlock(PredBB, V);215else216Phi->addIncoming(V, PredBB);217}218219#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)220void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {221O << "Live-out ";222getPhi()->printAsOperand(O);223O << " = ";224getOperand(0)->printAsOperand(O, SlotTracker);225O << "\n";226}227#endif228229void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {230assert(!Parent && "Recipe already in some VPBasicBlock");231assert(InsertPos->getParent() &&232"Insertion position not in any VPBasicBlock");233InsertPos->getParent()->insert(this, InsertPos->getIterator());234}235236void VPRecipeBase::insertBefore(VPBasicBlock &BB,237iplist<VPRecipeBase>::iterator I) {238assert(!Parent && "Recipe already in some VPBasicBlock");239assert(I == BB.end() || I->getParent() == &BB);240BB.insert(this, I);241}242243void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {244assert(!Parent && "Recipe already in some VPBasicBlock");245assert(InsertPos->getParent() &&246"Insertion position not in any VPBasicBlock");247InsertPos->getParent()->insert(this, std::next(InsertPos->getIterator()));248}249250void VPRecipeBase::removeFromParent() {251assert(getParent() && "Recipe not in any VPBasicBlock");252getParent()->getRecipeList().remove(getIterator());253Parent = nullptr;254}255256iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {257assert(getParent() && "Recipe not in any VPBasicBlock");258return getParent()->getRecipeList().erase(getIterator());259}260261void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {262removeFromParent();263insertAfter(InsertPos);264}265266void VPRecipeBase::moveBefore(VPBasicBlock &BB,267iplist<VPRecipeBase>::iterator I) {268removeFromParent();269insertBefore(BB, I);270}271272/// Return the underlying instruction to be used for computing \p R's cost via273/// the legacy cost model. Return nullptr if there's no suitable instruction.274static Instruction *getInstructionForCost(const VPRecipeBase *R) {275if (auto *S = dyn_cast<VPSingleDefRecipe>(R))276return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());277if (auto *IG = dyn_cast<VPInterleaveRecipe>(R))278return IG->getInsertPos();279if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))280return &WidenMem->getIngredient();281return nullptr;282}283284InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {285if (auto *UI = getInstructionForCost(this))286if (Ctx.skipCostComputation(UI, VF.isVector()))287return 0;288289InstructionCost RecipeCost = computeCost(VF, Ctx);290if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&291RecipeCost.isValid())292RecipeCost = InstructionCost(ForceTargetInstructionCost);293294LLVM_DEBUG({295dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";296dump();297});298return RecipeCost;299}300301InstructionCost VPRecipeBase::computeCost(ElementCount VF,302VPCostContext &Ctx) const {303// Compute the cost for the recipe falling back to the legacy cost model using304// the underlying instruction. If there is no underlying instruction, returns305// 0.306Instruction *UI = getInstructionForCost(this);307if (UI && isa<VPReplicateRecipe>(this)) {308// VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan309// transform, avoid computing their cost multiple times for now.310Ctx.SkipCostComputation.insert(UI);311}312return UI ? Ctx.getLegacyCost(UI, VF) : 0;313}314315FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {316assert(OpType == OperationType::FPMathOp &&317"recipe doesn't have fast math flags");318FastMathFlags Res;319Res.setAllowReassoc(FMFs.AllowReassoc);320Res.setNoNaNs(FMFs.NoNaNs);321Res.setNoInfs(FMFs.NoInfs);322Res.setNoSignedZeros(FMFs.NoSignedZeros);323Res.setAllowReciprocal(FMFs.AllowReciprocal);324Res.setAllowContract(FMFs.AllowContract);325Res.setApproxFunc(FMFs.ApproxFunc);326return Res;327}328329VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,330VPValue *A, VPValue *B, DebugLoc DL,331const Twine &Name)332: VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),333Pred, DL),334Opcode(Opcode), Name(Name.str()) {335assert(Opcode == Instruction::ICmp &&336"only ICmp predicates supported at the moment");337}338339VPInstruction::VPInstruction(unsigned Opcode,340std::initializer_list<VPValue *> Operands,341FastMathFlags FMFs, DebugLoc DL, const Twine &Name)342: VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),343Opcode(Opcode), Name(Name.str()) {344// Make sure the VPInstruction is a floating-point operation.345assert(isFPMathOp() && "this op can't take fast-math flags");346}347348bool VPInstruction::doesGeneratePerAllLanes() const {349return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);350}351352bool VPInstruction::canGenerateScalarForFirstLane() const {353if (Instruction::isBinaryOp(getOpcode()))354return true;355if (isSingleScalar() || isVectorToScalar())356return true;357switch (Opcode) {358case Instruction::ICmp:359case VPInstruction::BranchOnCond:360case VPInstruction::BranchOnCount:361case VPInstruction::CalculateTripCountMinusVF:362case VPInstruction::CanonicalIVIncrementForPart:363case VPInstruction::PtrAdd:364case VPInstruction::ExplicitVectorLength:365return true;366default:367return false;368}369}370371Value *VPInstruction::generatePerLane(VPTransformState &State,372const VPIteration &Lane) {373IRBuilderBase &Builder = State.Builder;374375assert(getOpcode() == VPInstruction::PtrAdd &&376"only PtrAdd opcodes are supported for now");377return Builder.CreatePtrAdd(State.get(getOperand(0), Lane),378State.get(getOperand(1), Lane), Name);379}380381Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {382IRBuilderBase &Builder = State.Builder;383384if (Instruction::isBinaryOp(getOpcode())) {385bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);386Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);387Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);388auto *Res =389Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);390if (auto *I = dyn_cast<Instruction>(Res))391setFlags(I);392return Res;393}394395switch (getOpcode()) {396case VPInstruction::Not: {397Value *A = State.get(getOperand(0), Part);398return Builder.CreateNot(A, Name);399}400case Instruction::ICmp: {401bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);402Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);403Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);404return Builder.CreateCmp(getPredicate(), A, B, Name);405}406case Instruction::Select: {407Value *Cond = State.get(getOperand(0), Part);408Value *Op1 = State.get(getOperand(1), Part);409Value *Op2 = State.get(getOperand(2), Part);410return Builder.CreateSelect(Cond, Op1, Op2, Name);411}412case VPInstruction::ActiveLaneMask: {413// Get first lane of vector induction variable.414Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));415// Get the original loop tripcount.416Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));417418// If this part of the active lane mask is scalar, generate the CMP directly419// to avoid unnecessary extracts.420if (State.VF.isScalar())421return Builder.CreateCmp(CmpInst::Predicate::ICMP_ULT, VIVElem0, ScalarTC,422Name);423424auto *Int1Ty = Type::getInt1Ty(Builder.getContext());425auto *PredTy = VectorType::get(Int1Ty, State.VF);426return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,427{PredTy, ScalarTC->getType()},428{VIVElem0, ScalarTC}, nullptr, Name);429}430case VPInstruction::FirstOrderRecurrenceSplice: {431// Generate code to combine the previous and current values in vector v3.432//433// vector.ph:434// v_init = vector(..., ..., ..., a[-1])435// br vector.body436//437// vector.body438// i = phi [0, vector.ph], [i+4, vector.body]439// v1 = phi [v_init, vector.ph], [v2, vector.body]440// v2 = a[i, i+1, i+2, i+3];441// v3 = vector(v1(3), v2(0, 1, 2))442443// For the first part, use the recurrence phi (v1), otherwise v2.444auto *V1 = State.get(getOperand(0), 0);445Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);446if (!PartMinus1->getType()->isVectorTy())447return PartMinus1;448Value *V2 = State.get(getOperand(1), Part);449return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);450}451case VPInstruction::CalculateTripCountMinusVF: {452if (Part != 0)453return State.get(this, 0, /*IsScalar*/ true);454455Value *ScalarTC = State.get(getOperand(0), {0, 0});456Value *Step =457createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);458Value *Sub = Builder.CreateSub(ScalarTC, Step);459Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);460Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);461return Builder.CreateSelect(Cmp, Sub, Zero);462}463case VPInstruction::ExplicitVectorLength: {464// Compute EVL465auto GetEVL = [=](VPTransformState &State, Value *AVL) {466assert(AVL->getType()->isIntegerTy() &&467"Requested vector length should be an integer.");468469// TODO: Add support for MaxSafeDist for correct loop emission.470assert(State.VF.isScalable() && "Expected scalable vector factor.");471Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());472473Value *EVL = State.Builder.CreateIntrinsic(474State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,475{AVL, VFArg, State.Builder.getTrue()});476return EVL;477};478// TODO: Restructure this code with an explicit remainder loop, vsetvli can479// be outside of the main loop.480assert(Part == 0 && "No unrolling expected for predicated vectorization.");481// Compute VTC - IV as the AVL (requested vector length).482Value *Index = State.get(getOperand(0), VPIteration(0, 0));483Value *TripCount = State.get(getOperand(1), VPIteration(0, 0));484Value *AVL = State.Builder.CreateSub(TripCount, Index);485Value *EVL = GetEVL(State, AVL);486return EVL;487}488case VPInstruction::CanonicalIVIncrementForPart: {489auto *IV = State.get(getOperand(0), VPIteration(0, 0));490if (Part == 0)491return IV;492493// The canonical IV is incremented by the vectorization factor (num of SIMD494// elements) times the unroll part.495Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);496return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),497hasNoSignedWrap());498}499case VPInstruction::BranchOnCond: {500if (Part != 0)501return nullptr;502503Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));504// Replace the temporary unreachable terminator with a new conditional505// branch, hooking it up to backward destination for exiting blocks now and506// to forward destination(s) later when they are created.507BranchInst *CondBr =508Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);509CondBr->setSuccessor(0, nullptr);510Builder.GetInsertBlock()->getTerminator()->eraseFromParent();511512if (!getParent()->isExiting())513return CondBr;514515VPRegionBlock *ParentRegion = getParent()->getParent();516VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();517CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);518return CondBr;519}520case VPInstruction::BranchOnCount: {521if (Part != 0)522return nullptr;523// First create the compare.524Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);525Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);526Value *Cond = Builder.CreateICmpEQ(IV, TC);527528// Now create the branch.529auto *Plan = getParent()->getPlan();530VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();531VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();532533// Replace the temporary unreachable terminator with a new conditional534// branch, hooking it up to backward destination (the header) now and to the535// forward destination (the exit/middle block) later when it is created.536// Note that CreateCondBr expects a valid BB as first argument, so we need537// to set it to nullptr later.538BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),539State.CFG.VPBB2IRBB[Header]);540CondBr->setSuccessor(0, nullptr);541Builder.GetInsertBlock()->getTerminator()->eraseFromParent();542return CondBr;543}544case VPInstruction::ComputeReductionResult: {545if (Part != 0)546return State.get(this, 0, /*IsScalar*/ true);547548// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary549// and will be removed by breaking up the recipe further.550auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));551auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());552// Get its reduction variable descriptor.553const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();554555RecurKind RK = RdxDesc.getRecurrenceKind();556557VPValue *LoopExitingDef = getOperand(1);558Type *PhiTy = OrigPhi->getType();559VectorParts RdxParts(State.UF);560for (unsigned Part = 0; Part < State.UF; ++Part)561RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());562563// If the vector reduction can be performed in a smaller type, we truncate564// then extend the loop exit value to enable InstCombine to evaluate the565// entire expression in the smaller type.566// TODO: Handle this in truncateToMinBW.567if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {568Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);569for (unsigned Part = 0; Part < State.UF; ++Part)570RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);571}572// Reduce all of the unrolled parts into a single vector.573Value *ReducedPartRdx = RdxParts[0];574unsigned Op = RecurrenceDescriptor::getOpcode(RK);575if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))576Op = Instruction::Or;577578if (PhiR->isOrdered()) {579ReducedPartRdx = RdxParts[State.UF - 1];580} else {581// Floating-point operations should have some FMF to enable the reduction.582IRBuilderBase::FastMathFlagGuard FMFG(Builder);583Builder.setFastMathFlags(RdxDesc.getFastMathFlags());584for (unsigned Part = 1; Part < State.UF; ++Part) {585Value *RdxPart = RdxParts[Part];586if (Op != Instruction::ICmp && Op != Instruction::FCmp)587ReducedPartRdx = Builder.CreateBinOp(588(Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");589else590ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);591}592}593594// Create the reduction after the loop. Note that inloop reductions create595// the target reduction in the loop using a Reduction recipe.596if ((State.VF.isVector() ||597RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&598!PhiR->isInLoop()) {599ReducedPartRdx =600createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);601// If the reduction can be performed in a smaller type, we need to extend602// the reduction to the wider type before we branch to the original loop.603if (PhiTy != RdxDesc.getRecurrenceType())604ReducedPartRdx = RdxDesc.isSigned()605? Builder.CreateSExt(ReducedPartRdx, PhiTy)606: Builder.CreateZExt(ReducedPartRdx, PhiTy);607}608609// If there were stores of the reduction value to a uniform memory address610// inside the loop, create the final store here.611if (StoreInst *SI = RdxDesc.IntermediateStore) {612auto *NewSI = Builder.CreateAlignedStore(613ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());614propagateMetadata(NewSI, SI);615}616617return ReducedPartRdx;618}619case VPInstruction::ExtractFromEnd: {620if (Part != 0)621return State.get(this, 0, /*IsScalar*/ true);622623auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());624unsigned Offset = CI->getZExtValue();625assert(Offset > 0 && "Offset from end must be positive");626Value *Res;627if (State.VF.isVector()) {628assert(Offset <= State.VF.getKnownMinValue() &&629"invalid offset to extract from");630// Extract lane VF - Offset from the operand.631Res = State.get(632getOperand(0),633VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));634} else {635assert(Offset <= State.UF && "invalid offset to extract from");636// When loop is unrolled without vectorizing, retrieve UF - Offset.637Res = State.get(getOperand(0), State.UF - Offset);638}639if (isa<ExtractElementInst>(Res))640Res->setName(Name);641return Res;642}643case VPInstruction::LogicalAnd: {644Value *A = State.get(getOperand(0), Part);645Value *B = State.get(getOperand(1), Part);646return Builder.CreateLogicalAnd(A, B, Name);647}648case VPInstruction::PtrAdd: {649assert(vputils::onlyFirstLaneUsed(this) &&650"can only generate first lane for PtrAdd");651Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);652Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);653return Builder.CreatePtrAdd(Ptr, Addend, Name);654}655case VPInstruction::ResumePhi: {656if (Part != 0)657return State.get(this, 0, /*IsScalar*/ true);658Value *IncomingFromVPlanPred =659State.get(getOperand(0), Part, /* IsScalar */ true);660Value *IncomingFromOtherPreds =661State.get(getOperand(1), Part, /* IsScalar */ true);662auto *NewPhi =663Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);664BasicBlock *VPlanPred =665State.CFG666.VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];667NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);668for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {669assert(OtherPred != VPlanPred &&670"VPlan predecessors should not be connected yet");671NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);672}673return NewPhi;674}675676default:677llvm_unreachable("Unsupported opcode for instruction");678}679}680681bool VPInstruction::isVectorToScalar() const {682return getOpcode() == VPInstruction::ExtractFromEnd ||683getOpcode() == VPInstruction::ComputeReductionResult;684}685686bool VPInstruction::isSingleScalar() const {687return getOpcode() == VPInstruction::ResumePhi;688}689690#if !defined(NDEBUG)691bool VPInstruction::isFPMathOp() const {692// Inspired by FPMathOperator::classof. Notable differences are that we don't693// support Call, PHI and Select opcodes here yet.694return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||695Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||696Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||697Opcode == Instruction::FCmp || Opcode == Instruction::Select;698}699#endif700701void VPInstruction::execute(VPTransformState &State) {702assert(!State.Instance && "VPInstruction executing an Instance");703IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);704assert((hasFastMathFlags() == isFPMathOp() ||705getOpcode() == Instruction::Select) &&706"Recipe not a FPMathOp but has fast-math flags?");707if (hasFastMathFlags())708State.Builder.setFastMathFlags(getFastMathFlags());709State.setDebugLocFrom(getDebugLoc());710bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&711(vputils::onlyFirstLaneUsed(this) ||712isVectorToScalar() || isSingleScalar());713bool GeneratesPerAllLanes = doesGeneratePerAllLanes();714bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this);715for (unsigned Part = 0; Part < State.UF; ++Part) {716if (GeneratesPerAllLanes) {717for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();718Lane != NumLanes; ++Lane) {719Value *GeneratedValue = generatePerLane(State, VPIteration(Part, Lane));720assert(GeneratedValue && "generatePerLane must produce a value");721State.set(this, GeneratedValue, VPIteration(Part, Lane));722}723continue;724}725726if (Part != 0 && OnlyFirstPartUsed && hasResult()) {727Value *Part0 = State.get(this, 0, /*IsScalar*/ GeneratesPerFirstLaneOnly);728State.set(this, Part0, Part,729/*IsScalar*/ GeneratesPerFirstLaneOnly);730continue;731}732733Value *GeneratedValue = generatePerPart(State, Part);734if (!hasResult())735continue;736assert(GeneratedValue && "generatePerPart must produce a value");737assert((GeneratedValue->getType()->isVectorTy() ==738!GeneratesPerFirstLaneOnly ||739State.VF.isScalar()) &&740"scalar value but not only first lane defined");741State.set(this, GeneratedValue, Part,742/*IsScalar*/ GeneratesPerFirstLaneOnly);743}744}745746bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {747assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");748if (Instruction::isBinaryOp(getOpcode()))749return vputils::onlyFirstLaneUsed(this);750751switch (getOpcode()) {752default:753return false;754case Instruction::ICmp:755case VPInstruction::PtrAdd:756// TODO: Cover additional opcodes.757return vputils::onlyFirstLaneUsed(this);758case VPInstruction::ActiveLaneMask:759case VPInstruction::ExplicitVectorLength:760case VPInstruction::CalculateTripCountMinusVF:761case VPInstruction::CanonicalIVIncrementForPart:762case VPInstruction::BranchOnCount:763case VPInstruction::BranchOnCond:764case VPInstruction::ResumePhi:765return true;766};767llvm_unreachable("switch should return");768}769770bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {771assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");772if (Instruction::isBinaryOp(getOpcode()))773return vputils::onlyFirstPartUsed(this);774775switch (getOpcode()) {776default:777return false;778case Instruction::ICmp:779case Instruction::Select:780return vputils::onlyFirstPartUsed(this);781case VPInstruction::BranchOnCount:782case VPInstruction::BranchOnCond:783case VPInstruction::CanonicalIVIncrementForPart:784return true;785};786llvm_unreachable("switch should return");787}788789#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)790void VPInstruction::dump() const {791VPSlotTracker SlotTracker(getParent()->getPlan());792print(dbgs(), "", SlotTracker);793}794795void VPInstruction::print(raw_ostream &O, const Twine &Indent,796VPSlotTracker &SlotTracker) const {797O << Indent << "EMIT ";798799if (hasResult()) {800printAsOperand(O, SlotTracker);801O << " = ";802}803804switch (getOpcode()) {805case VPInstruction::Not:806O << "not";807break;808case VPInstruction::SLPLoad:809O << "combined load";810break;811case VPInstruction::SLPStore:812O << "combined store";813break;814case VPInstruction::ActiveLaneMask:815O << "active lane mask";816break;817case VPInstruction::ResumePhi:818O << "resume-phi";819break;820case VPInstruction::ExplicitVectorLength:821O << "EXPLICIT-VECTOR-LENGTH";822break;823case VPInstruction::FirstOrderRecurrenceSplice:824O << "first-order splice";825break;826case VPInstruction::BranchOnCond:827O << "branch-on-cond";828break;829case VPInstruction::CalculateTripCountMinusVF:830O << "TC > VF ? TC - VF : 0";831break;832case VPInstruction::CanonicalIVIncrementForPart:833O << "VF * Part +";834break;835case VPInstruction::BranchOnCount:836O << "branch-on-count";837break;838case VPInstruction::ExtractFromEnd:839O << "extract-from-end";840break;841case VPInstruction::ComputeReductionResult:842O << "compute-reduction-result";843break;844case VPInstruction::LogicalAnd:845O << "logical-and";846break;847case VPInstruction::PtrAdd:848O << "ptradd";849break;850default:851O << Instruction::getOpcodeName(getOpcode());852}853854printFlags(O);855printOperands(O, SlotTracker);856857if (auto DL = getDebugLoc()) {858O << ", !dbg ";859DL.print(O);860}861}862#endif863864void VPWidenCallRecipe::execute(VPTransformState &State) {865assert(State.VF.isVector() && "not widening");866Function *CalledScalarFn = getCalledScalarFunction();867assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&868"DbgInfoIntrinsic should have been dropped during VPlan construction");869State.setDebugLocFrom(getDebugLoc());870871bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;872FunctionType *VFTy = nullptr;873if (Variant)874VFTy = Variant->getFunctionType();875for (unsigned Part = 0; Part < State.UF; ++Part) {876SmallVector<Type *, 2> TysForDecl;877// Add return type if intrinsic is overloaded on it.878if (UseIntrinsic &&879isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))880TysForDecl.push_back(VectorType::get(881CalledScalarFn->getReturnType()->getScalarType(), State.VF));882SmallVector<Value *, 4> Args;883for (const auto &I : enumerate(arg_operands())) {884// Some intrinsics have a scalar argument - don't replace it with a885// vector.886Value *Arg;887if (UseIntrinsic &&888isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))889Arg = State.get(I.value(), VPIteration(0, 0));890// Some vectorized function variants may also take a scalar argument,891// e.g. linear parameters for pointers. This needs to be the scalar value892// from the start of the respective part when interleaving.893else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())894Arg = State.get(I.value(), VPIteration(Part, 0));895else896Arg = State.get(I.value(), Part);897if (UseIntrinsic &&898isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))899TysForDecl.push_back(Arg->getType());900Args.push_back(Arg);901}902903Function *VectorF;904if (UseIntrinsic) {905// Use vector version of the intrinsic.906Module *M = State.Builder.GetInsertBlock()->getModule();907VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);908assert(VectorF && "Can't retrieve vector intrinsic.");909} else {910#ifndef NDEBUG911assert(Variant != nullptr && "Can't create vector function.");912#endif913VectorF = Variant;914}915916auto *CI = cast_or_null<CallInst>(getUnderlyingInstr());917SmallVector<OperandBundleDef, 1> OpBundles;918if (CI)919CI->getOperandBundlesAsDefs(OpBundles);920921CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);922923if (isa<FPMathOperator>(V))924V->copyFastMathFlags(CI);925926if (!V->getType()->isVoidTy())927State.set(this, V, Part);928State.addMetadata(V, CI);929}930}931932#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)933void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,934VPSlotTracker &SlotTracker) const {935O << Indent << "WIDEN-CALL ";936937Function *CalledFn = getCalledScalarFunction();938if (CalledFn->getReturnType()->isVoidTy())939O << "void ";940else {941printAsOperand(O, SlotTracker);942O << " = ";943}944945O << "call @" << CalledFn->getName() << "(";946interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {947Op->printAsOperand(O, SlotTracker);948});949O << ")";950951if (VectorIntrinsicID)952O << " (using vector intrinsic)";953else {954O << " (using library function";955if (Variant->hasName())956O << ": " << Variant->getName();957O << ")";958}959}960961void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,962VPSlotTracker &SlotTracker) const {963O << Indent << "WIDEN-SELECT ";964printAsOperand(O, SlotTracker);965O << " = select ";966getOperand(0)->printAsOperand(O, SlotTracker);967O << ", ";968getOperand(1)->printAsOperand(O, SlotTracker);969O << ", ";970getOperand(2)->printAsOperand(O, SlotTracker);971O << (isInvariantCond() ? " (condition is loop invariant)" : "");972}973#endif974975void VPWidenSelectRecipe::execute(VPTransformState &State) {976State.setDebugLocFrom(getDebugLoc());977978// The condition can be loop invariant but still defined inside the979// loop. This means that we can't just use the original 'cond' value.980// We have to take the 'vectorized' value and pick the first lane.981// Instcombine will make this a no-op.982auto *InvarCond =983isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;984985for (unsigned Part = 0; Part < State.UF; ++Part) {986Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);987Value *Op0 = State.get(getOperand(1), Part);988Value *Op1 = State.get(getOperand(2), Part);989Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);990State.set(this, Sel, Part);991State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));992}993}994995VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(996const FastMathFlags &FMF) {997AllowReassoc = FMF.allowReassoc();998NoNaNs = FMF.noNaNs();999NoInfs = FMF.noInfs();1000NoSignedZeros = FMF.noSignedZeros();1001AllowReciprocal = FMF.allowReciprocal();1002AllowContract = FMF.allowContract();1003ApproxFunc = FMF.approxFunc();1004}10051006#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)1007void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {1008switch (OpType) {1009case OperationType::Cmp:1010O << " " << CmpInst::getPredicateName(getPredicate());1011break;1012case OperationType::DisjointOp:1013if (DisjointFlags.IsDisjoint)1014O << " disjoint";1015break;1016case OperationType::PossiblyExactOp:1017if (ExactFlags.IsExact)1018O << " exact";1019break;1020case OperationType::OverflowingBinOp:1021if (WrapFlags.HasNUW)1022O << " nuw";1023if (WrapFlags.HasNSW)1024O << " nsw";1025break;1026case OperationType::FPMathOp:1027getFastMathFlags().print(O);1028break;1029case OperationType::GEPOp:1030if (GEPFlags.IsInBounds)1031O << " inbounds";1032break;1033case OperationType::NonNegOp:1034if (NonNegFlags.NonNeg)1035O << " nneg";1036break;1037case OperationType::Other:1038break;1039}1040if (getNumOperands() > 0)1041O << " ";1042}1043#endif10441045void VPWidenRecipe::execute(VPTransformState &State) {1046State.setDebugLocFrom(getDebugLoc());1047auto &Builder = State.Builder;1048switch (Opcode) {1049case Instruction::Call:1050case Instruction::Br:1051case Instruction::PHI:1052case Instruction::GetElementPtr:1053case Instruction::Select:1054llvm_unreachable("This instruction is handled by a different recipe.");1055case Instruction::UDiv:1056case Instruction::SDiv:1057case Instruction::SRem:1058case Instruction::URem:1059case Instruction::Add:1060case Instruction::FAdd:1061case Instruction::Sub:1062case Instruction::FSub:1063case Instruction::FNeg:1064case Instruction::Mul:1065case Instruction::FMul:1066case Instruction::FDiv:1067case Instruction::FRem:1068case Instruction::Shl:1069case Instruction::LShr:1070case Instruction::AShr:1071case Instruction::And:1072case Instruction::Or:1073case Instruction::Xor: {1074// Just widen unops and binops.1075for (unsigned Part = 0; Part < State.UF; ++Part) {1076SmallVector<Value *, 2> Ops;1077for (VPValue *VPOp : operands())1078Ops.push_back(State.get(VPOp, Part));10791080Value *V = Builder.CreateNAryOp(Opcode, Ops);10811082if (auto *VecOp = dyn_cast<Instruction>(V))1083setFlags(VecOp);10841085// Use this vector value for all users of the original instruction.1086State.set(this, V, Part);1087State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));1088}10891090break;1091}1092case Instruction::Freeze: {1093for (unsigned Part = 0; Part < State.UF; ++Part) {1094Value *Op = State.get(getOperand(0), Part);10951096Value *Freeze = Builder.CreateFreeze(Op);1097State.set(this, Freeze, Part);1098}1099break;1100}1101case Instruction::ICmp:1102case Instruction::FCmp: {1103// Widen compares. Generate vector compares.1104bool FCmp = Opcode == Instruction::FCmp;1105for (unsigned Part = 0; Part < State.UF; ++Part) {1106Value *A = State.get(getOperand(0), Part);1107Value *B = State.get(getOperand(1), Part);1108Value *C = nullptr;1109if (FCmp) {1110// Propagate fast math flags.1111IRBuilder<>::FastMathFlagGuard FMFG(Builder);1112if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))1113Builder.setFastMathFlags(I->getFastMathFlags());1114C = Builder.CreateFCmp(getPredicate(), A, B);1115} else {1116C = Builder.CreateICmp(getPredicate(), A, B);1117}1118State.set(this, C, Part);1119State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));1120}11211122break;1123}1124default:1125// This instruction is not vectorized by simple widening.1126LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "1127<< Instruction::getOpcodeName(Opcode));1128llvm_unreachable("Unhandled instruction!");1129} // end of switch.11301131#if !defined(NDEBUG)1132// Verify that VPlan type inference results agree with the type of the1133// generated values.1134for (unsigned Part = 0; Part < State.UF; ++Part) {1135assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),1136State.VF) == State.get(this, Part)->getType() &&1137"inferred type and type from generated instructions do not match");1138}1139#endif1140}11411142#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)1143void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,1144VPSlotTracker &SlotTracker) const {1145O << Indent << "WIDEN ";1146printAsOperand(O, SlotTracker);1147O << " = " << Instruction::getOpcodeName(Opcode);1148printFlags(O);1149printOperands(O, SlotTracker);1150}1151#endif11521153void VPWidenCastRecipe::execute(VPTransformState &State) {1154State.setDebugLocFrom(getDebugLoc());1155auto &Builder = State.Builder;1156/// Vectorize casts.1157assert(State.VF.isVector() && "Not vectorizing?");1158Type *DestTy = VectorType::get(getResultType(), State.VF);1159VPValue *Op = getOperand(0);1160for (unsigned Part = 0; Part < State.UF; ++Part) {1161if (Part > 0 && Op->isLiveIn()) {1162// FIXME: Remove once explicit unrolling is implemented using VPlan.1163State.set(this, State.get(this, 0), Part);1164continue;1165}1166Value *A = State.get(Op, Part);1167Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);1168State.set(this, Cast, Part);1169State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));1170}1171}11721173#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)1174void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,1175VPSlotTracker &SlotTracker) const {1176O << Indent << "WIDEN-CAST ";1177printAsOperand(O, SlotTracker);1178O << " = " << Instruction::getOpcodeName(Opcode) << " ";1179printFlags(O);1180printOperands(O, SlotTracker);1181O << " to " << *getResultType();1182}1183#endif11841185/// This function adds1186/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)1187/// to each vector element of Val. The sequence starts at StartIndex.1188/// \p Opcode is relevant for FP induction variable.1189static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,1190Instruction::BinaryOps BinOp, ElementCount VF,1191IRBuilderBase &Builder) {1192assert(VF.isVector() && "only vector VFs are supported");11931194// Create and check the types.1195auto *ValVTy = cast<VectorType>(Val->getType());1196ElementCount VLen = ValVTy->getElementCount();11971198Type *STy = Val->getType()->getScalarType();1199assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&1200"Induction Step must be an integer or FP");1201assert(Step->getType() == STy && "Step has wrong type");12021203SmallVector<Constant *, 8> Indices;12041205// Create a vector of consecutive numbers from zero to VF.1206VectorType *InitVecValVTy = ValVTy;1207if (STy->isFloatingPointTy()) {1208Type *InitVecValSTy =1209IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());1210InitVecValVTy = VectorType::get(InitVecValSTy, VLen);1211}1212Value *InitVec = Builder.CreateStepVector(InitVecValVTy);12131214// Splat the StartIdx1215Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);12161217if (STy->isIntegerTy()) {1218InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);1219Step = Builder.CreateVectorSplat(VLen, Step);1220assert(Step->getType() == Val->getType() && "Invalid step vec");1221// FIXME: The newly created binary instructions should contain nsw/nuw1222// flags, which can be found from the original scalar operations.1223Step = Builder.CreateMul(InitVec, Step);1224return Builder.CreateAdd(Val, Step, "induction");1225}12261227// Floating point induction.1228assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&1229"Binary Opcode should be specified for FP induction");1230InitVec = Builder.CreateUIToFP(InitVec, ValVTy);1231InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);12321233Step = Builder.CreateVectorSplat(VLen, Step);1234Value *MulOp = Builder.CreateFMul(InitVec, Step);1235return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");1236}12371238/// A helper function that returns an integer or floating-point constant with1239/// value C.1240static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {1241return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)1242: ConstantFP::get(Ty, C);1243}12441245static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,1246ElementCount VF) {1247assert(FTy->isFloatingPointTy() && "Expected floating point type!");1248Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());1249Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);1250return B.CreateUIToFP(RuntimeVF, FTy);1251}12521253void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {1254assert(!State.Instance && "Int or FP induction being replicated.");12551256Value *Start = getStartValue()->getLiveInIRValue();1257const InductionDescriptor &ID = getInductionDescriptor();1258TruncInst *Trunc = getTruncInst();1259IRBuilderBase &Builder = State.Builder;1260assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");1261assert(State.VF.isVector() && "must have vector VF");12621263// The value from the original loop to which we are mapping the new induction1264// variable.1265Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;12661267// Fast-math-flags propagate from the original induction instruction.1268IRBuilder<>::FastMathFlagGuard FMFG(Builder);1269if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))1270Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());12711272// Now do the actual transformations, and start with fetching the step value.1273Value *Step = State.get(getStepValue(), VPIteration(0, 0));12741275assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&1276"Expected either an induction phi-node or a truncate of it!");12771278// Construct the initial value of the vector IV in the vector loop preheader1279auto CurrIP = Builder.saveIP();1280BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);1281Builder.SetInsertPoint(VectorPH->getTerminator());1282if (isa<TruncInst>(EntryVal)) {1283assert(Start->getType()->isIntegerTy() &&1284"Truncation requires an integer type");1285auto *TruncType = cast<IntegerType>(EntryVal->getType());1286Step = Builder.CreateTrunc(Step, TruncType);1287Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);1288}12891290Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);1291Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);1292Value *SteppedStart = getStepVector(1293SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);12941295// We create vector phi nodes for both integer and floating-point induction1296// variables. Here, we determine the kind of arithmetic we will perform.1297Instruction::BinaryOps AddOp;1298Instruction::BinaryOps MulOp;1299if (Step->getType()->isIntegerTy()) {1300AddOp = Instruction::Add;1301MulOp = Instruction::Mul;1302} else {1303AddOp = ID.getInductionOpcode();1304MulOp = Instruction::FMul;1305}13061307// Multiply the vectorization factor by the step using integer or1308// floating-point arithmetic as appropriate.1309Type *StepType = Step->getType();1310Value *RuntimeVF;1311if (Step->getType()->isFloatingPointTy())1312RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);1313else1314RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);1315Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);13161317// Create a vector splat to use in the induction update.1318//1319// FIXME: If the step is non-constant, we create the vector splat with1320// IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't1321// handle a constant vector splat.1322Value *SplatVF = isa<Constant>(Mul)1323? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))1324: Builder.CreateVectorSplat(State.VF, Mul);1325Builder.restoreIP(CurrIP);13261327// We may need to add the step a number of times, depending on the unroll1328// factor. The last of those goes into the PHI.1329PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");1330VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());1331VecInd->setDebugLoc(EntryVal->getDebugLoc());1332Instruction *LastInduction = VecInd;1333for (unsigned Part = 0; Part < State.UF; ++Part) {1334State.set(this, LastInduction, Part);13351336if (isa<TruncInst>(EntryVal))1337State.addMetadata(LastInduction, EntryVal);13381339LastInduction = cast<Instruction>(1340Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));1341LastInduction->setDebugLoc(EntryVal->getDebugLoc());1342}13431344LastInduction->setName("vec.ind.next");1345VecInd->addIncoming(SteppedStart, VectorPH);1346// Add induction update using an incorrect block temporarily. The phi node1347// will be fixed after VPlan execution. Note that at this point the latch1348// block cannot be used, as it does not exist yet.1349// TODO: Model increment value in VPlan, by turning the recipe into a1350// multi-def and a subclass of VPHeaderPHIRecipe.1351VecInd->addIncoming(LastInduction, VectorPH);1352}13531354#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)1355void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,1356VPSlotTracker &SlotTracker) const {1357O << Indent << "WIDEN-INDUCTION";1358if (getTruncInst()) {1359O << "\\l\"";1360O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";1361O << " +\n" << Indent << "\" ";1362getVPValue(0)->printAsOperand(O, SlotTracker);1363} else1364O << " " << VPlanIngredient(IV);13651366O << ", ";1367getStepValue()->printAsOperand(O, SlotTracker);1368}1369#endif13701371bool VPWidenIntOrFpInductionRecipe::isCanonical() const {1372// The step may be defined by a recipe in the preheader (e.g. if it requires1373// SCEV expansion), but for the canonical induction the step is required to be1374// 1, which is represented as live-in.1375if (getStepValue()->getDefiningRecipe())1376return false;1377auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());1378auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());1379auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());1380return StartC && StartC->isZero() && StepC && StepC->isOne() &&1381getScalarType() == CanIV->getScalarType();1382}13831384#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)1385void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,1386VPSlotTracker &SlotTracker) const {1387O << Indent;1388printAsOperand(O, SlotTracker);1389O << Indent << "= DERIVED-IV ";1390getStartValue()->printAsOperand(O, SlotTracker);1391O << " + ";1392getOperand(1)->printAsOperand(O, SlotTracker);1393O << " * ";1394getStepValue()->printAsOperand(O, SlotTracker);1395}1396#endif13971398void VPScalarIVStepsRecipe::execute(VPTransformState &State) {1399// Fast-math-flags propagate from the original induction instruction.1400IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);1401if (hasFastMathFlags())1402State.Builder.setFastMathFlags(getFastMathFlags());14031404/// Compute scalar induction steps. \p ScalarIV is the scalar induction1405/// variable on which to base the steps, \p Step is the size of the step.14061407Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));1408Value *Step = State.get(getStepValue(), VPIteration(0, 0));1409IRBuilderBase &Builder = State.Builder;14101411// Ensure step has the same type as that of scalar IV.1412Type *BaseIVTy = BaseIV->getType()->getScalarType();1413assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");14141415// We build scalar steps for both integer and floating-point induction1416// variables. Here, we determine the kind of arithmetic we will perform.1417Instruction::BinaryOps AddOp;1418Instruction::BinaryOps MulOp;1419if (BaseIVTy->isIntegerTy()) {1420AddOp = Instruction::Add;1421MulOp = Instruction::Mul;1422} else {1423AddOp = InductionOpcode;1424MulOp = Instruction::FMul;1425}14261427// Determine the number of scalars we need to generate for each unroll1428// iteration.1429bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);1430// Compute the scalar steps and save the results in State.1431Type *IntStepTy =1432IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());1433Type *VecIVTy = nullptr;1434Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;1435if (!FirstLaneOnly && State.VF.isScalable()) {1436VecIVTy = VectorType::get(BaseIVTy, State.VF);1437UnitStepVec =1438Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));1439SplatStep = Builder.CreateVectorSplat(State.VF, Step);1440SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);1441}14421443unsigned StartPart = 0;1444unsigned EndPart = State.UF;1445unsigned StartLane = 0;1446unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();1447if (State.Instance) {1448StartPart = State.Instance->Part;1449EndPart = StartPart + 1;1450StartLane = State.Instance->Lane.getKnownLane();1451EndLane = StartLane + 1;1452}1453for (unsigned Part = StartPart; Part < EndPart; ++Part) {1454Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);14551456if (!FirstLaneOnly && State.VF.isScalable()) {1457auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);1458auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);1459if (BaseIVTy->isFloatingPointTy())1460InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);1461auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);1462auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);1463State.set(this, Add, Part);1464// It's useful to record the lane values too for the known minimum number1465// of elements so we do those below. This improves the code quality when1466// trying to extract the first element, for example.1467}14681469if (BaseIVTy->isFloatingPointTy())1470StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);14711472for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {1473Value *StartIdx = Builder.CreateBinOp(1474AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));1475// The step returned by `createStepForVF` is a runtime-evaluated value1476// when VF is scalable. Otherwise, it should be folded into a Constant.1477assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&1478"Expected StartIdx to be folded to a constant when VF is not "1479"scalable");1480auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);1481auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);1482State.set(this, Add, VPIteration(Part, Lane));1483}1484}1485}14861487#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)1488void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,1489VPSlotTracker &SlotTracker) const {1490O << Indent;1491printAsOperand(O, SlotTracker);1492O << " = SCALAR-STEPS ";1493printOperands(O, SlotTracker);1494}1495#endif14961497void VPWidenGEPRecipe::execute(VPTransformState &State) {1498assert(State.VF.isVector() && "not widening");1499auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());1500// Construct a vector GEP by widening the operands of the scalar GEP as1501// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP1502// results in a vector of pointers when at least one operand of the GEP1503// is vector-typed. Thus, to keep the representation compact, we only use1504// vector-typed operands for loop-varying values.15051506if (areAllOperandsInvariant()) {1507// If we are vectorizing, but the GEP has only loop-invariant operands,1508// the GEP we build (by only using vector-typed operands for1509// loop-varying values) would be a scalar pointer. Thus, to ensure we1510// produce a vector of pointers, we need to either arbitrarily pick an1511// operand to broadcast, or broadcast a clone of the original GEP.1512// Here, we broadcast a clone of the original.1513//1514// TODO: If at some point we decide to scalarize instructions having1515// loop-invariant operands, this special case will no longer be1516// required. We would add the scalarization decision to1517// collectLoopScalars() and teach getVectorValue() to broadcast1518// the lane-zero scalar value.1519SmallVector<Value *> Ops;1520for (unsigned I = 0, E = getNumOperands(); I != E; I++)1521Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));15221523auto *NewGEP =1524State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],1525ArrayRef(Ops).drop_front(), "", isInBounds());1526for (unsigned Part = 0; Part < State.UF; ++Part) {1527Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);1528State.set(this, EntryPart, Part);1529State.addMetadata(EntryPart, GEP);1530}1531} else {1532// If the GEP has at least one loop-varying operand, we are sure to1533// produce a vector of pointers. But if we are only unrolling, we want1534// to produce a scalar GEP for each unroll part. Thus, the GEP we1535// produce with the code below will be scalar (if VF == 1) or vector1536// (otherwise). Note that for the unroll-only case, we still maintain1537// values in the vector mapping with initVector, as we do for other1538// instructions.1539for (unsigned Part = 0; Part < State.UF; ++Part) {1540// The pointer operand of the new GEP. If it's loop-invariant, we1541// won't broadcast it.1542auto *Ptr = isPointerLoopInvariant()1543? State.get(getOperand(0), VPIteration(0, 0))1544: State.get(getOperand(0), Part);15451546// Collect all the indices for the new GEP. If any index is1547// loop-invariant, we won't broadcast it.1548SmallVector<Value *, 4> Indices;1549for (unsigned I = 1, E = getNumOperands(); I < E; I++) {1550VPValue *Operand = getOperand(I);1551if (isIndexLoopInvariant(I - 1))1552Indices.push_back(State.get(Operand, VPIteration(0, 0)));1553else1554Indices.push_back(State.get(Operand, Part));1555}15561557// Create the new GEP. Note that this GEP may be a scalar if VF == 1,1558// but it should be a vector, otherwise.1559auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,1560Indices, "", isInBounds());1561assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&1562"NewGEP is not a pointer vector");1563State.set(this, NewGEP, Part);1564State.addMetadata(NewGEP, GEP);1565}1566}1567}15681569#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)1570void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,1571VPSlotTracker &SlotTracker) const {1572O << Indent << "WIDEN-GEP ";1573O << (isPointerLoopInvariant() ? "Inv" : "Var");1574for (size_t I = 0; I < getNumOperands() - 1; ++I)1575O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";15761577O << " ";1578printAsOperand(O, SlotTracker);1579O << " = getelementptr";1580printFlags(O);1581printOperands(O, SlotTracker);1582}1583#endif15841585void VPVectorPointerRecipe ::execute(VPTransformState &State) {1586auto &Builder = State.Builder;1587State.setDebugLocFrom(getDebugLoc());1588for (unsigned Part = 0; Part < State.UF; ++Part) {1589// Calculate the pointer for the specific unroll-part.1590Value *PartPtr = nullptr;1591// Use i32 for the gep index type when the value is constant,1592// or query DataLayout for a more suitable index type otherwise.1593const DataLayout &DL =1594Builder.GetInsertBlock()->getDataLayout();1595Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)1596? DL.getIndexType(IndexedTy->getPointerTo())1597: Builder.getInt32Ty();1598Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));1599bool InBounds = isInBounds();1600if (IsReverse) {1601// If the address is consecutive but reversed, then the1602// wide store needs to start at the last vector element.1603// RunTimeVF = VScale * VF.getKnownMinValue()1604// For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()1605Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);1606// NumElt = -Part * RunTimeVF1607Value *NumElt = Builder.CreateMul(1608ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);1609// LastLane = 1 - RunTimeVF1610Value *LastLane =1611Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);1612PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);1613PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);1614} else {1615Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);1616PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);1617}16181619State.set(this, PartPtr, Part, /*IsScalar*/ true);1620}1621}16221623#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)1624void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,1625VPSlotTracker &SlotTracker) const {1626O << Indent;1627printAsOperand(O, SlotTracker);1628O << " = vector-pointer ";1629if (IsReverse)1630O << "(reverse) ";16311632printOperands(O, SlotTracker);1633}1634#endif16351636void VPBlendRecipe::execute(VPTransformState &State) {1637State.setDebugLocFrom(getDebugLoc());1638// We know that all PHIs in non-header blocks are converted into1639// selects, so we don't have to worry about the insertion order and we1640// can just use the builder.1641// At this point we generate the predication tree. There may be1642// duplications since this is a simple recursive scan, but future1643// optimizations will clean it up.16441645unsigned NumIncoming = getNumIncomingValues();16461647// Generate a sequence of selects of the form:1648// SELECT(Mask3, In3,1649// SELECT(Mask2, In2,1650// SELECT(Mask1, In1,1651// In0)))1652// Note that Mask0 is never used: lanes for which no path reaches this phi and1653// are essentially undef are taken from In0.1654VectorParts Entry(State.UF);1655bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);1656for (unsigned In = 0; In < NumIncoming; ++In) {1657for (unsigned Part = 0; Part < State.UF; ++Part) {1658// We might have single edge PHIs (blocks) - use an identity1659// 'select' for the first PHI operand.1660Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);1661if (In == 0)1662Entry[Part] = In0; // Initialize with the first incoming value.1663else {1664// Select between the current value and the previous incoming edge1665// based on the incoming mask.1666Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);1667Entry[Part] =1668State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");1669}1670}1671}1672for (unsigned Part = 0; Part < State.UF; ++Part)1673State.set(this, Entry[Part], Part, OnlyFirstLaneUsed);1674}16751676#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)1677void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,1678VPSlotTracker &SlotTracker) const {1679O << Indent << "BLEND ";1680printAsOperand(O, SlotTracker);1681O << " =";1682if (getNumIncomingValues() == 1) {1683// Not a User of any mask: not really blending, this is a1684// single-predecessor phi.1685O << " ";1686getIncomingValue(0)->printAsOperand(O, SlotTracker);1687} else {1688for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {1689O << " ";1690getIncomingValue(I)->printAsOperand(O, SlotTracker);1691if (I == 0)1692continue;1693O << "/";1694getMask(I)->printAsOperand(O, SlotTracker);1695}1696}1697}1698#endif16991700void VPReductionRecipe::execute(VPTransformState &State) {1701assert(!State.Instance && "Reduction being replicated.");1702Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);1703RecurKind Kind = RdxDesc.getRecurrenceKind();1704// Propagate the fast-math flags carried by the underlying instruction.1705IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);1706State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());1707for (unsigned Part = 0; Part < State.UF; ++Part) {1708Value *NewVecOp = State.get(getVecOp(), Part);1709if (VPValue *Cond = getCondOp()) {1710Value *NewCond = State.get(Cond, Part, State.VF.isScalar());1711VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());1712Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();1713Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,1714RdxDesc.getFastMathFlags());1715if (State.VF.isVector()) {1716Iden = State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);1717}17181719Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);1720NewVecOp = Select;1721}1722Value *NewRed;1723Value *NextInChain;1724if (IsOrdered) {1725if (State.VF.isVector())1726NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,1727PrevInChain);1728else1729NewRed = State.Builder.CreateBinOp(1730(Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,1731NewVecOp);1732PrevInChain = NewRed;1733} else {1734PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);1735NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);1736}1737if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {1738NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),1739NewRed, PrevInChain);1740} else if (IsOrdered)1741NextInChain = NewRed;1742else1743NextInChain = State.Builder.CreateBinOp(1744(Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);1745State.set(this, NextInChain, Part, /*IsScalar*/ true);1746}1747}17481749void VPReductionEVLRecipe::execute(VPTransformState &State) {1750assert(!State.Instance && "Reduction being replicated.");1751assert(State.UF == 1 &&1752"Expected only UF == 1 when vectorizing with explicit vector length.");17531754auto &Builder = State.Builder;1755// Propagate the fast-math flags carried by the underlying instruction.1756IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);1757const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();1758Builder.setFastMathFlags(RdxDesc.getFastMathFlags());17591760RecurKind Kind = RdxDesc.getRecurrenceKind();1761Value *Prev = State.get(getChainOp(), 0, /*IsScalar*/ true);1762Value *VecOp = State.get(getVecOp(), 0);1763Value *EVL = State.get(getEVL(), VPIteration(0, 0));17641765VectorBuilder VBuilder(Builder);1766VBuilder.setEVL(EVL);1767Value *Mask;1768// TODO: move the all-true mask generation into VectorBuilder.1769if (VPValue *CondOp = getCondOp())1770Mask = State.get(CondOp, 0);1771else1772Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());1773VBuilder.setMask(Mask);17741775Value *NewRed;1776if (isOrdered()) {1777NewRed = createOrderedReduction(VBuilder, RdxDesc, VecOp, Prev);1778} else {1779NewRed = createSimpleTargetReduction(VBuilder, VecOp, RdxDesc);1780if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))1781NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);1782else1783NewRed = Builder.CreateBinOp(1784(Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, Prev);1785}1786State.set(this, NewRed, 0, /*IsScalar*/ true);1787}17881789#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)1790void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,1791VPSlotTracker &SlotTracker) const {1792O << Indent << "REDUCE ";1793printAsOperand(O, SlotTracker);1794O << " = ";1795getChainOp()->printAsOperand(O, SlotTracker);1796O << " +";1797if (isa<FPMathOperator>(getUnderlyingInstr()))1798O << getUnderlyingInstr()->getFastMathFlags();1799O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";1800getVecOp()->printAsOperand(O, SlotTracker);1801if (isConditional()) {1802O << ", ";1803getCondOp()->printAsOperand(O, SlotTracker);1804}1805O << ")";1806if (RdxDesc.IntermediateStore)1807O << " (with final reduction value stored in invariant address sank "1808"outside of loop)";1809}18101811void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,1812VPSlotTracker &SlotTracker) const {1813const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();1814O << Indent << "REDUCE ";1815printAsOperand(O, SlotTracker);1816O << " = ";1817getChainOp()->printAsOperand(O, SlotTracker);1818O << " +";1819if (isa<FPMathOperator>(getUnderlyingInstr()))1820O << getUnderlyingInstr()->getFastMathFlags();1821O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";1822getVecOp()->printAsOperand(O, SlotTracker);1823O << ", ";1824getEVL()->printAsOperand(O, SlotTracker);1825if (isConditional()) {1826O << ", ";1827getCondOp()->printAsOperand(O, SlotTracker);1828}1829O << ")";1830if (RdxDesc.IntermediateStore)1831O << " (with final reduction value stored in invariant address sank "1832"outside of loop)";1833}1834#endif18351836bool VPReplicateRecipe::shouldPack() const {1837// Find if the recipe is used by a widened recipe via an intervening1838// VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.1839return any_of(users(), [](const VPUser *U) {1840if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))1841return any_of(PredR->users(), [PredR](const VPUser *U) {1842return !U->usesScalars(PredR);1843});1844return false;1845});1846}18471848#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)1849void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,1850VPSlotTracker &SlotTracker) const {1851O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");18521853if (!getUnderlyingInstr()->getType()->isVoidTy()) {1854printAsOperand(O, SlotTracker);1855O << " = ";1856}1857if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {1858O << "call";1859printFlags(O);1860O << "@" << CB->getCalledFunction()->getName() << "(";1861interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),1862O, [&O, &SlotTracker](VPValue *Op) {1863Op->printAsOperand(O, SlotTracker);1864});1865O << ")";1866} else {1867O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());1868printFlags(O);1869printOperands(O, SlotTracker);1870}18711872if (shouldPack())1873O << " (S->V)";1874}1875#endif18761877/// Checks if \p C is uniform across all VFs and UFs. It is considered as such1878/// if it is either defined outside the vector region or its operand is known to1879/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).1880/// TODO: Uniformity should be associated with a VPValue and there should be a1881/// generic way to check.1882static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {1883return C->isDefinedOutsideVectorRegions() ||1884isa<VPDerivedIVRecipe>(C->getOperand(0)) ||1885isa<VPCanonicalIVPHIRecipe>(C->getOperand(0));1886}18871888Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {1889assert(vputils::onlyFirstLaneUsed(this) &&1890"Codegen only implemented for first lane.");1891switch (Opcode) {1892case Instruction::SExt:1893case Instruction::ZExt:1894case Instruction::Trunc: {1895// Note: SExt/ZExt not used yet.1896Value *Op = State.get(getOperand(0), VPIteration(Part, 0));1897return State.Builder.CreateCast(Instruction::CastOps(Opcode), Op, ResultTy);1898}1899default:1900llvm_unreachable("opcode not implemented yet");1901}1902}19031904void VPScalarCastRecipe ::execute(VPTransformState &State) {1905bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(this);1906for (unsigned Part = 0; Part != State.UF; ++Part) {1907Value *Res;1908// Only generate a single instance, if the recipe is uniform across UFs and1909// VFs.1910if (Part > 0 && IsUniformAcrossVFsAndUFs)1911Res = State.get(this, VPIteration(0, 0));1912else1913Res = generate(State, Part);1914State.set(this, Res, VPIteration(Part, 0));1915}1916}19171918#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)1919void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,1920VPSlotTracker &SlotTracker) const {1921O << Indent << "SCALAR-CAST ";1922printAsOperand(O, SlotTracker);1923O << " = " << Instruction::getOpcodeName(Opcode) << " ";1924printOperands(O, SlotTracker);1925O << " to " << *ResultTy;1926}1927#endif19281929void VPBranchOnMaskRecipe::execute(VPTransformState &State) {1930assert(State.Instance && "Branch on Mask works only on single instance.");19311932unsigned Part = State.Instance->Part;1933unsigned Lane = State.Instance->Lane.getKnownLane();19341935Value *ConditionBit = nullptr;1936VPValue *BlockInMask = getMask();1937if (BlockInMask) {1938ConditionBit = State.get(BlockInMask, Part);1939if (ConditionBit->getType()->isVectorTy())1940ConditionBit = State.Builder.CreateExtractElement(1941ConditionBit, State.Builder.getInt32(Lane));1942} else // Block in mask is all-one.1943ConditionBit = State.Builder.getTrue();19441945// Replace the temporary unreachable terminator with a new conditional branch,1946// whose two destinations will be set later when they are created.1947auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();1948assert(isa<UnreachableInst>(CurrentTerminator) &&1949"Expected to replace unreachable terminator with conditional branch.");1950auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);1951CondBr->setSuccessor(0, nullptr);1952ReplaceInstWithInst(CurrentTerminator, CondBr);1953}19541955void VPPredInstPHIRecipe::execute(VPTransformState &State) {1956assert(State.Instance && "Predicated instruction PHI works per instance.");1957Instruction *ScalarPredInst =1958cast<Instruction>(State.get(getOperand(0), *State.Instance));1959BasicBlock *PredicatedBB = ScalarPredInst->getParent();1960BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();1961assert(PredicatingBB && "Predicated block has no single predecessor.");1962assert(isa<VPReplicateRecipe>(getOperand(0)) &&1963"operand must be VPReplicateRecipe");19641965// By current pack/unpack logic we need to generate only a single phi node: if1966// a vector value for the predicated instruction exists at this point it means1967// the instruction has vector users only, and a phi for the vector value is1968// needed. In this case the recipe of the predicated instruction is marked to1969// also do that packing, thereby "hoisting" the insert-element sequence.1970// Otherwise, a phi node for the scalar value is needed.1971unsigned Part = State.Instance->Part;1972if (State.hasVectorValue(getOperand(0), Part)) {1973Value *VectorValue = State.get(getOperand(0), Part);1974InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);1975PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);1976VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.1977VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.1978if (State.hasVectorValue(this, Part))1979State.reset(this, VPhi, Part);1980else1981State.set(this, VPhi, Part);1982// NOTE: Currently we need to update the value of the operand, so the next1983// predicated iteration inserts its generated value in the correct vector.1984State.reset(getOperand(0), VPhi, Part);1985} else {1986Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();1987PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);1988Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),1989PredicatingBB);1990Phi->addIncoming(ScalarPredInst, PredicatedBB);1991if (State.hasScalarValue(this, *State.Instance))1992State.reset(this, Phi, *State.Instance);1993else1994State.set(this, Phi, *State.Instance);1995// NOTE: Currently we need to update the value of the operand, so the next1996// predicated iteration inserts its generated value in the correct vector.1997State.reset(getOperand(0), Phi, *State.Instance);1998}1999}20002001#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)2002void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,2003VPSlotTracker &SlotTracker) const {2004O << Indent << "PHI-PREDICATED-INSTRUCTION ";2005printAsOperand(O, SlotTracker);2006O << " = ";2007printOperands(O, SlotTracker);2008}20092010void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,2011VPSlotTracker &SlotTracker) const {2012O << Indent << "WIDEN ";2013printAsOperand(O, SlotTracker);2014O << " = load ";2015printOperands(O, SlotTracker);2016}20172018void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,2019VPSlotTracker &SlotTracker) const {2020O << Indent << "WIDEN ";2021printAsOperand(O, SlotTracker);2022O << " = vp.load ";2023printOperands(O, SlotTracker);2024}20252026void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,2027VPSlotTracker &SlotTracker) const {2028O << Indent << "WIDEN store ";2029printOperands(O, SlotTracker);2030}20312032void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,2033VPSlotTracker &SlotTracker) const {2034O << Indent << "WIDEN vp.store ";2035printOperands(O, SlotTracker);2036}2037#endif20382039static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,2040VectorType *DstVTy, const DataLayout &DL) {2041// Verify that V is a vector type with same number of elements as DstVTy.2042auto VF = DstVTy->getElementCount();2043auto *SrcVecTy = cast<VectorType>(V->getType());2044assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");2045Type *SrcElemTy = SrcVecTy->getElementType();2046Type *DstElemTy = DstVTy->getElementType();2047assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&2048"Vector elements must have same size");20492050// Do a direct cast if element types are castable.2051if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {2052return Builder.CreateBitOrPointerCast(V, DstVTy);2053}2054// V cannot be directly casted to desired vector type.2055// May happen when V is a floating point vector but DstVTy is a vector of2056// pointers or vice-versa. Handle this using a two-step bitcast using an2057// intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.2058assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&2059"Only one type should be a pointer type");2060assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&2061"Only one type should be a floating point type");2062Type *IntTy =2063IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));2064auto *VecIntTy = VectorType::get(IntTy, VF);2065Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);2066return Builder.CreateBitOrPointerCast(CastVal, DstVTy);2067}20682069/// Return a vector containing interleaved elements from multiple2070/// smaller input vectors.2071static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,2072const Twine &Name) {2073unsigned Factor = Vals.size();2074assert(Factor > 1 && "Tried to interleave invalid number of vectors");20752076VectorType *VecTy = cast<VectorType>(Vals[0]->getType());2077#ifndef NDEBUG2078for (Value *Val : Vals)2079assert(Val->getType() == VecTy && "Tried to interleave mismatched types");2080#endif20812082// Scalable vectors cannot use arbitrary shufflevectors (only splats), so2083// must use intrinsics to interleave.2084if (VecTy->isScalableTy()) {2085VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);2086return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,2087Vals,2088/*FMFSource=*/nullptr, Name);2089}20902091// Fixed length. Start by concatenating all vectors into a wide vector.2092Value *WideVec = concatenateVectors(Builder, Vals);20932094// Interleave the elements into the wide vector.2095const unsigned NumElts = VecTy->getElementCount().getFixedValue();2096return Builder.CreateShuffleVector(2097WideVec, createInterleaveMask(NumElts, Factor), Name);2098}20992100// Try to vectorize the interleave group that \p Instr belongs to.2101//2102// E.g. Translate following interleaved load group (factor = 3):2103// for (i = 0; i < N; i+=3) {2104// R = Pic[i]; // Member of index 02105// G = Pic[i+1]; // Member of index 12106// B = Pic[i+2]; // Member of index 22107// ... // do something to R, G, B2108// }2109// To:2110// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B2111// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements2112// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements2113// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements2114//2115// Or translate following interleaved store group (factor = 3):2116// for (i = 0; i < N; i+=3) {2117// ... do something to R, G, B2118// Pic[i] = R; // Member of index 02119// Pic[i+1] = G; // Member of index 12120// Pic[i+2] = B; // Member of index 22121// }2122// To:2123// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>2124// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>2125// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,2126// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements2127// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B2128void VPInterleaveRecipe::execute(VPTransformState &State) {2129assert(!State.Instance && "Interleave group being replicated.");2130const InterleaveGroup<Instruction> *Group = IG;2131Instruction *Instr = Group->getInsertPos();21322133// Prepare for the vector type of the interleaved load/store.2134Type *ScalarTy = getLoadStoreType(Instr);2135unsigned InterleaveFactor = Group->getFactor();2136auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);21372138// Prepare for the new pointers.2139SmallVector<Value *, 2> AddrParts;2140unsigned Index = Group->getIndex(Instr);21412142// TODO: extend the masked interleaved-group support to reversed access.2143VPValue *BlockInMask = getMask();2144assert((!BlockInMask || !Group->isReverse()) &&2145"Reversed masked interleave-group not supported.");21462147Value *Idx;2148// If the group is reverse, adjust the index to refer to the last vector lane2149// instead of the first. We adjust the index from the first vector lane,2150// rather than directly getting the pointer for lane VF - 1, because the2151// pointer operand of the interleaved access is supposed to be uniform. For2152// uniform instructions, we're only required to generate a value for the2153// first vector lane in each unroll iteration.2154if (Group->isReverse()) {2155Value *RuntimeVF =2156getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);2157Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));2158Idx = State.Builder.CreateMul(Idx,2159State.Builder.getInt32(Group->getFactor()));2160Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));2161Idx = State.Builder.CreateNeg(Idx);2162} else2163Idx = State.Builder.getInt32(-Index);21642165VPValue *Addr = getAddr();2166for (unsigned Part = 0; Part < State.UF; Part++) {2167Value *AddrPart = State.get(Addr, VPIteration(Part, 0));2168if (auto *I = dyn_cast<Instruction>(AddrPart))2169State.setDebugLocFrom(I->getDebugLoc());21702171// Notice current instruction could be any index. Need to adjust the address2172// to the member of index 0.2173//2174// E.g. a = A[i+1]; // Member of index 1 (Current instruction)2175// b = A[i]; // Member of index 02176// Current pointer is pointed to A[i+1], adjust it to A[i].2177//2178// E.g. A[i+1] = a; // Member of index 12179// A[i] = b; // Member of index 02180// A[i+2] = c; // Member of index 2 (Current instruction)2181// Current pointer is pointed to A[i+2], adjust it to A[i].21822183bool InBounds = false;2184if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))2185InBounds = gep->isInBounds();2186AddrPart = State.Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);2187AddrParts.push_back(AddrPart);2188}21892190State.setDebugLocFrom(Instr->getDebugLoc());2191Value *PoisonVec = PoisonValue::get(VecTy);21922193auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor](2194unsigned Part, Value *MaskForGaps) -> Value * {2195if (State.VF.isScalable()) {2196assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");2197assert(InterleaveFactor == 2 &&2198"Unsupported deinterleave factor for scalable vectors");2199auto *BlockInMaskPart = State.get(BlockInMask, Part);2200SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};2201auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),2202State.VF.getKnownMinValue() * 2, true);2203return State.Builder.CreateIntrinsic(2204MaskTy, Intrinsic::vector_interleave2, Ops,2205/*FMFSource=*/nullptr, "interleaved.mask");2206}22072208if (!BlockInMask)2209return MaskForGaps;22102211Value *BlockInMaskPart = State.get(BlockInMask, Part);2212Value *ShuffledMask = State.Builder.CreateShuffleVector(2213BlockInMaskPart,2214createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),2215"interleaved.mask");2216return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,2217ShuffledMask, MaskForGaps)2218: ShuffledMask;2219};22202221const DataLayout &DL = Instr->getDataLayout();2222// Vectorize the interleaved load group.2223if (isa<LoadInst>(Instr)) {2224Value *MaskForGaps = nullptr;2225if (NeedsMaskForGaps) {2226MaskForGaps = createBitMaskForGaps(State.Builder,2227State.VF.getKnownMinValue(), *Group);2228assert(MaskForGaps && "Mask for Gaps is required but it is null");2229}22302231// For each unroll part, create a wide load for the group.2232SmallVector<Value *, 2> NewLoads;2233for (unsigned Part = 0; Part < State.UF; Part++) {2234Instruction *NewLoad;2235if (BlockInMask || MaskForGaps) {2236Value *GroupMask = CreateGroupMask(Part, MaskForGaps);2237NewLoad = State.Builder.CreateMaskedLoad(VecTy, AddrParts[Part],2238Group->getAlign(), GroupMask,2239PoisonVec, "wide.masked.vec");2240} else2241NewLoad = State.Builder.CreateAlignedLoad(2242VecTy, AddrParts[Part], Group->getAlign(), "wide.vec");2243Group->addMetadata(NewLoad);2244NewLoads.push_back(NewLoad);2245}22462247ArrayRef<VPValue *> VPDefs = definedValues();2248const DataLayout &DL = State.CFG.PrevBB->getDataLayout();2249if (VecTy->isScalableTy()) {2250assert(InterleaveFactor == 2 &&2251"Unsupported deinterleave factor for scalable vectors");22522253for (unsigned Part = 0; Part < State.UF; ++Part) {2254// Scalable vectors cannot use arbitrary shufflevectors (only splats),2255// so must use intrinsics to deinterleave.2256Value *DI = State.Builder.CreateIntrinsic(2257Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],2258/*FMFSource=*/nullptr, "strided.vec");2259unsigned J = 0;2260for (unsigned I = 0; I < InterleaveFactor; ++I) {2261Instruction *Member = Group->getMember(I);22622263if (!Member)2264continue;22652266Value *StridedVec = State.Builder.CreateExtractValue(DI, I);2267// If this member has different type, cast the result type.2268if (Member->getType() != ScalarTy) {2269VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);2270StridedVec =2271createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);2272}22732274if (Group->isReverse())2275StridedVec =2276State.Builder.CreateVectorReverse(StridedVec, "reverse");22772278State.set(VPDefs[J], StridedVec, Part);2279++J;2280}2281}22822283return;2284}22852286// For each member in the group, shuffle out the appropriate data from the2287// wide loads.2288unsigned J = 0;2289for (unsigned I = 0; I < InterleaveFactor; ++I) {2290Instruction *Member = Group->getMember(I);22912292// Skip the gaps in the group.2293if (!Member)2294continue;22952296auto StrideMask =2297createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());2298for (unsigned Part = 0; Part < State.UF; Part++) {2299Value *StridedVec = State.Builder.CreateShuffleVector(2300NewLoads[Part], StrideMask, "strided.vec");23012302// If this member has different type, cast the result type.2303if (Member->getType() != ScalarTy) {2304assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");2305VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);2306StridedVec =2307createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);2308}23092310if (Group->isReverse())2311StridedVec = State.Builder.CreateVectorReverse(StridedVec, "reverse");23122313State.set(VPDefs[J], StridedVec, Part);2314}2315++J;2316}2317return;2318}23192320// The sub vector type for current instruction.2321auto *SubVT = VectorType::get(ScalarTy, State.VF);23222323// Vectorize the interleaved store group.2324Value *MaskForGaps =2325createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);2326assert((!MaskForGaps || !State.VF.isScalable()) &&2327"masking gaps for scalable vectors is not yet supported.");2328ArrayRef<VPValue *> StoredValues = getStoredValues();2329for (unsigned Part = 0; Part < State.UF; Part++) {2330// Collect the stored vector from each member.2331SmallVector<Value *, 4> StoredVecs;2332unsigned StoredIdx = 0;2333for (unsigned i = 0; i < InterleaveFactor; i++) {2334assert((Group->getMember(i) || MaskForGaps) &&2335"Fail to get a member from an interleaved store group");2336Instruction *Member = Group->getMember(i);23372338// Skip the gaps in the group.2339if (!Member) {2340Value *Undef = PoisonValue::get(SubVT);2341StoredVecs.push_back(Undef);2342continue;2343}23442345Value *StoredVec = State.get(StoredValues[StoredIdx], Part);2346++StoredIdx;23472348if (Group->isReverse())2349StoredVec = State.Builder.CreateVectorReverse(StoredVec, "reverse");23502351// If this member has different type, cast it to a unified type.23522353if (StoredVec->getType() != SubVT)2354StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);23552356StoredVecs.push_back(StoredVec);2357}23582359// Interleave all the smaller vectors into one wider vector.2360Value *IVec =2361interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");2362Instruction *NewStoreInstr;2363if (BlockInMask || MaskForGaps) {2364Value *GroupMask = CreateGroupMask(Part, MaskForGaps);2365NewStoreInstr = State.Builder.CreateMaskedStore(2366IVec, AddrParts[Part], Group->getAlign(), GroupMask);2367} else2368NewStoreInstr = State.Builder.CreateAlignedStore(IVec, AddrParts[Part],2369Group->getAlign());23702371Group->addMetadata(NewStoreInstr);2372}2373}23742375#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)2376void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,2377VPSlotTracker &SlotTracker) const {2378O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";2379IG->getInsertPos()->printAsOperand(O, false);2380O << ", ";2381getAddr()->printAsOperand(O, SlotTracker);2382VPValue *Mask = getMask();2383if (Mask) {2384O << ", ";2385Mask->printAsOperand(O, SlotTracker);2386}23872388unsigned OpIdx = 0;2389for (unsigned i = 0; i < IG->getFactor(); ++i) {2390if (!IG->getMember(i))2391continue;2392if (getNumStoreOperands() > 0) {2393O << "\n" << Indent << " store ";2394getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);2395O << " to index " << i;2396} else {2397O << "\n" << Indent << " ";2398getVPValue(OpIdx)->printAsOperand(O, SlotTracker);2399O << " = load from index " << i;2400}2401++OpIdx;2402}2403}2404#endif24052406void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {2407Value *Start = getStartValue()->getLiveInIRValue();2408PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");2409EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());24102411BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);2412EntryPart->addIncoming(Start, VectorPH);2413EntryPart->setDebugLoc(getDebugLoc());2414for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)2415State.set(this, EntryPart, Part, /*IsScalar*/ true);2416}24172418#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)2419void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,2420VPSlotTracker &SlotTracker) const {2421O << Indent << "EMIT ";2422printAsOperand(O, SlotTracker);2423O << " = CANONICAL-INDUCTION ";2424printOperands(O, SlotTracker);2425}2426#endif24272428bool VPCanonicalIVPHIRecipe::isCanonical(2429InductionDescriptor::InductionKind Kind, VPValue *Start,2430VPValue *Step) const {2431// Must be an integer induction.2432if (Kind != InductionDescriptor::IK_IntInduction)2433return false;2434// Start must match the start value of this canonical induction.2435if (Start != getStartValue())2436return false;24372438// If the step is defined by a recipe, it is not a ConstantInt.2439if (Step->getDefiningRecipe())2440return false;24412442ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());2443return StepC && StepC->isOne();2444}24452446bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {2447return IsScalarAfterVectorization &&2448(!IsScalable || vputils::onlyFirstLaneUsed(this));2449}24502451#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)2452void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,2453VPSlotTracker &SlotTracker) const {2454O << Indent << "EMIT ";2455printAsOperand(O, SlotTracker);2456O << " = WIDEN-POINTER-INDUCTION ";2457getStartValue()->printAsOperand(O, SlotTracker);2458O << ", " << *IndDesc.getStep();2459}2460#endif24612462void VPExpandSCEVRecipe::execute(VPTransformState &State) {2463assert(!State.Instance && "cannot be used in per-lane");2464const DataLayout &DL = State.CFG.PrevBB->getDataLayout();2465SCEVExpander Exp(SE, DL, "induction");24662467Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),2468&*State.Builder.GetInsertPoint());2469assert(!State.ExpandedSCEVs.contains(Expr) &&2470"Same SCEV expanded multiple times");2471State.ExpandedSCEVs[Expr] = Res;2472for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)2473State.set(this, Res, {Part, 0});2474}24752476#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)2477void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,2478VPSlotTracker &SlotTracker) const {2479O << Indent << "EMIT ";2480getVPSingleValue()->printAsOperand(O, SlotTracker);2481O << " = EXPAND SCEV " << *Expr;2482}2483#endif24842485void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {2486Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);2487Type *STy = CanonicalIV->getType();2488IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());2489ElementCount VF = State.VF;2490Value *VStart = VF.isScalar()2491? CanonicalIV2492: Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");2493for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {2494Value *VStep = createStepForVF(Builder, STy, VF, Part);2495if (VF.isVector()) {2496VStep = Builder.CreateVectorSplat(VF, VStep);2497VStep =2498Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));2499}2500Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");2501State.set(this, CanonicalVectorIV, Part);2502}2503}25042505#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)2506void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,2507VPSlotTracker &SlotTracker) const {2508O << Indent << "EMIT ";2509printAsOperand(O, SlotTracker);2510O << " = WIDEN-CANONICAL-INDUCTION ";2511printOperands(O, SlotTracker);2512}2513#endif25142515void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {2516auto &Builder = State.Builder;2517// Create a vector from the initial value.2518auto *VectorInit = getStartValue()->getLiveInIRValue();25192520Type *VecTy = State.VF.isScalar()2521? VectorInit->getType()2522: VectorType::get(VectorInit->getType(), State.VF);25232524BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);2525if (State.VF.isVector()) {2526auto *IdxTy = Builder.getInt32Ty();2527auto *One = ConstantInt::get(IdxTy, 1);2528IRBuilder<>::InsertPointGuard Guard(Builder);2529Builder.SetInsertPoint(VectorPH->getTerminator());2530auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);2531auto *LastIdx = Builder.CreateSub(RuntimeVF, One);2532VectorInit = Builder.CreateInsertElement(2533PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");2534}25352536// Create a phi node for the new recurrence.2537PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");2538EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());2539EntryPart->addIncoming(VectorInit, VectorPH);2540State.set(this, EntryPart, 0);2541}25422543#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)2544void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,2545VPSlotTracker &SlotTracker) const {2546O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";2547printAsOperand(O, SlotTracker);2548O << " = phi ";2549printOperands(O, SlotTracker);2550}2551#endif25522553void VPReductionPHIRecipe::execute(VPTransformState &State) {2554auto &Builder = State.Builder;25552556// Reductions do not have to start at zero. They can start with2557// any loop invariant values.2558VPValue *StartVPV = getStartValue();2559Value *StartV = StartVPV->getLiveInIRValue();25602561// In order to support recurrences we need to be able to vectorize Phi nodes.2562// Phi nodes have cycles, so we need to vectorize them in two stages. This is2563// stage #1: We create a new vector PHI node with no incoming edges. We'll use2564// this value when we vectorize all of the instructions that use the PHI.2565bool ScalarPHI = State.VF.isScalar() || IsInLoop;2566Type *VecTy = ScalarPHI ? StartV->getType()2567: VectorType::get(StartV->getType(), State.VF);25682569BasicBlock *HeaderBB = State.CFG.PrevBB;2570assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&2571"recipe must be in the vector loop header");2572unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;2573for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {2574Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");2575EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());2576State.set(this, EntryPart, Part, IsInLoop);2577}25782579BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);25802581Value *Iden = nullptr;2582RecurKind RK = RdxDesc.getRecurrenceKind();2583if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||2584RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {2585// MinMax and AnyOf reductions have the start value as their identity.2586if (ScalarPHI) {2587Iden = StartV;2588} else {2589IRBuilderBase::InsertPointGuard IPBuilder(Builder);2590Builder.SetInsertPoint(VectorPH->getTerminator());2591StartV = Iden =2592Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");2593}2594} else {2595Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),2596RdxDesc.getFastMathFlags());25972598if (!ScalarPHI) {2599Iden = Builder.CreateVectorSplat(State.VF, Iden);2600IRBuilderBase::InsertPointGuard IPBuilder(Builder);2601Builder.SetInsertPoint(VectorPH->getTerminator());2602Constant *Zero = Builder.getInt32(0);2603StartV = Builder.CreateInsertElement(Iden, StartV, Zero);2604}2605}26062607for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {2608Value *EntryPart = State.get(this, Part, IsInLoop);2609// Make sure to add the reduction start value only to the2610// first unroll part.2611Value *StartVal = (Part == 0) ? StartV : Iden;2612cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);2613}2614}26152616#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)2617void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,2618VPSlotTracker &SlotTracker) const {2619O << Indent << "WIDEN-REDUCTION-PHI ";26202621printAsOperand(O, SlotTracker);2622O << " = phi ";2623printOperands(O, SlotTracker);2624}2625#endif26262627void VPWidenPHIRecipe::execute(VPTransformState &State) {2628assert(EnableVPlanNativePath &&2629"Non-native vplans are not expected to have VPWidenPHIRecipes.");26302631Value *Op0 = State.get(getOperand(0), 0);2632Type *VecTy = Op0->getType();2633Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");2634State.set(this, VecPhi, 0);2635}26362637#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)2638void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,2639VPSlotTracker &SlotTracker) const {2640O << Indent << "WIDEN-PHI ";26412642auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());2643// Unless all incoming values are modeled in VPlan print the original PHI2644// directly.2645// TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming2646// values as VPValues.2647if (getNumOperands() != OriginalPhi->getNumOperands()) {2648O << VPlanIngredient(OriginalPhi);2649return;2650}26512652printAsOperand(O, SlotTracker);2653O << " = phi ";2654printOperands(O, SlotTracker);2655}2656#endif26572658// TODO: It would be good to use the existing VPWidenPHIRecipe instead and2659// remove VPActiveLaneMaskPHIRecipe.2660void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {2661BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);2662for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {2663Value *StartMask = State.get(getOperand(0), Part);2664PHINode *EntryPart =2665State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");2666EntryPart->addIncoming(StartMask, VectorPH);2667EntryPart->setDebugLoc(getDebugLoc());2668State.set(this, EntryPart, Part);2669}2670}26712672#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)2673void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,2674VPSlotTracker &SlotTracker) const {2675O << Indent << "ACTIVE-LANE-MASK-PHI ";26762677printAsOperand(O, SlotTracker);2678O << " = phi ";2679printOperands(O, SlotTracker);2680}2681#endif26822683void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {2684BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);2685assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");2686Value *Start = State.get(getOperand(0), VPIteration(0, 0));2687PHINode *EntryPart =2688State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv");2689EntryPart->addIncoming(Start, VectorPH);2690EntryPart->setDebugLoc(getDebugLoc());2691State.set(this, EntryPart, 0, /*IsScalar=*/true);2692}26932694#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)2695void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,2696VPSlotTracker &SlotTracker) const {2697O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";26982699printAsOperand(O, SlotTracker);2700O << " = phi ";2701printOperands(O, SlotTracker);2702}2703#endif270427052706