Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
213799 views
//===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7///8/// \file9/// This file implements explicit unrolling for VPlans.10///11//===----------------------------------------------------------------------===//1213#include "VPRecipeBuilder.h"14#include "VPlan.h"15#include "VPlanAnalysis.h"16#include "VPlanCFG.h"17#include "VPlanHelpers.h"18#include "VPlanPatternMatch.h"19#include "VPlanTransforms.h"20#include "VPlanUtils.h"21#include "llvm/ADT/PostOrderIterator.h"22#include "llvm/ADT/STLExtras.h"23#include "llvm/ADT/ScopeExit.h"24#include "llvm/Analysis/IVDescriptors.h"25#include "llvm/IR/Intrinsics.h"2627using namespace llvm;28using namespace llvm::VPlanPatternMatch;2930namespace {3132/// Helper to hold state needed for unrolling. It holds the Plan to unroll by33/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate34/// the unrolling transformation, where the original VPValues are retained for35/// part zero.36class UnrollState {37/// Plan to unroll.38VPlan &Plan;39/// Unroll factor to unroll by.40const unsigned UF;41/// Analysis for types.42VPTypeAnalysis TypeInfo;4344/// Unrolling may create recipes that should not be unrolled themselves.45/// Those are tracked in ToSkip.46SmallPtrSet<VPRecipeBase *, 8> ToSkip;4748// Associate with each VPValue of part 0 its unrolled instances of parts 1,49// ..., UF-1.50DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;5152/// Unroll replicate region \p VPR by cloning the region UF - 1 times.53void unrollReplicateRegionByUF(VPRegionBlock *VPR);5455/// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across56/// all parts.57void unrollRecipeByUF(VPRecipeBase &R);5859/// Unroll header phi recipe \p R. How exactly the recipe gets unrolled60/// depends on the concrete header phi. Inserts newly created recipes at \p61/// InsertPtForPhi.62void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,63VPBasicBlock::iterator InsertPtForPhi);6465/// Unroll a widen induction recipe \p IV. This introduces recipes to compute66/// the induction steps for each part.67void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV,68VPBasicBlock::iterator InsertPtForPhi);6970VPValue *getConstantVPV(unsigned Part) {71Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();72return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));73}7475public:76UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx)77: Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {}7879void unrollBlock(VPBlockBase *VPB);8081VPValue *getValueForPart(VPValue *V, unsigned Part) {82if (Part == 0 || V->isLiveIn())83return V;84assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&85"accessed value does not exist");86return VPV2Parts[V][Part - 1];87}8889/// Given a single original recipe \p OrigR (of part zero), and its copy \p90/// CopyR for part \p Part, map every VPValue defined by \p OrigR to its91/// corresponding VPValue defined by \p CopyR.92void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,93unsigned Part) {94for (const auto &[Idx, VPV] : enumerate(OrigR->definedValues())) {95auto Ins = VPV2Parts.insert({VPV, {}});96assert(Ins.first->second.size() == Part - 1 && "earlier parts not set");97Ins.first->second.push_back(CopyR->getVPValue(Idx));98}99}100101/// Given a uniform recipe \p R, add it for all parts.102void addUniformForAllParts(VPSingleDefRecipe *R) {103auto Ins = VPV2Parts.insert({R, {}});104assert(Ins.second && "uniform value already added");105for (unsigned Part = 0; Part != UF; ++Part)106Ins.first->second.push_back(R);107}108109bool contains(VPValue *VPV) const { return VPV2Parts.contains(VPV); }110111/// Update \p R's operand at \p OpIdx with its corresponding VPValue for part112/// \p P.113void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {114auto *Op = R->getOperand(OpIdx);115R->setOperand(OpIdx, getValueForPart(Op, Part));116}117118/// Update \p R's operands with their corresponding VPValues for part \p P.119void remapOperands(VPRecipeBase *R, unsigned Part) {120for (const auto &[OpIdx, Op] : enumerate(R->operands()))121R->setOperand(OpIdx, getValueForPart(Op, Part));122}123};124} // namespace125126void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {127VPBlockBase *InsertPt = VPR->getSingleSuccessor();128for (unsigned Part = 1; Part != UF; ++Part) {129auto *Copy = VPR->clone();130VPBlockUtils::insertBlockBefore(Copy, InsertPt);131132auto PartI = vp_depth_first_shallow(Copy->getEntry());133auto Part0 = vp_depth_first_shallow(VPR->getEntry());134for (const auto &[PartIVPBB, Part0VPBB] :135zip(VPBlockUtils::blocksOnly<VPBasicBlock>(PartI),136VPBlockUtils::blocksOnly<VPBasicBlock>(Part0))) {137for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {138remapOperands(&PartIR, Part);139if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {140ScalarIVSteps->addOperand(getConstantVPV(Part));141}142143addRecipeForPart(&Part0R, &PartIR, Part);144}145}146}147}148149void UnrollState::unrollWidenInductionByUF(150VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {151VPBasicBlock *PH = cast<VPBasicBlock>(152IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());153Type *IVTy = TypeInfo.inferScalarType(IV);154auto &ID = IV->getInductionDescriptor();155VPIRFlags Flags;156if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))157Flags = ID.getInductionBinOp()->getFastMathFlags();158159VPValue *ScalarStep = IV->getStepValue();160VPBuilder Builder(PH);161VPInstruction *VectorStep = Builder.createNaryOp(162VPInstruction::WideIVStep, {&Plan.getVF(), ScalarStep}, IVTy, Flags,163IV->getDebugLoc());164165ToSkip.insert(VectorStep);166167// Now create recipes to compute the induction steps for part 1 .. UF. Part 0168// remains the header phi. Parts > 0 are computed by adding Step to the169// previous part. The header phi recipe will get 2 new operands: the step170// value for a single part and the last part, used to compute the backedge171// value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 =172// VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3173// %Part.1 = %Part.0 + %VectorStep174// %Part.2 = %Part.1 + %VectorStep175// %Part.3 = %Part.2 + %VectorStep176//177// The newly added recipes are added to ToSkip to avoid interleaving them178// again.179VPValue *Prev = IV;180Builder.setInsertPoint(IV->getParent(), InsertPtForPhi);181unsigned AddOpc =182IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add;183for (unsigned Part = 1; Part != UF; ++Part) {184std::string Name =185Part > 1 ? "step.add." + std::to_string(Part) : "step.add";186187VPInstruction *Add = Builder.createNaryOp(AddOpc,188{189Prev,190VectorStep,191},192Flags, IV->getDebugLoc(), Name);193ToSkip.insert(Add);194addRecipeForPart(IV, Add, Part);195Prev = Add;196}197IV->addOperand(VectorStep);198IV->addOperand(Prev);199}200201void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,202VPBasicBlock::iterator InsertPtForPhi) {203// First-order recurrences pass a single vector or scalar through their header204// phis, irrespective of interleaving.205if (isa<VPFirstOrderRecurrencePHIRecipe>(R))206return;207208// Generate step vectors for each unrolled part.209if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) {210unrollWidenInductionByUF(IV, InsertPtForPhi);211return;212}213214auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(R);215if (RdxPhi && RdxPhi->isOrdered())216return;217218auto InsertPt = std::next(R->getIterator());219for (unsigned Part = 1; Part != UF; ++Part) {220VPRecipeBase *Copy = R->clone();221Copy->insertBefore(*R->getParent(), InsertPt);222addRecipeForPart(R, Copy, Part);223if (isa<VPWidenPointerInductionRecipe>(R)) {224Copy->addOperand(R);225Copy->addOperand(getConstantVPV(Part));226} else if (RdxPhi) {227// If the start value is a ReductionStartVector, use the identity value228// (second operand) for unrolled parts. If the scaling factor is > 1,229// create a new ReductionStartVector with the scale factor and both230// operands set to the identity value.231if (auto *VPI = dyn_cast<VPInstruction>(RdxPhi->getStartValue())) {232assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&233"unexpected start VPInstruction");234if (Part != 1)235continue;236VPValue *StartV;237if (match(VPI->getOperand(2), m_SpecificInt(1))) {238StartV = VPI->getOperand(1);239} else {240auto *C = VPI->clone();241C->setOperand(0, C->getOperand(1));242C->insertAfter(VPI);243StartV = C;244}245for (unsigned Part = 1; Part != UF; ++Part)246VPV2Parts[VPI][Part - 1] = StartV;247}248Copy->addOperand(getConstantVPV(Part));249} else {250assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&251"unexpected header phi recipe not needing unrolled part");252}253}254}255256/// Handle non-header-phi recipes.257void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {258if (match(&R, m_BranchOnCond(m_VPValue())) ||259match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))260return;261262if (auto *VPI = dyn_cast<VPInstruction>(&R)) {263if (vputils::onlyFirstPartUsed(VPI)) {264addUniformForAllParts(VPI);265return;266}267}268if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {269if (isa<StoreInst>(RepR->getUnderlyingValue()) &&270RepR->getOperand(1)->isDefinedOutsideLoopRegions()) {271// Stores to an invariant address only need to store the last part.272remapOperands(&R, UF - 1);273return;274}275if (auto *II = dyn_cast<IntrinsicInst>(RepR->getUnderlyingValue())) {276if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) {277addUniformForAllParts(RepR);278return;279}280}281}282283// Unroll non-uniform recipes.284auto InsertPt = std::next(R.getIterator());285VPBasicBlock &VPBB = *R.getParent();286for (unsigned Part = 1; Part != UF; ++Part) {287VPRecipeBase *Copy = R.clone();288Copy->insertBefore(VPBB, InsertPt);289addRecipeForPart(&R, Copy, Part);290291VPValue *Op;292if (match(&R, m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(293m_VPValue(), m_VPValue(Op)))) {294Copy->setOperand(0, getValueForPart(Op, Part - 1));295Copy->setOperand(1, getValueForPart(Op, Part));296continue;297}298if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {299auto *Phi = dyn_cast<VPReductionPHIRecipe>(R.getOperand(0));300if (Phi && Phi->isOrdered()) {301auto &Parts = VPV2Parts[Phi];302if (Part == 1) {303Parts.clear();304Parts.push_back(Red);305}306Parts.push_back(Copy->getVPSingleValue());307Phi->setOperand(1, Copy->getVPSingleValue());308}309}310remapOperands(Copy, Part);311312// Add operand indicating the part to generate code for, to recipes still313// requiring it.314if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe,315VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) ||316match(Copy, m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(317m_VPValue())))318Copy->addOperand(getConstantVPV(Part));319320if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R))321Copy->setOperand(0, R.getOperand(0));322}323}324325void UnrollState::unrollBlock(VPBlockBase *VPB) {326auto *VPR = dyn_cast<VPRegionBlock>(VPB);327if (VPR) {328if (VPR->isReplicator())329return unrollReplicateRegionByUF(VPR);330331// Traverse blocks in region in RPO to ensure defs are visited before uses332// across blocks.333ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>334RPOT(VPR->getEntry());335for (VPBlockBase *VPB : RPOT)336unrollBlock(VPB);337return;338}339340// VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.341auto *VPBB = cast<VPBasicBlock>(VPB);342auto InsertPtForPhi = VPBB->getFirstNonPhi();343for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {344if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))345continue;346347// Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and348// Compute*Result which combine all parts to compute the final value.349VPValue *Op1;350if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) ||351match(&R, m_VPInstruction<VPInstruction::FirstActiveLane>(352m_VPValue(Op1))) ||353match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(354m_VPValue(), m_VPValue(), m_VPValue(Op1))) ||355match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(356m_VPValue(), m_VPValue(Op1))) ||357match(&R, m_VPInstruction<VPInstruction::ComputeFindIVResult>(358m_VPValue(), m_VPValue(), m_VPValue(), m_VPValue(Op1)))) {359addUniformForAllParts(cast<VPInstruction>(&R));360for (unsigned Part = 1; Part != UF; ++Part)361R.addOperand(getValueForPart(Op1, Part));362continue;363}364VPValue *Op0;365if (match(&R, m_VPInstruction<VPInstruction::ExtractLastElement>(366m_VPValue(Op0))) ||367match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(368m_VPValue(Op0)))) {369addUniformForAllParts(cast<VPSingleDefRecipe>(&R));370if (Plan.hasScalarVFOnly()) {371auto *I = cast<VPInstruction>(&R);372// Extracting from end with VF = 1 implies retrieving the last or373// penultimate scalar part (UF-1 or UF-2).374unsigned Offset =375I->getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;376I->replaceAllUsesWith(getValueForPart(Op0, UF - Offset));377R.eraseFromParent();378} else {379// Otherwise we extract from the last part.380remapOperands(&R, UF - 1);381}382continue;383}384385auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);386if (SingleDef && vputils::isUniformAcrossVFsAndUFs(SingleDef)) {387addUniformForAllParts(SingleDef);388continue;389}390391if (auto *H = dyn_cast<VPHeaderPHIRecipe>(&R)) {392unrollHeaderPHIByUF(H, InsertPtForPhi);393continue;394}395396unrollRecipeByUF(R);397}398}399400void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {401assert(UF > 0 && "Unroll factor must be positive");402Plan.setUF(UF);403auto Cleanup = make_scope_exit([&Plan]() {404auto Iter = vp_depth_first_deep(Plan.getEntry());405// Remove recipes that are redundant after unrolling.406for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {407for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {408auto *VPI = dyn_cast<VPInstruction>(&R);409if (VPI &&410VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&411VPI->getNumOperands() == 1) {412VPI->replaceAllUsesWith(VPI->getOperand(0));413VPI->eraseFromParent();414}415}416}417});418if (UF == 1) {419return;420}421422UnrollState Unroller(Plan, UF, Ctx);423424// Iterate over all blocks in the plan starting from Entry, and unroll425// recipes inside them. This includes the vector preheader and middle blocks,426// which may set up or post-process per-part values.427ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(428Plan.getEntry());429for (VPBlockBase *VPB : RPOT)430Unroller.unrollBlock(VPB);431432unsigned Part = 1;433// Remap operands of cloned header phis to update backedge values. The header434// phis cloned during unrolling are just after the header phi for part 0.435// Reset Part to 1 when reaching the first (part 0) recipe of a block.436for (VPRecipeBase &H :437Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {438// The second operand of Fixed Order Recurrence phi's, feeding the spliced439// value across the backedge, needs to remap to the last part of the spliced440// value.441if (isa<VPFirstOrderRecurrencePHIRecipe>(&H)) {442Unroller.remapOperand(&H, 1, UF - 1);443continue;444}445if (Unroller.contains(H.getVPSingleValue()) ||446isa<VPWidenPointerInductionRecipe>(&H)) {447Part = 1;448continue;449}450Unroller.remapOperands(&H, Part);451Part++;452}453454VPlanTransforms::removeDeadRecipes(Plan);455}456457/// Create a single-scalar clone of \p RepR for lane \p Lane.458static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,459Type *IdxTy, VPReplicateRecipe *RepR,460VPLane Lane) {461// Collect the operands at Lane, creating extracts as needed.462SmallVector<VPValue *> NewOps;463for (VPValue *Op : RepR->operands()) {464if (vputils::isSingleScalar(Op)) {465NewOps.push_back(Op);466continue;467}468if (Lane.getKind() == VPLane::Kind::ScalableLast) {469NewOps.push_back(470Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));471continue;472}473// Look through buildvector to avoid unnecessary extracts.474if (match(Op, m_BuildVector())) {475NewOps.push_back(476cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane()));477continue;478}479VPValue *Idx =480Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));481VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});482NewOps.push_back(Ext);483}484485auto *New =486new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,487/*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR);488New->transferFlags(*RepR);489New->insertBefore(RepR);490return New;491}492493void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {494Type *IdxTy = IntegerType::get(495Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32);496497// Visit all VPBBs outside the loop region and directly inside the top-level498// loop region.499auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(500vp_depth_first_shallow(Plan.getEntry()));501auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(502vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()));503auto VPBBsToUnroll =504concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion);505for (VPBasicBlock *VPBB : VPBBsToUnroll) {506for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {507auto *RepR = dyn_cast<VPReplicateRecipe>(&R);508if (!RepR || RepR->isSingleScalar())509continue;510511VPBuilder Builder(RepR);512if (RepR->getNumUsers() == 0) {513if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&514vputils::isSingleScalar(RepR->getOperand(1))) {515// Stores to invariant addresses need to store the last lane only.516cloneForLane(Plan, Builder, IdxTy, RepR,517VPLane::getLastLaneForVF(VF));518} else {519// Create single-scalar version of RepR for all lanes.520for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)521cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I));522}523RepR->eraseFromParent();524continue;525}526/// Create single-scalar version of RepR for all lanes.527SmallVector<VPValue *> LaneDefs;528for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)529LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)));530531/// Users that only demand the first lane can use the definition for lane532/// 0.533RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {534return U.onlyFirstLaneUsed(RepR);535});536537// If needed, create a Build(Struct)Vector recipe to insert the scalar538// lane values into a vector.539Type *ResTy = RepR->getUnderlyingInstr()->getType();540VPValue *VecRes = Builder.createNaryOp(541ResTy->isStructTy() ? VPInstruction::BuildStructVector542: VPInstruction::BuildVector,543LaneDefs);544RepR->replaceAllUsesWith(VecRes);545RepR->eraseFromParent();546}547}548}549550551