Path: blob/main/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
35266 views
//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7/// This file implements SLP analysis based on VPlan. The analysis is based on8/// the ideas described in9///10/// Look-ahead SLP: auto-vectorization in the presence of commutative11/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,12/// Luís F. W. Góes13///14//===----------------------------------------------------------------------===//1516#include "VPlan.h"17#include "VPlanValue.h"18#include "llvm/ADT/DenseMap.h"19#include "llvm/ADT/SmallVector.h"20#include "llvm/Analysis/VectorUtils.h"21#include "llvm/IR/Instruction.h"22#include "llvm/IR/Instructions.h"23#include "llvm/IR/Type.h"24#include "llvm/IR/Value.h"25#include "llvm/Support/Casting.h"26#include "llvm/Support/Debug.h"27#include "llvm/Support/ErrorHandling.h"28#include "llvm/Support/raw_ostream.h"29#include <algorithm>30#include <cassert>31#include <optional>32#include <utility>3334using namespace llvm;3536#define DEBUG_TYPE "vplan-slp"3738// Number of levels to look ahead when re-ordering multi node operands.39static unsigned LookaheadMaxDepth = 5;4041VPInstruction *VPlanSlp::markFailed() {42// FIXME: Currently this is used to signal we hit instructions we cannot43// trivially SLP'ize.44CompletelySLP = false;45return nullptr;46}4748void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {49if (all_of(Operands, [](VPValue *V) {50return cast<VPInstruction>(V)->getUnderlyingInstr();51})) {52unsigned BundleSize = 0;53for (VPValue *V : Operands) {54Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();55assert(!T->isVectorTy() && "Only scalar types supported for now");56BundleSize += T->getScalarSizeInBits();57}58WidestBundleBits = std::max(WidestBundleBits, BundleSize);59}6061auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);62assert(Res.second &&63"Already created a combined instruction for the operand bundle");64(void)Res;65}6667bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {68// Currently we only support VPInstructions.69if (!all_of(Operands, [](VPValue *Op) {70return Op && isa<VPInstruction>(Op) &&71cast<VPInstruction>(Op)->getUnderlyingInstr();72})) {73LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");74return false;75}7677// Check if opcodes and type width agree for all instructions in the bundle.78// FIXME: Differing widths/opcodes can be handled by inserting additional79// instructions.80// FIXME: Deal with non-primitive types.81const Instruction *OriginalInstr =82cast<VPInstruction>(Operands[0])->getUnderlyingInstr();83unsigned Opcode = OriginalInstr->getOpcode();84unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();85if (!all_of(Operands, [Opcode, Width](VPValue *Op) {86const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();87return I->getOpcode() == Opcode &&88I->getType()->getPrimitiveSizeInBits() == Width;89})) {90LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");91return false;92}9394// For now, all operands must be defined in the same BB.95if (any_of(Operands, [this](VPValue *Op) {96return cast<VPInstruction>(Op)->getParent() != &this->BB;97})) {98LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");99return false;100}101102if (any_of(Operands,103[](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {104LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");105return false;106}107108// For loads, check that there are no instructions writing to memory in109// between them.110// TODO: we only have to forbid instructions writing to memory that could111// interfere with any of the loads in the bundle112if (Opcode == Instruction::Load) {113unsigned LoadsSeen = 0;114VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();115for (auto &I : *Parent) {116auto *VPI = dyn_cast<VPInstruction>(&I);117if (!VPI)118break;119if (VPI->getOpcode() == Instruction::Load &&120llvm::is_contained(Operands, VPI))121LoadsSeen++;122123if (LoadsSeen == Operands.size())124break;125if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {126LLVM_DEBUG(127dbgs() << "VPSLP: instruction modifying memory between loads\n");128return false;129}130}131132if (!all_of(Operands, [](VPValue *Op) {133return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())134->isSimple();135})) {136LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");137return false;138}139}140141if (Opcode == Instruction::Store)142if (!all_of(Operands, [](VPValue *Op) {143return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())144->isSimple();145})) {146LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");147return false;148}149150return true;151}152153static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,154unsigned OperandIndex) {155SmallVector<VPValue *, 4> Operands;156for (VPValue *V : Values) {157// Currently we only support VPInstructions.158auto *U = cast<VPInstruction>(V);159Operands.push_back(U->getOperand(OperandIndex));160}161return Operands;162}163164static bool areCommutative(ArrayRef<VPValue *> Values) {165return Instruction::isCommutative(166cast<VPInstruction>(Values[0])->getOpcode());167}168169static SmallVector<SmallVector<VPValue *, 4>, 4>170getOperands(ArrayRef<VPValue *> Values) {171SmallVector<SmallVector<VPValue *, 4>, 4> Result;172auto *VPI = cast<VPInstruction>(Values[0]);173174switch (VPI->getOpcode()) {175case Instruction::Load:176llvm_unreachable("Loads terminate a tree, no need to get operands");177case Instruction::Store:178Result.push_back(getOperands(Values, 0));179break;180default:181for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)182Result.push_back(getOperands(Values, I));183break;184}185186return Result;187}188189/// Returns the opcode of Values or ~0 if they do not all agree.190static std::optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {191unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();192if (any_of(Values, [Opcode](VPValue *V) {193return cast<VPInstruction>(V)->getOpcode() != Opcode;194}))195return std::nullopt;196return {Opcode};197}198199/// Returns true if A and B access sequential memory if they are loads or200/// stores or if they have identical opcodes otherwise.201static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,202VPInterleavedAccessInfo &IAI) {203if (A->getOpcode() != B->getOpcode())204return false;205206if (A->getOpcode() != Instruction::Load &&207A->getOpcode() != Instruction::Store)208return true;209auto *GA = IAI.getInterleaveGroup(A);210auto *GB = IAI.getInterleaveGroup(B);211212return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);213}214215/// Implements getLAScore from Listing 7 in the paper.216/// Traverses and compares operands of V1 and V2 to MaxLevel.217static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,218VPInterleavedAccessInfo &IAI) {219auto *I1 = dyn_cast<VPInstruction>(V1);220auto *I2 = dyn_cast<VPInstruction>(V2);221// Currently we only support VPInstructions.222if (!I1 || !I2)223return 0;224225if (MaxLevel == 0)226return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);227228unsigned Score = 0;229for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)230for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)231Score +=232getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);233return Score;234}235236std::pair<VPlanSlp::OpMode, VPValue *>237VPlanSlp::getBest(OpMode Mode, VPValue *Last,238SmallPtrSetImpl<VPValue *> &Candidates,239VPInterleavedAccessInfo &IAI) {240assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&241"Currently we only handle load and commutative opcodes");242LLVM_DEBUG(dbgs() << " getBest\n");243244SmallVector<VPValue *, 4> BestCandidates;245LLVM_DEBUG(dbgs() << " Candidates for "246<< *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");247for (auto *Candidate : Candidates) {248auto *LastI = cast<VPInstruction>(Last);249auto *CandidateI = cast<VPInstruction>(Candidate);250if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {251LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()252<< " ");253BestCandidates.push_back(Candidate);254}255}256LLVM_DEBUG(dbgs() << "\n");257258if (BestCandidates.empty())259return {OpMode::Failed, nullptr};260261if (BestCandidates.size() == 1)262return {Mode, BestCandidates[0]};263264VPValue *Best = nullptr;265unsigned BestScore = 0;266for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {267unsigned PrevScore = ~0u;268bool AllSame = true;269270// FIXME: Avoid visiting the same operands multiple times.271for (auto *Candidate : BestCandidates) {272unsigned Score = getLAScore(Last, Candidate, Depth, IAI);273if (PrevScore == ~0u)274PrevScore = Score;275if (PrevScore != Score)276AllSame = false;277PrevScore = Score;278279if (Score > BestScore) {280BestScore = Score;281Best = Candidate;282}283}284if (!AllSame)285break;286}287LLVM_DEBUG(dbgs() << "Found best "288<< *cast<VPInstruction>(Best)->getUnderlyingInstr()289<< "\n");290Candidates.erase(Best);291292return {Mode, Best};293}294295SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {296SmallVector<MultiNodeOpTy, 4> FinalOrder;297SmallVector<OpMode, 4> Mode;298FinalOrder.reserve(MultiNodeOps.size());299Mode.reserve(MultiNodeOps.size());300301LLVM_DEBUG(dbgs() << "Reordering multinode\n");302303for (auto &Operands : MultiNodeOps) {304FinalOrder.push_back({Operands.first, {Operands.second[0]}});305if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==306Instruction::Load)307Mode.push_back(OpMode::Load);308else309Mode.push_back(OpMode::Opcode);310}311312for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {313LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");314SmallPtrSet<VPValue *, 4> Candidates;315LLVM_DEBUG(dbgs() << " Candidates ");316for (auto Ops : MultiNodeOps) {317LLVM_DEBUG(318dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()319<< " ");320Candidates.insert(Ops.second[Lane]);321}322LLVM_DEBUG(dbgs() << "\n");323324for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {325LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");326if (Mode[Op] == OpMode::Failed)327continue;328329VPValue *Last = FinalOrder[Op].second[Lane - 1];330std::pair<OpMode, VPValue *> Res =331getBest(Mode[Op], Last, Candidates, IAI);332if (Res.second)333FinalOrder[Op].second.push_back(Res.second);334else335// TODO: handle this case336FinalOrder[Op].second.push_back(markFailed());337}338}339340return FinalOrder;341}342343#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)344void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {345dbgs() << " Ops: ";346for (auto *Op : Values) {347if (auto *VPInstr = cast_or_null<VPInstruction>(Op))348if (auto *Instr = VPInstr->getUnderlyingInstr()) {349dbgs() << *Instr << " | ";350continue;351}352dbgs() << " nullptr | ";353}354dbgs() << "\n";355}356#endif357358VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {359assert(!Values.empty() && "Need some operands!");360361// If we already visited this instruction bundle, re-use the existing node362auto I = BundleToCombined.find(to_vector<4>(Values));363if (I != BundleToCombined.end()) {364#ifndef NDEBUG365// Check that the resulting graph is a tree. If we re-use a node, this means366// its values have multiple users. We only allow this, if all users of each367// value are the same instruction.368for (auto *V : Values) {369auto UI = V->user_begin();370auto *FirstUser = *UI++;371while (UI != V->user_end()) {372assert(*UI == FirstUser && "Currently we only support SLP trees.");373UI++;374}375}376#endif377return I->second;378}379380// Dump inputs381LLVM_DEBUG({382dbgs() << "buildGraph: ";383dumpBundle(Values);384});385386if (!areVectorizable(Values))387return markFailed();388389assert(getOpcode(Values) && "Opcodes for all values must match");390unsigned ValuesOpcode = *getOpcode(Values);391392SmallVector<VPValue *, 4> CombinedOperands;393if (areCommutative(Values)) {394bool MultiNodeRoot = !MultiNodeActive;395MultiNodeActive = true;396for (auto &Operands : getOperands(Values)) {397LLVM_DEBUG({398dbgs() << " Visiting Commutative";399dumpBundle(Operands);400});401402auto OperandsOpcode = getOpcode(Operands);403if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {404LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");405CombinedOperands.push_back(buildGraph(Operands));406} else {407LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");408// Create dummy VPInstruction, which will we replace later by the409// re-ordered operand.410VPInstruction *Op = new VPInstruction(0, {});411CombinedOperands.push_back(Op);412MultiNodeOps.emplace_back(Op, Operands);413}414}415416if (MultiNodeRoot) {417LLVM_DEBUG(dbgs() << "Reorder \n");418MultiNodeActive = false;419420auto FinalOrder = reorderMultiNodeOps();421422MultiNodeOps.clear();423for (auto &Ops : FinalOrder) {424VPInstruction *NewOp = buildGraph(Ops.second);425Ops.first->replaceAllUsesWith(NewOp);426for (unsigned i = 0; i < CombinedOperands.size(); i++)427if (CombinedOperands[i] == Ops.first)428CombinedOperands[i] = NewOp;429delete Ops.first;430Ops.first = NewOp;431}432LLVM_DEBUG(dbgs() << "Found final order\n");433}434} else {435LLVM_DEBUG(dbgs() << " NonCommuntative\n");436if (ValuesOpcode == Instruction::Load)437for (VPValue *V : Values)438CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));439else440for (auto &Operands : getOperands(Values))441CombinedOperands.push_back(buildGraph(Operands));442}443444unsigned Opcode;445switch (ValuesOpcode) {446case Instruction::Load:447Opcode = VPInstruction::SLPLoad;448break;449case Instruction::Store:450Opcode = VPInstruction::SLPStore;451break;452default:453Opcode = ValuesOpcode;454break;455}456457if (!CompletelySLP)458return markFailed();459460assert(CombinedOperands.size() > 0 && "Need more some operands");461auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();462auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());463464LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "465<< *cast<VPInstruction>(Values[0]) << "\n");466addCombined(Values, VPI);467return VPI;468}469470471