Path: blob/main/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
35269 views
//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7// When profitable, replace GPR targeting i64 instructions with their8// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined9// as minimizing the number of cross-class register copies.10//===----------------------------------------------------------------------===//1112//===----------------------------------------------------------------------===//13// TODO: Graph based predicate heuristics.14// Walking the instruction list linearly will get many, perhaps most, of15// the cases, but to do a truly thorough job of this, we need a more16// wholistic approach.17//18// This optimization is very similar in spirit to the register allocator's19// spill placement, only here we're determining where to place cross-class20// register copies rather than spills. As such, a similar approach is21// called for.22//23// We want to build up a set of graphs of all instructions which are candidates24// for transformation along with instructions which generate their inputs and25// consume their outputs. For each edge in the graph, we assign a weight26// based on whether there is a copy required there (weight zero if not) and27// the block frequency of the block containing the defining or using28// instruction, whichever is less. Our optimization is then a graph problem29// to minimize the total weight of all the graphs, then transform instructions30// and add or remove copy instructions as called for to implement the31// solution.32//===----------------------------------------------------------------------===//3334#include "AArch64.h"35#include "AArch64InstrInfo.h"36#include "AArch64RegisterInfo.h"37#include "llvm/ADT/Statistic.h"38#include "llvm/CodeGen/MachineFunction.h"39#include "llvm/CodeGen/MachineFunctionPass.h"40#include "llvm/CodeGen/MachineInstr.h"41#include "llvm/CodeGen/MachineInstrBuilder.h"42#include "llvm/CodeGen/MachineRegisterInfo.h"43#include "llvm/Support/CommandLine.h"44#include "llvm/Support/Debug.h"45#include "llvm/Support/raw_ostream.h"46using namespace llvm;4748#define DEBUG_TYPE "aarch64-simd-scalar"4950// Allow forcing all i64 operations with equivalent SIMD instructions to use51// them. For stress-testing the transformation function.52static cl::opt<bool>53TransformAll("aarch64-simd-scalar-force-all",54cl::desc("Force use of AdvSIMD scalar instructions everywhere"),55cl::init(false), cl::Hidden);5657STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");58STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");59STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");6061#define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization"6263namespace {64class AArch64AdvSIMDScalar : public MachineFunctionPass {65MachineRegisterInfo *MRI;66const TargetInstrInfo *TII;6768private:69// isProfitableToTransform - Predicate function to determine whether an70// instruction should be transformed to its equivalent AdvSIMD scalar71// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.72bool isProfitableToTransform(const MachineInstr &MI) const;7374// transformInstruction - Perform the transformation of an instruction75// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs76// to be the correct register class, minimizing cross-class copies.77void transformInstruction(MachineInstr &MI);7879// processMachineBasicBlock - Main optimzation loop.80bool processMachineBasicBlock(MachineBasicBlock *MBB);8182public:83static char ID; // Pass identification, replacement for typeid.84explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {85initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry());86}8788bool runOnMachineFunction(MachineFunction &F) override;8990StringRef getPassName() const override { return AARCH64_ADVSIMD_NAME; }9192void getAnalysisUsage(AnalysisUsage &AU) const override {93AU.setPreservesCFG();94MachineFunctionPass::getAnalysisUsage(AU);95}96};97char AArch64AdvSIMDScalar::ID = 0;98} // end anonymous namespace99100INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar",101AARCH64_ADVSIMD_NAME, false, false)102103static bool isGPR64(unsigned Reg, unsigned SubReg,104const MachineRegisterInfo *MRI) {105if (SubReg)106return false;107if (Register::isVirtualRegister(Reg))108return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);109return AArch64::GPR64RegClass.contains(Reg);110}111112static bool isFPR64(unsigned Reg, unsigned SubReg,113const MachineRegisterInfo *MRI) {114if (Register::isVirtualRegister(Reg))115return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&116SubReg == 0) ||117(MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&118SubReg == AArch64::dsub);119// Physical register references just check the register class directly.120return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||121(AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);122}123124// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64125// copy instruction. Return nullptr if the instruction is not a copy.126static MachineOperand *getSrcFromCopy(MachineInstr *MI,127const MachineRegisterInfo *MRI,128unsigned &SubReg) {129SubReg = 0;130// The "FMOV Xd, Dn" instruction is the typical form.131if (MI->getOpcode() == AArch64::FMOVDXr ||132MI->getOpcode() == AArch64::FMOVXDr)133return &MI->getOperand(1);134// A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see135// these at this stage, but it's easy to check for.136if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {137SubReg = AArch64::dsub;138return &MI->getOperand(1);139}140// Or just a plain COPY instruction. This can be directly to/from FPR64,141// or it can be a dsub subreg reference to an FPR128.142if (MI->getOpcode() == AArch64::COPY) {143if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),144MRI) &&145isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))146return &MI->getOperand(1);147if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),148MRI) &&149isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),150MRI)) {151SubReg = MI->getOperand(1).getSubReg();152return &MI->getOperand(1);153}154}155156// Otherwise, this is some other kind of instruction.157return nullptr;158}159160// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent161// that we're considering transforming to, return that AdvSIMD opcode. For all162// others, return the original opcode.163static unsigned getTransformOpcode(unsigned Opc) {164switch (Opc) {165default:166break;167// FIXME: Lots more possibilities.168case AArch64::ADDXrr:169return AArch64::ADDv1i64;170case AArch64::SUBXrr:171return AArch64::SUBv1i64;172case AArch64::ANDXrr:173return AArch64::ANDv8i8;174case AArch64::EORXrr:175return AArch64::EORv8i8;176case AArch64::ORRXrr:177return AArch64::ORRv8i8;178}179// No AdvSIMD equivalent, so just return the original opcode.180return Opc;181}182183static bool isTransformable(const MachineInstr &MI) {184unsigned Opc = MI.getOpcode();185return Opc != getTransformOpcode(Opc);186}187188// isProfitableToTransform - Predicate function to determine whether an189// instruction should be transformed to its equivalent AdvSIMD scalar190// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.191bool AArch64AdvSIMDScalar::isProfitableToTransform(192const MachineInstr &MI) const {193// If this instruction isn't eligible to be transformed (no SIMD equivalent),194// early exit since that's the common case.195if (!isTransformable(MI))196return false;197198// Count the number of copies we'll need to add and approximate the number199// of copies that a transform will enable us to remove.200unsigned NumNewCopies = 3;201unsigned NumRemovableCopies = 0;202203Register OrigSrc0 = MI.getOperand(1).getReg();204Register OrigSrc1 = MI.getOperand(2).getReg();205unsigned SubReg0;206unsigned SubReg1;207if (!MRI->def_empty(OrigSrc0)) {208MachineRegisterInfo::def_instr_iterator Def =209MRI->def_instr_begin(OrigSrc0);210assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");211MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);212// If the source was from a copy, we don't need to insert a new copy.213if (MOSrc0)214--NumNewCopies;215// If there are no other users of the original source, we can delete216// that instruction.217if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0))218++NumRemovableCopies;219}220if (!MRI->def_empty(OrigSrc1)) {221MachineRegisterInfo::def_instr_iterator Def =222MRI->def_instr_begin(OrigSrc1);223assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");224MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);225if (MOSrc1)226--NumNewCopies;227// If there are no other users of the original source, we can delete228// that instruction.229if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1))230++NumRemovableCopies;231}232233// If any of the uses of the original instructions is a cross class copy,234// that's a copy that will be removable if we transform. Likewise, if235// any of the uses is a transformable instruction, it's likely the tranforms236// will chain, enabling us to save a copy there, too. This is an aggressive237// heuristic that approximates the graph based cost analysis described above.238Register Dst = MI.getOperand(0).getReg();239bool AllUsesAreCopies = true;240for (MachineRegisterInfo::use_instr_nodbg_iterator241Use = MRI->use_instr_nodbg_begin(Dst),242E = MRI->use_instr_nodbg_end();243Use != E; ++Use) {244unsigned SubReg;245if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use))246++NumRemovableCopies;247// If the use is an INSERT_SUBREG, that's still something that can248// directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's249// preferable to have it use the FPR64 in most cases, as if the source250// vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.251// Ditto for a lane insert.252else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||253Use->getOpcode() == AArch64::INSvi64gpr)254;255else256AllUsesAreCopies = false;257}258// If all of the uses of the original destination register are copies to259// FPR64, then we won't end up having a new copy back to GPR64 either.260if (AllUsesAreCopies)261--NumNewCopies;262263// If a transform will not increase the number of cross-class copies required,264// return true.265if (NumNewCopies <= NumRemovableCopies)266return true;267268// Finally, even if we otherwise wouldn't transform, check if we're forcing269// transformation of everything.270return TransformAll;271}272273static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,274unsigned Dst, unsigned Src, bool IsKill) {275MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),276TII->get(AArch64::COPY), Dst)277.addReg(Src, getKillRegState(IsKill));278LLVM_DEBUG(dbgs() << " adding copy: " << *MIB);279++NumCopiesInserted;280return MIB;281}282283// transformInstruction - Perform the transformation of an instruction284// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs285// to be the correct register class, minimizing cross-class copies.286void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {287LLVM_DEBUG(dbgs() << "Scalar transform: " << MI);288289MachineBasicBlock *MBB = MI.getParent();290unsigned OldOpc = MI.getOpcode();291unsigned NewOpc = getTransformOpcode(OldOpc);292assert(OldOpc != NewOpc && "transform an instruction to itself?!");293294// Check if we need a copy for the source registers.295Register OrigSrc0 = MI.getOperand(1).getReg();296Register OrigSrc1 = MI.getOperand(2).getReg();297unsigned Src0 = 0, SubReg0;298unsigned Src1 = 0, SubReg1;299bool KillSrc0 = false, KillSrc1 = false;300if (!MRI->def_empty(OrigSrc0)) {301MachineRegisterInfo::def_instr_iterator Def =302MRI->def_instr_begin(OrigSrc0);303assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");304MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);305// If there are no other users of the original source, we can delete306// that instruction.307if (MOSrc0) {308Src0 = MOSrc0->getReg();309KillSrc0 = MOSrc0->isKill();310// Src0 is going to be reused, thus, it cannot be killed anymore.311MOSrc0->setIsKill(false);312if (MRI->hasOneNonDBGUse(OrigSrc0)) {313assert(MOSrc0 && "Can't delete copy w/o a valid original source!");314Def->eraseFromParent();315++NumCopiesDeleted;316}317}318}319if (!MRI->def_empty(OrigSrc1)) {320MachineRegisterInfo::def_instr_iterator Def =321MRI->def_instr_begin(OrigSrc1);322assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");323MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);324// If there are no other users of the original source, we can delete325// that instruction.326if (MOSrc1) {327Src1 = MOSrc1->getReg();328KillSrc1 = MOSrc1->isKill();329// Src0 is going to be reused, thus, it cannot be killed anymore.330MOSrc1->setIsKill(false);331if (MRI->hasOneNonDBGUse(OrigSrc1)) {332assert(MOSrc1 && "Can't delete copy w/o a valid original source!");333Def->eraseFromParent();334++NumCopiesDeleted;335}336}337}338// If we weren't able to reference the original source directly, create a339// copy.340if (!Src0) {341SubReg0 = 0;342Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);343insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0);344KillSrc0 = true;345}346if (!Src1) {347SubReg1 = 0;348Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);349insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1);350KillSrc1 = true;351}352353// Create a vreg for the destination.354// FIXME: No need to do this if the ultimate user expects an FPR64.355// Check for that and avoid the copy if possible.356Register Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);357358// For now, all of the new instructions have the same simple three-register359// form, so no need to special case based on what instruction we're360// building.361BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst)362.addReg(Src0, getKillRegState(KillSrc0), SubReg0)363.addReg(Src1, getKillRegState(KillSrc1), SubReg1);364365// Now copy the result back out to a GPR.366// FIXME: Try to avoid this if all uses could actually just use the FPR64367// directly.368insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true);369370// Erase the old instruction.371MI.eraseFromParent();372373++NumScalarInsnsUsed;374}375376// processMachineBasicBlock - Main optimzation loop.377bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {378bool Changed = false;379for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {380if (isProfitableToTransform(MI)) {381transformInstruction(MI);382Changed = true;383}384}385return Changed;386}387388// runOnMachineFunction - Pass entry point from PassManager.389bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {390bool Changed = false;391LLVM_DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");392393if (skipFunction(mf.getFunction()))394return false;395396MRI = &mf.getRegInfo();397TII = mf.getSubtarget().getInstrInfo();398399// Just check things on a one-block-at-a-time basis.400for (MachineBasicBlock &MBB : mf)401if (processMachineBasicBlock(&MBB))402Changed = true;403return Changed;404}405406// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine407// to add the pass to the PassManager.408FunctionPass *llvm::createAArch64AdvSIMDScalar() {409return new AArch64AdvSIMDScalar();410}411412413