Path: blob/main/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
35294 views
//===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file contains a pass that performs load / store related peephole9// optimizations. This pass should be run after register allocation.10//11// The pass runs after the PrologEpilogInserter where we emit the CFI12// instructions. In order to preserve the correctness of the unwind informaiton,13// the pass should not change the order of any two instructions, one of which14// has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix15// to unwind information.16//17//===----------------------------------------------------------------------===//1819#include "AArch64InstrInfo.h"20#include "AArch64MachineFunctionInfo.h"21#include "AArch64Subtarget.h"22#include "MCTargetDesc/AArch64AddressingModes.h"23#include "llvm/ADT/SmallVector.h"24#include "llvm/ADT/Statistic.h"25#include "llvm/ADT/StringRef.h"26#include "llvm/ADT/iterator_range.h"27#include "llvm/Analysis/AliasAnalysis.h"28#include "llvm/CodeGen/MachineBasicBlock.h"29#include "llvm/CodeGen/MachineFunction.h"30#include "llvm/CodeGen/MachineFunctionPass.h"31#include "llvm/CodeGen/MachineInstr.h"32#include "llvm/CodeGen/MachineInstrBuilder.h"33#include "llvm/CodeGen/MachineOperand.h"34#include "llvm/CodeGen/MachineRegisterInfo.h"35#include "llvm/CodeGen/TargetRegisterInfo.h"36#include "llvm/IR/DebugLoc.h"37#include "llvm/MC/MCAsmInfo.h"38#include "llvm/MC/MCDwarf.h"39#include "llvm/MC/MCRegisterInfo.h"40#include "llvm/Pass.h"41#include "llvm/Support/CommandLine.h"42#include "llvm/Support/Debug.h"43#include "llvm/Support/DebugCounter.h"44#include "llvm/Support/ErrorHandling.h"45#include "llvm/Support/raw_ostream.h"46#include <cassert>47#include <cstdint>48#include <functional>49#include <iterator>50#include <limits>51#include <optional>5253using namespace llvm;5455#define DEBUG_TYPE "aarch64-ldst-opt"5657STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");58STATISTIC(NumPostFolded, "Number of post-index updates folded");59STATISTIC(NumPreFolded, "Number of pre-index updates folded");60STATISTIC(NumUnscaledPairCreated,61"Number of load/store from unscaled generated");62STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");63STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");64STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "65"not passed the alignment check");6667DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",68"Controls which pairs are considered for renaming");6970// The LdStLimit limits how far we search for load/store pairs.71static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",72cl::init(20), cl::Hidden);7374// The UpdateLimit limits how far we search for update instructions when we form75// pre-/post-index instructions.76static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),77cl::Hidden);7879// Enable register renaming to find additional store pairing opportunities.80static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",81cl::init(true), cl::Hidden);8283#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"8485namespace {8687using LdStPairFlags = struct LdStPairFlags {88// If a matching instruction is found, MergeForward is set to true if the89// merge is to remove the first instruction and replace the second with90// a pair-wise insn, and false if the reverse is true.91bool MergeForward = false;9293// SExtIdx gives the index of the result of the load pair that must be94// extended. The value of SExtIdx assumes that the paired load produces the95// value in this order: (I, returned iterator), i.e., -1 means no value has96// to be extended, 0 means I, and 1 means the returned iterator.97int SExtIdx = -1;9899// If not none, RenameReg can be used to rename the result register of the100// first store in a pair. Currently this only works when merging stores101// forward.102std::optional<MCPhysReg> RenameReg;103104LdStPairFlags() = default;105106void setMergeForward(bool V = true) { MergeForward = V; }107bool getMergeForward() const { return MergeForward; }108109void setSExtIdx(int V) { SExtIdx = V; }110int getSExtIdx() const { return SExtIdx; }111112void setRenameReg(MCPhysReg R) { RenameReg = R; }113void clearRenameReg() { RenameReg = std::nullopt; }114std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }115};116117struct AArch64LoadStoreOpt : public MachineFunctionPass {118static char ID;119120AArch64LoadStoreOpt() : MachineFunctionPass(ID) {121initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());122}123124AliasAnalysis *AA;125const AArch64InstrInfo *TII;126const TargetRegisterInfo *TRI;127const AArch64Subtarget *Subtarget;128129// Track which register units have been modified and used.130LiveRegUnits ModifiedRegUnits, UsedRegUnits;131LiveRegUnits DefinedInBB;132133void getAnalysisUsage(AnalysisUsage &AU) const override {134AU.addRequired<AAResultsWrapperPass>();135MachineFunctionPass::getAnalysisUsage(AU);136}137138// Scan the instructions looking for a load/store that can be combined139// with the current instruction into a load/store pair.140// Return the matching instruction if one is found, else MBB->end().141MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,142LdStPairFlags &Flags,143unsigned Limit,144bool FindNarrowMerge);145146// Scan the instructions looking for a store that writes to the address from147// which the current load instruction reads. Return true if one is found.148bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,149MachineBasicBlock::iterator &StoreI);150151// Merge the two instructions indicated into a wider narrow store instruction.152MachineBasicBlock::iterator153mergeNarrowZeroStores(MachineBasicBlock::iterator I,154MachineBasicBlock::iterator MergeMI,155const LdStPairFlags &Flags);156157// Merge the two instructions indicated into a single pair-wise instruction.158MachineBasicBlock::iterator159mergePairedInsns(MachineBasicBlock::iterator I,160MachineBasicBlock::iterator Paired,161const LdStPairFlags &Flags);162163// Promote the load that reads directly from the address stored to.164MachineBasicBlock::iterator165promoteLoadFromStore(MachineBasicBlock::iterator LoadI,166MachineBasicBlock::iterator StoreI);167168// Scan the instruction list to find a base register update that can169// be combined with the current instruction (a load or store) using170// pre or post indexed addressing with writeback. Scan forwards.171MachineBasicBlock::iterator172findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,173int UnscaledOffset, unsigned Limit);174175// Scan the instruction list to find a base register update that can176// be combined with the current instruction (a load or store) using177// pre or post indexed addressing with writeback. Scan backwards.178MachineBasicBlock::iterator179findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);180181// Find an instruction that updates the base register of the ld/st182// instruction.183bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,184unsigned BaseReg, int Offset);185186// Merge a pre- or post-index base register update into a ld/st instruction.187MachineBasicBlock::iterator188mergeUpdateInsn(MachineBasicBlock::iterator I,189MachineBasicBlock::iterator Update, bool IsPreIdx);190191// Find and merge zero store instructions.192bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);193194// Find and pair ldr/str instructions.195bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);196197// Find and promote load instructions which read directly from store.198bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);199200// Find and merge a base register updates before or after a ld/st instruction.201bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);202203bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);204205bool runOnMachineFunction(MachineFunction &Fn) override;206207MachineFunctionProperties getRequiredProperties() const override {208return MachineFunctionProperties().set(209MachineFunctionProperties::Property::NoVRegs);210}211212StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }213};214215char AArch64LoadStoreOpt::ID = 0;216217} // end anonymous namespace218219INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",220AARCH64_LOAD_STORE_OPT_NAME, false, false)221222static bool isNarrowStore(unsigned Opc) {223switch (Opc) {224default:225return false;226case AArch64::STRBBui:227case AArch64::STURBBi:228case AArch64::STRHHui:229case AArch64::STURHHi:230return true;231}232}233234// These instruction set memory tag and either keep memory contents unchanged or235// set it to zero, ignoring the address part of the source register.236static bool isTagStore(const MachineInstr &MI) {237switch (MI.getOpcode()) {238default:239return false;240case AArch64::STGi:241case AArch64::STZGi:242case AArch64::ST2Gi:243case AArch64::STZ2Gi:244return true;245}246}247248static unsigned getMatchingNonSExtOpcode(unsigned Opc,249bool *IsValidLdStrOpc = nullptr) {250if (IsValidLdStrOpc)251*IsValidLdStrOpc = true;252switch (Opc) {253default:254if (IsValidLdStrOpc)255*IsValidLdStrOpc = false;256return std::numeric_limits<unsigned>::max();257case AArch64::STRDui:258case AArch64::STURDi:259case AArch64::STRDpre:260case AArch64::STRQui:261case AArch64::STURQi:262case AArch64::STRQpre:263case AArch64::STRBBui:264case AArch64::STURBBi:265case AArch64::STRHHui:266case AArch64::STURHHi:267case AArch64::STRWui:268case AArch64::STRWpre:269case AArch64::STURWi:270case AArch64::STRXui:271case AArch64::STRXpre:272case AArch64::STURXi:273case AArch64::LDRDui:274case AArch64::LDURDi:275case AArch64::LDRDpre:276case AArch64::LDRQui:277case AArch64::LDURQi:278case AArch64::LDRQpre:279case AArch64::LDRWui:280case AArch64::LDURWi:281case AArch64::LDRWpre:282case AArch64::LDRXui:283case AArch64::LDURXi:284case AArch64::LDRXpre:285case AArch64::STRSui:286case AArch64::STURSi:287case AArch64::STRSpre:288case AArch64::LDRSui:289case AArch64::LDURSi:290case AArch64::LDRSpre:291return Opc;292case AArch64::LDRSWui:293return AArch64::LDRWui;294case AArch64::LDURSWi:295return AArch64::LDURWi;296case AArch64::LDRSWpre:297return AArch64::LDRWpre;298}299}300301static unsigned getMatchingWideOpcode(unsigned Opc) {302switch (Opc) {303default:304llvm_unreachable("Opcode has no wide equivalent!");305case AArch64::STRBBui:306return AArch64::STRHHui;307case AArch64::STRHHui:308return AArch64::STRWui;309case AArch64::STURBBi:310return AArch64::STURHHi;311case AArch64::STURHHi:312return AArch64::STURWi;313case AArch64::STURWi:314return AArch64::STURXi;315case AArch64::STRWui:316return AArch64::STRXui;317}318}319320static unsigned getMatchingPairOpcode(unsigned Opc) {321switch (Opc) {322default:323llvm_unreachable("Opcode has no pairwise equivalent!");324case AArch64::STRSui:325case AArch64::STURSi:326return AArch64::STPSi;327case AArch64::STRSpre:328return AArch64::STPSpre;329case AArch64::STRDui:330case AArch64::STURDi:331return AArch64::STPDi;332case AArch64::STRDpre:333return AArch64::STPDpre;334case AArch64::STRQui:335case AArch64::STURQi:336return AArch64::STPQi;337case AArch64::STRQpre:338return AArch64::STPQpre;339case AArch64::STRWui:340case AArch64::STURWi:341return AArch64::STPWi;342case AArch64::STRWpre:343return AArch64::STPWpre;344case AArch64::STRXui:345case AArch64::STURXi:346return AArch64::STPXi;347case AArch64::STRXpre:348return AArch64::STPXpre;349case AArch64::LDRSui:350case AArch64::LDURSi:351return AArch64::LDPSi;352case AArch64::LDRSpre:353return AArch64::LDPSpre;354case AArch64::LDRDui:355case AArch64::LDURDi:356return AArch64::LDPDi;357case AArch64::LDRDpre:358return AArch64::LDPDpre;359case AArch64::LDRQui:360case AArch64::LDURQi:361return AArch64::LDPQi;362case AArch64::LDRQpre:363return AArch64::LDPQpre;364case AArch64::LDRWui:365case AArch64::LDURWi:366return AArch64::LDPWi;367case AArch64::LDRWpre:368return AArch64::LDPWpre;369case AArch64::LDRXui:370case AArch64::LDURXi:371return AArch64::LDPXi;372case AArch64::LDRXpre:373return AArch64::LDPXpre;374case AArch64::LDRSWui:375case AArch64::LDURSWi:376return AArch64::LDPSWi;377case AArch64::LDRSWpre:378return AArch64::LDPSWpre;379}380}381382static unsigned isMatchingStore(MachineInstr &LoadInst,383MachineInstr &StoreInst) {384unsigned LdOpc = LoadInst.getOpcode();385unsigned StOpc = StoreInst.getOpcode();386switch (LdOpc) {387default:388llvm_unreachable("Unsupported load instruction!");389case AArch64::LDRBBui:390return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||391StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;392case AArch64::LDURBBi:393return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||394StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;395case AArch64::LDRHHui:396return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||397StOpc == AArch64::STRXui;398case AArch64::LDURHHi:399return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||400StOpc == AArch64::STURXi;401case AArch64::LDRWui:402return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;403case AArch64::LDURWi:404return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;405case AArch64::LDRXui:406return StOpc == AArch64::STRXui;407case AArch64::LDURXi:408return StOpc == AArch64::STURXi;409}410}411412static unsigned getPreIndexedOpcode(unsigned Opc) {413// FIXME: We don't currently support creating pre-indexed loads/stores when414// the load or store is the unscaled version. If we decide to perform such an415// optimization in the future the cases for the unscaled loads/stores will416// need to be added here.417switch (Opc) {418default:419llvm_unreachable("Opcode has no pre-indexed equivalent!");420case AArch64::STRSui:421return AArch64::STRSpre;422case AArch64::STRDui:423return AArch64::STRDpre;424case AArch64::STRQui:425return AArch64::STRQpre;426case AArch64::STRBBui:427return AArch64::STRBBpre;428case AArch64::STRHHui:429return AArch64::STRHHpre;430case AArch64::STRWui:431return AArch64::STRWpre;432case AArch64::STRXui:433return AArch64::STRXpre;434case AArch64::LDRSui:435return AArch64::LDRSpre;436case AArch64::LDRDui:437return AArch64::LDRDpre;438case AArch64::LDRQui:439return AArch64::LDRQpre;440case AArch64::LDRBBui:441return AArch64::LDRBBpre;442case AArch64::LDRHHui:443return AArch64::LDRHHpre;444case AArch64::LDRWui:445return AArch64::LDRWpre;446case AArch64::LDRXui:447return AArch64::LDRXpre;448case AArch64::LDRSWui:449return AArch64::LDRSWpre;450case AArch64::LDPSi:451return AArch64::LDPSpre;452case AArch64::LDPSWi:453return AArch64::LDPSWpre;454case AArch64::LDPDi:455return AArch64::LDPDpre;456case AArch64::LDPQi:457return AArch64::LDPQpre;458case AArch64::LDPWi:459return AArch64::LDPWpre;460case AArch64::LDPXi:461return AArch64::LDPXpre;462case AArch64::STPSi:463return AArch64::STPSpre;464case AArch64::STPDi:465return AArch64::STPDpre;466case AArch64::STPQi:467return AArch64::STPQpre;468case AArch64::STPWi:469return AArch64::STPWpre;470case AArch64::STPXi:471return AArch64::STPXpre;472case AArch64::STGi:473return AArch64::STGPreIndex;474case AArch64::STZGi:475return AArch64::STZGPreIndex;476case AArch64::ST2Gi:477return AArch64::ST2GPreIndex;478case AArch64::STZ2Gi:479return AArch64::STZ2GPreIndex;480case AArch64::STGPi:481return AArch64::STGPpre;482}483}484485static unsigned getPostIndexedOpcode(unsigned Opc) {486switch (Opc) {487default:488llvm_unreachable("Opcode has no post-indexed wise equivalent!");489case AArch64::STRSui:490case AArch64::STURSi:491return AArch64::STRSpost;492case AArch64::STRDui:493case AArch64::STURDi:494return AArch64::STRDpost;495case AArch64::STRQui:496case AArch64::STURQi:497return AArch64::STRQpost;498case AArch64::STRBBui:499return AArch64::STRBBpost;500case AArch64::STRHHui:501return AArch64::STRHHpost;502case AArch64::STRWui:503case AArch64::STURWi:504return AArch64::STRWpost;505case AArch64::STRXui:506case AArch64::STURXi:507return AArch64::STRXpost;508case AArch64::LDRSui:509case AArch64::LDURSi:510return AArch64::LDRSpost;511case AArch64::LDRDui:512case AArch64::LDURDi:513return AArch64::LDRDpost;514case AArch64::LDRQui:515case AArch64::LDURQi:516return AArch64::LDRQpost;517case AArch64::LDRBBui:518return AArch64::LDRBBpost;519case AArch64::LDRHHui:520return AArch64::LDRHHpost;521case AArch64::LDRWui:522case AArch64::LDURWi:523return AArch64::LDRWpost;524case AArch64::LDRXui:525case AArch64::LDURXi:526return AArch64::LDRXpost;527case AArch64::LDRSWui:528return AArch64::LDRSWpost;529case AArch64::LDPSi:530return AArch64::LDPSpost;531case AArch64::LDPSWi:532return AArch64::LDPSWpost;533case AArch64::LDPDi:534return AArch64::LDPDpost;535case AArch64::LDPQi:536return AArch64::LDPQpost;537case AArch64::LDPWi:538return AArch64::LDPWpost;539case AArch64::LDPXi:540return AArch64::LDPXpost;541case AArch64::STPSi:542return AArch64::STPSpost;543case AArch64::STPDi:544return AArch64::STPDpost;545case AArch64::STPQi:546return AArch64::STPQpost;547case AArch64::STPWi:548return AArch64::STPWpost;549case AArch64::STPXi:550return AArch64::STPXpost;551case AArch64::STGi:552return AArch64::STGPostIndex;553case AArch64::STZGi:554return AArch64::STZGPostIndex;555case AArch64::ST2Gi:556return AArch64::ST2GPostIndex;557case AArch64::STZ2Gi:558return AArch64::STZ2GPostIndex;559case AArch64::STGPi:560return AArch64::STGPpost;561}562}563564static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {565566unsigned OpcA = FirstMI.getOpcode();567unsigned OpcB = MI.getOpcode();568569switch (OpcA) {570default:571return false;572case AArch64::STRSpre:573return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);574case AArch64::STRDpre:575return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);576case AArch64::STRQpre:577return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);578case AArch64::STRWpre:579return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);580case AArch64::STRXpre:581return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);582case AArch64::LDRSpre:583return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);584case AArch64::LDRDpre:585return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);586case AArch64::LDRQpre:587return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);588case AArch64::LDRWpre:589return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);590case AArch64::LDRXpre:591return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);592case AArch64::LDRSWpre:593return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi);594}595}596597// Returns the scale and offset range of pre/post indexed variants of MI.598static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,599int &MinOffset, int &MaxOffset) {600bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);601bool IsTagStore = isTagStore(MI);602// ST*G and all paired ldst have the same scale in pre/post-indexed variants603// as in the "unsigned offset" variant.604// All other pre/post indexed ldst instructions are unscaled.605Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;606607if (IsPaired) {608MinOffset = -64;609MaxOffset = 63;610} else {611MinOffset = -256;612MaxOffset = 255;613}614}615616static MachineOperand &getLdStRegOp(MachineInstr &MI,617unsigned PairedRegOp = 0) {618assert(PairedRegOp < 2 && "Unexpected register operand idx.");619bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);620if (IsPreLdSt)621PairedRegOp += 1;622unsigned Idx =623AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;624return MI.getOperand(Idx);625}626627static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,628MachineInstr &StoreInst,629const AArch64InstrInfo *TII) {630assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");631int LoadSize = TII->getMemScale(LoadInst);632int StoreSize = TII->getMemScale(StoreInst);633int UnscaledStOffset =634TII->hasUnscaledLdStOffset(StoreInst)635? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm()636: AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize;637int UnscaledLdOffset =638TII->hasUnscaledLdStOffset(LoadInst)639? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm()640: AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize;641return (UnscaledStOffset <= UnscaledLdOffset) &&642(UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));643}644645static bool isPromotableZeroStoreInst(MachineInstr &MI) {646unsigned Opc = MI.getOpcode();647return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||648isNarrowStore(Opc)) &&649getLdStRegOp(MI).getReg() == AArch64::WZR;650}651652static bool isPromotableLoadFromStore(MachineInstr &MI) {653switch (MI.getOpcode()) {654default:655return false;656// Scaled instructions.657case AArch64::LDRBBui:658case AArch64::LDRHHui:659case AArch64::LDRWui:660case AArch64::LDRXui:661// Unscaled instructions.662case AArch64::LDURBBi:663case AArch64::LDURHHi:664case AArch64::LDURWi:665case AArch64::LDURXi:666return true;667}668}669670static bool isMergeableLdStUpdate(MachineInstr &MI) {671unsigned Opc = MI.getOpcode();672switch (Opc) {673default:674return false;675// Scaled instructions.676case AArch64::STRSui:677case AArch64::STRDui:678case AArch64::STRQui:679case AArch64::STRXui:680case AArch64::STRWui:681case AArch64::STRHHui:682case AArch64::STRBBui:683case AArch64::LDRSui:684case AArch64::LDRDui:685case AArch64::LDRQui:686case AArch64::LDRXui:687case AArch64::LDRWui:688case AArch64::LDRHHui:689case AArch64::LDRBBui:690case AArch64::STGi:691case AArch64::STZGi:692case AArch64::ST2Gi:693case AArch64::STZ2Gi:694case AArch64::STGPi:695// Unscaled instructions.696case AArch64::STURSi:697case AArch64::STURDi:698case AArch64::STURQi:699case AArch64::STURWi:700case AArch64::STURXi:701case AArch64::LDURSi:702case AArch64::LDURDi:703case AArch64::LDURQi:704case AArch64::LDURWi:705case AArch64::LDURXi:706// Paired instructions.707case AArch64::LDPSi:708case AArch64::LDPSWi:709case AArch64::LDPDi:710case AArch64::LDPQi:711case AArch64::LDPWi:712case AArch64::LDPXi:713case AArch64::STPSi:714case AArch64::STPDi:715case AArch64::STPQi:716case AArch64::STPWi:717case AArch64::STPXi:718// Make sure this is a reg+imm (as opposed to an address reloc).719if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())720return false;721722return true;723}724}725726static bool isRewritableImplicitDef(unsigned Opc) {727switch (Opc) {728default:729return false;730case AArch64::ORRWrs:731case AArch64::ADDWri:732return true;733}734}735736MachineBasicBlock::iterator737AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,738MachineBasicBlock::iterator MergeMI,739const LdStPairFlags &Flags) {740assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&741"Expected promotable zero stores.");742743MachineBasicBlock::iterator E = I->getParent()->end();744MachineBasicBlock::iterator NextI = next_nodbg(I, E);745// If NextI is the second of the two instructions to be merged, we need746// to skip one further. Either way we merge will invalidate the iterator,747// and we don't need to scan the new instruction, as it's a pairwise748// instruction, which we're not considering for further action anyway.749if (NextI == MergeMI)750NextI = next_nodbg(NextI, E);751752unsigned Opc = I->getOpcode();753unsigned MergeMIOpc = MergeMI->getOpcode();754bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);755bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(MergeMIOpc);756int OffsetStride = IsScaled ? TII->getMemScale(*I) : 1;757int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(*MergeMI) : 1;758759bool MergeForward = Flags.getMergeForward();760// Insert our new paired instruction after whichever of the paired761// instructions MergeForward indicates.762MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;763// Also based on MergeForward is from where we copy the base register operand764// so we get the flags compatible with the input code.765const MachineOperand &BaseRegOp =766MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI)767: AArch64InstrInfo::getLdStBaseOp(*I);768769// Which register is Rt and which is Rt2 depends on the offset order.770int64_t IOffsetInBytes =771AArch64InstrInfo::getLdStOffsetOp(*I).getImm() * OffsetStride;772int64_t MIOffsetInBytes =773AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() *774MergeMIOffsetStride;775// Select final offset based on the offset order.776int64_t OffsetImm;777if (IOffsetInBytes > MIOffsetInBytes)778OffsetImm = MIOffsetInBytes;779else780OffsetImm = IOffsetInBytes;781782int NewOpcode = getMatchingWideOpcode(Opc);783bool FinalIsScaled = !TII->hasUnscaledLdStOffset(NewOpcode);784785// Adjust final offset if the result opcode is a scaled store.786if (FinalIsScaled) {787int NewOffsetStride = FinalIsScaled ? TII->getMemScale(NewOpcode) : 1;788assert(((OffsetImm % NewOffsetStride) == 0) &&789"Offset should be a multiple of the store memory scale");790OffsetImm = OffsetImm / NewOffsetStride;791}792793// Construct the new instruction.794DebugLoc DL = I->getDebugLoc();795MachineBasicBlock *MBB = I->getParent();796MachineInstrBuilder MIB;797MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))798.addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)799.add(BaseRegOp)800.addImm(OffsetImm)801.cloneMergedMemRefs({&*I, &*MergeMI})802.setMIFlags(I->mergeFlagsWith(*MergeMI));803(void)MIB;804805LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");806LLVM_DEBUG(I->print(dbgs()));807LLVM_DEBUG(dbgs() << " ");808LLVM_DEBUG(MergeMI->print(dbgs()));809LLVM_DEBUG(dbgs() << " with instruction:\n ");810LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));811LLVM_DEBUG(dbgs() << "\n");812813// Erase the old instructions.814I->eraseFromParent();815MergeMI->eraseFromParent();816return NextI;817}818819// Apply Fn to all instructions between MI and the beginning of the block, until820// a def for DefReg is reached. Returns true, iff Fn returns true for all821// visited instructions. Stop after visiting Limit iterations.822static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,823const TargetRegisterInfo *TRI, unsigned Limit,824std::function<bool(MachineInstr &, bool)> &Fn) {825auto MBB = MI.getParent();826for (MachineInstr &I :827instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {828if (!Limit)829return false;830--Limit;831832bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {833return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&834TRI->regsOverlap(MOP.getReg(), DefReg);835});836if (!Fn(I, isDef))837return false;838if (isDef)839break;840}841return true;842}843844static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,845const TargetRegisterInfo *TRI) {846847for (const MachineOperand &MOP : phys_regs_and_masks(MI))848if (MOP.isReg() && MOP.isKill())849Units.removeReg(MOP.getReg());850851for (const MachineOperand &MOP : phys_regs_and_masks(MI))852if (MOP.isReg() && !MOP.isKill())853Units.addReg(MOP.getReg());854}855856MachineBasicBlock::iterator857AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,858MachineBasicBlock::iterator Paired,859const LdStPairFlags &Flags) {860MachineBasicBlock::iterator E = I->getParent()->end();861MachineBasicBlock::iterator NextI = next_nodbg(I, E);862// If NextI is the second of the two instructions to be merged, we need863// to skip one further. Either way we merge will invalidate the iterator,864// and we don't need to scan the new instruction, as it's a pairwise865// instruction, which we're not considering for further action anyway.866if (NextI == Paired)867NextI = next_nodbg(NextI, E);868869int SExtIdx = Flags.getSExtIdx();870unsigned Opc =871SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());872bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);873int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;874875bool MergeForward = Flags.getMergeForward();876877std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();878if (RenameReg) {879MCRegister RegToRename = getLdStRegOp(*I).getReg();880DefinedInBB.addReg(*RenameReg);881882// Return the sub/super register for RenameReg, matching the size of883// OriginalReg.884auto GetMatchingSubReg =885[this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {886for (MCPhysReg SubOrSuper :887TRI->sub_and_superregs_inclusive(*RenameReg)) {888if (C->contains(SubOrSuper))889return SubOrSuper;890}891llvm_unreachable("Should have found matching sub or super register!");892};893894std::function<bool(MachineInstr &, bool)> UpdateMIs =895[this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,896bool IsDef) {897if (IsDef) {898bool SeenDef = false;899for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {900MachineOperand &MOP = MI.getOperand(OpIdx);901// Rename the first explicit definition and all implicit902// definitions matching RegToRename.903if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&904(!MergeForward || !SeenDef ||905(MOP.isDef() && MOP.isImplicit())) &&906TRI->regsOverlap(MOP.getReg(), RegToRename)) {907assert((MOP.isImplicit() ||908(MOP.isRenamable() && !MOP.isEarlyClobber())) &&909"Need renamable operands");910Register MatchingReg;911if (const TargetRegisterClass *RC =912MI.getRegClassConstraint(OpIdx, TII, TRI))913MatchingReg = GetMatchingSubReg(RC);914else {915if (!isRewritableImplicitDef(MI.getOpcode()))916continue;917MatchingReg = GetMatchingSubReg(918TRI->getMinimalPhysRegClass(MOP.getReg()));919}920MOP.setReg(MatchingReg);921SeenDef = true;922}923}924} else {925for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {926MachineOperand &MOP = MI.getOperand(OpIdx);927if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&928TRI->regsOverlap(MOP.getReg(), RegToRename)) {929assert((MOP.isImplicit() ||930(MOP.isRenamable() && !MOP.isEarlyClobber())) &&931"Need renamable operands");932Register MatchingReg;933if (const TargetRegisterClass *RC =934MI.getRegClassConstraint(OpIdx, TII, TRI))935MatchingReg = GetMatchingSubReg(RC);936else937MatchingReg = GetMatchingSubReg(938TRI->getMinimalPhysRegClass(MOP.getReg()));939assert(MatchingReg != AArch64::NoRegister &&940"Cannot find matching regs for renaming");941MOP.setReg(MatchingReg);942}943}944}945LLVM_DEBUG(dbgs() << "Renamed " << MI);946return true;947};948forAllMIsUntilDef(MergeForward ? *I : *std::prev(Paired), RegToRename, TRI,949UINT32_MAX, UpdateMIs);950951#if !defined(NDEBUG)952// For forward merging store:953// Make sure the register used for renaming is not used between the954// paired instructions. That would trash the content before the new955// paired instruction.956MCPhysReg RegToCheck = *RenameReg;957// For backward merging load:958// Make sure the register being renamed is not used between the959// paired instructions. That would trash the content after the new960// paired instruction.961if (!MergeForward)962RegToCheck = RegToRename;963for (auto &MI :964iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(965MergeForward ? std::next(I) : I,966MergeForward ? std::next(Paired) : Paired))967assert(all_of(MI.operands(),968[this, RegToCheck](const MachineOperand &MOP) {969return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||970MOP.isUndef() ||971!TRI->regsOverlap(MOP.getReg(), RegToCheck);972}) &&973"Rename register used between paired instruction, trashing the "974"content");975#endif976}977978// Insert our new paired instruction after whichever of the paired979// instructions MergeForward indicates.980MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;981// Also based on MergeForward is from where we copy the base register operand982// so we get the flags compatible with the input code.983const MachineOperand &BaseRegOp =984MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired)985: AArch64InstrInfo::getLdStBaseOp(*I);986987int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm();988int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm();989bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());990if (IsUnscaled != PairedIsUnscaled) {991// We're trying to pair instructions that differ in how they are scaled. If992// I is scaled then scale the offset of Paired accordingly. Otherwise, do993// the opposite (i.e., make Paired's offset unscaled).994int MemSize = TII->getMemScale(*Paired);995if (PairedIsUnscaled) {996// If the unscaled offset isn't a multiple of the MemSize, we can't997// pair the operations together.998assert(!(PairedOffset % TII->getMemScale(*Paired)) &&999"Offset should be a multiple of the stride!");1000PairedOffset /= MemSize;1001} else {1002PairedOffset *= MemSize;1003}1004}10051006// Which register is Rt and which is Rt2 depends on the offset order.1007// However, for pre load/stores the Rt should be the one of the pre1008// load/store.1009MachineInstr *RtMI, *Rt2MI;1010if (Offset == PairedOffset + OffsetStride &&1011!AArch64InstrInfo::isPreLdSt(*I)) {1012RtMI = &*Paired;1013Rt2MI = &*I;1014// Here we swapped the assumption made for SExtIdx.1015// I.e., we turn ldp I, Paired into ldp Paired, I.1016// Update the index accordingly.1017if (SExtIdx != -1)1018SExtIdx = (SExtIdx + 1) % 2;1019} else {1020RtMI = &*I;1021Rt2MI = &*Paired;1022}1023int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();1024// Scale the immediate offset, if necessary.1025if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {1026assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&1027"Unscaled offset cannot be scaled.");1028OffsetImm /= TII->getMemScale(*RtMI);1029}10301031// Construct the new instruction.1032MachineInstrBuilder MIB;1033DebugLoc DL = I->getDebugLoc();1034MachineBasicBlock *MBB = I->getParent();1035MachineOperand RegOp0 = getLdStRegOp(*RtMI);1036MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);1037MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;1038// Kill flags may become invalid when moving stores for pairing.1039if (RegOp0.isUse()) {1040if (!MergeForward) {1041// Clear kill flags on store if moving upwards. Example:1042// STRWui kill %w0, ...1043// USE %w11044// STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards1045// We are about to move the store of w1, so its kill flag may become1046// invalid; not the case for w0.1047// Since w1 is used between the stores, the kill flag on w1 is cleared1048// after merging.1049// STPWi kill %w0, %w1, ...1050// USE %w11051for (auto It = std::next(I); It != Paired && PairedRegOp.isKill(); ++It)1052if (It->readsRegister(PairedRegOp.getReg(), TRI))1053PairedRegOp.setIsKill(false);1054} else {1055// Clear kill flags of the first stores register. Example:1056// STRWui %w1, ...1057// USE kill %w1 ; need to clear kill flag when moving STRWui downwards1058// STRW %w01059Register Reg = getLdStRegOp(*I).getReg();1060for (MachineInstr &MI : make_range(std::next(I), Paired))1061MI.clearRegisterKills(Reg, TRI);1062}1063}10641065unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);1066MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));10671068// Adds the pre-index operand for pre-indexed ld/st pairs.1069if (AArch64InstrInfo::isPreLdSt(*RtMI))1070MIB.addReg(BaseRegOp.getReg(), RegState::Define);10711072MIB.add(RegOp0)1073.add(RegOp1)1074.add(BaseRegOp)1075.addImm(OffsetImm)1076.cloneMergedMemRefs({&*I, &*Paired})1077.setMIFlags(I->mergeFlagsWith(*Paired));10781079(void)MIB;10801081LLVM_DEBUG(1082dbgs() << "Creating pair load/store. Replacing instructions:\n ");1083LLVM_DEBUG(I->print(dbgs()));1084LLVM_DEBUG(dbgs() << " ");1085LLVM_DEBUG(Paired->print(dbgs()));1086LLVM_DEBUG(dbgs() << " with instruction:\n ");1087if (SExtIdx != -1) {1088// Generate the sign extension for the proper result of the ldp.1089// I.e., with X1, that would be:1090// %w1 = KILL %w1, implicit-def %x11091// %x1 = SBFMXri killed %x1, 0, 311092MachineOperand &DstMO = MIB->getOperand(SExtIdx);1093// Right now, DstMO has the extended register, since it comes from an1094// extended opcode.1095Register DstRegX = DstMO.getReg();1096// Get the W variant of that register.1097Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);1098// Update the result of LDP to use the W instead of the X variant.1099DstMO.setReg(DstRegW);1100LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));1101LLVM_DEBUG(dbgs() << "\n");1102// Make the machine verifier happy by providing a definition for1103// the X register.1104// Insert this definition right after the generated LDP, i.e., before1105// InsertionPoint.1106MachineInstrBuilder MIBKill =1107BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)1108.addReg(DstRegW)1109.addReg(DstRegX, RegState::Define);1110MIBKill->getOperand(2).setImplicit();1111// Create the sign extension.1112MachineInstrBuilder MIBSXTW =1113BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)1114.addReg(DstRegX)1115.addImm(0)1116.addImm(31);1117(void)MIBSXTW;1118LLVM_DEBUG(dbgs() << " Extend operand:\n ");1119LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));1120} else {1121LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));1122}1123LLVM_DEBUG(dbgs() << "\n");11241125if (MergeForward)1126for (const MachineOperand &MOP : phys_regs_and_masks(*I))1127if (MOP.isReg() && MOP.isKill())1128DefinedInBB.addReg(MOP.getReg());11291130// Erase the old instructions.1131I->eraseFromParent();1132Paired->eraseFromParent();11331134return NextI;1135}11361137MachineBasicBlock::iterator1138AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,1139MachineBasicBlock::iterator StoreI) {1140MachineBasicBlock::iterator NextI =1141next_nodbg(LoadI, LoadI->getParent()->end());11421143int LoadSize = TII->getMemScale(*LoadI);1144int StoreSize = TII->getMemScale(*StoreI);1145Register LdRt = getLdStRegOp(*LoadI).getReg();1146const MachineOperand &StMO = getLdStRegOp(*StoreI);1147Register StRt = getLdStRegOp(*StoreI).getReg();1148bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);11491150assert((IsStoreXReg ||1151TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&1152"Unexpected RegClass");11531154MachineInstr *BitExtMI;1155if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {1156// Remove the load, if the destination register of the loads is the same1157// register for stored value.1158if (StRt == LdRt && LoadSize == 8) {1159for (MachineInstr &MI : make_range(StoreI->getIterator(),1160LoadI->getIterator())) {1161if (MI.killsRegister(StRt, TRI)) {1162MI.clearRegisterKills(StRt, TRI);1163break;1164}1165}1166LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");1167LLVM_DEBUG(LoadI->print(dbgs()));1168LLVM_DEBUG(dbgs() << "\n");1169LoadI->eraseFromParent();1170return NextI;1171}1172// Replace the load with a mov if the load and store are in the same size.1173BitExtMI =1174BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),1175TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)1176.addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)1177.add(StMO)1178.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))1179.setMIFlags(LoadI->getFlags());1180} else {1181// FIXME: Currently we disable this transformation in big-endian targets as1182// performance and correctness are verified only in little-endian.1183if (!Subtarget->isLittleEndian())1184return NextI;1185bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI);1186assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&1187"Unsupported ld/st match");1188assert(LoadSize <= StoreSize && "Invalid load size");1189int UnscaledLdOffset =1190IsUnscaled1191? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm()1192: AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize;1193int UnscaledStOffset =1194IsUnscaled1195? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm()1196: AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize;1197int Width = LoadSize * 8;1198Register DestReg =1199IsStoreXReg ? Register(TRI->getMatchingSuperReg(1200LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))1201: LdRt;12021203assert((UnscaledLdOffset >= UnscaledStOffset &&1204(UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&1205"Invalid offset");12061207int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);1208int Imms = Immr + Width - 1;1209if (UnscaledLdOffset == UnscaledStOffset) {1210uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N1211| ((Immr) << 6) // immr1212| ((Imms) << 0) // imms1213;12141215BitExtMI =1216BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),1217TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),1218DestReg)1219.add(StMO)1220.addImm(AndMaskEncoded)1221.setMIFlags(LoadI->getFlags());1222} else {1223BitExtMI =1224BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),1225TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),1226DestReg)1227.add(StMO)1228.addImm(Immr)1229.addImm(Imms)1230.setMIFlags(LoadI->getFlags());1231}1232}12331234// Clear kill flags between store and load.1235for (MachineInstr &MI : make_range(StoreI->getIterator(),1236BitExtMI->getIterator()))1237if (MI.killsRegister(StRt, TRI)) {1238MI.clearRegisterKills(StRt, TRI);1239break;1240}12411242LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");1243LLVM_DEBUG(StoreI->print(dbgs()));1244LLVM_DEBUG(dbgs() << " ");1245LLVM_DEBUG(LoadI->print(dbgs()));1246LLVM_DEBUG(dbgs() << " with instructions:\n ");1247LLVM_DEBUG(StoreI->print(dbgs()));1248LLVM_DEBUG(dbgs() << " ");1249LLVM_DEBUG((BitExtMI)->print(dbgs()));1250LLVM_DEBUG(dbgs() << "\n");12511252// Erase the old instructions.1253LoadI->eraseFromParent();1254return NextI;1255}12561257static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {1258// Convert the byte-offset used by unscaled into an "element" offset used1259// by the scaled pair load/store instructions.1260if (IsUnscaled) {1261// If the byte-offset isn't a multiple of the stride, there's no point1262// trying to match it.1263if (Offset % OffsetStride)1264return false;1265Offset /= OffsetStride;1266}1267return Offset <= 63 && Offset >= -64;1268}12691270// Do alignment, specialized to power of 2 and for signed ints,1271// avoiding having to do a C-style cast from uint_64t to int when1272// using alignTo from include/llvm/Support/MathExtras.h.1273// FIXME: Move this function to include/MathExtras.h?1274static int alignTo(int Num, int PowOf2) {1275return (Num + PowOf2 - 1) & ~(PowOf2 - 1);1276}12771278static bool mayAlias(MachineInstr &MIa,1279SmallVectorImpl<MachineInstr *> &MemInsns,1280AliasAnalysis *AA) {1281for (MachineInstr *MIb : MemInsns) {1282if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) {1283LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());1284return true;1285}1286}12871288LLVM_DEBUG(dbgs() << "No aliases found\n");1289return false;1290}12911292bool AArch64LoadStoreOpt::findMatchingStore(1293MachineBasicBlock::iterator I, unsigned Limit,1294MachineBasicBlock::iterator &StoreI) {1295MachineBasicBlock::iterator B = I->getParent()->begin();1296MachineBasicBlock::iterator MBBI = I;1297MachineInstr &LoadMI = *I;1298Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg();12991300// If the load is the first instruction in the block, there's obviously1301// not any matching store.1302if (MBBI == B)1303return false;13041305// Track which register units have been modified and used between the first1306// insn and the second insn.1307ModifiedRegUnits.clear();1308UsedRegUnits.clear();13091310unsigned Count = 0;1311do {1312MBBI = prev_nodbg(MBBI, B);1313MachineInstr &MI = *MBBI;13141315// Don't count transient instructions towards the search limit since there1316// may be different numbers of them if e.g. debug information is present.1317if (!MI.isTransient())1318++Count;13191320// If the load instruction reads directly from the address to which the1321// store instruction writes and the stored value is not modified, we can1322// promote the load. Since we do not handle stores with pre-/post-index,1323// it's unnecessary to check if BaseReg is modified by the store itself.1324// Also we can't handle stores without an immediate offset operand,1325// while the operand might be the address for a global variable.1326if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&1327BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&1328AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&1329isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&1330ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {1331StoreI = MBBI;1332return true;1333}13341335if (MI.isCall())1336return false;13371338// Update modified / uses register units.1339LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);13401341// Otherwise, if the base register is modified, we have no match, so1342// return early.1343if (!ModifiedRegUnits.available(BaseReg))1344return false;13451346// If we encounter a store aliased with the load, return early.1347if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))1348return false;1349} while (MBBI != B && Count < Limit);1350return false;1351}13521353static bool needsWinCFI(const MachineFunction *MF) {1354return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&1355MF->getFunction().needsUnwindTableEntry();1356}13571358// Returns true if FirstMI and MI are candidates for merging or pairing.1359// Otherwise, returns false.1360static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,1361LdStPairFlags &Flags,1362const AArch64InstrInfo *TII) {1363// If this is volatile or if pairing is suppressed, not a candidate.1364if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))1365return false;13661367// We should have already checked FirstMI for pair suppression and volatility.1368assert(!FirstMI.hasOrderedMemoryRef() &&1369!TII->isLdStPairSuppressed(FirstMI) &&1370"FirstMI shouldn't get here if either of these checks are true.");13711372if (needsWinCFI(MI.getMF()) && (MI.getFlag(MachineInstr::FrameSetup) ||1373MI.getFlag(MachineInstr::FrameDestroy)))1374return false;13751376unsigned OpcA = FirstMI.getOpcode();1377unsigned OpcB = MI.getOpcode();13781379// Opcodes match: If the opcodes are pre ld/st there is nothing more to check.1380if (OpcA == OpcB)1381return !AArch64InstrInfo::isPreLdSt(FirstMI);13821383// Two pre ld/st of different opcodes cannot be merged either1384if (AArch64InstrInfo::isPreLdSt(FirstMI) && AArch64InstrInfo::isPreLdSt(MI))1385return false;13861387// Try to match a sign-extended load/store with a zero-extended load/store.1388bool IsValidLdStrOpc, PairIsValidLdStrOpc;1389unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);1390assert(IsValidLdStrOpc &&1391"Given Opc should be a Load or Store with an immediate");1392// OpcA will be the first instruction in the pair.1393if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {1394Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);1395return true;1396}13971398// If the second instruction isn't even a mergable/pairable load/store, bail1399// out.1400if (!PairIsValidLdStrOpc)1401return false;14021403// FIXME: We don't support merging narrow stores with mixed scaled/unscaled1404// offsets.1405if (isNarrowStore(OpcA) || isNarrowStore(OpcB))1406return false;14071408// The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and1409// LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui1410// are candidate pairs that can be merged.1411if (isPreLdStPairCandidate(FirstMI, MI))1412return true;14131414// Try to match an unscaled load/store with a scaled load/store.1415return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) &&1416getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);14171418// FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?1419}14201421static bool canRenameMOP(const MachineOperand &MOP,1422const TargetRegisterInfo *TRI) {1423if (MOP.isReg()) {1424auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());1425// Renaming registers with multiple disjunct sub-registers (e.g. the1426// result of a LD3) means that all sub-registers are renamed, potentially1427// impacting other instructions we did not check. Bail out.1428// Note that this relies on the structure of the AArch64 register file. In1429// particular, a subregister cannot be written without overwriting the1430// whole register.1431if (RegClass->HasDisjunctSubRegs) {1432LLVM_DEBUG(1433dbgs()1434<< " Cannot rename operands with multiple disjunct subregisters ("1435<< MOP << ")\n");1436return false;1437}14381439// We cannot rename arbitrary implicit-defs, the specific rule to rewrite1440// them must be known. For example, in ORRWrs the implicit-def1441// corresponds to the result register.1442if (MOP.isImplicit() && MOP.isDef()) {1443if (!isRewritableImplicitDef(MOP.getParent()->getOpcode()))1444return false;1445return TRI->isSuperOrSubRegisterEq(1446MOP.getParent()->getOperand(0).getReg(), MOP.getReg());1447}1448}1449return MOP.isImplicit() ||1450(MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());1451}14521453static bool1454canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,1455SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,1456const TargetRegisterInfo *TRI) {1457if (!FirstMI.mayStore())1458return false;14591460// Check if we can find an unused register which we can use to rename1461// the register used by the first load/store.14621463auto RegToRename = getLdStRegOp(FirstMI).getReg();1464// For now, we only rename if the store operand gets killed at the store.1465if (!getLdStRegOp(FirstMI).isKill() &&1466!any_of(FirstMI.operands(),1467[TRI, RegToRename](const MachineOperand &MOP) {1468return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&1469MOP.isImplicit() && MOP.isKill() &&1470TRI->regsOverlap(RegToRename, MOP.getReg());1471})) {1472LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI);1473return false;1474}14751476bool FoundDef = false;14771478// For each instruction between FirstMI and the previous def for RegToRename,1479// we1480// * check if we can rename RegToRename in this instruction1481// * collect the registers used and required register classes for RegToRename.1482std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,1483bool IsDef) {1484LLVM_DEBUG(dbgs() << "Checking " << MI);1485// Currently we do not try to rename across frame-setup instructions.1486if (MI.getFlag(MachineInstr::FrameSetup)) {1487LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "1488<< "currently\n");1489return false;1490}14911492UsedInBetween.accumulate(MI);14931494// For a definition, check that we can rename the definition and exit the1495// loop.1496FoundDef = IsDef;14971498// For defs, check if we can rename the first def of RegToRename.1499if (FoundDef) {1500// For some pseudo instructions, we might not generate code in the end1501// (e.g. KILL) and we would end up without a correct def for the rename1502// register.1503// TODO: This might be overly conservative and we could handle those cases1504// in multiple ways:1505// 1. Insert an extra copy, to materialize the def.1506// 2. Skip pseudo-defs until we find an non-pseudo def.1507if (MI.isPseudo()) {1508LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n");1509return false;1510}15111512for (auto &MOP : MI.operands()) {1513if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||1514!TRI->regsOverlap(MOP.getReg(), RegToRename))1515continue;1516if (!canRenameMOP(MOP, TRI)) {1517LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);1518return false;1519}1520RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));1521}1522return true;1523} else {1524for (auto &MOP : MI.operands()) {1525if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||1526!TRI->regsOverlap(MOP.getReg(), RegToRename))1527continue;15281529if (!canRenameMOP(MOP, TRI)) {1530LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);1531return false;1532}1533RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));1534}1535}1536return true;1537};15381539if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))1540return false;15411542if (!FoundDef) {1543LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");1544return false;1545}1546return true;1547}15481549// We want to merge the second load into the first by rewriting the usages of1550// the same reg between first (incl.) and second (excl.). We don't need to care1551// about any insns before FirstLoad or after SecondLoad.1552// 1. The second load writes new value into the same reg.1553// - The renaming is impossible to impact later use of the reg.1554// - The second load always trash the value written by the first load which1555// means the reg must be killed before the second load.1556// 2. The first load must be a def for the same reg so we don't need to look1557// into anything before it.1558static bool canRenameUntilSecondLoad(1559MachineInstr &FirstLoad, MachineInstr &SecondLoad,1560LiveRegUnits &UsedInBetween,1561SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,1562const TargetRegisterInfo *TRI) {1563if (FirstLoad.isPseudo())1564return false;15651566UsedInBetween.accumulate(FirstLoad);1567auto RegToRename = getLdStRegOp(FirstLoad).getReg();1568bool Success = std::all_of(1569FirstLoad.getIterator(), SecondLoad.getIterator(),1570[&](MachineInstr &MI) {1571LLVM_DEBUG(dbgs() << "Checking " << MI);1572// Currently we do not try to rename across frame-setup instructions.1573if (MI.getFlag(MachineInstr::FrameSetup)) {1574LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "1575<< "currently\n");1576return false;1577}15781579for (auto &MOP : MI.operands()) {1580if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||1581!TRI->regsOverlap(MOP.getReg(), RegToRename))1582continue;1583if (!canRenameMOP(MOP, TRI)) {1584LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);1585return false;1586}1587RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));1588}15891590return true;1591});1592return Success;1593}15941595// Check if we can find a physical register for renaming \p Reg. This register1596// must:1597// * not be defined already in \p DefinedInBB; DefinedInBB must contain all1598// defined registers up to the point where the renamed register will be used,1599// * not used in \p UsedInBetween; UsedInBetween must contain all accessed1600// registers in the range the rename register will be used,1601// * is available in all used register classes (checked using RequiredClasses).1602static std::optional<MCPhysReg> tryToFindRegisterToRename(1603const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,1604LiveRegUnits &UsedInBetween,1605SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,1606const TargetRegisterInfo *TRI) {1607const MachineRegisterInfo &RegInfo = MF.getRegInfo();16081609// Checks if any sub- or super-register of PR is callee saved.1610auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {1611return any_of(TRI->sub_and_superregs_inclusive(PR),1612[&MF, TRI](MCPhysReg SubOrSuper) {1613return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);1614});1615};16161617// Check if PR or one of its sub- or super-registers can be used for all1618// required register classes.1619auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {1620return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {1621return any_of(1622TRI->sub_and_superregs_inclusive(PR),1623[C](MCPhysReg SubOrSuper) { return C->contains(SubOrSuper); });1624});1625};16261627auto *RegClass = TRI->getMinimalPhysRegClass(Reg);1628for (const MCPhysReg &PR : *RegClass) {1629if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&1630!RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&1631CanBeUsedForAllClasses(PR)) {1632DefinedInBB.addReg(PR);1633LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)1634<< "\n");1635return {PR};1636}1637}1638LLVM_DEBUG(dbgs() << "No rename register found from "1639<< TRI->getRegClassName(RegClass) << "\n");1640return std::nullopt;1641}16421643// For store pairs: returns a register from FirstMI to the beginning of the1644// block that can be renamed.1645// For load pairs: returns a register from FirstMI to MI that can be renamed.1646static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(1647std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,1648Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,1649SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,1650const TargetRegisterInfo *TRI) {1651std::optional<MCPhysReg> RenameReg;1652if (!DebugCounter::shouldExecute(RegRenamingCounter))1653return RenameReg;16541655auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());1656MachineFunction &MF = *FirstMI.getParent()->getParent();1657if (!RegClass || !MF.getRegInfo().tracksLiveness())1658return RenameReg;16591660const bool IsLoad = FirstMI.mayLoad();16611662if (!MaybeCanRename) {1663if (IsLoad)1664MaybeCanRename = {canRenameUntilSecondLoad(FirstMI, MI, UsedInBetween,1665RequiredClasses, TRI)};1666else1667MaybeCanRename = {1668canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};1669}16701671if (*MaybeCanRename) {1672RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,1673RequiredClasses, TRI);1674}1675return RenameReg;1676}16771678/// Scan the instructions looking for a load/store that can be combined with the1679/// current instruction into a wider equivalent or a load/store pair.1680MachineBasicBlock::iterator1681AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,1682LdStPairFlags &Flags, unsigned Limit,1683bool FindNarrowMerge) {1684MachineBasicBlock::iterator E = I->getParent()->end();1685MachineBasicBlock::iterator MBBI = I;1686MachineBasicBlock::iterator MBBIWithRenameReg;1687MachineInstr &FirstMI = *I;1688MBBI = next_nodbg(MBBI, E);16891690bool MayLoad = FirstMI.mayLoad();1691bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);1692Register Reg = getLdStRegOp(FirstMI).getReg();1693Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg();1694int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm();1695int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;1696bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);16971698std::optional<bool> MaybeCanRename;1699if (!EnableRenaming)1700MaybeCanRename = {false};17011702SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;1703LiveRegUnits UsedInBetween;1704UsedInBetween.init(*TRI);17051706Flags.clearRenameReg();17071708// Track which register units have been modified and used between the first1709// insn (inclusive) and the second insn.1710ModifiedRegUnits.clear();1711UsedRegUnits.clear();17121713// Remember any instructions that read/write memory between FirstMI and MI.1714SmallVector<MachineInstr *, 4> MemInsns;17151716LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());1717for (unsigned Count = 0; MBBI != E && Count < Limit;1718MBBI = next_nodbg(MBBI, E)) {1719MachineInstr &MI = *MBBI;1720LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());17211722UsedInBetween.accumulate(MI);17231724// Don't count transient instructions towards the search limit since there1725// may be different numbers of them if e.g. debug information is present.1726if (!MI.isTransient())1727++Count;17281729Flags.setSExtIdx(-1);1730if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&1731AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {1732assert(MI.mayLoadOrStore() && "Expected memory operation.");1733// If we've found another instruction with the same opcode, check to see1734// if the base and offset are compatible with our starting instruction.1735// These instructions all have scaled immediate operands, so we just1736// check for +1/-1. Make sure to check the new instruction offset is1737// actually an immediate and not a symbolic reference destined for1738// a relocation.1739Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();1740int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();1741bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);1742if (IsUnscaled != MIIsUnscaled) {1743// We're trying to pair instructions that differ in how they are scaled.1744// If FirstMI is scaled then scale the offset of MI accordingly.1745// Otherwise, do the opposite (i.e., make MI's offset unscaled).1746int MemSize = TII->getMemScale(MI);1747if (MIIsUnscaled) {1748// If the unscaled offset isn't a multiple of the MemSize, we can't1749// pair the operations together: bail and keep looking.1750if (MIOffset % MemSize) {1751LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,1752UsedRegUnits, TRI);1753MemInsns.push_back(&MI);1754continue;1755}1756MIOffset /= MemSize;1757} else {1758MIOffset *= MemSize;1759}1760}17611762bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);17631764if (BaseReg == MIBaseReg) {1765// If the offset of the second ld/st is not equal to the size of the1766// destination register it can’t be paired with a pre-index ld/st1767// pair. Additionally if the base reg is used or modified the operations1768// can't be paired: bail and keep looking.1769if (IsPreLdSt) {1770bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);1771bool IsBaseRegUsed = !UsedRegUnits.available(1772AArch64InstrInfo::getLdStBaseOp(MI).getReg());1773bool IsBaseRegModified = !ModifiedRegUnits.available(1774AArch64InstrInfo::getLdStBaseOp(MI).getReg());1775// If the stored value and the address of the second instruction is1776// the same, it needs to be using the updated register and therefore1777// it must not be folded.1778bool IsMIRegTheSame =1779TRI->regsOverlap(getLdStRegOp(MI).getReg(),1780AArch64InstrInfo::getLdStBaseOp(MI).getReg());1781if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||1782IsMIRegTheSame) {1783LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,1784UsedRegUnits, TRI);1785MemInsns.push_back(&MI);1786continue;1787}1788} else {1789if ((Offset != MIOffset + OffsetStride) &&1790(Offset + OffsetStride != MIOffset)) {1791LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,1792UsedRegUnits, TRI);1793MemInsns.push_back(&MI);1794continue;1795}1796}17971798int MinOffset = Offset < MIOffset ? Offset : MIOffset;1799if (FindNarrowMerge) {1800// If the alignment requirements of the scaled wide load/store1801// instruction can't express the offset of the scaled narrow input,1802// bail and keep looking. For promotable zero stores, allow only when1803// the stored value is the same (i.e., WZR).1804if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||1805(IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {1806LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,1807UsedRegUnits, TRI);1808MemInsns.push_back(&MI);1809continue;1810}1811} else {1812// Pairwise instructions have a 7-bit signed offset field. Single1813// insns have a 12-bit unsigned offset field. If the resultant1814// immediate offset of merging these instructions is out of range for1815// a pairwise instruction, bail and keep looking.1816if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {1817LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,1818UsedRegUnits, TRI);1819MemInsns.push_back(&MI);1820LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "1821<< "keep looking.\n");1822continue;1823}1824// If the alignment requirements of the paired (scaled) instruction1825// can't express the offset of the unscaled input, bail and keep1826// looking.1827if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {1828LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,1829UsedRegUnits, TRI);1830MemInsns.push_back(&MI);1831LLVM_DEBUG(dbgs()1832<< "Offset doesn't fit due to alignment requirements, "1833<< "keep looking.\n");1834continue;1835}1836}18371838// If the BaseReg has been modified, then we cannot do the optimization.1839// For example, in the following pattern1840// ldr x1 [x2]1841// ldr x2 [x3]1842// ldr x4 [x2, #8],1843// the first and third ldr cannot be converted to ldp x1, x4, [x2]1844if (!ModifiedRegUnits.available(BaseReg))1845return E;18461847const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(1848Reg, getLdStRegOp(MI).getReg());18491850// If the Rt of the second instruction (destination register of the1851// load) was not modified or used between the two instructions and none1852// of the instructions between the second and first alias with the1853// second, we can combine the second into the first.1854bool RtNotModified =1855ModifiedRegUnits.available(getLdStRegOp(MI).getReg());1856bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&1857!UsedRegUnits.available(getLdStRegOp(MI).getReg()));18581859LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"1860<< "Reg '" << getLdStRegOp(MI) << "' not modified: "1861<< (RtNotModified ? "true" : "false") << "\n"1862<< "Reg '" << getLdStRegOp(MI) << "' not used: "1863<< (RtNotUsed ? "true" : "false") << "\n");18641865if (RtNotModified && RtNotUsed && !mayAlias(MI, MemInsns, AA)) {1866// For pairs loading into the same reg, try to find a renaming1867// opportunity to allow the renaming of Reg between FirstMI and MI1868// and combine MI into FirstMI; otherwise bail and keep looking.1869if (SameLoadReg) {1870std::optional<MCPhysReg> RenameReg =1871findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,1872Reg, DefinedInBB, UsedInBetween,1873RequiredClasses, TRI);1874if (!RenameReg) {1875LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,1876UsedRegUnits, TRI);1877MemInsns.push_back(&MI);1878LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "1879<< "keep looking.\n");1880continue;1881}1882Flags.setRenameReg(*RenameReg);1883}18841885Flags.setMergeForward(false);1886if (!SameLoadReg)1887Flags.clearRenameReg();1888return MBBI;1889}18901891// Likewise, if the Rt of the first instruction is not modified or used1892// between the two instructions and none of the instructions between the1893// first and the second alias with the first, we can combine the first1894// into the second.1895RtNotModified = !(1896MayLoad && !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg()));18971898LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"1899<< "Reg '" << getLdStRegOp(FirstMI)1900<< "' not modified: "1901<< (RtNotModified ? "true" : "false") << "\n");19021903if (RtNotModified && !mayAlias(FirstMI, MemInsns, AA)) {1904if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {1905Flags.setMergeForward(true);1906Flags.clearRenameReg();1907return MBBI;1908}19091910std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(1911MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,1912RequiredClasses, TRI);1913if (RenameReg) {1914Flags.setMergeForward(true);1915Flags.setRenameReg(*RenameReg);1916MBBIWithRenameReg = MBBI;1917}1918}1919LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "1920<< "interference in between, keep looking.\n");1921}1922}19231924if (Flags.getRenameReg())1925return MBBIWithRenameReg;19261927// If the instruction wasn't a matching load or store. Stop searching if we1928// encounter a call instruction that might modify memory.1929if (MI.isCall()) {1930LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");1931return E;1932}19331934// Update modified / uses register units.1935LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);19361937// Otherwise, if the base register is modified, we have no match, so1938// return early.1939if (!ModifiedRegUnits.available(BaseReg)) {1940LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");1941return E;1942}19431944// Update list of instructions that read/write memory.1945if (MI.mayLoadOrStore())1946MemInsns.push_back(&MI);1947}1948return E;1949}19501951static MachineBasicBlock::iterator1952maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {1953auto End = MI.getParent()->end();1954if (MaybeCFI == End ||1955MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||1956!(MI.getFlag(MachineInstr::FrameSetup) ||1957MI.getFlag(MachineInstr::FrameDestroy)) ||1958AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP)1959return End;19601961const MachineFunction &MF = *MI.getParent()->getParent();1962unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();1963const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];1964switch (CFI.getOperation()) {1965case MCCFIInstruction::OpDefCfa:1966case MCCFIInstruction::OpDefCfaOffset:1967return MaybeCFI;1968default:1969return End;1970}1971}19721973MachineBasicBlock::iterator1974AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,1975MachineBasicBlock::iterator Update,1976bool IsPreIdx) {1977assert((Update->getOpcode() == AArch64::ADDXri ||1978Update->getOpcode() == AArch64::SUBXri) &&1979"Unexpected base register update instruction to merge!");1980MachineBasicBlock::iterator E = I->getParent()->end();1981MachineBasicBlock::iterator NextI = next_nodbg(I, E);19821983// If updating the SP and the following instruction is CFA offset related CFI1984// instruction move it after the merged instruction.1985MachineBasicBlock::iterator CFI =1986IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;19871988// Return the instruction following the merged instruction, which is1989// the instruction following our unmerged load. Unless that's the add/sub1990// instruction we're merging, in which case it's the one after that.1991if (NextI == Update)1992NextI = next_nodbg(NextI, E);19931994int Value = Update->getOperand(2).getImm();1995assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&1996"Can't merge 1 << 12 offset into pre-/post-indexed load / store");1997if (Update->getOpcode() == AArch64::SUBXri)1998Value = -Value;19992000unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())2001: getPostIndexedOpcode(I->getOpcode());2002MachineInstrBuilder MIB;2003int Scale, MinOffset, MaxOffset;2004getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);2005if (!AArch64InstrInfo::isPairedLdSt(*I)) {2006// Non-paired instruction.2007MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))2008.add(getLdStRegOp(*Update))2009.add(getLdStRegOp(*I))2010.add(AArch64InstrInfo::getLdStBaseOp(*I))2011.addImm(Value / Scale)2012.setMemRefs(I->memoperands())2013.setMIFlags(I->mergeFlagsWith(*Update));2014} else {2015// Paired instruction.2016MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))2017.add(getLdStRegOp(*Update))2018.add(getLdStRegOp(*I, 0))2019.add(getLdStRegOp(*I, 1))2020.add(AArch64InstrInfo::getLdStBaseOp(*I))2021.addImm(Value / Scale)2022.setMemRefs(I->memoperands())2023.setMIFlags(I->mergeFlagsWith(*Update));2024}2025if (CFI != E) {2026MachineBasicBlock *MBB = I->getParent();2027MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);2028}20292030if (IsPreIdx) {2031++NumPreFolded;2032LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");2033} else {2034++NumPostFolded;2035LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");2036}2037LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");2038LLVM_DEBUG(I->print(dbgs()));2039LLVM_DEBUG(dbgs() << " ");2040LLVM_DEBUG(Update->print(dbgs()));2041LLVM_DEBUG(dbgs() << " with instruction:\n ");2042LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));2043LLVM_DEBUG(dbgs() << "\n");20442045// Erase the old instructions for the block.2046I->eraseFromParent();2047Update->eraseFromParent();20482049return NextI;2050}20512052bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,2053MachineInstr &MI,2054unsigned BaseReg, int Offset) {2055switch (MI.getOpcode()) {2056default:2057break;2058case AArch64::SUBXri:2059case AArch64::ADDXri:2060// Make sure it's a vanilla immediate operand, not a relocation or2061// anything else we can't handle.2062if (!MI.getOperand(2).isImm())2063break;2064// Watch out for 1 << 12 shifted value.2065if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))2066break;20672068// The update instruction source and destination register must be the2069// same as the load/store base register.2070if (MI.getOperand(0).getReg() != BaseReg ||2071MI.getOperand(1).getReg() != BaseReg)2072break;20732074int UpdateOffset = MI.getOperand(2).getImm();2075if (MI.getOpcode() == AArch64::SUBXri)2076UpdateOffset = -UpdateOffset;20772078// The immediate must be a multiple of the scaling factor of the pre/post2079// indexed instruction.2080int Scale, MinOffset, MaxOffset;2081getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);2082if (UpdateOffset % Scale != 0)2083break;20842085// Scaled offset must fit in the instruction immediate.2086int ScaledOffset = UpdateOffset / Scale;2087if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)2088break;20892090// If we have a non-zero Offset, we check that it matches the amount2091// we're adding to the register.2092if (!Offset || Offset == UpdateOffset)2093return true;2094break;2095}2096return false;2097}20982099MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(2100MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {2101MachineBasicBlock::iterator E = I->getParent()->end();2102MachineInstr &MemMI = *I;2103MachineBasicBlock::iterator MBBI = I;21042105Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();2106int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() *2107TII->getMemScale(MemMI);21082109// Scan forward looking for post-index opportunities. Updating instructions2110// can't be formed if the memory instruction doesn't have the offset we're2111// looking for.2112if (MIUnscaledOffset != UnscaledOffset)2113return E;21142115// If the base register overlaps a source/destination register, we can't2116// merge the update. This does not apply to tag store instructions which2117// ignore the address part of the source register.2118// This does not apply to STGPi as well, which does not have unpredictable2119// behavior in this case unlike normal stores, and always performs writeback2120// after reading the source register value.2121if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {2122bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);2123for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {2124Register DestReg = getLdStRegOp(MemMI, i).getReg();2125if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))2126return E;2127}2128}21292130// Track which register units have been modified and used between the first2131// insn (inclusive) and the second insn.2132ModifiedRegUnits.clear();2133UsedRegUnits.clear();2134MBBI = next_nodbg(MBBI, E);21352136// We can't post-increment the stack pointer if any instruction between2137// the memory access (I) and the increment (MBBI) can access the memory2138// region defined by [SP, MBBI].2139const bool BaseRegSP = BaseReg == AArch64::SP;2140if (BaseRegSP && needsWinCFI(I->getMF())) {2141// FIXME: For now, we always block the optimization over SP in windows2142// targets as it requires to adjust the unwind/debug info, messing up2143// the unwind info can actually cause a miscompile.2144return E;2145}21462147for (unsigned Count = 0; MBBI != E && Count < Limit;2148MBBI = next_nodbg(MBBI, E)) {2149MachineInstr &MI = *MBBI;21502151// Don't count transient instructions towards the search limit since there2152// may be different numbers of them if e.g. debug information is present.2153if (!MI.isTransient())2154++Count;21552156// If we found a match, return it.2157if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))2158return MBBI;21592160// Update the status of what the instruction clobbered and used.2161LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);21622163// Otherwise, if the base register is used or modified, we have no match, so2164// return early.2165// If we are optimizing SP, do not allow instructions that may load or store2166// in between the load and the optimized value update.2167if (!ModifiedRegUnits.available(BaseReg) ||2168!UsedRegUnits.available(BaseReg) ||2169(BaseRegSP && MBBI->mayLoadOrStore()))2170return E;2171}2172return E;2173}21742175MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(2176MachineBasicBlock::iterator I, unsigned Limit) {2177MachineBasicBlock::iterator B = I->getParent()->begin();2178MachineBasicBlock::iterator E = I->getParent()->end();2179MachineInstr &MemMI = *I;2180MachineBasicBlock::iterator MBBI = I;2181MachineFunction &MF = *MemMI.getMF();21822183Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();2184int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();21852186// If the load/store is the first instruction in the block, there's obviously2187// not any matching update. Ditto if the memory offset isn't zero.2188if (MBBI == B || Offset != 0)2189return E;2190// If the base register overlaps a destination register, we can't2191// merge the update.2192if (!isTagStore(MemMI)) {2193bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);2194for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {2195Register DestReg = getLdStRegOp(MemMI, i).getReg();2196if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))2197return E;2198}2199}22002201const bool BaseRegSP = BaseReg == AArch64::SP;2202if (BaseRegSP && needsWinCFI(I->getMF())) {2203// FIXME: For now, we always block the optimization over SP in windows2204// targets as it requires to adjust the unwind/debug info, messing up2205// the unwind info can actually cause a miscompile.2206return E;2207}22082209const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();2210unsigned RedZoneSize =2211Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());22122213// Track which register units have been modified and used between the first2214// insn (inclusive) and the second insn.2215ModifiedRegUnits.clear();2216UsedRegUnits.clear();2217unsigned Count = 0;2218bool MemAcessBeforeSPPreInc = false;2219do {2220MBBI = prev_nodbg(MBBI, B);2221MachineInstr &MI = *MBBI;22222223// Don't count transient instructions towards the search limit since there2224// may be different numbers of them if e.g. debug information is present.2225if (!MI.isTransient())2226++Count;22272228// If we found a match, return it.2229if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {2230// Check that the update value is within our red zone limit (which may be2231// zero).2232if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)2233return E;2234return MBBI;2235}22362237// Update the status of what the instruction clobbered and used.2238LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);22392240// Otherwise, if the base register is used or modified, we have no match, so2241// return early.2242if (!ModifiedRegUnits.available(BaseReg) ||2243!UsedRegUnits.available(BaseReg))2244return E;2245// Keep track if we have a memory access before an SP pre-increment, in this2246// case we need to validate later that the update amount respects the red2247// zone.2248if (BaseRegSP && MBBI->mayLoadOrStore())2249MemAcessBeforeSPPreInc = true;2250} while (MBBI != B && Count < Limit);2251return E;2252}22532254bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(2255MachineBasicBlock::iterator &MBBI) {2256MachineInstr &MI = *MBBI;2257// If this is a volatile load, don't mess with it.2258if (MI.hasOrderedMemoryRef())2259return false;22602261if (needsWinCFI(MI.getMF()) && MI.getFlag(MachineInstr::FrameDestroy))2262return false;22632264// Make sure this is a reg+imm.2265// FIXME: It is possible to extend it to handle reg+reg cases.2266if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())2267return false;22682269// Look backward up to LdStLimit instructions.2270MachineBasicBlock::iterator StoreI;2271if (findMatchingStore(MBBI, LdStLimit, StoreI)) {2272++NumLoadsFromStoresPromoted;2273// Promote the load. Keeping the iterator straight is a2274// pain, so we let the merge routine tell us what the next instruction2275// is after it's done mucking about.2276MBBI = promoteLoadFromStore(MBBI, StoreI);2277return true;2278}2279return false;2280}22812282// Merge adjacent zero stores into a wider store.2283bool AArch64LoadStoreOpt::tryToMergeZeroStInst(2284MachineBasicBlock::iterator &MBBI) {2285assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");2286MachineInstr &MI = *MBBI;2287MachineBasicBlock::iterator E = MI.getParent()->end();22882289if (!TII->isCandidateToMergeOrPair(MI))2290return false;22912292// Look ahead up to LdStLimit instructions for a mergable instruction.2293LdStPairFlags Flags;2294MachineBasicBlock::iterator MergeMI =2295findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);2296if (MergeMI != E) {2297++NumZeroStoresPromoted;22982299// Keeping the iterator straight is a pain, so we let the merge routine tell2300// us what the next instruction is after it's done mucking about.2301MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);2302return true;2303}2304return false;2305}23062307// Find loads and stores that can be merged into a single load or store pair2308// instruction.2309bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {2310MachineInstr &MI = *MBBI;2311MachineBasicBlock::iterator E = MI.getParent()->end();23122313if (!TII->isCandidateToMergeOrPair(MI))2314return false;23152316// If disable-ldp feature is opted, do not emit ldp.2317if (MI.mayLoad() && Subtarget->hasDisableLdp())2318return false;23192320// If disable-stp feature is opted, do not emit stp.2321if (MI.mayStore() && Subtarget->hasDisableStp())2322return false;23232324// Early exit if the offset is not possible to match. (6 bits of positive2325// range, plus allow an extra one in case we find a later insn that matches2326// with Offset-1)2327bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);2328int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();2329int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;2330// Allow one more for offset.2331if (Offset > 0)2332Offset -= OffsetStride;2333if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))2334return false;23352336// Look ahead up to LdStLimit instructions for a pairable instruction.2337LdStPairFlags Flags;2338MachineBasicBlock::iterator Paired =2339findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);2340if (Paired != E) {2341// Keeping the iterator straight is a pain, so we let the merge routine tell2342// us what the next instruction is after it's done mucking about.2343auto Prev = std::prev(MBBI);23442345// Fetch the memoperand of the load/store that is a candidate for2346// combination.2347MachineMemOperand *MemOp =2348MI.memoperands_empty() ? nullptr : MI.memoperands().front();23492350// If a load/store arrives and ldp/stp-aligned-only feature is opted, check2351// that the alignment of the source pointer is at least double the alignment2352// of the type.2353if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) ||2354(MI.mayStore() && Subtarget->hasStpAlignedOnly())) {2355// If there is no size/align information, cancel the transformation.2356if (!MemOp || !MemOp->getMemoryType().isValid()) {2357NumFailedAlignmentCheck++;2358return false;2359}23602361// Get the needed alignments to check them if2362// ldp-aligned-only/stp-aligned-only features are opted.2363uint64_t MemAlignment = MemOp->getAlign().value();2364uint64_t TypeAlignment = Align(MemOp->getSize().getValue()).value();23652366if (MemAlignment < 2 * TypeAlignment) {2367NumFailedAlignmentCheck++;2368return false;2369}2370}23712372++NumPairCreated;2373if (TII->hasUnscaledLdStOffset(MI))2374++NumUnscaledPairCreated;23752376MBBI = mergePairedInsns(MBBI, Paired, Flags);2377// Collect liveness info for instructions between Prev and the new position2378// MBBI.2379for (auto I = std::next(Prev); I != MBBI; I++)2380updateDefinedRegisters(*I, DefinedInBB, TRI);23812382return true;2383}2384return false;2385}23862387bool AArch64LoadStoreOpt::tryToMergeLdStUpdate2388(MachineBasicBlock::iterator &MBBI) {2389MachineInstr &MI = *MBBI;2390MachineBasicBlock::iterator E = MI.getParent()->end();2391MachineBasicBlock::iterator Update;23922393// Look forward to try to form a post-index instruction. For example,2394// ldr x0, [x20]2395// add x20, x20, #322396// merged into:2397// ldr x0, [x20], #322398Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);2399if (Update != E) {2400// Merge the update into the ld/st.2401MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);2402return true;2403}24042405// Don't know how to handle unscaled pre/post-index versions below, so bail.2406if (TII->hasUnscaledLdStOffset(MI.getOpcode()))2407return false;24082409// Look back to try to find a pre-index instruction. For example,2410// add x0, x0, #82411// ldr x1, [x0]2412// merged into:2413// ldr x1, [x0, #8]!2414Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);2415if (Update != E) {2416// Merge the update into the ld/st.2417MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);2418return true;2419}24202421// The immediate in the load/store is scaled by the size of the memory2422// operation. The immediate in the add we're looking for,2423// however, is not, so adjust here.2424int UnscaledOffset =2425AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);24262427// Look forward to try to find a pre-index instruction. For example,2428// ldr x1, [x0, #64]2429// add x0, x0, #642430// merged into:2431// ldr x1, [x0, #64]!2432Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);2433if (Update != E) {2434// Merge the update into the ld/st.2435MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);2436return true;2437}24382439return false;2440}24412442bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,2443bool EnableNarrowZeroStOpt) {24442445bool Modified = false;2446// Four tranformations to do here:2447// 1) Find loads that directly read from stores and promote them by2448// replacing with mov instructions. If the store is wider than the load,2449// the load will be replaced with a bitfield extract.2450// e.g.,2451// str w1, [x0, #4]2452// ldrh w2, [x0, #6]2453// ; becomes2454// str w1, [x0, #4]2455// lsr w2, w1, #162456for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();2457MBBI != E;) {2458if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))2459Modified = true;2460else2461++MBBI;2462}2463// 2) Merge adjacent zero stores into a wider store.2464// e.g.,2465// strh wzr, [x0]2466// strh wzr, [x0, #2]2467// ; becomes2468// str wzr, [x0]2469// e.g.,2470// str wzr, [x0]2471// str wzr, [x0, #4]2472// ; becomes2473// str xzr, [x0]2474if (EnableNarrowZeroStOpt)2475for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();2476MBBI != E;) {2477if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))2478Modified = true;2479else2480++MBBI;2481}2482// 3) Find loads and stores that can be merged into a single load or store2483// pair instruction.2484// e.g.,2485// ldr x0, [x2]2486// ldr x1, [x2, #8]2487// ; becomes2488// ldp x0, x1, [x2]24892490if (MBB.getParent()->getRegInfo().tracksLiveness()) {2491DefinedInBB.clear();2492DefinedInBB.addLiveIns(MBB);2493}24942495for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();2496MBBI != E;) {2497// Track currently live registers up to this point, to help with2498// searching for a rename register on demand.2499updateDefinedRegisters(*MBBI, DefinedInBB, TRI);2500if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))2501Modified = true;2502else2503++MBBI;2504}2505// 4) Find base register updates that can be merged into the load or store2506// as a base-reg writeback.2507// e.g.,2508// ldr x0, [x2]2509// add x2, x2, #42510// ; becomes2511// ldr x0, [x2], #42512for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();2513MBBI != E;) {2514if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))2515Modified = true;2516else2517++MBBI;2518}25192520return Modified;2521}25222523bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {2524if (skipFunction(Fn.getFunction()))2525return false;25262527Subtarget = &Fn.getSubtarget<AArch64Subtarget>();2528TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());2529TRI = Subtarget->getRegisterInfo();2530AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();25312532// Resize the modified and used register unit trackers. We do this once2533// per function and then clear the register units each time we optimize a load2534// or store.2535ModifiedRegUnits.init(*TRI);2536UsedRegUnits.init(*TRI);2537DefinedInBB.init(*TRI);25382539bool Modified = false;2540bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();2541for (auto &MBB : Fn) {2542auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);2543Modified |= M;2544}25452546return Modified;2547}25482549// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and2550// stores near one another? Note: The pre-RA instruction scheduler already has2551// hooks to try and schedule pairable loads/stores together to improve pairing2552// opportunities. Thus, pre-RA pairing pass may not be worth the effort.25532554// FIXME: When pairing store instructions it's very possible for this pass to2555// hoist a store with a KILL marker above another use (without a KILL marker).2556// The resulting IR is invalid, but nothing uses the KILL markers after this2557// pass, so it's never caused a problem in practice.25582559/// createAArch64LoadStoreOptimizationPass - returns an instance of the2560/// load / store optimization pass.2561FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {2562return new AArch64LoadStoreOpt();2563}256425652566