Path: blob/main/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
35294 views
//===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// If a load follows a store and reloads data that the store has written to9// memory, Intel microarchitectures can in many cases forward the data directly10// from the store to the load, This "store forwarding" saves cycles by enabling11// the load to directly obtain the data instead of accessing the data from12// cache or memory.13// A "store forward block" occurs in cases that a store cannot be forwarded to14// the load. The most typical case of store forward block on Intel Core15// microarchitecture that a small store cannot be forwarded to a large load.16// The estimated penalty for a store forward block is ~13 cycles.17//18// This pass tries to recognize and handle cases where "store forward block"19// is created by the compiler when lowering memcpy calls to a sequence20// of a load and a store.21//22// The pass currently only handles cases where memcpy is lowered to23// XMM/YMM registers, it tries to break the memcpy into smaller copies.24// breaking the memcpy should be possible since there is no atomicity25// guarantee for loads and stores to XMM/YMM.26//27// It could be better for performance to solve the problem by loading28// to XMM/YMM then inserting the partial store before storing back from XMM/YMM29// to memory, but this will result in a more conservative optimization since it30// requires we prove that all memory accesses between the blocking store and the31// load must alias/don't alias before we can move the store, whereas the32// transformation done here is correct regardless to other memory accesses.33//===----------------------------------------------------------------------===//3435#include "X86.h"36#include "X86InstrInfo.h"37#include "X86Subtarget.h"38#include "llvm/Analysis/AliasAnalysis.h"39#include "llvm/CodeGen/MachineBasicBlock.h"40#include "llvm/CodeGen/MachineFunction.h"41#include "llvm/CodeGen/MachineFunctionPass.h"42#include "llvm/CodeGen/MachineInstr.h"43#include "llvm/CodeGen/MachineInstrBuilder.h"44#include "llvm/CodeGen/MachineOperand.h"45#include "llvm/CodeGen/MachineRegisterInfo.h"46#include "llvm/IR/DebugInfoMetadata.h"47#include "llvm/IR/DebugLoc.h"48#include "llvm/IR/Function.h"49#include "llvm/InitializePasses.h"50#include "llvm/MC/MCInstrDesc.h"5152using namespace llvm;5354#define DEBUG_TYPE "x86-avoid-SFB"5556static cl::opt<bool> DisableX86AvoidStoreForwardBlocks(57"x86-disable-avoid-SFB", cl::Hidden,58cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));5960static cl::opt<unsigned> X86AvoidSFBInspectionLimit(61"x86-sfb-inspection-limit",62cl::desc("X86: Number of instructions backward to "63"inspect for store forwarding blocks."),64cl::init(20), cl::Hidden);6566namespace {6768using DisplacementSizeMap = std::map<int64_t, unsigned>;6970class X86AvoidSFBPass : public MachineFunctionPass {71public:72static char ID;73X86AvoidSFBPass() : MachineFunctionPass(ID) { }7475StringRef getPassName() const override {76return "X86 Avoid Store Forwarding Blocks";77}7879bool runOnMachineFunction(MachineFunction &MF) override;8081void getAnalysisUsage(AnalysisUsage &AU) const override {82MachineFunctionPass::getAnalysisUsage(AU);83AU.addRequired<AAResultsWrapperPass>();84}8586private:87MachineRegisterInfo *MRI = nullptr;88const X86InstrInfo *TII = nullptr;89const X86RegisterInfo *TRI = nullptr;90SmallVector<std::pair<MachineInstr *, MachineInstr *>, 2>91BlockedLoadsStoresPairs;92SmallVector<MachineInstr *, 2> ForRemoval;93AliasAnalysis *AA = nullptr;9495/// Returns couples of Load then Store to memory which look96/// like a memcpy.97void findPotentiallylBlockedCopies(MachineFunction &MF);98/// Break the memcpy's load and store into smaller copies99/// such that each memory load that was blocked by a smaller store100/// would now be copied separately.101void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,102const DisplacementSizeMap &BlockingStoresDispSizeMap);103/// Break a copy of size Size to smaller copies.104void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,105MachineInstr *StoreInst, int64_t StDispImm,106int64_t LMMOffset, int64_t SMMOffset);107108void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,109MachineInstr *StoreInst, unsigned NStoreOpcode,110int64_t StoreDisp, unsigned Size, int64_t LMMOffset,111int64_t SMMOffset);112113bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;114115unsigned getRegSizeInBytes(MachineInstr *Inst);116};117118} // end anonymous namespace119120char X86AvoidSFBPass::ID = 0;121122INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking",123false, false)124INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)125INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", false,126false)127128FunctionPass *llvm::createX86AvoidStoreForwardingBlocks() {129return new X86AvoidSFBPass();130}131132static bool isXMMLoadOpcode(unsigned Opcode) {133return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||134Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||135Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||136Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||137Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||138Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||139Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||140Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;141}142static bool isYMMLoadOpcode(unsigned Opcode) {143return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||144Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||145Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||146Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||147Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||148Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||149Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;150}151152static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {153return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);154}155156static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {157switch (LdOpcode) {158case X86::MOVUPSrm:159case X86::MOVAPSrm:160return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;161case X86::VMOVUPSrm:162case X86::VMOVAPSrm:163return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;164case X86::VMOVUPDrm:165case X86::VMOVAPDrm:166return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;167case X86::VMOVDQUrm:168case X86::VMOVDQArm:169return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;170case X86::VMOVUPSZ128rm:171case X86::VMOVAPSZ128rm:172return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;173case X86::VMOVUPDZ128rm:174case X86::VMOVAPDZ128rm:175return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;176case X86::VMOVUPSYrm:177case X86::VMOVAPSYrm:178return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;179case X86::VMOVUPDYrm:180case X86::VMOVAPDYrm:181return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;182case X86::VMOVDQUYrm:183case X86::VMOVDQAYrm:184return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;185case X86::VMOVUPSZ256rm:186case X86::VMOVAPSZ256rm:187return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;188case X86::VMOVUPDZ256rm:189case X86::VMOVAPDZ256rm:190return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;191case X86::VMOVDQU64Z128rm:192case X86::VMOVDQA64Z128rm:193return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;194case X86::VMOVDQU32Z128rm:195case X86::VMOVDQA32Z128rm:196return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;197case X86::VMOVDQU64Z256rm:198case X86::VMOVDQA64Z256rm:199return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;200case X86::VMOVDQU32Z256rm:201case X86::VMOVDQA32Z256rm:202return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;203default:204return false;205}206}207208static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {209bool PBlock = false;210PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||211Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||212Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||213Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;214if (isYMMLoadOpcode(LoadOpcode))215PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||216Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||217Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||218Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||219Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||220Opcode == X86::VMOVDQU64Z128mr ||221Opcode == X86::VMOVDQA64Z128mr ||222Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;223return PBlock;224}225226static const int MOV128SZ = 16;227static const int MOV64SZ = 8;228static const int MOV32SZ = 4;229static const int MOV16SZ = 2;230static const int MOV8SZ = 1;231232static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {233switch (LoadOpcode) {234case X86::VMOVUPSYrm:235case X86::VMOVAPSYrm:236return X86::VMOVUPSrm;237case X86::VMOVUPDYrm:238case X86::VMOVAPDYrm:239return X86::VMOVUPDrm;240case X86::VMOVDQUYrm:241case X86::VMOVDQAYrm:242return X86::VMOVDQUrm;243case X86::VMOVUPSZ256rm:244case X86::VMOVAPSZ256rm:245return X86::VMOVUPSZ128rm;246case X86::VMOVUPDZ256rm:247case X86::VMOVAPDZ256rm:248return X86::VMOVUPDZ128rm;249case X86::VMOVDQU64Z256rm:250case X86::VMOVDQA64Z256rm:251return X86::VMOVDQU64Z128rm;252case X86::VMOVDQU32Z256rm:253case X86::VMOVDQA32Z256rm:254return X86::VMOVDQU32Z128rm;255default:256llvm_unreachable("Unexpected Load Instruction Opcode");257}258return 0;259}260261static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {262switch (StoreOpcode) {263case X86::VMOVUPSYmr:264case X86::VMOVAPSYmr:265return X86::VMOVUPSmr;266case X86::VMOVUPDYmr:267case X86::VMOVAPDYmr:268return X86::VMOVUPDmr;269case X86::VMOVDQUYmr:270case X86::VMOVDQAYmr:271return X86::VMOVDQUmr;272case X86::VMOVUPSZ256mr:273case X86::VMOVAPSZ256mr:274return X86::VMOVUPSZ128mr;275case X86::VMOVUPDZ256mr:276case X86::VMOVAPDZ256mr:277return X86::VMOVUPDZ128mr;278case X86::VMOVDQU64Z256mr:279case X86::VMOVDQA64Z256mr:280return X86::VMOVDQU64Z128mr;281case X86::VMOVDQU32Z256mr:282case X86::VMOVDQA32Z256mr:283return X86::VMOVDQU32Z128mr;284default:285llvm_unreachable("Unexpected Load Instruction Opcode");286}287return 0;288}289290static int getAddrOffset(const MachineInstr *MI) {291const MCInstrDesc &Descl = MI->getDesc();292int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);293assert(AddrOffset != -1 && "Expected Memory Operand");294AddrOffset += X86II::getOperandBias(Descl);295return AddrOffset;296}297298static MachineOperand &getBaseOperand(MachineInstr *MI) {299int AddrOffset = getAddrOffset(MI);300return MI->getOperand(AddrOffset + X86::AddrBaseReg);301}302303static MachineOperand &getDispOperand(MachineInstr *MI) {304int AddrOffset = getAddrOffset(MI);305return MI->getOperand(AddrOffset + X86::AddrDisp);306}307308// Relevant addressing modes contain only base register and immediate309// displacement or frameindex and immediate displacement.310// TODO: Consider expanding to other addressing modes in the future311static bool isRelevantAddressingMode(MachineInstr *MI) {312int AddrOffset = getAddrOffset(MI);313const MachineOperand &Base = getBaseOperand(MI);314const MachineOperand &Disp = getDispOperand(MI);315const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);316const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);317const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);318319if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))320return false;321if (!Disp.isImm())322return false;323if (Scale.getImm() != 1)324return false;325if (!(Index.isReg() && Index.getReg() == X86::NoRegister))326return false;327if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))328return false;329return true;330}331332// Collect potentially blocking stores.333// Limit the number of instructions backwards we want to inspect334// since the effect of store block won't be visible if the store335// and load instructions have enough instructions in between to336// keep the core busy.337static SmallVector<MachineInstr *, 2>338findPotentialBlockers(MachineInstr *LoadInst) {339SmallVector<MachineInstr *, 2> PotentialBlockers;340unsigned BlockCount = 0;341const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;342for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),343E = LoadInst->getParent()->rend();344PBInst != E; ++PBInst) {345if (PBInst->isMetaInstruction())346continue;347BlockCount++;348if (BlockCount >= InspectionLimit)349break;350MachineInstr &MI = *PBInst;351if (MI.getDesc().isCall())352return PotentialBlockers;353PotentialBlockers.push_back(&MI);354}355// If we didn't get to the instructions limit try predecessing blocks.356// Ideally we should traverse the predecessor blocks in depth with some357// coloring algorithm, but for now let's just look at the first order358// predecessors.359if (BlockCount < InspectionLimit) {360MachineBasicBlock *MBB = LoadInst->getParent();361int LimitLeft = InspectionLimit - BlockCount;362for (MachineBasicBlock *PMBB : MBB->predecessors()) {363int PredCount = 0;364for (MachineInstr &PBInst : llvm::reverse(*PMBB)) {365if (PBInst.isMetaInstruction())366continue;367PredCount++;368if (PredCount >= LimitLeft)369break;370if (PBInst.getDesc().isCall())371break;372PotentialBlockers.push_back(&PBInst);373}374}375}376return PotentialBlockers;377}378379void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,380int64_t LoadDisp, MachineInstr *StoreInst,381unsigned NStoreOpcode, int64_t StoreDisp,382unsigned Size, int64_t LMMOffset,383int64_t SMMOffset) {384MachineOperand &LoadBase = getBaseOperand(LoadInst);385MachineOperand &StoreBase = getBaseOperand(StoreInst);386MachineBasicBlock *MBB = LoadInst->getParent();387MachineMemOperand *LMMO = *LoadInst->memoperands_begin();388MachineMemOperand *SMMO = *StoreInst->memoperands_begin();389390Register Reg1 = MRI->createVirtualRegister(391TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));392MachineInstr *NewLoad =393BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),394Reg1)395.add(LoadBase)396.addImm(1)397.addReg(X86::NoRegister)398.addImm(LoadDisp)399.addReg(X86::NoRegister)400.addMemOperand(401MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));402if (LoadBase.isReg())403getBaseOperand(NewLoad).setIsKill(false);404LLVM_DEBUG(NewLoad->dump());405// If the load and store are consecutive, use the loadInst location to406// reduce register pressure.407MachineInstr *StInst = StoreInst;408auto PrevInstrIt = prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),409MBB->instr_begin());410if (PrevInstrIt.getNodePtr() == LoadInst)411StInst = LoadInst;412MachineInstr *NewStore =413BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))414.add(StoreBase)415.addImm(1)416.addReg(X86::NoRegister)417.addImm(StoreDisp)418.addReg(X86::NoRegister)419.addReg(Reg1)420.addMemOperand(421MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));422if (StoreBase.isReg())423getBaseOperand(NewStore).setIsKill(false);424MachineOperand &StoreSrcVReg = StoreInst->getOperand(X86::AddrNumOperands);425assert(StoreSrcVReg.isReg() && "Expected virtual register");426NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());427LLVM_DEBUG(NewStore->dump());428}429430void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,431int64_t LdDispImm, MachineInstr *StoreInst,432int64_t StDispImm, int64_t LMMOffset,433int64_t SMMOffset) {434int LdDisp = LdDispImm;435int StDisp = StDispImm;436while (Size > 0) {437if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {438Size = Size - MOV128SZ;439buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,440StoreInst, getYMMtoXMMStoreOpcode(StoreInst->getOpcode()),441StDisp, MOV128SZ, LMMOffset, SMMOffset);442LdDisp += MOV128SZ;443StDisp += MOV128SZ;444LMMOffset += MOV128SZ;445SMMOffset += MOV128SZ;446continue;447}448if (Size - MOV64SZ >= 0) {449Size = Size - MOV64SZ;450buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,451MOV64SZ, LMMOffset, SMMOffset);452LdDisp += MOV64SZ;453StDisp += MOV64SZ;454LMMOffset += MOV64SZ;455SMMOffset += MOV64SZ;456continue;457}458if (Size - MOV32SZ >= 0) {459Size = Size - MOV32SZ;460buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,461MOV32SZ, LMMOffset, SMMOffset);462LdDisp += MOV32SZ;463StDisp += MOV32SZ;464LMMOffset += MOV32SZ;465SMMOffset += MOV32SZ;466continue;467}468if (Size - MOV16SZ >= 0) {469Size = Size - MOV16SZ;470buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,471MOV16SZ, LMMOffset, SMMOffset);472LdDisp += MOV16SZ;473StDisp += MOV16SZ;474LMMOffset += MOV16SZ;475SMMOffset += MOV16SZ;476continue;477}478if (Size - MOV8SZ >= 0) {479Size = Size - MOV8SZ;480buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,481MOV8SZ, LMMOffset, SMMOffset);482LdDisp += MOV8SZ;483StDisp += MOV8SZ;484LMMOffset += MOV8SZ;485SMMOffset += MOV8SZ;486continue;487}488}489assert(Size == 0 && "Wrong size division");490}491492static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {493MachineOperand &LoadBase = getBaseOperand(LoadInst);494MachineOperand &StoreBase = getBaseOperand(StoreInst);495auto *StorePrevNonDbgInstr =496prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),497LoadInst->getParent()->instr_begin())498.getNodePtr();499if (LoadBase.isReg()) {500MachineInstr *LastLoad = LoadInst->getPrevNode();501// If the original load and store to xmm/ymm were consecutive502// then the partial copies were also created in503// a consecutive order to reduce register pressure,504// and the location of the last load is before the last store.505if (StorePrevNonDbgInstr == LoadInst)506LastLoad = LoadInst->getPrevNode()->getPrevNode();507getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());508}509if (StoreBase.isReg()) {510MachineInstr *StInst = StoreInst;511if (StorePrevNonDbgInstr == LoadInst)512StInst = LoadInst;513getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());514}515}516517bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,518const MachineMemOperand &Op2) const {519if (!Op1.getValue() || !Op2.getValue())520return true;521522int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());523int64_t Overlapa = Op1.getSize().getValue() + Op1.getOffset() - MinOffset;524int64_t Overlapb = Op2.getSize().getValue() + Op2.getOffset() - MinOffset;525526return !AA->isNoAlias(527MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),528MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));529}530531void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {532for (auto &MBB : MF)533for (auto &MI : MBB) {534if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))535continue;536int DefVR = MI.getOperand(0).getReg();537if (!MRI->hasOneNonDBGUse(DefVR))538continue;539for (MachineOperand &StoreMO :540llvm::make_early_inc_range(MRI->use_nodbg_operands(DefVR))) {541MachineInstr &StoreMI = *StoreMO.getParent();542// Skip cases where the memcpy may overlap.543if (StoreMI.getParent() == MI.getParent() &&544isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&545isRelevantAddressingMode(&MI) &&546isRelevantAddressingMode(&StoreMI) &&547MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) {548if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))549BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));550}551}552}553}554555unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {556const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,557*LoadInst->getParent()->getParent());558return TRI->getRegSizeInBits(*TRC) / 8;559}560561void X86AvoidSFBPass::breakBlockedCopies(562MachineInstr *LoadInst, MachineInstr *StoreInst,563const DisplacementSizeMap &BlockingStoresDispSizeMap) {564int64_t LdDispImm = getDispOperand(LoadInst).getImm();565int64_t StDispImm = getDispOperand(StoreInst).getImm();566int64_t LMMOffset = 0;567int64_t SMMOffset = 0;568569int64_t LdDisp1 = LdDispImm;570int64_t LdDisp2 = 0;571int64_t StDisp1 = StDispImm;572int64_t StDisp2 = 0;573unsigned Size1 = 0;574unsigned Size2 = 0;575int64_t LdStDelta = StDispImm - LdDispImm;576577for (auto DispSizePair : BlockingStoresDispSizeMap) {578LdDisp2 = DispSizePair.first;579StDisp2 = DispSizePair.first + LdStDelta;580Size2 = DispSizePair.second;581// Avoid copying overlapping areas.582if (LdDisp2 < LdDisp1) {583int OverlapDelta = LdDisp1 - LdDisp2;584LdDisp2 += OverlapDelta;585StDisp2 += OverlapDelta;586Size2 -= OverlapDelta;587}588Size1 = LdDisp2 - LdDisp1;589590// Build a copy for the point until the current blocking store's591// displacement.592buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,593SMMOffset);594// Build a copy for the current blocking store.595buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,596SMMOffset + Size1);597LdDisp1 = LdDisp2 + Size2;598StDisp1 = StDisp2 + Size2;599LMMOffset += Size1 + Size2;600SMMOffset += Size1 + Size2;601}602unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;603buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,604LMMOffset);605}606607static bool hasSameBaseOpValue(MachineInstr *LoadInst,608MachineInstr *StoreInst) {609const MachineOperand &LoadBase = getBaseOperand(LoadInst);610const MachineOperand &StoreBase = getBaseOperand(StoreInst);611if (LoadBase.isReg() != StoreBase.isReg())612return false;613if (LoadBase.isReg())614return LoadBase.getReg() == StoreBase.getReg();615return LoadBase.getIndex() == StoreBase.getIndex();616}617618static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,619int64_t StoreDispImm, unsigned StoreSize) {620return ((StoreDispImm >= LoadDispImm) &&621(StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));622}623624// Keep track of all stores blocking a load625static void626updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,627int64_t DispImm, unsigned Size) {628if (BlockingStoresDispSizeMap.count(DispImm)) {629// Choose the smallest blocking store starting at this displacement.630if (BlockingStoresDispSizeMap[DispImm] > Size)631BlockingStoresDispSizeMap[DispImm] = Size;632633} else634BlockingStoresDispSizeMap[DispImm] = Size;635}636637// Remove blocking stores contained in each other.638static void639removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {640if (BlockingStoresDispSizeMap.size() <= 1)641return;642643SmallVector<std::pair<int64_t, unsigned>, 0> DispSizeStack;644for (auto DispSizePair : BlockingStoresDispSizeMap) {645int64_t CurrDisp = DispSizePair.first;646unsigned CurrSize = DispSizePair.second;647while (DispSizeStack.size()) {648int64_t PrevDisp = DispSizeStack.back().first;649unsigned PrevSize = DispSizeStack.back().second;650if (CurrDisp + CurrSize > PrevDisp + PrevSize)651break;652DispSizeStack.pop_back();653}654DispSizeStack.push_back(DispSizePair);655}656BlockingStoresDispSizeMap.clear();657for (auto Disp : DispSizeStack)658BlockingStoresDispSizeMap.insert(Disp);659}660661bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {662bool Changed = false;663664if (DisableX86AvoidStoreForwardBlocks || skipFunction(MF.getFunction()) ||665!MF.getSubtarget<X86Subtarget>().is64Bit())666return false;667668MRI = &MF.getRegInfo();669assert(MRI->isSSA() && "Expected MIR to be in SSA form");670TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();671TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();672AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();673LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);674// Look for a load then a store to XMM/YMM which look like a memcpy675findPotentiallylBlockedCopies(MF);676677for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {678MachineInstr *LoadInst = LoadStoreInstPair.first;679int64_t LdDispImm = getDispOperand(LoadInst).getImm();680DisplacementSizeMap BlockingStoresDispSizeMap;681682SmallVector<MachineInstr *, 2> PotentialBlockers =683findPotentialBlockers(LoadInst);684for (auto *PBInst : PotentialBlockers) {685if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),686LoadInst->getOpcode()) ||687!isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand())688continue;689int64_t PBstDispImm = getDispOperand(PBInst).getImm();690unsigned PBstSize = (*PBInst->memoperands_begin())->getSize().getValue();691// This check doesn't cover all cases, but it will suffice for now.692// TODO: take branch probability into consideration, if the blocking693// store is in an unreached block, breaking the memcopy could lose694// performance.695if (hasSameBaseOpValue(LoadInst, PBInst) &&696isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,697PBstSize))698updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,699PBstSize);700}701702if (BlockingStoresDispSizeMap.empty())703continue;704705// We found a store forward block, break the memcpy's load and store706// into smaller copies such that each smaller store that was causing707// a store block would now be copied separately.708MachineInstr *StoreInst = LoadStoreInstPair.second;709LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");710LLVM_DEBUG(LoadInst->dump());711LLVM_DEBUG(StoreInst->dump());712LLVM_DEBUG(dbgs() << "Replaced with:\n");713removeRedundantBlockingStores(BlockingStoresDispSizeMap);714breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);715updateKillStatus(LoadInst, StoreInst);716ForRemoval.push_back(LoadInst);717ForRemoval.push_back(StoreInst);718}719for (auto *RemovedInst : ForRemoval) {720RemovedInst->eraseFromParent();721}722ForRemoval.clear();723BlockedLoadsStoresPairs.clear();724LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);725726return Changed;727}728729730