Path: blob/main/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
35269 views
//===- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor --===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7/// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions8/// that may inhibit the HW prefetching. This is done in two steps. Before9/// ISel, we mark strided loads (i.e. those that will likely benefit from10/// prefetching) with metadata. Then, after opcodes have been finalized, we11/// insert MOVs and re-write loads to prevent unintentional tag collisions.12// ===---------------------------------------------------------------------===//1314#include "AArch64.h"15#include "AArch64InstrInfo.h"16#include "AArch64Subtarget.h"17#include "AArch64TargetMachine.h"18#include "llvm/ADT/DenseMap.h"19#include "llvm/ADT/DepthFirstIterator.h"20#include "llvm/ADT/SmallVector.h"21#include "llvm/ADT/Statistic.h"22#include "llvm/Analysis/LoopInfo.h"23#include "llvm/Analysis/ScalarEvolution.h"24#include "llvm/Analysis/ScalarEvolutionExpressions.h"25#include "llvm/CodeGen/LiveRegUnits.h"26#include "llvm/CodeGen/MachineBasicBlock.h"27#include "llvm/CodeGen/MachineFunction.h"28#include "llvm/CodeGen/MachineFunctionPass.h"29#include "llvm/CodeGen/MachineInstr.h"30#include "llvm/CodeGen/MachineInstrBuilder.h"31#include "llvm/CodeGen/MachineLoopInfo.h"32#include "llvm/CodeGen/MachineOperand.h"33#include "llvm/CodeGen/MachineRegisterInfo.h"34#include "llvm/CodeGen/TargetPassConfig.h"35#include "llvm/CodeGen/TargetRegisterInfo.h"36#include "llvm/IR/DebugLoc.h"37#include "llvm/IR/Dominators.h"38#include "llvm/IR/Function.h"39#include "llvm/IR/Instruction.h"40#include "llvm/IR/Instructions.h"41#include "llvm/IR/Metadata.h"42#include "llvm/InitializePasses.h"43#include "llvm/Pass.h"44#include "llvm/Support/Casting.h"45#include "llvm/Support/Debug.h"46#include "llvm/Support/DebugCounter.h"47#include "llvm/Support/raw_ostream.h"48#include <iterator>49#include <utility>5051using namespace llvm;5253#define DEBUG_TYPE "aarch64-falkor-hwpf-fix"5455STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");56STATISTIC(NumCollisionsAvoided,57"Number of HW prefetch tag collisions avoided");58STATISTIC(NumCollisionsNotAvoided,59"Number of HW prefetch tag collisions not avoided due to lack of registers");60DEBUG_COUNTER(FixCounter, "falkor-hwpf",61"Controls which tag collisions are avoided");6263namespace {6465class FalkorMarkStridedAccesses {66public:67FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)68: LI(LI), SE(SE) {}6970bool run();7172private:73bool runOnLoop(Loop &L);7475LoopInfo &LI;76ScalarEvolution &SE;77};7879class FalkorMarkStridedAccessesLegacy : public FunctionPass {80public:81static char ID; // Pass ID, replacement for typeid8283FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {84initializeFalkorMarkStridedAccessesLegacyPass(85*PassRegistry::getPassRegistry());86}8788void getAnalysisUsage(AnalysisUsage &AU) const override {89AU.addRequired<TargetPassConfig>();90AU.addPreserved<DominatorTreeWrapperPass>();91AU.addRequired<LoopInfoWrapperPass>();92AU.addPreserved<LoopInfoWrapperPass>();93AU.addRequired<ScalarEvolutionWrapperPass>();94AU.addPreserved<ScalarEvolutionWrapperPass>();95}9697bool runOnFunction(Function &F) override;98};99100} // end anonymous namespace101102char FalkorMarkStridedAccessesLegacy::ID = 0;103104INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,105"Falkor HW Prefetch Fix", false, false)106INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)107INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)108INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)109INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,110"Falkor HW Prefetch Fix", false, false)111112FunctionPass *llvm::createFalkorMarkStridedAccessesPass() {113return new FalkorMarkStridedAccessesLegacy();114}115116bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {117TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();118const AArch64Subtarget *ST =119TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);120if (ST->getProcFamily() != AArch64Subtarget::Falkor)121return false;122123if (skipFunction(F))124return false;125126LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();127ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();128129FalkorMarkStridedAccesses LDP(LI, SE);130return LDP.run();131}132133bool FalkorMarkStridedAccesses::run() {134bool MadeChange = false;135136for (Loop *L : LI)137for (Loop *LIt : depth_first(L))138MadeChange |= runOnLoop(*LIt);139140return MadeChange;141}142143bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {144// Only mark strided loads in the inner-most loop145if (!L.isInnermost())146return false;147148bool MadeChange = false;149150for (BasicBlock *BB : L.blocks()) {151for (Instruction &I : *BB) {152LoadInst *LoadI = dyn_cast<LoadInst>(&I);153if (!LoadI)154continue;155156Value *PtrValue = LoadI->getPointerOperand();157if (L.isLoopInvariant(PtrValue))158continue;159160const SCEV *LSCEV = SE.getSCEV(PtrValue);161const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);162if (!LSCEVAddRec || !LSCEVAddRec->isAffine())163continue;164165LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,166MDNode::get(LoadI->getContext(), {}));167++NumStridedLoadsMarked;168LLVM_DEBUG(dbgs() << "Load: " << I << " marked as strided\n");169MadeChange = true;170}171}172173return MadeChange;174}175176namespace {177178class FalkorHWPFFix : public MachineFunctionPass {179public:180static char ID;181182FalkorHWPFFix() : MachineFunctionPass(ID) {183initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry());184}185186bool runOnMachineFunction(MachineFunction &Fn) override;187188void getAnalysisUsage(AnalysisUsage &AU) const override {189AU.setPreservesCFG();190AU.addRequired<MachineLoopInfoWrapperPass>();191MachineFunctionPass::getAnalysisUsage(AU);192}193194MachineFunctionProperties getRequiredProperties() const override {195return MachineFunctionProperties().set(196MachineFunctionProperties::Property::NoVRegs);197}198199private:200void runOnLoop(MachineLoop &L, MachineFunction &Fn);201202const AArch64InstrInfo *TII;203const TargetRegisterInfo *TRI;204DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap;205bool Modified;206};207208/// Bits from load opcodes used to compute HW prefetcher instruction tags.209struct LoadInfo {210LoadInfo() = default;211212Register DestReg;213Register BaseReg;214int BaseRegIdx = -1;215const MachineOperand *OffsetOpnd = nullptr;216bool IsPrePost = false;217};218219} // end anonymous namespace220221char FalkorHWPFFix::ID = 0;222223INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late",224"Falkor HW Prefetch Fix Late Phase", false, false)225INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)226INITIALIZE_PASS_END(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late",227"Falkor HW Prefetch Fix Late Phase", false, false)228229static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {230return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8);231}232233static std::optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {234int DestRegIdx;235int BaseRegIdx;236int OffsetIdx;237bool IsPrePost;238239switch (MI.getOpcode()) {240default:241return std::nullopt;242243case AArch64::LD1i64:244case AArch64::LD2i64:245DestRegIdx = 0;246BaseRegIdx = 3;247OffsetIdx = -1;248IsPrePost = false;249break;250251case AArch64::LD1i8:252case AArch64::LD1i16:253case AArch64::LD1i32:254case AArch64::LD2i8:255case AArch64::LD2i16:256case AArch64::LD2i32:257case AArch64::LD3i8:258case AArch64::LD3i16:259case AArch64::LD3i32:260case AArch64::LD3i64:261case AArch64::LD4i8:262case AArch64::LD4i16:263case AArch64::LD4i32:264case AArch64::LD4i64:265DestRegIdx = -1;266BaseRegIdx = 3;267OffsetIdx = -1;268IsPrePost = false;269break;270271case AArch64::LD1Onev1d:272case AArch64::LD1Onev2s:273case AArch64::LD1Onev4h:274case AArch64::LD1Onev8b:275case AArch64::LD1Onev2d:276case AArch64::LD1Onev4s:277case AArch64::LD1Onev8h:278case AArch64::LD1Onev16b:279case AArch64::LD1Rv1d:280case AArch64::LD1Rv2s:281case AArch64::LD1Rv4h:282case AArch64::LD1Rv8b:283case AArch64::LD1Rv2d:284case AArch64::LD1Rv4s:285case AArch64::LD1Rv8h:286case AArch64::LD1Rv16b:287DestRegIdx = 0;288BaseRegIdx = 1;289OffsetIdx = -1;290IsPrePost = false;291break;292293case AArch64::LD1Twov1d:294case AArch64::LD1Twov2s:295case AArch64::LD1Twov4h:296case AArch64::LD1Twov8b:297case AArch64::LD1Twov2d:298case AArch64::LD1Twov4s:299case AArch64::LD1Twov8h:300case AArch64::LD1Twov16b:301case AArch64::LD1Threev1d:302case AArch64::LD1Threev2s:303case AArch64::LD1Threev4h:304case AArch64::LD1Threev8b:305case AArch64::LD1Threev2d:306case AArch64::LD1Threev4s:307case AArch64::LD1Threev8h:308case AArch64::LD1Threev16b:309case AArch64::LD1Fourv1d:310case AArch64::LD1Fourv2s:311case AArch64::LD1Fourv4h:312case AArch64::LD1Fourv8b:313case AArch64::LD1Fourv2d:314case AArch64::LD1Fourv4s:315case AArch64::LD1Fourv8h:316case AArch64::LD1Fourv16b:317case AArch64::LD2Twov2s:318case AArch64::LD2Twov4s:319case AArch64::LD2Twov8b:320case AArch64::LD2Twov2d:321case AArch64::LD2Twov4h:322case AArch64::LD2Twov8h:323case AArch64::LD2Twov16b:324case AArch64::LD2Rv1d:325case AArch64::LD2Rv2s:326case AArch64::LD2Rv4s:327case AArch64::LD2Rv8b:328case AArch64::LD2Rv2d:329case AArch64::LD2Rv4h:330case AArch64::LD2Rv8h:331case AArch64::LD2Rv16b:332case AArch64::LD3Threev2s:333case AArch64::LD3Threev4h:334case AArch64::LD3Threev8b:335case AArch64::LD3Threev2d:336case AArch64::LD3Threev4s:337case AArch64::LD3Threev8h:338case AArch64::LD3Threev16b:339case AArch64::LD3Rv1d:340case AArch64::LD3Rv2s:341case AArch64::LD3Rv4h:342case AArch64::LD3Rv8b:343case AArch64::LD3Rv2d:344case AArch64::LD3Rv4s:345case AArch64::LD3Rv8h:346case AArch64::LD3Rv16b:347case AArch64::LD4Fourv2s:348case AArch64::LD4Fourv4h:349case AArch64::LD4Fourv8b:350case AArch64::LD4Fourv2d:351case AArch64::LD4Fourv4s:352case AArch64::LD4Fourv8h:353case AArch64::LD4Fourv16b:354case AArch64::LD4Rv1d:355case AArch64::LD4Rv2s:356case AArch64::LD4Rv4h:357case AArch64::LD4Rv8b:358case AArch64::LD4Rv2d:359case AArch64::LD4Rv4s:360case AArch64::LD4Rv8h:361case AArch64::LD4Rv16b:362DestRegIdx = -1;363BaseRegIdx = 1;364OffsetIdx = -1;365IsPrePost = false;366break;367368case AArch64::LD1i64_POST:369case AArch64::LD2i64_POST:370DestRegIdx = 1;371BaseRegIdx = 4;372OffsetIdx = 5;373IsPrePost = true;374break;375376case AArch64::LD1i8_POST:377case AArch64::LD1i16_POST:378case AArch64::LD1i32_POST:379case AArch64::LD2i8_POST:380case AArch64::LD2i16_POST:381case AArch64::LD2i32_POST:382case AArch64::LD3i8_POST:383case AArch64::LD3i16_POST:384case AArch64::LD3i32_POST:385case AArch64::LD3i64_POST:386case AArch64::LD4i8_POST:387case AArch64::LD4i16_POST:388case AArch64::LD4i32_POST:389case AArch64::LD4i64_POST:390DestRegIdx = -1;391BaseRegIdx = 4;392OffsetIdx = 5;393IsPrePost = true;394break;395396case AArch64::LD1Onev1d_POST:397case AArch64::LD1Onev2s_POST:398case AArch64::LD1Onev4h_POST:399case AArch64::LD1Onev8b_POST:400case AArch64::LD1Onev2d_POST:401case AArch64::LD1Onev4s_POST:402case AArch64::LD1Onev8h_POST:403case AArch64::LD1Onev16b_POST:404case AArch64::LD1Rv1d_POST:405case AArch64::LD1Rv2s_POST:406case AArch64::LD1Rv4h_POST:407case AArch64::LD1Rv8b_POST:408case AArch64::LD1Rv2d_POST:409case AArch64::LD1Rv4s_POST:410case AArch64::LD1Rv8h_POST:411case AArch64::LD1Rv16b_POST:412DestRegIdx = 1;413BaseRegIdx = 2;414OffsetIdx = 3;415IsPrePost = true;416break;417418case AArch64::LD1Twov1d_POST:419case AArch64::LD1Twov2s_POST:420case AArch64::LD1Twov4h_POST:421case AArch64::LD1Twov8b_POST:422case AArch64::LD1Twov2d_POST:423case AArch64::LD1Twov4s_POST:424case AArch64::LD1Twov8h_POST:425case AArch64::LD1Twov16b_POST:426case AArch64::LD1Threev1d_POST:427case AArch64::LD1Threev2s_POST:428case AArch64::LD1Threev4h_POST:429case AArch64::LD1Threev8b_POST:430case AArch64::LD1Threev2d_POST:431case AArch64::LD1Threev4s_POST:432case AArch64::LD1Threev8h_POST:433case AArch64::LD1Threev16b_POST:434case AArch64::LD1Fourv1d_POST:435case AArch64::LD1Fourv2s_POST:436case AArch64::LD1Fourv4h_POST:437case AArch64::LD1Fourv8b_POST:438case AArch64::LD1Fourv2d_POST:439case AArch64::LD1Fourv4s_POST:440case AArch64::LD1Fourv8h_POST:441case AArch64::LD1Fourv16b_POST:442case AArch64::LD2Twov2s_POST:443case AArch64::LD2Twov4s_POST:444case AArch64::LD2Twov8b_POST:445case AArch64::LD2Twov2d_POST:446case AArch64::LD2Twov4h_POST:447case AArch64::LD2Twov8h_POST:448case AArch64::LD2Twov16b_POST:449case AArch64::LD2Rv1d_POST:450case AArch64::LD2Rv2s_POST:451case AArch64::LD2Rv4s_POST:452case AArch64::LD2Rv8b_POST:453case AArch64::LD2Rv2d_POST:454case AArch64::LD2Rv4h_POST:455case AArch64::LD2Rv8h_POST:456case AArch64::LD2Rv16b_POST:457case AArch64::LD3Threev2s_POST:458case AArch64::LD3Threev4h_POST:459case AArch64::LD3Threev8b_POST:460case AArch64::LD3Threev2d_POST:461case AArch64::LD3Threev4s_POST:462case AArch64::LD3Threev8h_POST:463case AArch64::LD3Threev16b_POST:464case AArch64::LD3Rv1d_POST:465case AArch64::LD3Rv2s_POST:466case AArch64::LD3Rv4h_POST:467case AArch64::LD3Rv8b_POST:468case AArch64::LD3Rv2d_POST:469case AArch64::LD3Rv4s_POST:470case AArch64::LD3Rv8h_POST:471case AArch64::LD3Rv16b_POST:472case AArch64::LD4Fourv2s_POST:473case AArch64::LD4Fourv4h_POST:474case AArch64::LD4Fourv8b_POST:475case AArch64::LD4Fourv2d_POST:476case AArch64::LD4Fourv4s_POST:477case AArch64::LD4Fourv8h_POST:478case AArch64::LD4Fourv16b_POST:479case AArch64::LD4Rv1d_POST:480case AArch64::LD4Rv2s_POST:481case AArch64::LD4Rv4h_POST:482case AArch64::LD4Rv8b_POST:483case AArch64::LD4Rv2d_POST:484case AArch64::LD4Rv4s_POST:485case AArch64::LD4Rv8h_POST:486case AArch64::LD4Rv16b_POST:487DestRegIdx = -1;488BaseRegIdx = 2;489OffsetIdx = 3;490IsPrePost = true;491break;492493case AArch64::LDRBBroW:494case AArch64::LDRBBroX:495case AArch64::LDRBBui:496case AArch64::LDRBroW:497case AArch64::LDRBroX:498case AArch64::LDRBui:499case AArch64::LDRDl:500case AArch64::LDRDroW:501case AArch64::LDRDroX:502case AArch64::LDRDui:503case AArch64::LDRHHroW:504case AArch64::LDRHHroX:505case AArch64::LDRHHui:506case AArch64::LDRHroW:507case AArch64::LDRHroX:508case AArch64::LDRHui:509case AArch64::LDRQl:510case AArch64::LDRQroW:511case AArch64::LDRQroX:512case AArch64::LDRQui:513case AArch64::LDRSBWroW:514case AArch64::LDRSBWroX:515case AArch64::LDRSBWui:516case AArch64::LDRSBXroW:517case AArch64::LDRSBXroX:518case AArch64::LDRSBXui:519case AArch64::LDRSHWroW:520case AArch64::LDRSHWroX:521case AArch64::LDRSHWui:522case AArch64::LDRSHXroW:523case AArch64::LDRSHXroX:524case AArch64::LDRSHXui:525case AArch64::LDRSWl:526case AArch64::LDRSWroW:527case AArch64::LDRSWroX:528case AArch64::LDRSWui:529case AArch64::LDRSl:530case AArch64::LDRSroW:531case AArch64::LDRSroX:532case AArch64::LDRSui:533case AArch64::LDRWl:534case AArch64::LDRWroW:535case AArch64::LDRWroX:536case AArch64::LDRWui:537case AArch64::LDRXl:538case AArch64::LDRXroW:539case AArch64::LDRXroX:540case AArch64::LDRXui:541case AArch64::LDURBBi:542case AArch64::LDURBi:543case AArch64::LDURDi:544case AArch64::LDURHHi:545case AArch64::LDURHi:546case AArch64::LDURQi:547case AArch64::LDURSBWi:548case AArch64::LDURSBXi:549case AArch64::LDURSHWi:550case AArch64::LDURSHXi:551case AArch64::LDURSWi:552case AArch64::LDURSi:553case AArch64::LDURWi:554case AArch64::LDURXi:555DestRegIdx = 0;556BaseRegIdx = 1;557OffsetIdx = 2;558IsPrePost = false;559break;560561case AArch64::LDRBBpost:562case AArch64::LDRBBpre:563case AArch64::LDRBpost:564case AArch64::LDRBpre:565case AArch64::LDRDpost:566case AArch64::LDRDpre:567case AArch64::LDRHHpost:568case AArch64::LDRHHpre:569case AArch64::LDRHpost:570case AArch64::LDRHpre:571case AArch64::LDRQpost:572case AArch64::LDRQpre:573case AArch64::LDRSBWpost:574case AArch64::LDRSBWpre:575case AArch64::LDRSBXpost:576case AArch64::LDRSBXpre:577case AArch64::LDRSHWpost:578case AArch64::LDRSHWpre:579case AArch64::LDRSHXpost:580case AArch64::LDRSHXpre:581case AArch64::LDRSWpost:582case AArch64::LDRSWpre:583case AArch64::LDRSpost:584case AArch64::LDRSpre:585case AArch64::LDRWpost:586case AArch64::LDRWpre:587case AArch64::LDRXpost:588case AArch64::LDRXpre:589DestRegIdx = 1;590BaseRegIdx = 2;591OffsetIdx = 3;592IsPrePost = true;593break;594595case AArch64::LDNPDi:596case AArch64::LDNPQi:597case AArch64::LDNPSi:598case AArch64::LDPQi:599case AArch64::LDPDi:600case AArch64::LDPSi:601DestRegIdx = -1;602BaseRegIdx = 2;603OffsetIdx = 3;604IsPrePost = false;605break;606607case AArch64::LDPSWi:608case AArch64::LDPWi:609case AArch64::LDPXi:610DestRegIdx = 0;611BaseRegIdx = 2;612OffsetIdx = 3;613IsPrePost = false;614break;615616case AArch64::LDPQpost:617case AArch64::LDPQpre:618case AArch64::LDPDpost:619case AArch64::LDPDpre:620case AArch64::LDPSpost:621case AArch64::LDPSpre:622DestRegIdx = -1;623BaseRegIdx = 3;624OffsetIdx = 4;625IsPrePost = true;626break;627628case AArch64::LDPSWpost:629case AArch64::LDPSWpre:630case AArch64::LDPWpost:631case AArch64::LDPWpre:632case AArch64::LDPXpost:633case AArch64::LDPXpre:634DestRegIdx = 1;635BaseRegIdx = 3;636OffsetIdx = 4;637IsPrePost = true;638break;639}640641// Loads from the stack pointer don't get prefetched.642Register BaseReg = MI.getOperand(BaseRegIdx).getReg();643if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP)644return std::nullopt;645646LoadInfo LI;647LI.DestReg = DestRegIdx == -1 ? Register() : MI.getOperand(DestRegIdx).getReg();648LI.BaseReg = BaseReg;649LI.BaseRegIdx = BaseRegIdx;650LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx);651LI.IsPrePost = IsPrePost;652return LI;653}654655static std::optional<unsigned> getTag(const TargetRegisterInfo *TRI,656const MachineInstr &MI,657const LoadInfo &LI) {658unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0;659unsigned Base = TRI->getEncodingValue(LI.BaseReg);660unsigned Off;661if (LI.OffsetOpnd == nullptr)662Off = 0;663else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||664LI.OffsetOpnd->isCPI())665return std::nullopt;666else if (LI.OffsetOpnd->isReg())667Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg());668else669Off = LI.OffsetOpnd->getImm() >> 2;670671return makeTag(Dest, Base, Off);672}673674void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {675// Build the initial tag map for the whole loop.676TagMap.clear();677for (MachineBasicBlock *MBB : L.getBlocks())678for (MachineInstr &MI : *MBB) {679std::optional<LoadInfo> LInfo = getLoadInfo(MI);680if (!LInfo)681continue;682std::optional<unsigned> Tag = getTag(TRI, MI, *LInfo);683if (!Tag)684continue;685TagMap[*Tag].push_back(&MI);686}687688bool AnyCollisions = false;689for (auto &P : TagMap) {690auto Size = P.second.size();691if (Size > 1) {692for (auto *MI : P.second) {693if (TII->isStridedAccess(*MI)) {694AnyCollisions = true;695break;696}697}698}699if (AnyCollisions)700break;701}702// Nothing to fix.703if (!AnyCollisions)704return;705706MachineRegisterInfo &MRI = Fn.getRegInfo();707708// Go through all the basic blocks in the current loop and fix any streaming709// loads to avoid collisions with any other loads.710LiveRegUnits LR(*TRI);711for (MachineBasicBlock *MBB : L.getBlocks()) {712LR.clear();713LR.addLiveOuts(*MBB);714for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) {715MachineInstr &MI = *I;716if (!TII->isStridedAccess(MI))717continue;718719std::optional<LoadInfo> OptLdI = getLoadInfo(MI);720if (!OptLdI)721continue;722LoadInfo LdI = *OptLdI;723std::optional<unsigned> OptOldTag = getTag(TRI, MI, LdI);724if (!OptOldTag)725continue;726auto &OldCollisions = TagMap[*OptOldTag];727if (OldCollisions.size() <= 1)728continue;729730bool Fixed = false;731LLVM_DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);732733if (!DebugCounter::shouldExecute(FixCounter)) {734LLVM_DEBUG(dbgs() << "Skipping fix due to debug counter:\n " << MI);735continue;736}737738// Add the non-base registers of MI as live so we don't use them as739// scratch registers.740for (unsigned OpI = 0, OpE = MI.getNumOperands(); OpI < OpE; ++OpI) {741if (OpI == static_cast<unsigned>(LdI.BaseRegIdx))742continue;743MachineOperand &MO = MI.getOperand(OpI);744if (MO.isReg() && MO.readsReg())745LR.addReg(MO.getReg());746}747748for (unsigned ScratchReg : AArch64::GPR64RegClass) {749if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg))750continue;751752LoadInfo NewLdI(LdI);753NewLdI.BaseReg = ScratchReg;754unsigned NewTag = *getTag(TRI, MI, NewLdI);755// Scratch reg tag would collide too, so don't use it.756if (TagMap.count(NewTag))757continue;758759LLVM_DEBUG(dbgs() << "Changing base reg to: "760<< printReg(ScratchReg, TRI) << '\n');761762// Rewrite:763// Xd = LOAD Xb, off764// to:765// Xc = MOV Xb766// Xd = LOAD Xc, off767DebugLoc DL = MI.getDebugLoc();768BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg)769.addReg(AArch64::XZR)770.addReg(LdI.BaseReg)771.addImm(0);772MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx);773BaseOpnd.setReg(ScratchReg);774775// If the load does a pre/post increment, then insert a MOV after as776// well to update the real base register.777if (LdI.IsPrePost) {778LLVM_DEBUG(dbgs() << "Doing post MOV of incremented reg: "779<< printReg(ScratchReg, TRI) << '\n');780MI.getOperand(0).setReg(781ScratchReg); // Change tied operand pre/post update dest.782BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL,783TII->get(AArch64::ORRXrs), LdI.BaseReg)784.addReg(AArch64::XZR)785.addReg(ScratchReg)786.addImm(0);787}788789for (int I = 0, E = OldCollisions.size(); I != E; ++I)790if (OldCollisions[I] == &MI) {791std::swap(OldCollisions[I], OldCollisions[E - 1]);792OldCollisions.pop_back();793break;794}795796// Update TagMap to reflect instruction changes to reduce the number797// of later MOVs to be inserted. This needs to be done after798// OldCollisions is updated since it may be relocated by this799// insertion.800TagMap[NewTag].push_back(&MI);801++NumCollisionsAvoided;802Fixed = true;803Modified = true;804break;805}806if (!Fixed)807++NumCollisionsNotAvoided;808}809}810}811812bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {813auto &ST = Fn.getSubtarget<AArch64Subtarget>();814if (ST.getProcFamily() != AArch64Subtarget::Falkor)815return false;816817if (skipFunction(Fn.getFunction()))818return false;819820TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());821TRI = ST.getRegisterInfo();822823MachineLoopInfo &LI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();824825Modified = false;826827for (MachineLoop *I : LI)828for (MachineLoop *L : depth_first(I))829// Only process inner-loops830if (L->isInnermost())831runOnLoop(*L, Fn);832833return Modified;834}835836FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); }837838839