Path: blob/main/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp
213799 views
//===---- LoongArchMergeBaseOffset.cpp - Optimise address calculations ----===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// Merge the offset of address calculation into the offset field9// of instructions in a global address lowering sequence.10//11//===----------------------------------------------------------------------===//1213#include "LoongArch.h"14#include "LoongArchTargetMachine.h"15#include "llvm/CodeGen/MachineFunctionPass.h"16#include "llvm/CodeGen/Passes.h"17#include "llvm/MC/TargetRegistry.h"18#include "llvm/Support/Debug.h"19#include "llvm/Target/TargetOptions.h"20#include <optional>2122using namespace llvm;2324#define DEBUG_TYPE "loongarch-merge-base-offset"25#define LoongArch_MERGE_BASE_OFFSET_NAME "LoongArch Merge Base Offset"2627namespace {2829class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass {30const LoongArchSubtarget *ST = nullptr;31MachineRegisterInfo *MRI;3233public:34static char ID;35bool runOnMachineFunction(MachineFunction &Fn) override;36bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12,37MachineInstr *&Lo20, MachineInstr *&Hi12,38MachineInstr *&Last);39bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Add,40MachineInstr *&Lo12);4142bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12,43MachineInstr *&Lo20, MachineInstr *&Hi12,44MachineInstr *&Last);45void foldOffset(MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,46MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,47int64_t Offset);48bool foldLargeOffset(MachineInstr &Hi20, MachineInstr &Lo12,49MachineInstr *&Lo20, MachineInstr *&Hi12,50MachineInstr *&Last, MachineInstr &TailAdd,51Register GAReg);5253bool foldIntoMemoryOps(MachineInstr &Hi20, MachineInstr &Lo12,54MachineInstr *&Lo20, MachineInstr *&Hi12,55MachineInstr *&Last);5657LoongArchMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}5859MachineFunctionProperties getRequiredProperties() const override {60return MachineFunctionProperties().setIsSSA();61}6263void getAnalysisUsage(AnalysisUsage &AU) const override {64AU.setPreservesCFG();65MachineFunctionPass::getAnalysisUsage(AU);66}6768StringRef getPassName() const override {69return LoongArch_MERGE_BASE_OFFSET_NAME;70}71};72} // end anonymous namespace7374char LoongArchMergeBaseOffsetOpt::ID = 0;75INITIALIZE_PASS(LoongArchMergeBaseOffsetOpt, DEBUG_TYPE,76LoongArch_MERGE_BASE_OFFSET_NAME, false, false)7778// Detect either of the patterns:79//80// 1. (small/medium):81// pcalau12i vreg1, %pc_hi20(s)82// addi.d vreg2, vreg1, %pc_lo12(s)83//84// 2. (large):85// pcalau12i vreg1, %pc_hi20(s)86// addi.d vreg2, $zero, %pc_lo12(s)87// lu32i.d vreg3, vreg2, %pc64_lo20(s)88// lu52i.d vreg4, vreg3, %pc64_hi12(s)89// add.d vreg5, vreg4, vreg19091// The pattern is only accepted if:92// 1) For small and medium pattern, the first instruction has only one use,93// which is the ADDI.94// 2) For large pattern, the first four instructions each have only one use,95// and the user of the fourth instruction is ADD.96// 3) The address operands have the appropriate type, reflecting the97// lowering of a global address or constant pool using the pattern.98// 4) The offset value in the Global Address or Constant Pool is 0.99bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,100MachineInstr *&Lo12,101MachineInstr *&Lo20,102MachineInstr *&Hi12,103MachineInstr *&Last) {104if (Hi20.getOpcode() != LoongArch::PCALAU12I)105return false;106107const MachineOperand &Hi20Op1 = Hi20.getOperand(1);108if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_PCREL_HI)109return false;110111auto isGlobalOrCPIOrBlockAddress = [](const MachineOperand &Op) {112return Op.isGlobal() || Op.isCPI() || Op.isBlockAddress();113};114115if (!isGlobalOrCPIOrBlockAddress(Hi20Op1) || Hi20Op1.getOffset() != 0)116return false;117118Register HiDestReg = Hi20.getOperand(0).getReg();119if (!MRI->hasOneUse(HiDestReg))120return false;121122MachineInstr *UseInst = &*MRI->use_instr_begin(HiDestReg);123if (UseInst->getOpcode() != LoongArch::ADD_D) {124Lo12 = UseInst;125if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||126(!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))127return false;128} else {129assert(ST->is64Bit());130Last = UseInst;131132Register LastOp1Reg = Last->getOperand(1).getReg();133if (!LastOp1Reg.isVirtual())134return false;135Hi12 = MRI->getVRegDef(LastOp1Reg);136const MachineOperand &Hi12Op2 = Hi12->getOperand(2);137if (Hi12Op2.getTargetFlags() != LoongArchII::MO_PCREL64_HI)138return false;139if (!isGlobalOrCPIOrBlockAddress(Hi12Op2) || Hi12Op2.getOffset() != 0)140return false;141if (!MRI->hasOneUse(Hi12->getOperand(0).getReg()))142return false;143144Lo20 = MRI->getVRegDef(Hi12->getOperand(1).getReg());145const MachineOperand &Lo20Op2 = Lo20->getOperand(2);146if (Lo20Op2.getTargetFlags() != LoongArchII::MO_PCREL64_LO)147return false;148if (!isGlobalOrCPIOrBlockAddress(Lo20Op2) || Lo20Op2.getOffset() != 0)149return false;150if (!MRI->hasOneUse(Lo20->getOperand(0).getReg()))151return false;152153Lo12 = MRI->getVRegDef(Lo20->getOperand(1).getReg());154if (!MRI->hasOneUse(Lo12->getOperand(0).getReg()))155return false;156}157158const MachineOperand &Lo12Op2 = Lo12->getOperand(2);159assert(Hi20.getOpcode() == LoongArch::PCALAU12I);160if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_PCREL_LO ||161!(isGlobalOrCPIOrBlockAddress(Lo12Op2) || Lo12Op2.isMCSymbol()) ||162Lo12Op2.getOffset() != 0)163return false;164165if (Hi20Op1.isGlobal()) {166LLVM_DEBUG(dbgs() << " Found lowered global address: "167<< *Hi20Op1.getGlobal() << "\n");168} else if (Hi20Op1.isBlockAddress()) {169LLVM_DEBUG(dbgs() << " Found lowered basic address: "170<< *Hi20Op1.getBlockAddress() << "\n");171} else if (Hi20Op1.isCPI()) {172LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex()173<< "\n");174}175176return true;177}178179// Detect the pattern:180//181// (small/medium):182// lu12i.w vreg1, %le_hi20_r(s)183// add.w/d vreg2, vreg1, r2, %le_add_r(s)184// addi.w/d vreg3, vreg2, %le_lo12_r(s)185186// The pattern is only accepted if:187// 1) The first instruction has only one use, which is the PseudoAddTPRel.188// The second instruction has only one use, which is the ADDI. The189// second instruction's last operand is the tp register.190// 2) The address operands have the appropriate type, reflecting the191// lowering of a thread_local global address using the pattern.192// 3) The offset value in the ThreadLocal Global Address is 0.193bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,194MachineInstr *&Add,195MachineInstr *&Lo12) {196if (Hi20.getOpcode() != LoongArch::LU12I_W)197return false;198199auto isGlobalOrCPI = [](const MachineOperand &Op) {200return Op.isGlobal() || Op.isCPI();201};202203const MachineOperand &Hi20Op1 = Hi20.getOperand(1);204if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_LE_HI_R ||205!isGlobalOrCPI(Hi20Op1) || Hi20Op1.getOffset() != 0)206return false;207208Register HiDestReg = Hi20.getOperand(0).getReg();209if (!MRI->hasOneUse(HiDestReg))210return false;211212Add = &*MRI->use_instr_begin(HiDestReg);213if ((ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_D) ||214(!ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_W))215return false;216217if (Add->getOperand(2).getReg() != LoongArch::R2)218return false;219220const MachineOperand &AddOp3 = Add->getOperand(3);221if (LoongArchII::getDirectFlags(AddOp3) != LoongArchII::MO_LE_ADD_R ||222!(isGlobalOrCPI(AddOp3) || AddOp3.isMCSymbol()) ||223AddOp3.getOffset() != 0)224return false;225226Register AddDestReg = Add->getOperand(0).getReg();227if (!MRI->hasOneUse(AddDestReg))228return false;229230Lo12 = &*MRI->use_instr_begin(AddDestReg);231if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||232(!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))233return false;234235const MachineOperand &Lo12Op2 = Lo12->getOperand(2);236if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_LE_LO_R ||237!(isGlobalOrCPI(Lo12Op2) || Lo12Op2.isMCSymbol()) ||238Lo12Op2.getOffset() != 0)239return false;240241if (Hi20Op1.isGlobal()) {242LLVM_DEBUG(dbgs() << " Found lowered global address: "243<< *Hi20Op1.getGlobal() << "\n");244} else if (Hi20Op1.isCPI()) {245LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex()246<< "\n");247}248249return true;250}251252// Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions.253// Delete the tail instruction and update all the uses to use the254// output from Last.255void LoongArchMergeBaseOffsetOpt::foldOffset(256MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,257MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,258int64_t Offset) {259// Put the offset back in Hi and the Lo260Hi20.getOperand(1).setOffset(Offset);261Lo12.getOperand(2).setOffset(Offset);262if (Lo20 && Hi12) {263Lo20->getOperand(2).setOffset(Offset);264Hi12->getOperand(2).setOffset(Offset);265}266267// For tls-le, offset of the second PseudoAddTPRel instr should also be268// updated.269MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());270if (Hi20.getOpcode() == LoongArch::LU12I_W)271Add->getOperand(3).setOffset(Offset);272273// Delete the tail instruction.274MachineInstr *Def = Last ? Last : &Lo12;275MRI->constrainRegClass(Def->getOperand(0).getReg(),276MRI->getRegClass(Tail.getOperand(0).getReg()));277MRI->replaceRegWith(Tail.getOperand(0).getReg(), Def->getOperand(0).getReg());278Tail.eraseFromParent();279280LLVM_DEBUG(dbgs() << " Merged offset " << Offset << " into base.\n"281<< " " << Hi20;);282if (Hi20.getOpcode() == LoongArch::LU12I_W) {283LLVM_DEBUG(dbgs() << " " << *Add;);284}285LLVM_DEBUG(dbgs() << " " << Lo12;);286if (Lo20 && Hi12) {287LLVM_DEBUG(dbgs() << " " << *Lo20 << " " << *Hi12;);288}289}290291// Detect patterns for large offsets that are passed into an ADD instruction.292// If the pattern is found, updates the offset in Hi20, (Add), Lo12,293// (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that294// produced the offset.295//296// (The instructions marked with "!" are not necessarily present)297//298// Base address lowering is of the form:299// 1) pcala:300// Hi20: pcalau12i vreg1, %pc_hi20(s)301// +--- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)302// | Lo20: lu32i.d vreg2, %pc64_lo20(s) !303// +--- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !304// |305// | 2) tls-le:306// | Hi20: lu12i.w vreg1, %le_hi20_r(s)307// | Add: add.w/d vreg1, vreg1, r2, %le_add_r(s)308// +--- Lo12: addi.w/d vreg2, vreg1, %le_lo12_r(s)309// |310// | The large offset can be one of the forms:311// |312// +-> 1) Offset that has non zero bits in Hi20 and Lo12 bits:313// | OffsetHi20: lu12i.w vreg3, 4314// | OffsetLo12: ori voff, vreg3, 188 ------------------+315// | |316// +-> 2) Offset that has non zero bits in Hi20 bits only: |317// | OffsetHi20: lu12i.w voff, 128 ------------------+318// | |319// +-> 3) Offset that has non zero bits in Lo20 bits: |320// | OffsetHi20: lu12i.w vreg3, 121 ! |321// | OffsetLo12: ori voff, vreg3, 122 ! |322// | OffsetLo20: lu32i.d voff, 123 ------------------+323// +-> 4) Offset that has non zero bits in Hi12 bits: |324// OffsetHi20: lu12i.w vreg3, 121 ! |325// OffsetLo12: ori voff, vreg3, 122 ! |326// OffsetLo20: lu32i.d vreg3, 123 ! |327// OffsetHi12: lu52i.d voff, vrg3, 124 ------------------+328// |329// TailAdd: add.d vreg4, vreg2, voff <------------------+330//331bool LoongArchMergeBaseOffsetOpt::foldLargeOffset(332MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,333MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &TailAdd,334Register GAReg) {335assert((TailAdd.getOpcode() == LoongArch::ADD_W ||336TailAdd.getOpcode() == LoongArch::ADD_D) &&337"Expected ADD instruction!");338Register Rs = TailAdd.getOperand(1).getReg();339Register Rt = TailAdd.getOperand(2).getReg();340Register Reg = Rs == GAReg ? Rt : Rs;341SmallVector<MachineInstr *, 4> Instrs;342int64_t Offset = 0;343int64_t Mask = -1;344345// This can point to one of [ORI, LU12I.W, LU32I.D, LU52I.D]:346for (int i = 0; i < 4; i++) {347// Handle Reg is R0.348if (Reg == LoongArch::R0)349break;350351// Can't fold if the register has more than one use.352if (!Reg.isVirtual() || !MRI->hasOneUse(Reg))353return false;354355MachineInstr *Curr = MRI->getVRegDef(Reg);356if (!Curr)357break;358359switch (Curr->getOpcode()) {360default:361// Can't fold if the instruction opcode is unexpected.362return false;363case LoongArch::ORI: {364MachineOperand ImmOp = Curr->getOperand(2);365if (ImmOp.getTargetFlags() != LoongArchII::MO_None)366return false;367Offset += ImmOp.getImm();368Reg = Curr->getOperand(1).getReg();369Instrs.push_back(Curr);370break;371}372case LoongArch::LU12I_W: {373MachineOperand ImmOp = Curr->getOperand(1);374if (ImmOp.getTargetFlags() != LoongArchII::MO_None)375return false;376Offset += SignExtend64<32>(ImmOp.getImm() << 12) & Mask;377Reg = LoongArch::R0;378Instrs.push_back(Curr);379break;380}381case LoongArch::LU32I_D: {382MachineOperand ImmOp = Curr->getOperand(2);383if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Lo20)384return false;385Offset += SignExtend64<52>(ImmOp.getImm() << 32) & Mask;386Mask ^= 0x000FFFFF00000000ULL;387Reg = Curr->getOperand(1).getReg();388Instrs.push_back(Curr);389break;390}391case LoongArch::LU52I_D: {392MachineOperand ImmOp = Curr->getOperand(2);393if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Hi12)394return false;395Offset += ImmOp.getImm() << 52;396Mask ^= 0xFFF0000000000000ULL;397Reg = Curr->getOperand(1).getReg();398Instrs.push_back(Curr);399break;400}401}402}403404// Can't fold if the offset is not extracted.405if (!Offset)406return false;407408foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset);409LLVM_DEBUG(dbgs() << " Offset Instrs:\n");410for (auto I : Instrs) {411LLVM_DEBUG(dbgs() << " " << *I);412I->eraseFromParent();413}414415return true;416}417418bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,419MachineInstr &Lo12,420MachineInstr *&Lo20,421MachineInstr *&Hi12,422MachineInstr *&Last) {423Register DestReg =424Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();425426// Look for arithmetic instructions we can get an offset from.427// We might be able to remove the arithmetic instructions by folding the428// offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or429// LU12I_W+PseudoAddTPRel+ADDI.430if (!MRI->hasOneUse(DestReg))431return false;432433// DestReg has only one use.434MachineInstr &Tail = *MRI->use_instr_begin(DestReg);435switch (Tail.getOpcode()) {436default:437LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:"438<< Tail);439break;440case LoongArch::ADDI_W:441if (ST->is64Bit())442return false;443[[fallthrough]];444case LoongArch::ADDI_D:445case LoongArch::ADDU16I_D: {446// Offset is simply an immediate operand.447int64_t Offset = Tail.getOperand(2).getImm();448if (Tail.getOpcode() == LoongArch::ADDU16I_D)449Offset = SignExtend64<32>(Offset << 16);450451// We might have two ADDIs in a row.452Register TailDestReg = Tail.getOperand(0).getReg();453if (MRI->hasOneUse(TailDestReg)) {454MachineInstr &TailTail = *MRI->use_instr_begin(TailDestReg);455if (ST->is64Bit() && TailTail.getOpcode() == LoongArch::ADDI_W)456return false;457if (TailTail.getOpcode() == LoongArch::ADDI_W ||458TailTail.getOpcode() == LoongArch::ADDI_D) {459Offset += TailTail.getOperand(2).getImm();460LLVM_DEBUG(dbgs() << " Offset Instrs: " << Tail << TailTail);461foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailTail, Offset);462Tail.eraseFromParent();463return true;464}465}466467LLVM_DEBUG(dbgs() << " Offset Instr: " << Tail);468foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, Offset);469return true;470}471case LoongArch::ADD_W:472if (ST->is64Bit())473return false;474[[fallthrough]];475case LoongArch::ADD_D:476// The offset is too large to fit in the immediate field of ADDI.477return foldLargeOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, DestReg);478break;479}480481return false;482}483484// Memory access opcode mapping for transforms.485static unsigned getNewOpc(unsigned Op, bool isLarge) {486switch (Op) {487case LoongArch::LD_B:488return isLarge ? LoongArch::LDX_B : LoongArch::LD_B;489case LoongArch::LD_H:490return isLarge ? LoongArch::LDX_H : LoongArch::LD_H;491case LoongArch::LD_W:492case LoongArch::LDPTR_W:493return isLarge ? LoongArch::LDX_W : LoongArch::LD_W;494case LoongArch::LD_D:495case LoongArch::LDPTR_D:496return isLarge ? LoongArch::LDX_D : LoongArch::LD_D;497case LoongArch::LD_BU:498return isLarge ? LoongArch::LDX_BU : LoongArch::LD_BU;499case LoongArch::LD_HU:500return isLarge ? LoongArch::LDX_HU : LoongArch::LD_HU;501case LoongArch::LD_WU:502return isLarge ? LoongArch::LDX_WU : LoongArch::LD_WU;503case LoongArch::FLD_S:504return isLarge ? LoongArch::FLDX_S : LoongArch::FLD_S;505case LoongArch::FLD_D:506return isLarge ? LoongArch::FLDX_D : LoongArch::FLD_D;507case LoongArch::VLD:508return isLarge ? LoongArch::VLDX : LoongArch::VLD;509case LoongArch::XVLD:510return isLarge ? LoongArch::XVLDX : LoongArch::XVLD;511case LoongArch::VLDREPL_B:512return LoongArch::VLDREPL_B;513case LoongArch::XVLDREPL_B:514return LoongArch::XVLDREPL_B;515case LoongArch::ST_B:516return isLarge ? LoongArch::STX_B : LoongArch::ST_B;517case LoongArch::ST_H:518return isLarge ? LoongArch::STX_H : LoongArch::ST_H;519case LoongArch::ST_W:520case LoongArch::STPTR_W:521return isLarge ? LoongArch::STX_W : LoongArch::ST_W;522case LoongArch::ST_D:523case LoongArch::STPTR_D:524return isLarge ? LoongArch::STX_D : LoongArch::ST_D;525case LoongArch::FST_S:526return isLarge ? LoongArch::FSTX_S : LoongArch::FST_S;527case LoongArch::FST_D:528return isLarge ? LoongArch::FSTX_D : LoongArch::FST_D;529case LoongArch::VST:530return isLarge ? LoongArch::VSTX : LoongArch::VST;531case LoongArch::XVST:532return isLarge ? LoongArch::XVSTX : LoongArch::XVST;533default:534llvm_unreachable("Unexpected opcode for replacement");535}536}537538bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,539MachineInstr &Lo12,540MachineInstr *&Lo20,541MachineInstr *&Hi12,542MachineInstr *&Last) {543Register DestReg =544Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();545546// If all the uses are memory ops with the same offset, we can transform:547//548// 1. (small/medium):549// 1.1. pcala550// pcalau12i vreg1, %pc_hi20(s)551// addi.d vreg2, vreg1, %pc_lo12(s)552// ld.w vreg3, 8(vreg2)553//554// =>555//556// pcalau12i vreg1, %pc_hi20(s+8)557// ld.w vreg3, vreg1, %pc_lo12(s+8)(vreg1)558//559// 1.2. tls-le560// lu12i.w vreg1, %le_hi20_r(s)561// add.w/d vreg2, vreg1, r2, %le_add_r(s)562// addi.w/d vreg3, vreg2, %le_lo12_r(s)563// ld.w vreg4, 8(vreg3)564//565// =>566//567// lu12i.w vreg1, %le_hi20_r(s+8)568// add.w/d vreg2, vreg1, r2, %le_add_r(s+8)569// ld.w vreg4, vreg2, %le_lo12_r(s+8)(vreg2)570//571// 2. (large):572// pcalau12i vreg1, %pc_hi20(s)573// addi.d vreg2, $zero, %pc_lo12(s)574// lu32i.d vreg3, vreg2, %pc64_lo20(s)575// lu52i.d vreg4, vreg3, %pc64_hi12(s)576// add.d vreg5, vreg4, vreg1577// ld.w vreg6, 8(vreg5)578//579// =>580//581// pcalau12i vreg1, %pc_hi20(s+8)582// addi.d vreg2, $zero, %pc_lo12(s+8)583// lu32i.d vreg3, vreg2, %pc64_lo20(s+8)584// lu52i.d vreg4, vreg3, %pc64_hi12(s+8)585// ldx.w vreg6, vreg4, vreg1586587std::optional<int64_t> CommonOffset;588DenseMap<const MachineInstr *, SmallVector<unsigned>>589InlineAsmMemoryOpIndexesMap;590for (const MachineInstr &UseMI : MRI->use_instructions(DestReg)) {591switch (UseMI.getOpcode()) {592default:593LLVM_DEBUG(dbgs() << "Not a load or store instruction: " << UseMI);594return false;595case LoongArch::VLDREPL_B:596case LoongArch::XVLDREPL_B:597// We can't do this for large pattern.598if (Last)599return false;600[[fallthrough]];601case LoongArch::LD_B:602case LoongArch::LD_H:603case LoongArch::LD_W:604case LoongArch::LD_D:605case LoongArch::LD_BU:606case LoongArch::LD_HU:607case LoongArch::LD_WU:608case LoongArch::LDPTR_W:609case LoongArch::LDPTR_D:610case LoongArch::FLD_S:611case LoongArch::FLD_D:612case LoongArch::VLD:613case LoongArch::XVLD:614case LoongArch::ST_B:615case LoongArch::ST_H:616case LoongArch::ST_W:617case LoongArch::ST_D:618case LoongArch::STPTR_W:619case LoongArch::STPTR_D:620case LoongArch::FST_S:621case LoongArch::FST_D:622case LoongArch::VST:623case LoongArch::XVST: {624if (UseMI.getOperand(1).isFI())625return false;626// Register defined by Lo should not be the value register.627if (DestReg == UseMI.getOperand(0).getReg())628return false;629assert(DestReg == UseMI.getOperand(1).getReg() &&630"Expected base address use");631// All load/store instructions must use the same offset.632int64_t Offset = UseMI.getOperand(2).getImm();633if (CommonOffset && Offset != CommonOffset)634return false;635CommonOffset = Offset;636break;637}638case LoongArch::INLINEASM:639case LoongArch::INLINEASM_BR: {640// We can't do this for large pattern.641if (Last)642return false;643SmallVector<unsigned> InlineAsmMemoryOpIndexes;644unsigned NumOps = 0;645for (unsigned I = InlineAsm::MIOp_FirstOperand;646I < UseMI.getNumOperands(); I += 1 + NumOps) {647const MachineOperand &FlagsMO = UseMI.getOperand(I);648// Should be an imm.649if (!FlagsMO.isImm())650continue;651652const InlineAsm::Flag Flags(FlagsMO.getImm());653NumOps = Flags.getNumOperandRegisters();654655// Memory constraints have two operands.656if (NumOps != 2 || !Flags.isMemKind()) {657// If the register is used by something other than a memory contraint,658// we should not fold.659for (unsigned J = 0; J < NumOps; ++J) {660const MachineOperand &MO = UseMI.getOperand(I + 1 + J);661if (MO.isReg() && MO.getReg() == DestReg)662return false;663}664continue;665}666667// We can only do this for constraint m.668if (Flags.getMemoryConstraintID() != InlineAsm::ConstraintCode::m)669return false;670671const MachineOperand &AddrMO = UseMI.getOperand(I + 1);672if (!AddrMO.isReg() || AddrMO.getReg() != DestReg)673continue;674675const MachineOperand &OffsetMO = UseMI.getOperand(I + 2);676if (!OffsetMO.isImm())677continue;678679// All inline asm memory operands must use the same offset.680int64_t Offset = OffsetMO.getImm();681if (CommonOffset && Offset != CommonOffset)682return false;683CommonOffset = Offset;684InlineAsmMemoryOpIndexes.push_back(I + 1);685}686InlineAsmMemoryOpIndexesMap.insert(687std::make_pair(&UseMI, InlineAsmMemoryOpIndexes));688break;689}690}691}692693// We found a common offset.694// Update the offsets in global address lowering.695// We may have already folded some arithmetic so we need to add to any696// existing offset.697int64_t NewOffset = Hi20.getOperand(1).getOffset() + *CommonOffset;698// LA32 ignores the upper 32 bits.699if (!ST->is64Bit())700NewOffset = SignExtend64<32>(NewOffset);701// We can only fold simm32 offsets.702if (!isInt<32>(NewOffset))703return false;704705// If optimized by this pass successfully, MO_RELAX bitmask target-flag should706// be removed from the pcala code sequence. Code sequence of tls-le can still707// be relaxed after being optimized.708//709// For example:710// pcalau12i $a0, %pc_hi20(symbol)711// addi.d $a0, $a0, %pc_lo12(symbol)712// ld.w $a0, $a0, 0713//714// =>715//716// pcalau12i $a0, %pc_hi20(symbol)717// ld.w $a0, $a0, %pc_lo12(symbol)718//719// Code sequence optimized before can be relax by linker. But after being720// optimized, it cannot be relaxed any more. So MO_RELAX flag should not be721// carried by them.722Hi20.getOperand(1).setOffset(NewOffset);723MachineOperand &ImmOp = Lo12.getOperand(2);724ImmOp.setOffset(NewOffset);725if (Lo20 && Hi12) {726Lo20->getOperand(2).setOffset(NewOffset);727Hi12->getOperand(2).setOffset(NewOffset);728}729if (Hi20.getOpcode() == LoongArch::PCALAU12I) {730Hi20.getOperand(1).setTargetFlags(731LoongArchII::getDirectFlags(Hi20.getOperand(1)));732ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp));733} else if (Hi20.getOpcode() == LoongArch::LU12I_W) {734MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());735Add->getOperand(3).setOffset(NewOffset);736}737738// Update the immediate in the load/store instructions to add the offset.739const LoongArchInstrInfo &TII = *ST->getInstrInfo();740for (MachineInstr &UseMI :741llvm::make_early_inc_range(MRI->use_instructions(DestReg))) {742if (UseMI.getOpcode() == LoongArch::INLINEASM ||743UseMI.getOpcode() == LoongArch::INLINEASM_BR) {744auto &InlineAsmMemoryOpIndexes = InlineAsmMemoryOpIndexesMap[&UseMI];745for (unsigned I : InlineAsmMemoryOpIndexes) {746MachineOperand &MO = UseMI.getOperand(I + 1);747switch (ImmOp.getType()) {748case MachineOperand::MO_GlobalAddress:749MO.ChangeToGA(ImmOp.getGlobal(), ImmOp.getOffset(),750LoongArchII::getDirectFlags(ImmOp));751break;752case MachineOperand::MO_MCSymbol:753MO.ChangeToMCSymbol(ImmOp.getMCSymbol(),754LoongArchII::getDirectFlags(ImmOp));755MO.setOffset(ImmOp.getOffset());756break;757case MachineOperand::MO_BlockAddress:758MO.ChangeToBA(ImmOp.getBlockAddress(), ImmOp.getOffset(),759LoongArchII::getDirectFlags(ImmOp));760break;761case MachineOperand::MO_ConstantPoolIndex:762MO.ChangeToCPI(ImmOp.getIndex(), ImmOp.getOffset(),763LoongArchII::getDirectFlags(ImmOp));764break;765default:766report_fatal_error("unsupported machine operand type");767break;768}769}770} else {771UseMI.setDesc(TII.get(getNewOpc(UseMI.getOpcode(), Last)));772if (Last) {773UseMI.removeOperand(2);774UseMI.removeOperand(1);775UseMI.addOperand(Last->getOperand(1));776UseMI.addOperand(Last->getOperand(2));777UseMI.getOperand(1).setIsKill(false);778UseMI.getOperand(2).setIsKill(false);779} else {780UseMI.removeOperand(2);781UseMI.addOperand(ImmOp);782}783}784}785786if (Last) {787Last->eraseFromParent();788return true;789}790791if (Hi20.getOpcode() == LoongArch::PCALAU12I) {792MRI->replaceRegWith(Lo12.getOperand(0).getReg(),793Hi20.getOperand(0).getReg());794} else if (Hi20.getOpcode() == LoongArch::LU12I_W) {795MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());796MRI->replaceRegWith(Lo12.getOperand(0).getReg(),797Add->getOperand(0).getReg());798}799Lo12.eraseFromParent();800return true;801}802803bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {804if (skipFunction(Fn.getFunction()))805return false;806807ST = &Fn.getSubtarget<LoongArchSubtarget>();808809bool MadeChange = false;810MRI = &Fn.getRegInfo();811for (MachineBasicBlock &MBB : Fn) {812LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");813for (MachineInstr &Hi20 : MBB) {814MachineInstr *Lo12 = nullptr;815MachineInstr *Lo20 = nullptr;816MachineInstr *Hi12 = nullptr;817MachineInstr *Last = nullptr;818if (Hi20.getOpcode() == LoongArch::PCALAU12I) {819// Detect foldable pcala code sequence in small/medium/large code model.820if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))821continue;822} else if (Hi20.getOpcode() == LoongArch::LU12I_W) {823MachineInstr *Add = nullptr;824// Detect foldable tls-le code sequence in small/medium code model.825if (!detectFoldable(Hi20, Add, Lo12))826continue;827} else {828continue;829}830// For tls-le, we do not pass the second PseudoAddTPRel instr in order to831// reuse the existing hooks and the last three paramaters should always be832// nullptr.833MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last);834MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last);835}836}837838return MadeChange;839}840841/// Returns an instance of the Merge Base Offset Optimization pass.842FunctionPass *llvm::createLoongArchMergeBaseOffsetOptPass() {843return new LoongArchMergeBaseOffsetOpt();844}845846847