Path: blob/main/contrib/llvm-project/llvm/lib/Target/ARM/ARMLatencyMutations.cpp
213799 views
//===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8/// \file This file contains the ARM definition DAG scheduling mutations which9/// change inter-instruction latencies10//11//===----------------------------------------------------------------------===//1213#include "ARMLatencyMutations.h"14#include "ARMSubtarget.h"15#include "Thumb2InstrInfo.h"16#include "llvm/Analysis/AliasAnalysis.h"17#include "llvm/CodeGen/ScheduleDAG.h"18#include "llvm/CodeGen/ScheduleDAGMutation.h"19#include "llvm/CodeGen/TargetInstrInfo.h"20#include <algorithm>21#include <array>22#include <initializer_list>23#include <memory>2425namespace llvm {2627namespace {2829// Precompute information about opcodes to speed up pass3031class InstructionInformation {32protected:33struct IInfo {34bool HasBRegAddr : 1; // B-side of addr gen is a register35bool HasBRegAddrShift : 1; // B-side of addr gen has a shift36bool IsDivide : 1; // Some form of integer divide37bool IsInlineShiftALU : 1; // Inline shift+ALU38bool IsMultiply : 1; // Some form of integer multiply39bool IsMVEIntMAC : 1; // MVE 8/16/32-bit integer MAC operation40bool IsNonSubwordLoad : 1; // Load which is a word or larger41bool IsShift : 1; // Shift operation42bool IsRev : 1; // REV operation43bool ProducesQP : 1; // Produces a vector register result44bool ProducesDP : 1; // Produces a double-precision register result45bool ProducesSP : 1; // Produces a single-precision register result46bool ConsumesQP : 1; // Consumes a vector register result47bool ConsumesDP : 1; // Consumes a double-precision register result48bool ConsumesSP : 1; // Consumes a single-precision register result49unsigned MVEIntMACMatched; // Matched operand type (for MVE)50unsigned AddressOpMask; // Mask indicating which operands go into AGU51IInfo()52: HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false),53IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false),54IsNonSubwordLoad(false), IsShift(false), IsRev(false),55ProducesQP(false), ProducesDP(false), ProducesSP(false),56ConsumesQP(false), ConsumesDP(false), ConsumesSP(false),57MVEIntMACMatched(0), AddressOpMask(0) {}58};59typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray;60IInfoArray Info;6162public:63// Always available information64unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; }65bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; }66bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; }67bool isDivide(unsigned Op) { return Info[Op].IsDivide; }68bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; }69bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; }70bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; }71bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; }72bool isRev(unsigned Op) { return Info[Op].IsRev; }73bool isShift(unsigned Op) { return Info[Op].IsShift; }7475// information available if markDPConsumers is called.76bool producesQP(unsigned Op) { return Info[Op].ProducesQP; }77bool producesDP(unsigned Op) { return Info[Op].ProducesDP; }78bool producesSP(unsigned Op) { return Info[Op].ProducesSP; }79bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; }80bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; }81bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; }8283bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) {84return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp;85}8687InstructionInformation(const ARMBaseInstrInfo *TII);8889protected:90void markDPProducersConsumers(const ARMBaseInstrInfo *TII);91};9293InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) {94using namespace ARM;9596std::initializer_list<unsigned> hasBRegAddrList = {97t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,98tLDRr, tLDRBr, tLDRHr, tSTRr, tSTRBr, tSTRHr,99};100for (auto op : hasBRegAddrList) {101Info[op].HasBRegAddr = true;102}103104std::initializer_list<unsigned> hasBRegAddrShiftList = {105t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,106};107for (auto op : hasBRegAddrShiftList) {108Info[op].HasBRegAddrShift = true;109}110111Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;112113std::initializer_list<unsigned> isInlineShiftALUList = {114t2ADCrs, t2ADDSrs, t2ADDrs, t2BICrs, t2EORrs,115t2ORNrs, t2RSBSrs, t2RSBrs, t2SBCrs, t2SUBrs,116t2SUBSrs, t2CMPrs, t2CMNzrs, t2TEQrs, t2TSTrs,117};118for (auto op : isInlineShiftALUList) {119Info[op].IsInlineShiftALU = true;120}121122Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;123124std::initializer_list<unsigned> isMultiplyList = {125t2MUL, t2MLA, t2MLS, t2SMLABB, t2SMLABT, t2SMLAD, t2SMLADX,126t2SMLAL, t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT,127t2SMLATB, t2SMLATT, t2SMLAWT, t2SMLSD, t2SMLSDX, t2SMLSLD, t2SMLSLDX,128t2SMMLA, t2SMMLAR, t2SMMLS, t2SMMLSR, t2SMMUL, t2SMMULR, t2SMUAD,129t2SMUADX, t2SMULBB, t2SMULBT, t2SMULL, t2SMULTB, t2SMULTT, t2SMULWT,130t2SMUSD, t2SMUSDX, t2UMAAL, t2UMLAL, t2UMULL, tMUL,131};132for (auto op : isMultiplyList) {133Info[op].IsMultiply = true;134}135136std::initializer_list<unsigned> isMVEIntMACList = {137MVE_VMLAS_qr_i16, MVE_VMLAS_qr_i32, MVE_VMLAS_qr_i8,138MVE_VMLA_qr_i16, MVE_VMLA_qr_i32, MVE_VMLA_qr_i8,139MVE_VQDMLAH_qrs16, MVE_VQDMLAH_qrs32, MVE_VQDMLAH_qrs8,140MVE_VQDMLASH_qrs16, MVE_VQDMLASH_qrs32, MVE_VQDMLASH_qrs8,141MVE_VQRDMLAH_qrs16, MVE_VQRDMLAH_qrs32, MVE_VQRDMLAH_qrs8,142MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8,143MVE_VQDMLADHXs16, MVE_VQDMLADHXs32, MVE_VQDMLADHXs8,144MVE_VQDMLADHs16, MVE_VQDMLADHs32, MVE_VQDMLADHs8,145MVE_VQDMLSDHXs16, MVE_VQDMLSDHXs32, MVE_VQDMLSDHXs8,146MVE_VQDMLSDHs16, MVE_VQDMLSDHs32, MVE_VQDMLSDHs8,147MVE_VQRDMLADHXs16, MVE_VQRDMLADHXs32, MVE_VQRDMLADHXs8,148MVE_VQRDMLADHs16, MVE_VQRDMLADHs32, MVE_VQRDMLADHs8,149MVE_VQRDMLSDHXs16, MVE_VQRDMLSDHXs32, MVE_VQRDMLSDHXs8,150MVE_VQRDMLSDHs16, MVE_VQRDMLSDHs32, MVE_VQRDMLSDHs8,151};152for (auto op : isMVEIntMACList) {153Info[op].IsMVEIntMAC = true;154}155156std::initializer_list<unsigned> isNonSubwordLoadList = {157t2LDRi12, t2LDRi8, t2LDR_POST, t2LDR_PRE, t2LDRpci,158t2LDRs, t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi,159tLDRpci, tLDRr, tLDRspi,160};161for (auto op : isNonSubwordLoadList) {162Info[op].IsNonSubwordLoad = true;163}164165std::initializer_list<unsigned> isRevList = {166t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH,167};168for (auto op : isRevList) {169Info[op].IsRev = true;170}171172std::initializer_list<unsigned> isShiftList = {173t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr,174tASRri, tASRrr, tLSLSri, tLSLri, tLSLrr, tLSRri, tLSRrr, tROR,175};176for (auto op : isShiftList) {177Info[op].IsShift = true;178}179180std::initializer_list<unsigned> Address1List = {181t2LDRBi12,182t2LDRBi8,183t2LDRBpci,184t2LDRBs,185t2LDRHi12,186t2LDRHi8,187t2LDRHpci,188t2LDRHs,189t2LDRSBi12,190t2LDRSBi8,191t2LDRSBpci,192t2LDRSBs,193t2LDRSHi12,194t2LDRSHi8,195t2LDRSHpci,196t2LDRSHs,197t2LDRi12,198t2LDRi8,199t2LDRpci,200t2LDRs,201tLDRBi,202tLDRBr,203tLDRHi,204tLDRHr,205tLDRSB,206tLDRSH,207tLDRi,208tLDRpci,209tLDRr,210tLDRspi,211t2STRBi12,212t2STRBi8,213t2STRBs,214t2STRHi12,215t2STRHi8,216t2STRHs,217t2STRi12,218t2STRi8,219t2STRs,220tSTRBi,221tSTRBr,222tSTRHi,223tSTRHr,224tSTRi,225tSTRr,226tSTRspi,227VLDRD,228VLDRH,229VLDRS,230VSTRD,231VSTRH,232VSTRS,233MVE_VLD20_16,234MVE_VLD20_32,235MVE_VLD20_8,236MVE_VLD21_16,237MVE_VLD21_32,238MVE_VLD21_8,239MVE_VLD40_16,240MVE_VLD40_32,241MVE_VLD40_8,242MVE_VLD41_16,243MVE_VLD41_32,244MVE_VLD41_8,245MVE_VLD42_16,246MVE_VLD42_32,247MVE_VLD42_8,248MVE_VLD43_16,249MVE_VLD43_32,250MVE_VLD43_8,251MVE_VLDRBS16,252MVE_VLDRBS16_rq,253MVE_VLDRBS32,254MVE_VLDRBS32_rq,255MVE_VLDRBU16,256MVE_VLDRBU16_rq,257MVE_VLDRBU32,258MVE_VLDRBU32_rq,259MVE_VLDRBU8,260MVE_VLDRBU8_rq,261MVE_VLDRDU64_qi,262MVE_VLDRDU64_rq,263MVE_VLDRDU64_rq_u,264MVE_VLDRHS32,265MVE_VLDRHS32_rq,266MVE_VLDRHS32_rq_u,267MVE_VLDRHU16,268MVE_VLDRHU16_rq,269MVE_VLDRHU16_rq_u,270MVE_VLDRHU32,271MVE_VLDRHU32_rq,272MVE_VLDRHU32_rq_u,273MVE_VLDRWU32,274MVE_VLDRWU32_qi,275MVE_VLDRWU32_rq,276MVE_VLDRWU32_rq_u,277MVE_VST20_16,278MVE_VST20_32,279MVE_VST20_8,280MVE_VST21_16,281MVE_VST21_32,282MVE_VST21_8,283MVE_VST40_16,284MVE_VST40_32,285MVE_VST40_8,286MVE_VST41_16,287MVE_VST41_32,288MVE_VST41_8,289MVE_VST42_16,290MVE_VST42_32,291MVE_VST42_8,292MVE_VST43_16,293MVE_VST43_32,294MVE_VST43_8,295MVE_VSTRB16,296MVE_VSTRB16_rq,297MVE_VSTRB32,298MVE_VSTRB32_rq,299MVE_VSTRBU8,300MVE_VSTRB8_rq,301MVE_VSTRD64_qi,302MVE_VSTRD64_rq,303MVE_VSTRD64_rq_u,304MVE_VSTRH32,305MVE_VSTRH32_rq,306MVE_VSTRH32_rq_u,307MVE_VSTRHU16,308MVE_VSTRH16_rq,309MVE_VSTRH16_rq_u,310MVE_VSTRWU32,311MVE_VSTRW32_qi,312MVE_VSTRW32_rq,313MVE_VSTRW32_rq_u,314};315std::initializer_list<unsigned> Address2List = {316t2LDRB_POST,317t2LDRB_PRE,318t2LDRDi8,319t2LDRH_POST,320t2LDRH_PRE,321t2LDRSB_POST,322t2LDRSB_PRE,323t2LDRSH_POST,324t2LDRSH_PRE,325t2LDR_POST,326t2LDR_PRE,327t2STRB_POST,328t2STRB_PRE,329t2STRDi8,330t2STRH_POST,331t2STRH_PRE,332t2STR_POST,333t2STR_PRE,334MVE_VLD20_16_wb,335MVE_VLD20_32_wb,336MVE_VLD20_8_wb,337MVE_VLD21_16_wb,338MVE_VLD21_32_wb,339MVE_VLD21_8_wb,340MVE_VLD40_16_wb,341MVE_VLD40_32_wb,342MVE_VLD40_8_wb,343MVE_VLD41_16_wb,344MVE_VLD41_32_wb,345MVE_VLD41_8_wb,346MVE_VLD42_16_wb,347MVE_VLD42_32_wb,348MVE_VLD42_8_wb,349MVE_VLD43_16_wb,350MVE_VLD43_32_wb,351MVE_VLD43_8_wb,352MVE_VLDRBS16_post,353MVE_VLDRBS16_pre,354MVE_VLDRBS32_post,355MVE_VLDRBS32_pre,356MVE_VLDRBU16_post,357MVE_VLDRBU16_pre,358MVE_VLDRBU32_post,359MVE_VLDRBU32_pre,360MVE_VLDRBU8_post,361MVE_VLDRBU8_pre,362MVE_VLDRDU64_qi_pre,363MVE_VLDRHS32_post,364MVE_VLDRHS32_pre,365MVE_VLDRHU16_post,366MVE_VLDRHU16_pre,367MVE_VLDRHU32_post,368MVE_VLDRHU32_pre,369MVE_VLDRWU32_post,370MVE_VLDRWU32_pre,371MVE_VLDRWU32_qi_pre,372MVE_VST20_16_wb,373MVE_VST20_32_wb,374MVE_VST20_8_wb,375MVE_VST21_16_wb,376MVE_VST21_32_wb,377MVE_VST21_8_wb,378MVE_VST40_16_wb,379MVE_VST40_32_wb,380MVE_VST40_8_wb,381MVE_VST41_16_wb,382MVE_VST41_32_wb,383MVE_VST41_8_wb,384MVE_VST42_16_wb,385MVE_VST42_32_wb,386MVE_VST42_8_wb,387MVE_VST43_16_wb,388MVE_VST43_32_wb,389MVE_VST43_8_wb,390MVE_VSTRB16_post,391MVE_VSTRB16_pre,392MVE_VSTRB32_post,393MVE_VSTRB32_pre,394MVE_VSTRBU8_post,395MVE_VSTRBU8_pre,396MVE_VSTRD64_qi_pre,397MVE_VSTRH32_post,398MVE_VSTRH32_pre,399MVE_VSTRHU16_post,400MVE_VSTRHU16_pre,401MVE_VSTRWU32_post,402MVE_VSTRWU32_pre,403MVE_VSTRW32_qi_pre,404};405std::initializer_list<unsigned> Address3List = {406t2LDRD_POST,407t2LDRD_PRE,408t2STRD_POST,409t2STRD_PRE,410};411// Compute a mask of which operands are involved in address computation412for (auto &op : Address1List) {413Info[op].AddressOpMask = 0x6;414}415for (auto &op : Address2List) {416Info[op].AddressOpMask = 0xc;417}418for (auto &op : Address3List) {419Info[op].AddressOpMask = 0x18;420}421for (auto &op : hasBRegAddrShiftList) {422Info[op].AddressOpMask |= 0x8;423}424}425426void InstructionInformation::markDPProducersConsumers(427const ARMBaseInstrInfo *TII) {428// Learn about all instructions which have FP source/dest registers429for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) {430const MCInstrDesc &MID = TII->get(MI);431auto Operands = MID.operands();432for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) {433bool MarkQP = false, MarkDP = false, MarkSP = false;434switch (Operands[OI].RegClass) {435case ARM::MQPRRegClassID:436case ARM::DPRRegClassID:437case ARM::DPR_8RegClassID:438case ARM::DPR_VFP2RegClassID:439case ARM::DPairRegClassID:440case ARM::DPairSpcRegClassID:441case ARM::DQuadRegClassID:442case ARM::DQuadSpcRegClassID:443case ARM::DTripleRegClassID:444case ARM::DTripleSpcRegClassID:445MarkDP = true;446break;447case ARM::QPRRegClassID:448case ARM::QPR_8RegClassID:449case ARM::QPR_VFP2RegClassID:450case ARM::QQPRRegClassID:451case ARM::QQQQPRRegClassID:452MarkQP = true;453break;454case ARM::SPRRegClassID:455case ARM::SPR_8RegClassID:456case ARM::FPWithVPRRegClassID:457MarkSP = true;458break;459default:460break;461}462if (MarkQP) {463if (OI < MID.getNumDefs())464Info[MI].ProducesQP = true;465else466Info[MI].ConsumesQP = true;467}468if (MarkDP) {469if (OI < MID.getNumDefs())470Info[MI].ProducesDP = true;471else472Info[MI].ConsumesDP = true;473}474if (MarkSP) {475if (OI < MID.getNumDefs())476Info[MI].ProducesSP = true;477else478Info[MI].ConsumesSP = true;479}480}481}482}483484} // anonymous namespace485486static bool hasImplicitCPSRUse(const MachineInstr *MI) {487return MI->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR);488}489490void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep,491unsigned latency) {492SDep Reverse = SrcDep;493Reverse.setSUnit(&SrcSU);494for (SDep &PDep : SrcDep.getSUnit()->Preds) {495if (PDep == Reverse) {496PDep.setLatency(latency);497SrcDep.getSUnit()->setDepthDirty();498break;499}500}501SrcDep.setLatency(latency);502SrcSU.setHeightDirty();503}504505static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) {506return (a & 0xe) != (b & 0xe);507}508509// Set output dependences to zero latency for processors which can510// simultaneously issue to the same register. Returns true if a change511// was made.512bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) {513if (Dep.getKind() == SDep::Output) {514setBidirLatencies(ISU, Dep, 0);515return true;516}517return false;518}519520// The graph doesn't look inside of bundles to determine their521// scheduling boundaries and reports zero latency into and out of them522// (except for CPSR into the bundle, which has latency 1).523// Make some better scheduling assumptions:524// 1) CPSR uses have zero latency; other uses have incoming latency 1525// 2) CPSR defs retain a latency of zero; others have a latency of 1.526//527// Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise528unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) {529530SUnit &DepSU = *Dep.getSUnit();531const MachineInstr *SrcMI = ISU.getInstr();532unsigned SrcOpcode = SrcMI->getOpcode();533const MachineInstr *DstMI = DepSU.getInstr();534unsigned DstOpcode = DstMI->getOpcode();535536if (DstOpcode == ARM::BUNDLE && TII->isPredicated(*DstMI)) {537setBidirLatencies(538ISU, Dep,539(Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1);540return 1;541}542if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(*SrcMI) &&543Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) {544setBidirLatencies(ISU, Dep, 1);545return 2;546}547return 0;548}549550// Determine whether there is a memory RAW hazard here and set up latency551// accordingly552bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep,553unsigned latency) {554if (!Dep.isNormalMemory())555return false;556auto &SrcInst = *ISU.getInstr();557auto &DstInst = *Dep.getSUnit()->getInstr();558if (!SrcInst.mayStore() || !DstInst.mayLoad())559return false;560561auto SrcMO = *SrcInst.memoperands().begin();562auto DstMO = *DstInst.memoperands().begin();563auto SrcVal = SrcMO->getValue();564auto DstVal = DstMO->getValue();565auto SrcPseudoVal = SrcMO->getPseudoValue();566auto DstPseudoVal = DstMO->getPseudoValue();567if (SrcVal && DstVal && AA->alias(SrcVal, DstVal) == AliasResult::MustAlias &&568SrcMO->getOffset() == DstMO->getOffset()) {569setBidirLatencies(ISU, Dep, latency);570return true;571} else if (SrcPseudoVal && DstPseudoVal &&572SrcPseudoVal->kind() == DstPseudoVal->kind() &&573SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) {574// Spills/fills575auto FS0 = cast<FixedStackPseudoSourceValue>(SrcPseudoVal);576auto FS1 = cast<FixedStackPseudoSourceValue>(DstPseudoVal);577if (FS0 == FS1) {578setBidirLatencies(ISU, Dep, latency);579return true;580}581}582return false;583}584585namespace {586587std::unique_ptr<InstructionInformation> II;588589class CortexM7InstructionInformation : public InstructionInformation {590public:591CortexM7InstructionInformation(const ARMBaseInstrInfo *TII)592: InstructionInformation(TII) {}593};594595class CortexM7Overrides : public ARMOverrideBypasses {596public:597CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)598: ARMOverrideBypasses(TII, AA) {599if (!II)600II.reset(new CortexM7InstructionInformation(TII));601}602603void modifyBypasses(SUnit &) override;604};605606void CortexM7Overrides::modifyBypasses(SUnit &ISU) {607const MachineInstr *SrcMI = ISU.getInstr();608unsigned SrcOpcode = SrcMI->getOpcode();609bool isNSWload = II->isNonSubwordLoad(SrcOpcode);610611// Walk the successors looking for latency overrides that are needed612for (SDep &Dep : ISU.Succs) {613614// Output dependences should have 0 latency, as M7 is able to615// schedule writers to the same register for simultaneous issue.616if (zeroOutputDependences(ISU, Dep))617continue;618619if (memoryRAWHazard(ISU, Dep, 4))620continue;621622// Ignore dependencies other than data623if (Dep.getKind() != SDep::Data)624continue;625626SUnit &DepSU = *Dep.getSUnit();627if (DepSU.isBoundaryNode())628continue;629630if (makeBundleAssumptions(ISU, Dep) == 1)631continue;632633const MachineInstr *DstMI = DepSU.getInstr();634unsigned DstOpcode = DstMI->getOpcode();635636// Word loads into any multiply or divide instruction are considered637// cannot bypass their scheduling stage. Didn't do this in the .td file638// because we cannot easily create a read advance that is 0 from certain639// writer classes and 1 from all the rest.640// (The other way around would have been easy.)641if (isNSWload && (II->isMultiply(DstOpcode) || II->isDivide(DstOpcode)))642setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);643644// Word loads into B operand of a load/store are considered cannot bypass645// their scheduling stage. Cannot do in the .td file because646// need to decide between -1 and -2 for ReadAdvance647if (isNSWload && II->hasBRegAddr(DstOpcode) &&648DstMI->getOperand(2).getReg() == Dep.getReg())649setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);650651// Multiplies into any address generation cannot bypass from EX3. Cannot do652// in the .td file because need to decide between -1 and -2 for ReadAdvance653if (II->isMultiply(SrcOpcode)) {654unsigned OpMask = II->getAddressOpMask(DstOpcode) >> 1;655for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) {656if ((OpMask & 1) && DstMI->getOperand(i).isReg() &&657DstMI->getOperand(i).getReg() == Dep.getReg()) {658setBidirLatencies(ISU, Dep, 4); // first legal bypass is EX4->EX1659break;660}661}662}663664// Mismatched conditional producers take longer on M7; they end up looking665// like they were produced at EX3 and read at IS.666if (TII->isPredicated(*SrcMI) && Dep.isAssignedRegDep() &&667(SrcOpcode == ARM::BUNDLE ||668mismatchedPred(TII->getPredicate(*SrcMI),669TII->getPredicate(*DstMI)))) {670unsigned Lat = 1;671// Operand A of shift+ALU is treated as an EX1 read instead of EX2.672if (II->isInlineShiftALU(DstOpcode) && DstMI->getOperand(3).getImm() &&673DstMI->getOperand(1).getReg() == Dep.getReg())674Lat = 2;675Lat = std::min(3u, Dep.getLatency() + Lat);676setBidirLatencies(ISU, Dep, std::max(Dep.getLatency(), Lat));677}678679// CC setter into conditional producer shouldn't have a latency of more680// than 1 unless it's due to an implicit read. (All the "true" readers681// of the condition code use an implicit read, and predicates use an682// explicit.)683if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&684TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))685setBidirLatencies(ISU, Dep, 1);686687// REV instructions cannot bypass directly into the EX1 shifter. The688// code is slightly inexact as it doesn't attempt to ensure that the bypass689// is to the shifter operands.690if (II->isRev(SrcOpcode)) {691if (II->isInlineShiftALU(DstOpcode))692setBidirLatencies(ISU, Dep, 2);693else if (II->isShift(DstOpcode))694setBidirLatencies(ISU, Dep, 1);695}696}697}698699class M85InstructionInformation : public InstructionInformation {700public:701M85InstructionInformation(const ARMBaseInstrInfo *t)702: InstructionInformation(t) {703markDPProducersConsumers(t);704}705};706707class M85Overrides : public ARMOverrideBypasses {708public:709M85Overrides(const ARMBaseInstrInfo *t, AAResults *a)710: ARMOverrideBypasses(t, a) {711if (!II)712II.reset(new M85InstructionInformation(t));713}714715void modifyBypasses(SUnit &) override;716717private:718unsigned computeBypassStage(const MCSchedClassDesc *SCD);719signed modifyMixedWidthFP(const MachineInstr *SrcMI,720const MachineInstr *DstMI, unsigned RegID,721const MCSchedClassDesc *SCD);722};723724unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) {725auto SM = DAG->getSchedModel();726unsigned DefIdx = 0; // just look for the first output's timing727if (DefIdx < SCDesc->NumWriteLatencyEntries) {728// Lookup the definition's write latency in SubtargetInfo.729const MCWriteLatencyEntry *WLEntry =730SM->getSubtargetInfo()->getWriteLatencyEntry(SCDesc, DefIdx);731unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000;732if (Latency == 4)733return 2;734else if (Latency == 5)735return 3;736else if (Latency > 3)737return 3;738else739return Latency;740}741return 2;742}743744// Latency changes for bypassing between FP registers of different sizes:745//746// Note that mixed DP/SP are unlikely because of the semantics747// of C. Mixed MVE/SP are quite common when MVE intrinsics are used.748signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI,749const MachineInstr *DstMI,750unsigned RegID,751const MCSchedClassDesc *SCD) {752753if (!II->producesSP(SrcMI->getOpcode()) &&754!II->producesDP(SrcMI->getOpcode()) &&755!II->producesQP(SrcMI->getOpcode()))756return 0;757758if (Register::isVirtualRegister(RegID)) {759if (II->producesSP(SrcMI->getOpcode()) &&760II->consumesDP(DstMI->getOpcode())) {761for (auto &OP : SrcMI->operands())762if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&763OP.getSubReg() == ARM::ssub_1)764return 5 - computeBypassStage(SCD);765} else if (II->producesSP(SrcMI->getOpcode()) &&766II->consumesQP(DstMI->getOpcode())) {767for (auto &OP : SrcMI->operands())768if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&769(OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))770return 5 - computeBypassStage(SCD) -771((OP.getSubReg() == ARM::ssub_2 ||772OP.getSubReg() == ARM::ssub_3)773? 1774: 0);775} else if (II->producesDP(SrcMI->getOpcode()) &&776II->consumesQP(DstMI->getOpcode())) {777for (auto &OP : SrcMI->operands())778if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&779OP.getSubReg() == ARM::ssub_1)780return -1;781} else if (II->producesDP(SrcMI->getOpcode()) &&782II->consumesSP(DstMI->getOpcode())) {783for (auto &OP : DstMI->operands())784if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&785OP.getSubReg() == ARM::ssub_1)786return 5 - computeBypassStage(SCD);787} else if (II->producesQP(SrcMI->getOpcode()) &&788II->consumesSP(DstMI->getOpcode())) {789for (auto &OP : DstMI->operands())790if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&791(OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))792return 5 - computeBypassStage(SCD) +793((OP.getSubReg() == ARM::ssub_2 ||794OP.getSubReg() == ARM::ssub_3)795? 1796: 0);797} else if (II->producesQP(SrcMI->getOpcode()) &&798II->consumesDP(DstMI->getOpcode())) {799for (auto &OP : DstMI->operands())800if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&801OP.getSubReg() == ARM::ssub_1)802return 1;803}804} else if (Register::isPhysicalRegister(RegID)) {805// Note that when the producer is narrower, not all of the producers806// may be present in the scheduling graph; somewhere earlier in the807// compiler, an implicit def/use of the aliased full register gets808// added to the producer, and so only that producer is seen as *the*809// single producer. This behavior also has the unfortunate effect of810// serializing the producers in the compiler's view of things.811if (II->producesSP(SrcMI->getOpcode()) &&812II->consumesDP(DstMI->getOpcode())) {813for (auto &OP : SrcMI->operands())814if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&815OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&816(OP.getReg() == RegID ||817(OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||818(OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))819return 5 - computeBypassStage(SCD);820} else if (II->producesSP(SrcMI->getOpcode()) &&821II->consumesQP(DstMI->getOpcode())) {822for (auto &OP : SrcMI->operands())823if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&824OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&825(OP.getReg() == RegID ||826(OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||827(OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))828return 5 - computeBypassStage(SCD) -829(((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0);830} else if (II->producesDP(SrcMI->getOpcode()) &&831II->consumesQP(DstMI->getOpcode())) {832for (auto &OP : SrcMI->operands())833if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 &&834OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 &&835(OP.getReg() == RegID ||836(OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID))837return -1;838} else if (II->producesDP(SrcMI->getOpcode()) &&839II->consumesSP(DstMI->getOpcode())) {840if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)841return 5 - computeBypassStage(SCD);842} else if (II->producesQP(SrcMI->getOpcode()) &&843II->consumesSP(DstMI->getOpcode())) {844if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)845return 5 - computeBypassStage(SCD) +846(((RegID - ARM::S0) / 2) % 2 ? 1 : 0);847} else if (II->producesQP(SrcMI->getOpcode()) &&848II->consumesDP(DstMI->getOpcode())) {849if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2)850return 1;851}852}853return 0;854}855856void M85Overrides::modifyBypasses(SUnit &ISU) {857const MachineInstr *SrcMI = ISU.getInstr();858unsigned SrcOpcode = SrcMI->getOpcode();859bool isNSWload = II->isNonSubwordLoad(SrcOpcode);860861// Walk the successors looking for latency overrides that are needed862for (SDep &Dep : ISU.Succs) {863864// Output dependences should have 0 latency, as CortexM85 is able to865// schedule writers to the same register for simultaneous issue.866if (zeroOutputDependences(ISU, Dep))867continue;868869if (memoryRAWHazard(ISU, Dep, 3))870continue;871872// Ignore dependencies other than data or strong ordering.873if (Dep.getKind() != SDep::Data)874continue;875876SUnit &DepSU = *Dep.getSUnit();877if (DepSU.isBoundaryNode())878continue;879880if (makeBundleAssumptions(ISU, Dep) == 1)881continue;882883const MachineInstr *DstMI = DepSU.getInstr();884unsigned DstOpcode = DstMI->getOpcode();885886// Word loads into B operand of a load/store with cannot bypass their887// scheduling stage. Cannot do in the .td file because need to decide888// between -1 and -2 for ReadAdvance889890if (isNSWload && II->hasBRegAddrShift(DstOpcode) &&891DstMI->getOperand(3).getImm() != 0 && // shift operand892DstMI->getOperand(2).getReg() == Dep.getReg())893setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);894895if (isNSWload && isMVEVectorInstruction(DstMI)) {896setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);897}898899if (II->isMVEIntMAC(DstOpcode) &&900II->isMVEIntMACMatched(SrcOpcode, DstOpcode) &&901DstMI->getOperand(0).isReg() &&902DstMI->getOperand(0).getReg() == Dep.getReg())903setBidirLatencies(ISU, Dep, Dep.getLatency() - 1);904905// CC setter into conditional producer shouldn't have a latency of more906// than 0 unless it's due to an implicit read.907if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&908TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))909setBidirLatencies(ISU, Dep, 0);910911if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, Dep.getReg(),912DAG->getSchedClass(&ISU)))913setBidirLatencies(ISU, Dep, std::max(0, signed(Dep.getLatency()) + ALat));914915if (II->isRev(SrcOpcode)) {916if (II->isInlineShiftALU(DstOpcode))917setBidirLatencies(ISU, Dep, 1);918else if (II->isShift(DstOpcode))919setBidirLatencies(ISU, Dep, 1);920}921}922}923924// Add M55 specific overrides for latencies between instructions. Currently it:925// - Adds an extra cycle latency between MVE VMLAV and scalar instructions.926class CortexM55Overrides : public ARMOverrideBypasses {927public:928CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)929: ARMOverrideBypasses(TII, AA) {}930931void modifyBypasses(SUnit &SU) override {932MachineInstr *SrcMI = SU.getInstr();933if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction))934return;935936for (SDep &Dep : SU.Succs) {937if (Dep.getKind() != SDep::Data)938continue;939SUnit &DepSU = *Dep.getSUnit();940if (DepSU.isBoundaryNode())941continue;942MachineInstr *DstMI = DepSU.getInstr();943944if (!isMVEVectorInstruction(DstMI) && !DstMI->mayStore())945setBidirLatencies(SU, Dep, 3);946}947}948};949950} // end anonymous namespace951952void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) {953DAG = DAGInstrs;954for (SUnit &ISU : DAGInstrs->SUnits) {955if (ISU.isBoundaryNode())956continue;957modifyBypasses(ISU);958}959if (DAGInstrs->ExitSU.getInstr())960modifyBypasses(DAGInstrs->ExitSU);961}962963std::unique_ptr<ScheduleDAGMutation>964createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) {965if (ST.isCortexM85())966return std::make_unique<M85Overrides>(ST.getInstrInfo(), AA);967else if (ST.isCortexM7())968return std::make_unique<CortexM7Overrides>(ST.getInstrInfo(), AA);969else if (ST.isCortexM55())970return std::make_unique<CortexM55Overrides>(ST.getInstrInfo(), AA);971972return nullptr;973}974975} // end namespace llvm976977978