Path: blob/main/contrib/llvm-project/llvm/lib/Target/X86/X86CmovConversion.cpp
35267 views
//====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8/// \file9/// This file implements a pass that converts X86 cmov instructions into10/// branches when profitable. This pass is conservative. It transforms if and11/// only if it can guarantee a gain with high confidence.12///13/// Thus, the optimization applies under the following conditions:14/// 1. Consider as candidates only CMOVs in innermost loops (assume that15/// most hotspots are represented by these loops).16/// 2. Given a group of CMOV instructions that are using the same EFLAGS def17/// instruction:18/// a. Consider them as candidates only if all have the same code condition19/// or the opposite one to prevent generating more than one conditional20/// jump per EFLAGS def instruction.21/// b. Consider them as candidates only if all are profitable to be22/// converted (assume that one bad conversion may cause a degradation).23/// 3. Apply conversion only for loops that are found profitable and only for24/// CMOV candidates that were found profitable.25/// a. A loop is considered profitable only if conversion will reduce its26/// depth cost by some threshold.27/// b. CMOV is considered profitable if the cost of its condition is higher28/// than the average cost of its true-value and false-value by 25% of29/// branch-misprediction-penalty. This assures no degradation even with30/// 25% branch misprediction.31///32/// Note: This pass is assumed to run on SSA machine code.33//34//===----------------------------------------------------------------------===//35//36// External interfaces:37// FunctionPass *llvm::createX86CmovConverterPass();38// bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);39//40//===----------------------------------------------------------------------===//4142#include "X86.h"43#include "X86InstrInfo.h"44#include "llvm/ADT/ArrayRef.h"45#include "llvm/ADT/DenseMap.h"46#include "llvm/ADT/STLExtras.h"47#include "llvm/ADT/SmallPtrSet.h"48#include "llvm/ADT/SmallVector.h"49#include "llvm/ADT/Statistic.h"50#include "llvm/CodeGen/MachineBasicBlock.h"51#include "llvm/CodeGen/MachineFunction.h"52#include "llvm/CodeGen/MachineFunctionPass.h"53#include "llvm/CodeGen/MachineInstr.h"54#include "llvm/CodeGen/MachineInstrBuilder.h"55#include "llvm/CodeGen/MachineLoopInfo.h"56#include "llvm/CodeGen/MachineOperand.h"57#include "llvm/CodeGen/MachineRegisterInfo.h"58#include "llvm/CodeGen/TargetInstrInfo.h"59#include "llvm/CodeGen/TargetRegisterInfo.h"60#include "llvm/CodeGen/TargetSchedule.h"61#include "llvm/CodeGen/TargetSubtargetInfo.h"62#include "llvm/IR/DebugLoc.h"63#include "llvm/InitializePasses.h"64#include "llvm/MC/MCSchedule.h"65#include "llvm/Pass.h"66#include "llvm/Support/CommandLine.h"67#include "llvm/Support/Debug.h"68#include "llvm/Support/raw_ostream.h"69#include "llvm/Target/CGPassBuilderOption.h"70#include <algorithm>71#include <cassert>72#include <iterator>73#include <utility>7475using namespace llvm;7677#define DEBUG_TYPE "x86-cmov-conversion"7879STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");80STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");81STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");82STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");8384// This internal switch can be used to turn off the cmov/branch optimization.85static cl::opt<bool>86EnableCmovConverter("x86-cmov-converter",87cl::desc("Enable the X86 cmov-to-branch optimization."),88cl::init(true), cl::Hidden);8990static cl::opt<unsigned>91GainCycleThreshold("x86-cmov-converter-threshold",92cl::desc("Minimum gain per loop (in cycles) threshold."),93cl::init(4), cl::Hidden);9495static cl::opt<bool> ForceMemOperand(96"x86-cmov-converter-force-mem-operand",97cl::desc("Convert cmovs to branches whenever they have memory operands."),98cl::init(true), cl::Hidden);99100static cl::opt<bool> ForceAll(101"x86-cmov-converter-force-all",102cl::desc("Convert all cmovs to branches."),103cl::init(false), cl::Hidden);104105namespace {106107/// Converts X86 cmov instructions into branches when profitable.108class X86CmovConverterPass : public MachineFunctionPass {109public:110X86CmovConverterPass() : MachineFunctionPass(ID) { }111112StringRef getPassName() const override { return "X86 cmov Conversion"; }113bool runOnMachineFunction(MachineFunction &MF) override;114void getAnalysisUsage(AnalysisUsage &AU) const override;115116/// Pass identification, replacement for typeid.117static char ID;118119private:120MachineRegisterInfo *MRI = nullptr;121const TargetInstrInfo *TII = nullptr;122const TargetRegisterInfo *TRI = nullptr;123MachineLoopInfo *MLI = nullptr;124TargetSchedModel TSchedModel;125126/// List of consecutive CMOV instructions.127using CmovGroup = SmallVector<MachineInstr *, 2>;128using CmovGroups = SmallVector<CmovGroup, 2>;129130/// Collect all CMOV-group-candidates in \p CurrLoop and update \p131/// CmovInstGroups accordingly.132///133/// \param Blocks List of blocks to process.134/// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.135/// \returns true iff it found any CMOV-group-candidate.136bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,137CmovGroups &CmovInstGroups,138bool IncludeLoads = false);139140/// Check if it is profitable to transform each CMOV-group-candidates into141/// branch. Remove all groups that are not profitable from \p CmovInstGroups.142///143/// \param Blocks List of blocks to process.144/// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.145/// \returns true iff any CMOV-group-candidate remain.146bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,147CmovGroups &CmovInstGroups);148149/// Convert the given list of consecutive CMOV instructions into a branch.150///151/// \param Group Consecutive CMOV instructions to be converted into branch.152void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const;153};154155} // end anonymous namespace156157char X86CmovConverterPass::ID = 0;158159void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {160MachineFunctionPass::getAnalysisUsage(AU);161AU.addRequired<MachineLoopInfoWrapperPass>();162}163164bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {165if (skipFunction(MF.getFunction()))166return false;167if (!EnableCmovConverter)168return false;169170// If the SelectOptimize pass is enabled, cmovs have already been optimized.171if (!getCGPassBuilderOption().DisableSelectOptimize)172return false;173174LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()175<< "**********\n");176177bool Changed = false;178MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();179const TargetSubtargetInfo &STI = MF.getSubtarget();180MRI = &MF.getRegInfo();181TII = STI.getInstrInfo();182TRI = STI.getRegisterInfo();183TSchedModel.init(&STI);184185// Before we handle the more subtle cases of register-register CMOVs inside186// of potentially hot loops, we want to quickly remove all CMOVs (ForceAll) or187// the ones with a memory operand (ForceMemOperand option). The latter CMOV188// will risk a stall waiting for the load to complete that speculative189// execution behind a branch is better suited to handle on modern x86 chips.190if (ForceMemOperand || ForceAll) {191CmovGroups AllCmovGroups;192SmallVector<MachineBasicBlock *, 4> Blocks;193for (auto &MBB : MF)194Blocks.push_back(&MBB);195if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {196for (auto &Group : AllCmovGroups) {197// Skip any group that doesn't do at least one memory operand cmov.198if (ForceMemOperand && !ForceAll &&199llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))200continue;201202// For CMOV groups which we can rewrite and which contain a memory load,203// always rewrite them. On x86, a CMOV will dramatically amplify any204// memory latency by blocking speculative execution.205Changed = true;206convertCmovInstsToBranches(Group);207}208}209// Early return as ForceAll converts all CmovGroups.210if (ForceAll)211return Changed;212}213214//===--------------------------------------------------------------------===//215// Register-operand Conversion Algorithm216// ---------217// For each innermost loop218// collectCmovCandidates() {219// Find all CMOV-group-candidates.220// }221//222// checkForProfitableCmovCandidates() {223// * Calculate both loop-depth and optimized-loop-depth.224// * Use these depth to check for loop transformation profitability.225// * Check for CMOV-group-candidate transformation profitability.226// }227//228// For each profitable CMOV-group-candidate229// convertCmovInstsToBranches() {230// * Create FalseBB, SinkBB, Conditional branch to SinkBB.231// * Replace each CMOV instruction with a PHI instruction in SinkBB.232// }233//234// Note: For more details, see each function description.235//===--------------------------------------------------------------------===//236237// Build up the loops in pre-order.238SmallVector<MachineLoop *, 4> Loops(MLI->begin(), MLI->end());239// Note that we need to check size on each iteration as we accumulate child240// loops.241for (int i = 0; i < (int)Loops.size(); ++i)242for (MachineLoop *Child : Loops[i]->getSubLoops())243Loops.push_back(Child);244245for (MachineLoop *CurrLoop : Loops) {246// Optimize only innermost loops.247if (!CurrLoop->getSubLoops().empty())248continue;249250// List of consecutive CMOV instructions to be processed.251CmovGroups CmovInstGroups;252253if (!collectCmovCandidates(CurrLoop->getBlocks(), CmovInstGroups))254continue;255256if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(),257CmovInstGroups))258continue;259260Changed = true;261for (auto &Group : CmovInstGroups)262convertCmovInstsToBranches(Group);263}264265return Changed;266}267268bool X86CmovConverterPass::collectCmovCandidates(269ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,270bool IncludeLoads) {271//===--------------------------------------------------------------------===//272// Collect all CMOV-group-candidates and add them into CmovInstGroups.273//274// CMOV-group:275// CMOV instructions, in same MBB, that uses same EFLAGS def instruction.276//277// CMOV-group-candidate:278// CMOV-group where all the CMOV instructions are279// 1. consecutive.280// 2. have same condition code or opposite one.281// 3. have only operand registers (X86::CMOVrr).282//===--------------------------------------------------------------------===//283// List of possible improvement (TODO's):284// --------------------------------------285// TODO: Add support for X86::CMOVrm instructions.286// TODO: Add support for X86::SETcc instructions.287// TODO: Add support for CMOV-groups with non consecutive CMOV instructions.288//===--------------------------------------------------------------------===//289290// Current processed CMOV-Group.291CmovGroup Group;292for (auto *MBB : Blocks) {293Group.clear();294// Condition code of first CMOV instruction current processed range and its295// opposite condition code.296X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID,297MemOpCC = X86::COND_INVALID;298// Indicator of a non CMOVrr instruction in the current processed range.299bool FoundNonCMOVInst = false;300// Indicator for current processed CMOV-group if it should be skipped.301bool SkipGroup = false;302303for (auto &I : *MBB) {304// Skip debug instructions.305if (I.isDebugInstr())306continue;307308X86::CondCode CC = X86::getCondFromCMov(I);309// Check if we found a X86::CMOVrr instruction. If it is marked as310// unpredictable, skip it and do not convert it to branch.311if (CC != X86::COND_INVALID &&312!I.getFlag(MachineInstr::MIFlag::Unpredictable) &&313(IncludeLoads || !I.mayLoad())) {314if (Group.empty()) {315// We found first CMOV in the range, reset flags.316FirstCC = CC;317FirstOppCC = X86::GetOppositeBranchCondition(CC);318// Clear out the prior group's memory operand CC.319MemOpCC = X86::COND_INVALID;320FoundNonCMOVInst = false;321SkipGroup = false;322}323Group.push_back(&I);324// Check if it is a non-consecutive CMOV instruction or it has different325// condition code than FirstCC or FirstOppCC.326if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))327// Mark the SKipGroup indicator to skip current processed CMOV-Group.328SkipGroup = true;329if (I.mayLoad()) {330if (MemOpCC == X86::COND_INVALID)331// The first memory operand CMOV.332MemOpCC = CC;333else if (CC != MemOpCC)334// Can't handle mixed conditions with memory operands.335SkipGroup = true;336}337// Check if we were relying on zero-extending behavior of the CMOV.338if (!SkipGroup &&339llvm::any_of(340MRI->use_nodbg_instructions(I.defs().begin()->getReg()),341[&](MachineInstr &UseI) {342return UseI.getOpcode() == X86::SUBREG_TO_REG;343}))344// FIXME: We should model the cost of using an explicit MOV to handle345// the zero-extension rather than just refusing to handle this.346SkipGroup = true;347continue;348}349// If Group is empty, keep looking for first CMOV in the range.350if (Group.empty())351continue;352353// We found a non X86::CMOVrr instruction.354FoundNonCMOVInst = true;355// Check if this instruction define EFLAGS, to determine end of processed356// range, as there would be no more instructions using current EFLAGS def.357if (I.definesRegister(X86::EFLAGS, /*TRI=*/nullptr)) {358// Check if current processed CMOV-group should not be skipped and add359// it as a CMOV-group-candidate.360if (!SkipGroup)361CmovInstGroups.push_back(Group);362else363++NumOfSkippedCmovGroups;364Group.clear();365}366}367// End of basic block is considered end of range, check if current processed368// CMOV-group should not be skipped and add it as a CMOV-group-candidate.369if (Group.empty())370continue;371if (!SkipGroup)372CmovInstGroups.push_back(Group);373else374++NumOfSkippedCmovGroups;375}376377NumOfCmovGroupCandidate += CmovInstGroups.size();378return !CmovInstGroups.empty();379}380381/// \returns Depth of CMOV instruction as if it was converted into branch.382/// \param TrueOpDepth depth cost of CMOV true value operand.383/// \param FalseOpDepth depth cost of CMOV false value operand.384static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {385// The depth of the result after branch conversion is386// TrueOpDepth * TrueOpProbability + FalseOpDepth * FalseOpProbability.387// As we have no info about branch weight, we assume 75% for one and 25% for388// the other, and pick the result with the largest resulting depth.389return std::max(390divideCeil(TrueOpDepth * 3 + FalseOpDepth, 4),391divideCeil(FalseOpDepth * 3 + TrueOpDepth, 4));392}393394bool X86CmovConverterPass::checkForProfitableCmovCandidates(395ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {396struct DepthInfo {397/// Depth of original loop.398unsigned Depth;399/// Depth of optimized loop.400unsigned OptDepth;401};402/// Number of loop iterations to calculate depth for ?!403static const unsigned LoopIterations = 2;404DenseMap<MachineInstr *, DepthInfo> DepthMap;405DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}};406enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 };407/// For each register type maps the register to its last def instruction.408DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum];409/// Maps register operand to its def instruction, which can be nullptr if it410/// is unknown (e.g., operand is defined outside the loop).411DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap;412413// Set depth of unknown instruction (i.e., nullptr) to zero.414DepthMap[nullptr] = {0, 0};415416SmallPtrSet<MachineInstr *, 4> CmovInstructions;417for (auto &Group : CmovInstGroups)418CmovInstructions.insert(Group.begin(), Group.end());419420//===--------------------------------------------------------------------===//421// Step 1: Calculate instruction depth and loop depth.422// Optimized-Loop:423// loop with CMOV-group-candidates converted into branches.424//425// Instruction-Depth:426// instruction latency + max operand depth.427// * For CMOV instruction in optimized loop the depth is calculated as:428// CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)429// TODO: Find a better way to estimate the latency of the branch instruction430// rather than using the CMOV latency.431//432// Loop-Depth:433// max instruction depth of all instructions in the loop.434// Note: instruction with max depth represents the critical-path in the loop.435//436// Loop-Depth[i]:437// Loop-Depth calculated for first `i` iterations.438// Note: it is enough to calculate depth for up to two iterations.439//440// Depth-Diff[i]:441// Number of cycles saved in first 'i` iterations by optimizing the loop.442//===--------------------------------------------------------------------===//443for (DepthInfo &MaxDepth : LoopDepth) {444for (auto *MBB : Blocks) {445// Clear physical registers Def map.446RegDefMaps[PhyRegType].clear();447for (MachineInstr &MI : *MBB) {448// Skip debug instructions.449if (MI.isDebugInstr())450continue;451unsigned MIDepth = 0;452unsigned MIDepthOpt = 0;453bool IsCMOV = CmovInstructions.count(&MI);454for (auto &MO : MI.uses()) {455// Checks for "isUse()" as "uses()" returns also implicit definitions.456if (!MO.isReg() || !MO.isUse())457continue;458Register Reg = MO.getReg();459auto &RDM = RegDefMaps[Reg.isVirtual()];460if (MachineInstr *DefMI = RDM.lookup(Reg)) {461OperandToDefMap[&MO] = DefMI;462DepthInfo Info = DepthMap.lookup(DefMI);463MIDepth = std::max(MIDepth, Info.Depth);464if (!IsCMOV)465MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth);466}467}468469if (IsCMOV)470MIDepthOpt = getDepthOfOptCmov(471DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth,472DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth);473474// Iterates over all operands to handle implicit definitions as well.475for (auto &MO : MI.operands()) {476if (!MO.isReg() || !MO.isDef())477continue;478Register Reg = MO.getReg();479RegDefMaps[Reg.isVirtual()][Reg] = &MI;480}481482unsigned Latency = TSchedModel.computeInstrLatency(&MI);483DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency};484MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth);485MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt);486}487}488}489490unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth,491LoopDepth[1].Depth - LoopDepth[1].OptDepth};492493//===--------------------------------------------------------------------===//494// Step 2: Check if Loop worth to be optimized.495// Worth-Optimize-Loop:496// case 1: Diff[1] == Diff[0]497// Critical-path is iteration independent - there is no dependency498// of critical-path instructions on critical-path instructions of499// previous iteration.500// Thus, it is enough to check gain percent of 1st iteration -501// To be conservative, the optimized loop need to have a depth of502// 12.5% cycles less than original loop, per iteration.503//504// case 2: Diff[1] > Diff[0]505// Critical-path is iteration dependent - there is dependency of506// critical-path instructions on critical-path instructions of507// previous iteration.508// Thus, check the gain percent of the 2nd iteration (similar to the509// previous case), but it is also required to check the gradient of510// the gain - the change in Depth-Diff compared to the change in511// Loop-Depth between 1st and 2nd iterations.512// To be conservative, the gradient need to be at least 50%.513//514// In addition, In order not to optimize loops with very small gain, the515// gain (in cycles) after 2nd iteration should not be less than a given516// threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.517//518// If loop is not worth optimizing, remove all CMOV-group-candidates.519//===--------------------------------------------------------------------===//520if (Diff[1] < GainCycleThreshold)521return false;522523bool WorthOptLoop = false;524if (Diff[1] == Diff[0])525WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;526else if (Diff[1] > Diff[0])527WorthOptLoop =528(Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&529(Diff[1] * 8 >= LoopDepth[1].Depth);530531if (!WorthOptLoop)532return false;533534++NumOfLoopCandidate;535536//===--------------------------------------------------------------------===//537// Step 3: Check for each CMOV-group-candidate if it worth to be optimized.538// Worth-Optimize-Group:539// Iff it is worth to optimize all CMOV instructions in the group.540//541// Worth-Optimize-CMOV:542// Predicted branch is faster than CMOV by the difference between depth of543// condition operand and depth of taken (predicted) value operand.544// To be conservative, the gain of such CMOV transformation should cover at545// at least 25% of branch-misprediction-penalty.546//===--------------------------------------------------------------------===//547unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;548CmovGroups TempGroups;549std::swap(TempGroups, CmovInstGroups);550for (auto &Group : TempGroups) {551bool WorthOpGroup = true;552for (auto *MI : Group) {553// Avoid CMOV instruction which value is used as a pointer to load from.554// This is another conservative check to avoid converting CMOV instruction555// used with tree-search like algorithm, where the branch is unpredicted.556auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());557if (!UIs.empty() && ++UIs.begin() == UIs.end()) {558unsigned Op = UIs.begin()->getOpcode();559if (Op == X86::MOV64rm || Op == X86::MOV32rm) {560WorthOpGroup = false;561break;562}563}564565unsigned CondCost =566DepthMap[OperandToDefMap.lookup(&MI->getOperand(4))].Depth;567unsigned ValCost = getDepthOfOptCmov(568DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,569DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);570if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) {571WorthOpGroup = false;572break;573}574}575576if (WorthOpGroup)577CmovInstGroups.push_back(Group);578}579580return !CmovInstGroups.empty();581}582583static bool checkEFLAGSLive(MachineInstr *MI) {584if (MI->killsRegister(X86::EFLAGS, /*TRI=*/nullptr))585return false;586587// The EFLAGS operand of MI might be missing a kill marker.588// Figure out whether EFLAGS operand should LIVE after MI instruction.589MachineBasicBlock *BB = MI->getParent();590MachineBasicBlock::iterator ItrMI = MI;591592// Scan forward through BB for a use/def of EFLAGS.593for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) {594if (I->readsRegister(X86::EFLAGS, /*TRI=*/nullptr))595return true;596if (I->definesRegister(X86::EFLAGS, /*TRI=*/nullptr))597return false;598}599600// We hit the end of the block, check whether EFLAGS is live into a successor.601for (MachineBasicBlock *Succ : BB->successors())602if (Succ->isLiveIn(X86::EFLAGS))603return true;604605return false;606}607608/// Given /p First CMOV instruction and /p Last CMOV instruction representing a609/// group of CMOV instructions, which may contain debug instructions in between,610/// move all debug instructions to after the last CMOV instruction, making the611/// CMOV group consecutive.612static void packCmovGroup(MachineInstr *First, MachineInstr *Last) {613assert(X86::getCondFromCMov(*Last) != X86::COND_INVALID &&614"Last instruction in a CMOV group must be a CMOV instruction");615616SmallVector<MachineInstr *, 2> DBGInstructions;617for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) {618if (I->isDebugInstr())619DBGInstructions.push_back(&*I);620}621622// Splice the debug instruction after the cmov group.623MachineBasicBlock *MBB = First->getParent();624for (auto *MI : DBGInstructions)625MBB->insertAfter(Last, MI->removeFromParent());626}627628void X86CmovConverterPass::convertCmovInstsToBranches(629SmallVectorImpl<MachineInstr *> &Group) const {630assert(!Group.empty() && "No CMOV instructions to convert");631++NumOfOptimizedCmovGroups;632633// If the CMOV group is not packed, e.g., there are debug instructions between634// first CMOV and last CMOV, then pack the group and make the CMOV instruction635// consecutive by moving the debug instructions to after the last CMOV.636packCmovGroup(Group.front(), Group.back());637638// To convert a CMOVcc instruction, we actually have to insert the diamond639// control-flow pattern. The incoming instruction knows the destination vreg640// to set, the condition code register to branch on, the true/false values to641// select between, and a branch opcode to use.642643// Before644// -----645// MBB:646// cond = cmp ...647// v1 = CMOVge t1, f1, cond648// v2 = CMOVlt t2, f2, cond649// v3 = CMOVge v1, f3, cond650//651// After652// -----653// MBB:654// cond = cmp ...655// jge %SinkMBB656//657// FalseMBB:658// jmp %SinkMBB659//660// SinkMBB:661// %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]662// %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch663// ; true-value with false-value664// %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use665// ; previous Phi instruction result666667MachineInstr &MI = *Group.front();668MachineInstr *LastCMOV = Group.back();669DebugLoc DL = MI.getDebugLoc();670671X86::CondCode CC = X86::CondCode(X86::getCondFromCMov(MI));672X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);673// Potentially swap the condition codes so that any memory operand to a CMOV674// is in the *false* position instead of the *true* position. We can invert675// any non-memory operand CMOV instructions to cope with this and we ensure676// memory operand CMOVs are only included with a single condition code.677if (llvm::any_of(Group, [&](MachineInstr *I) {678return I->mayLoad() && X86::getCondFromCMov(*I) == CC;679}))680std::swap(CC, OppCC);681682MachineBasicBlock *MBB = MI.getParent();683MachineFunction::iterator It = ++MBB->getIterator();684MachineFunction *F = MBB->getParent();685const BasicBlock *BB = MBB->getBasicBlock();686687MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);688MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);689F->insert(It, FalseMBB);690F->insert(It, SinkMBB);691692// If the EFLAGS register isn't dead in the terminator, then claim that it's693// live into the sink and copy blocks.694if (checkEFLAGSLive(LastCMOV)) {695FalseMBB->addLiveIn(X86::EFLAGS);696SinkMBB->addLiveIn(X86::EFLAGS);697}698699// Transfer the remainder of BB and its successor edges to SinkMBB.700SinkMBB->splice(SinkMBB->begin(), MBB,701std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());702SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);703704// Add the false and sink blocks as its successors.705MBB->addSuccessor(FalseMBB);706MBB->addSuccessor(SinkMBB);707708// Create the conditional branch instruction.709BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);710711// Add the sink block to the false block successors.712FalseMBB->addSuccessor(SinkMBB);713714MachineInstrBuilder MIB;715MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);716MachineBasicBlock::iterator MIItEnd =717std::next(MachineBasicBlock::iterator(LastCMOV));718MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();719MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();720721// First we need to insert an explicit load on the false path for any memory722// operand. We also need to potentially do register rewriting here, but it is723// simpler as the memory operands are always on the false path so we can724// simply take that input, whatever it is.725DenseMap<unsigned, unsigned> FalseBBRegRewriteTable;726for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {727auto &MI = *MIIt++;728// Skip any CMOVs in this group which don't load from memory.729if (!MI.mayLoad()) {730// Remember the false-side register input.731Register FalseReg =732MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg();733// Walk back through any intermediate cmovs referenced.734while (true) {735auto FRIt = FalseBBRegRewriteTable.find(FalseReg);736if (FRIt == FalseBBRegRewriteTable.end())737break;738FalseReg = FRIt->second;739}740FalseBBRegRewriteTable[MI.getOperand(0).getReg()] = FalseReg;741continue;742}743744// The condition must be the *opposite* of the one we've decided to branch745// on as the branch will go *around* the load and the load should happen746// when the CMOV condition is false.747assert(X86::getCondFromCMov(MI) == OppCC &&748"Can only handle memory-operand cmov instructions with a condition "749"opposite to the selected branch direction.");750751// The goal is to rewrite the cmov from:752//753// MBB:754// %A = CMOVcc %B (tied), (mem)755//756// to757//758// MBB:759// %A = CMOVcc %B (tied), %C760// FalseMBB:761// %C = MOV (mem)762//763// Which will allow the next loop to rewrite the CMOV in terms of a PHI:764//765// MBB:766// JMP!cc SinkMBB767// FalseMBB:768// %C = MOV (mem)769// SinkMBB:770// %A = PHI [ %C, FalseMBB ], [ %B, MBB]771772// Get a fresh register to use as the destination of the MOV.773const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());774Register TmpReg = MRI->createVirtualRegister(RC);775776// Retain debug instr number when unfolded.777unsigned OldDebugInstrNum = MI.peekDebugInstrNum();778SmallVector<MachineInstr *, 4> NewMIs;779bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,780/*UnfoldLoad*/ true,781/*UnfoldStore*/ false, NewMIs);782(void)Unfolded;783assert(Unfolded && "Should never fail to unfold a loading cmov!");784785// Move the new CMOV to just before the old one and reset any impacted786// iterator.787auto *NewCMOV = NewMIs.pop_back_val();788assert(X86::getCondFromCMov(*NewCMOV) == OppCC &&789"Last new instruction isn't the expected CMOV!");790LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());791MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV);792if (&*MIItBegin == &MI)793MIItBegin = MachineBasicBlock::iterator(NewCMOV);794795if (OldDebugInstrNum)796NewCMOV->setDebugInstrNum(OldDebugInstrNum);797798// Sink whatever instructions were needed to produce the unfolded operand799// into the false block.800for (auto *NewMI : NewMIs) {801LLVM_DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());802FalseMBB->insert(FalseInsertionPoint, NewMI);803// Re-map any operands that are from other cmovs to the inputs for this block.804for (auto &MOp : NewMI->uses()) {805if (!MOp.isReg())806continue;807auto It = FalseBBRegRewriteTable.find(MOp.getReg());808if (It == FalseBBRegRewriteTable.end())809continue;810811MOp.setReg(It->second);812// This might have been a kill when it referenced the cmov result, but813// it won't necessarily be once rewritten.814// FIXME: We could potentially improve this by tracking whether the815// operand to the cmov was also a kill, and then skipping the PHI node816// construction below.817MOp.setIsKill(false);818}819}820MBB->erase(&MI);821822// Add this PHI to the rewrite table.823FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg;824}825826// As we are creating the PHIs, we have to be careful if there is more than827// one. Later CMOVs may reference the results of earlier CMOVs, but later828// PHIs have to reference the individual true/false inputs from earlier PHIs.829// That also means that PHI construction must work forward from earlier to830// later, and that the code must maintain a mapping from earlier PHI's831// destination registers, and the registers that went into the PHI.832DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;833834for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {835Register DestReg = MIIt->getOperand(0).getReg();836Register Op1Reg = MIIt->getOperand(1).getReg();837Register Op2Reg = MIIt->getOperand(2).getReg();838839// If this CMOV we are processing is the opposite condition from the jump we840// generated, then we have to swap the operands for the PHI that is going to841// be generated.842if (X86::getCondFromCMov(*MIIt) == OppCC)843std::swap(Op1Reg, Op2Reg);844845auto Op1Itr = RegRewriteTable.find(Op1Reg);846if (Op1Itr != RegRewriteTable.end())847Op1Reg = Op1Itr->second.first;848849auto Op2Itr = RegRewriteTable.find(Op2Reg);850if (Op2Itr != RegRewriteTable.end())851Op2Reg = Op2Itr->second.second;852853// SinkMBB:854// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]855// ...856MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)857.addReg(Op1Reg)858.addMBB(FalseMBB)859.addReg(Op2Reg)860.addMBB(MBB);861(void)MIB;862LLVM_DEBUG(dbgs() << "\tFrom: "; MIIt->dump());863LLVM_DEBUG(dbgs() << "\tTo: "; MIB->dump());864865// debug-info: we can just copy the instr-ref number from one instruction866// to the other, seeing how it's a one-for-one substitution.867if (unsigned InstrNum = MIIt->peekDebugInstrNum())868MIB->setDebugInstrNum(InstrNum);869870// Add this PHI to the rewrite table.871RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);872}873874// Reset the NoPHIs property if a PHI was inserted to prevent a conflict with875// the MachineVerifier during testing.876if (MIItBegin != MIItEnd)877F->getProperties().reset(MachineFunctionProperties::Property::NoPHIs);878879// Now remove the CMOV(s).880MBB->erase(MIItBegin, MIItEnd);881882// Add new basic blocks to MachineLoopInfo.883if (MachineLoop *L = MLI->getLoopFor(MBB)) {884L->addBasicBlockToLoop(FalseMBB, *MLI);885L->addBasicBlockToLoop(SinkMBB, *MLI);886}887}888889INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",890false, false)891INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)892INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",893false, false)894895FunctionPass *llvm::createX86CmovConverterPass() {896return new X86CmovConverterPass();897}898899900