Path: blob/main/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
35294 views
//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file defines the X86-specific support for the FastISel class. Much9// of the target-specific code is generated by tablegen in the file10// X86GenFastISel.inc, which is #included here.11//12//===----------------------------------------------------------------------===//1314#include "X86.h"15#include "X86CallingConv.h"16#include "X86InstrBuilder.h"17#include "X86InstrInfo.h"18#include "X86MachineFunctionInfo.h"19#include "X86RegisterInfo.h"20#include "X86Subtarget.h"21#include "X86TargetMachine.h"22#include "llvm/Analysis/BranchProbabilityInfo.h"23#include "llvm/CodeGen/FastISel.h"24#include "llvm/CodeGen/FunctionLoweringInfo.h"25#include "llvm/CodeGen/MachineConstantPool.h"26#include "llvm/CodeGen/MachineFrameInfo.h"27#include "llvm/CodeGen/MachineRegisterInfo.h"28#include "llvm/IR/CallingConv.h"29#include "llvm/IR/DebugInfo.h"30#include "llvm/IR/DerivedTypes.h"31#include "llvm/IR/GetElementPtrTypeIterator.h"32#include "llvm/IR/GlobalAlias.h"33#include "llvm/IR/GlobalVariable.h"34#include "llvm/IR/Instructions.h"35#include "llvm/IR/IntrinsicInst.h"36#include "llvm/IR/IntrinsicsX86.h"37#include "llvm/IR/Operator.h"38#include "llvm/MC/MCAsmInfo.h"39#include "llvm/MC/MCSymbol.h"40#include "llvm/Support/ErrorHandling.h"41#include "llvm/Target/TargetOptions.h"42using namespace llvm;4344namespace {4546class X86FastISel final : public FastISel {47/// Subtarget - Keep a pointer to the X86Subtarget around so that we can48/// make the right decision when generating code for different targets.49const X86Subtarget *Subtarget;5051public:52explicit X86FastISel(FunctionLoweringInfo &funcInfo,53const TargetLibraryInfo *libInfo)54: FastISel(funcInfo, libInfo) {55Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();56}5758bool fastSelectInstruction(const Instruction *I) override;5960/// The specified machine instr operand is a vreg, and that61/// vreg is being provided by the specified load instruction. If possible,62/// try to fold the load as an operand to the instruction, returning true if63/// possible.64bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,65const LoadInst *LI) override;6667bool fastLowerArguments() override;68bool fastLowerCall(CallLoweringInfo &CLI) override;69bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;7071#include "X86GenFastISel.inc"7273private:74bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT,75const DebugLoc &DL);7677bool X86FastEmitLoad(MVT VT, X86AddressMode &AM, MachineMemOperand *MMO,78unsigned &ResultReg, unsigned Alignment = 1);7980bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM,81MachineMemOperand *MMO = nullptr, bool Aligned = false);82bool X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,83MachineMemOperand *MMO = nullptr, bool Aligned = false);8485bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,86unsigned &ResultReg);8788bool X86SelectAddress(const Value *V, X86AddressMode &AM);89bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);9091bool X86SelectLoad(const Instruction *I);9293bool X86SelectStore(const Instruction *I);9495bool X86SelectRet(const Instruction *I);9697bool X86SelectCmp(const Instruction *I);9899bool X86SelectZExt(const Instruction *I);100101bool X86SelectSExt(const Instruction *I);102103bool X86SelectBranch(const Instruction *I);104105bool X86SelectShift(const Instruction *I);106107bool X86SelectDivRem(const Instruction *I);108109bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);110111bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);112113bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);114115bool X86SelectSelect(const Instruction *I);116117bool X86SelectTrunc(const Instruction *I);118119bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc,120const TargetRegisterClass *RC);121122bool X86SelectFPExt(const Instruction *I);123bool X86SelectFPTrunc(const Instruction *I);124bool X86SelectSIToFP(const Instruction *I);125bool X86SelectUIToFP(const Instruction *I);126bool X86SelectIntToFP(const Instruction *I, bool IsSigned);127128const X86InstrInfo *getInstrInfo() const {129return Subtarget->getInstrInfo();130}131const X86TargetMachine *getTargetMachine() const {132return static_cast<const X86TargetMachine *>(&TM);133}134135bool handleConstantAddresses(const Value *V, X86AddressMode &AM);136137unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);138unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);139unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);140unsigned fastMaterializeConstant(const Constant *C) override;141142unsigned fastMaterializeAlloca(const AllocaInst *C) override;143144unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;145146/// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is147/// computed in an SSE register, not on the X87 floating point stack.148bool isScalarFPTypeInSSEReg(EVT VT) const {149return (VT == MVT::f64 && Subtarget->hasSSE2()) ||150(VT == MVT::f32 && Subtarget->hasSSE1()) || VT == MVT::f16;151}152153bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);154155bool IsMemcpySmall(uint64_t Len);156157bool TryEmitSmallMemcpy(X86AddressMode DestAM,158X86AddressMode SrcAM, uint64_t Len);159160bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,161const Value *Cond);162163const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,164X86AddressMode &AM);165166unsigned fastEmitInst_rrrr(unsigned MachineInstOpcode,167const TargetRegisterClass *RC, unsigned Op0,168unsigned Op1, unsigned Op2, unsigned Op3);169};170171} // end anonymous namespace.172173static std::pair<unsigned, bool>174getX86SSEConditionCode(CmpInst::Predicate Predicate) {175unsigned CC;176bool NeedSwap = false;177178// SSE Condition code mapping:179// 0 - EQ180// 1 - LT181// 2 - LE182// 3 - UNORD183// 4 - NEQ184// 5 - NLT185// 6 - NLE186// 7 - ORD187switch (Predicate) {188default: llvm_unreachable("Unexpected predicate");189case CmpInst::FCMP_OEQ: CC = 0; break;190case CmpInst::FCMP_OGT: NeedSwap = true; [[fallthrough]];191case CmpInst::FCMP_OLT: CC = 1; break;192case CmpInst::FCMP_OGE: NeedSwap = true; [[fallthrough]];193case CmpInst::FCMP_OLE: CC = 2; break;194case CmpInst::FCMP_UNO: CC = 3; break;195case CmpInst::FCMP_UNE: CC = 4; break;196case CmpInst::FCMP_ULE: NeedSwap = true; [[fallthrough]];197case CmpInst::FCMP_UGE: CC = 5; break;198case CmpInst::FCMP_ULT: NeedSwap = true; [[fallthrough]];199case CmpInst::FCMP_UGT: CC = 6; break;200case CmpInst::FCMP_ORD: CC = 7; break;201case CmpInst::FCMP_UEQ: CC = 8; break;202case CmpInst::FCMP_ONE: CC = 12; break;203}204205return std::make_pair(CC, NeedSwap);206}207208/// Adds a complex addressing mode to the given machine instr builder.209/// Note, this will constrain the index register. If its not possible to210/// constrain the given index register, then a new one will be created. The211/// IndexReg field of the addressing mode will be updated to match in this case.212const MachineInstrBuilder &213X86FastISel::addFullAddress(const MachineInstrBuilder &MIB,214X86AddressMode &AM) {215// First constrain the index register. It needs to be a GR64_NOSP.216AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg,217MIB->getNumOperands() +218X86::AddrIndexReg);219return ::addFullAddress(MIB, AM);220}221222/// Check if it is possible to fold the condition from the XALU intrinsic223/// into the user. The condition code will only be updated on success.224bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,225const Value *Cond) {226if (!isa<ExtractValueInst>(Cond))227return false;228229const auto *EV = cast<ExtractValueInst>(Cond);230if (!isa<IntrinsicInst>(EV->getAggregateOperand()))231return false;232233const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());234MVT RetVT;235const Function *Callee = II->getCalledFunction();236Type *RetTy =237cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);238if (!isTypeLegal(RetTy, RetVT))239return false;240241if (RetVT != MVT::i32 && RetVT != MVT::i64)242return false;243244X86::CondCode TmpCC;245switch (II->getIntrinsicID()) {246default: return false;247case Intrinsic::sadd_with_overflow:248case Intrinsic::ssub_with_overflow:249case Intrinsic::smul_with_overflow:250case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;251case Intrinsic::uadd_with_overflow:252case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;253}254255// Check if both instructions are in the same basic block.256if (II->getParent() != I->getParent())257return false;258259// Make sure nothing is in the way260BasicBlock::const_iterator Start(I);261BasicBlock::const_iterator End(II);262for (auto Itr = std::prev(Start); Itr != End; --Itr) {263// We only expect extractvalue instructions between the intrinsic and the264// instruction to be selected.265if (!isa<ExtractValueInst>(Itr))266return false;267268// Check that the extractvalue operand comes from the intrinsic.269const auto *EVI = cast<ExtractValueInst>(Itr);270if (EVI->getAggregateOperand() != II)271return false;272}273274// Make sure no potentially eflags clobbering phi moves can be inserted in275// between.276auto HasPhis = [](const BasicBlock *Succ) { return !Succ->phis().empty(); };277if (I->isTerminator() && llvm::any_of(successors(I), HasPhis))278return false;279280// Make sure there are no potentially eflags clobbering constant281// materializations in between.282if (llvm::any_of(I->operands(), [](Value *V) { return isa<Constant>(V); }))283return false;284285CC = TmpCC;286return true;287}288289bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {290EVT evt = TLI.getValueType(DL, Ty, /*AllowUnknown=*/true);291if (evt == MVT::Other || !evt.isSimple())292// Unhandled type. Halt "fast" selection and bail.293return false;294295VT = evt.getSimpleVT();296// For now, require SSE/SSE2 for performing floating-point operations,297// since x87 requires additional work.298if (VT == MVT::f64 && !Subtarget->hasSSE2())299return false;300if (VT == MVT::f32 && !Subtarget->hasSSE1())301return false;302// Similarly, no f80 support yet.303if (VT == MVT::f80)304return false;305// We only handle legal types. For example, on x86-32 the instruction306// selector contains all of the 64-bit instructions from x86-64,307// under the assumption that i64 won't be used if the target doesn't308// support it.309return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);310}311312/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.313/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.314/// Return true and the result register by reference if it is possible.315bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,316MachineMemOperand *MMO, unsigned &ResultReg,317unsigned Alignment) {318bool HasSSE1 = Subtarget->hasSSE1();319bool HasSSE2 = Subtarget->hasSSE2();320bool HasSSE41 = Subtarget->hasSSE41();321bool HasAVX = Subtarget->hasAVX();322bool HasAVX2 = Subtarget->hasAVX2();323bool HasAVX512 = Subtarget->hasAVX512();324bool HasVLX = Subtarget->hasVLX();325bool IsNonTemporal = MMO && MMO->isNonTemporal();326327// Treat i1 loads the same as i8 loads. Masking will be done when storing.328if (VT == MVT::i1)329VT = MVT::i8;330331// Get opcode and regclass of the output for the given load instruction.332unsigned Opc = 0;333switch (VT.SimpleTy) {334default: return false;335case MVT::i8:336Opc = X86::MOV8rm;337break;338case MVT::i16:339Opc = X86::MOV16rm;340break;341case MVT::i32:342Opc = X86::MOV32rm;343break;344case MVT::i64:345// Must be in x86-64 mode.346Opc = X86::MOV64rm;347break;348case MVT::f32:349Opc = HasAVX512 ? X86::VMOVSSZrm_alt350: HasAVX ? X86::VMOVSSrm_alt351: HasSSE1 ? X86::MOVSSrm_alt352: X86::LD_Fp32m;353break;354case MVT::f64:355Opc = HasAVX512 ? X86::VMOVSDZrm_alt356: HasAVX ? X86::VMOVSDrm_alt357: HasSSE2 ? X86::MOVSDrm_alt358: X86::LD_Fp64m;359break;360case MVT::f80:361// No f80 support yet.362return false;363case MVT::v4f32:364if (IsNonTemporal && Alignment >= 16 && HasSSE41)365Opc = HasVLX ? X86::VMOVNTDQAZ128rm :366HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;367else if (Alignment >= 16)368Opc = HasVLX ? X86::VMOVAPSZ128rm :369HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;370else371Opc = HasVLX ? X86::VMOVUPSZ128rm :372HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;373break;374case MVT::v2f64:375if (IsNonTemporal && Alignment >= 16 && HasSSE41)376Opc = HasVLX ? X86::VMOVNTDQAZ128rm :377HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;378else if (Alignment >= 16)379Opc = HasVLX ? X86::VMOVAPDZ128rm :380HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;381else382Opc = HasVLX ? X86::VMOVUPDZ128rm :383HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;384break;385case MVT::v4i32:386case MVT::v2i64:387case MVT::v8i16:388case MVT::v16i8:389if (IsNonTemporal && Alignment >= 16 && HasSSE41)390Opc = HasVLX ? X86::VMOVNTDQAZ128rm :391HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;392else if (Alignment >= 16)393Opc = HasVLX ? X86::VMOVDQA64Z128rm :394HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;395else396Opc = HasVLX ? X86::VMOVDQU64Z128rm :397HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;398break;399case MVT::v8f32:400assert(HasAVX);401if (IsNonTemporal && Alignment >= 32 && HasAVX2)402Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;403else if (IsNonTemporal && Alignment >= 16)404return false; // Force split for X86::VMOVNTDQArm405else if (Alignment >= 32)406Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;407else408Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;409break;410case MVT::v4f64:411assert(HasAVX);412if (IsNonTemporal && Alignment >= 32 && HasAVX2)413Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;414else if (IsNonTemporal && Alignment >= 16)415return false; // Force split for X86::VMOVNTDQArm416else if (Alignment >= 32)417Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;418else419Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;420break;421case MVT::v8i32:422case MVT::v4i64:423case MVT::v16i16:424case MVT::v32i8:425assert(HasAVX);426if (IsNonTemporal && Alignment >= 32 && HasAVX2)427Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;428else if (IsNonTemporal && Alignment >= 16)429return false; // Force split for X86::VMOVNTDQArm430else if (Alignment >= 32)431Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;432else433Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;434break;435case MVT::v16f32:436assert(HasAVX512);437if (IsNonTemporal && Alignment >= 64)438Opc = X86::VMOVNTDQAZrm;439else440Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;441break;442case MVT::v8f64:443assert(HasAVX512);444if (IsNonTemporal && Alignment >= 64)445Opc = X86::VMOVNTDQAZrm;446else447Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;448break;449case MVT::v8i64:450case MVT::v16i32:451case MVT::v32i16:452case MVT::v64i8:453assert(HasAVX512);454// Note: There are a lot more choices based on type with AVX-512, but455// there's really no advantage when the load isn't masked.456if (IsNonTemporal && Alignment >= 64)457Opc = X86::VMOVNTDQAZrm;458else459Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;460break;461}462463const TargetRegisterClass *RC = TLI.getRegClassFor(VT);464465ResultReg = createResultReg(RC);466MachineInstrBuilder MIB =467BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg);468addFullAddress(MIB, AM);469if (MMO)470MIB->addMemOperand(*FuncInfo.MF, MMO);471return true;472}473474/// X86FastEmitStore - Emit a machine instruction to store a value Val of475/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr476/// and a displacement offset, or a GlobalAddress,477/// i.e. V. Return true if it is possible.478bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,479MachineMemOperand *MMO, bool Aligned) {480bool HasSSE1 = Subtarget->hasSSE1();481bool HasSSE2 = Subtarget->hasSSE2();482bool HasSSE4A = Subtarget->hasSSE4A();483bool HasAVX = Subtarget->hasAVX();484bool HasAVX512 = Subtarget->hasAVX512();485bool HasVLX = Subtarget->hasVLX();486bool IsNonTemporal = MMO && MMO->isNonTemporal();487488// Get opcode and regclass of the output for the given store instruction.489unsigned Opc = 0;490switch (VT.getSimpleVT().SimpleTy) {491case MVT::f80: // No f80 support yet.492default: return false;493case MVT::i1: {494// Mask out all but lowest bit.495Register AndResult = createResultReg(&X86::GR8RegClass);496BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,497TII.get(X86::AND8ri), AndResult)498.addReg(ValReg).addImm(1);499ValReg = AndResult;500[[fallthrough]]; // handle i1 as i8.501}502case MVT::i8: Opc = X86::MOV8mr; break;503case MVT::i16: Opc = X86::MOV16mr; break;504case MVT::i32:505Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr;506break;507case MVT::i64:508// Must be in x86-64 mode.509Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;510break;511case MVT::f32:512if (HasSSE1) {513if (IsNonTemporal && HasSSE4A)514Opc = X86::MOVNTSS;515else516Opc = HasAVX512 ? X86::VMOVSSZmr :517HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;518} else519Opc = X86::ST_Fp32m;520break;521case MVT::f64:522if (HasSSE2) {523if (IsNonTemporal && HasSSE4A)524Opc = X86::MOVNTSD;525else526Opc = HasAVX512 ? X86::VMOVSDZmr :527HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;528} else529Opc = X86::ST_Fp64m;530break;531case MVT::x86mmx:532Opc = (IsNonTemporal && HasSSE1) ? X86::MMX_MOVNTQmr : X86::MMX_MOVQ64mr;533break;534case MVT::v4f32:535if (Aligned) {536if (IsNonTemporal)537Opc = HasVLX ? X86::VMOVNTPSZ128mr :538HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;539else540Opc = HasVLX ? X86::VMOVAPSZ128mr :541HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;542} else543Opc = HasVLX ? X86::VMOVUPSZ128mr :544HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;545break;546case MVT::v2f64:547if (Aligned) {548if (IsNonTemporal)549Opc = HasVLX ? X86::VMOVNTPDZ128mr :550HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;551else552Opc = HasVLX ? X86::VMOVAPDZ128mr :553HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;554} else555Opc = HasVLX ? X86::VMOVUPDZ128mr :556HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;557break;558case MVT::v4i32:559case MVT::v2i64:560case MVT::v8i16:561case MVT::v16i8:562if (Aligned) {563if (IsNonTemporal)564Opc = HasVLX ? X86::VMOVNTDQZ128mr :565HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;566else567Opc = HasVLX ? X86::VMOVDQA64Z128mr :568HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;569} else570Opc = HasVLX ? X86::VMOVDQU64Z128mr :571HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;572break;573case MVT::v8f32:574assert(HasAVX);575if (Aligned) {576if (IsNonTemporal)577Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr;578else579Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr;580} else581Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr;582break;583case MVT::v4f64:584assert(HasAVX);585if (Aligned) {586if (IsNonTemporal)587Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr;588else589Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr;590} else591Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr;592break;593case MVT::v8i32:594case MVT::v4i64:595case MVT::v16i16:596case MVT::v32i8:597assert(HasAVX);598if (Aligned) {599if (IsNonTemporal)600Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr;601else602Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr;603} else604Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr;605break;606case MVT::v16f32:607assert(HasAVX512);608if (Aligned)609Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr;610else611Opc = X86::VMOVUPSZmr;612break;613case MVT::v8f64:614assert(HasAVX512);615if (Aligned) {616Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr;617} else618Opc = X86::VMOVUPDZmr;619break;620case MVT::v8i64:621case MVT::v16i32:622case MVT::v32i16:623case MVT::v64i8:624assert(HasAVX512);625// Note: There are a lot more choices based on type with AVX-512, but626// there's really no advantage when the store isn't masked.627if (Aligned)628Opc = IsNonTemporal ? X86::VMOVNTDQZmr : X86::VMOVDQA64Zmr;629else630Opc = X86::VMOVDQU64Zmr;631break;632}633634const MCInstrDesc &Desc = TII.get(Opc);635// Some of the instructions in the previous switch use FR128 instead636// of FR32 for ValReg. Make sure the register we feed the instruction637// matches its register class constraints.638// Note: This is fine to do a copy from FR32 to FR128, this is the639// same registers behind the scene and actually why it did not trigger640// any bugs before.641ValReg = constrainOperandRegClass(Desc, ValReg, Desc.getNumOperands() - 1);642MachineInstrBuilder MIB =643BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, Desc);644addFullAddress(MIB, AM).addReg(ValReg);645if (MMO)646MIB->addMemOperand(*FuncInfo.MF, MMO);647648return true;649}650651bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,652X86AddressMode &AM,653MachineMemOperand *MMO, bool Aligned) {654// Handle 'null' like i32/i64 0.655if (isa<ConstantPointerNull>(Val))656Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));657658// If this is a store of a simple constant, fold the constant into the store.659if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {660unsigned Opc = 0;661bool Signed = true;662switch (VT.getSimpleVT().SimpleTy) {663default: break;664case MVT::i1:665Signed = false;666[[fallthrough]]; // Handle as i8.667case MVT::i8: Opc = X86::MOV8mi; break;668case MVT::i16: Opc = X86::MOV16mi; break;669case MVT::i32: Opc = X86::MOV32mi; break;670case MVT::i64:671// Must be a 32-bit sign extended value.672if (isInt<32>(CI->getSExtValue()))673Opc = X86::MOV64mi32;674break;675}676677if (Opc) {678MachineInstrBuilder MIB =679BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc));680addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()681: CI->getZExtValue());682if (MMO)683MIB->addMemOperand(*FuncInfo.MF, MMO);684return true;685}686}687688Register ValReg = getRegForValue(Val);689if (ValReg == 0)690return false;691692return X86FastEmitStore(VT, ValReg, AM, MMO, Aligned);693}694695/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of696/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.697/// ISD::SIGN_EXTEND).698bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,699unsigned Src, EVT SrcVT,700unsigned &ResultReg) {701unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src);702if (RR == 0)703return false;704705ResultReg = RR;706return true;707}708709bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {710// Handle constant address.711if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {712// Can't handle alternate code models yet.713if (TM.getCodeModel() != CodeModel::Small &&714TM.getCodeModel() != CodeModel::Medium)715return false;716717// Can't handle large objects yet.718if (TM.isLargeGlobalValue(GV))719return false;720721// Can't handle TLS yet.722if (GV->isThreadLocal())723return false;724725// Can't handle !absolute_symbol references yet.726if (GV->isAbsoluteSymbolRef())727return false;728729// RIP-relative addresses can't have additional register operands, so if730// we've already folded stuff into the addressing mode, just force the731// global value into its own register, which we can use as the basereg.732if (!Subtarget->isPICStyleRIPRel() ||733(AM.Base.Reg == 0 && AM.IndexReg == 0)) {734// Okay, we've committed to selecting this global. Set up the address.735AM.GV = GV;736737// Allow the subtarget to classify the global.738unsigned char GVFlags = Subtarget->classifyGlobalReference(GV);739740// If this reference is relative to the pic base, set it now.741if (isGlobalRelativeToPICBase(GVFlags)) {742// FIXME: How do we know Base.Reg is free??743AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);744}745746// Unless the ABI requires an extra load, return a direct reference to747// the global.748if (!isGlobalStubReference(GVFlags)) {749if (Subtarget->isPICStyleRIPRel()) {750// Use rip-relative addressing if we can. Above we verified that the751// base and index registers are unused.752assert(AM.Base.Reg == 0 && AM.IndexReg == 0);753AM.Base.Reg = X86::RIP;754}755AM.GVOpFlags = GVFlags;756return true;757}758759// Ok, we need to do a load from a stub. If we've already loaded from760// this stub, reuse the loaded pointer, otherwise emit the load now.761DenseMap<const Value *, Register>::iterator I = LocalValueMap.find(V);762Register LoadReg;763if (I != LocalValueMap.end() && I->second) {764LoadReg = I->second;765} else {766// Issue load from stub.767unsigned Opc = 0;768const TargetRegisterClass *RC = nullptr;769X86AddressMode StubAM;770StubAM.Base.Reg = AM.Base.Reg;771StubAM.GV = GV;772StubAM.GVOpFlags = GVFlags;773774// Prepare for inserting code in the local-value area.775SavePoint SaveInsertPt = enterLocalValueArea();776777if (TLI.getPointerTy(DL) == MVT::i64) {778Opc = X86::MOV64rm;779RC = &X86::GR64RegClass;780} else {781Opc = X86::MOV32rm;782RC = &X86::GR32RegClass;783}784785if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL ||786GVFlags == X86II::MO_GOTPCREL_NORELAX)787StubAM.Base.Reg = X86::RIP;788789LoadReg = createResultReg(RC);790MachineInstrBuilder LoadMI =791BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), LoadReg);792addFullAddress(LoadMI, StubAM);793794// Ok, back to normal mode.795leaveLocalValueArea(SaveInsertPt);796797// Prevent loading GV stub multiple times in same MBB.798LocalValueMap[V] = LoadReg;799}800801// Now construct the final address. Note that the Disp, Scale,802// and Index values may already be set here.803AM.Base.Reg = LoadReg;804AM.GV = nullptr;805return true;806}807}808809// If all else fails, try to materialize the value in a register.810if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {811if (AM.Base.Reg == 0) {812AM.Base.Reg = getRegForValue(V);813return AM.Base.Reg != 0;814}815if (AM.IndexReg == 0) {816assert(AM.Scale == 1 && "Scale with no index!");817AM.IndexReg = getRegForValue(V);818return AM.IndexReg != 0;819}820}821822return false;823}824825/// X86SelectAddress - Attempt to fill in an address from the given value.826///827bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {828SmallVector<const Value *, 32> GEPs;829redo_gep:830const User *U = nullptr;831unsigned Opcode = Instruction::UserOp1;832if (const Instruction *I = dyn_cast<Instruction>(V)) {833// Don't walk into other basic blocks; it's possible we haven't834// visited them yet, so the instructions may not yet be assigned835// virtual registers.836if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||837FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {838Opcode = I->getOpcode();839U = I;840}841} else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {842Opcode = C->getOpcode();843U = C;844}845846if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))847if (Ty->getAddressSpace() > 255)848// Fast instruction selection doesn't support the special849// address spaces.850return false;851852switch (Opcode) {853default: break;854case Instruction::BitCast:855// Look past bitcasts.856return X86SelectAddress(U->getOperand(0), AM);857858case Instruction::IntToPtr:859// Look past no-op inttoptrs.860if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==861TLI.getPointerTy(DL))862return X86SelectAddress(U->getOperand(0), AM);863break;864865case Instruction::PtrToInt:866// Look past no-op ptrtoints.867if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))868return X86SelectAddress(U->getOperand(0), AM);869break;870871case Instruction::Alloca: {872// Do static allocas.873const AllocaInst *A = cast<AllocaInst>(V);874DenseMap<const AllocaInst *, int>::iterator SI =875FuncInfo.StaticAllocaMap.find(A);876if (SI != FuncInfo.StaticAllocaMap.end()) {877AM.BaseType = X86AddressMode::FrameIndexBase;878AM.Base.FrameIndex = SI->second;879return true;880}881break;882}883884case Instruction::Add: {885// Adds of constants are common and easy enough.886if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {887uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();888// They have to fit in the 32-bit signed displacement field though.889if (isInt<32>(Disp)) {890AM.Disp = (uint32_t)Disp;891return X86SelectAddress(U->getOperand(0), AM);892}893}894break;895}896897case Instruction::GetElementPtr: {898X86AddressMode SavedAM = AM;899900// Pattern-match simple GEPs.901uint64_t Disp = (int32_t)AM.Disp;902unsigned IndexReg = AM.IndexReg;903unsigned Scale = AM.Scale;904MVT PtrVT = TLI.getValueType(DL, U->getType()).getSimpleVT();905906gep_type_iterator GTI = gep_type_begin(U);907// Iterate through the indices, folding what we can. Constants can be908// folded, and one dynamic index can be handled, if the scale is supported.909for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();910i != e; ++i, ++GTI) {911const Value *Op = *i;912if (StructType *STy = GTI.getStructTypeOrNull()) {913const StructLayout *SL = DL.getStructLayout(STy);914Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());915continue;916}917918// A array/variable index is always of the form i*S where S is the919// constant scale size. See if we can push the scale into immediates.920uint64_t S = GTI.getSequentialElementStride(DL);921for (;;) {922if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {923// Constant-offset addressing.924Disp += CI->getSExtValue() * S;925break;926}927if (canFoldAddIntoGEP(U, Op)) {928// A compatible add with a constant operand. Fold the constant.929ConstantInt *CI =930cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));931Disp += CI->getSExtValue() * S;932// Iterate on the other operand.933Op = cast<AddOperator>(Op)->getOperand(0);934continue;935}936if (IndexReg == 0 &&937(!AM.GV || !Subtarget->isPICStyleRIPRel()) &&938(S == 1 || S == 2 || S == 4 || S == 8)) {939// Scaled-index addressing.940Scale = S;941IndexReg = getRegForGEPIndex(PtrVT, Op);942if (IndexReg == 0)943return false;944break;945}946// Unsupported.947goto unsupported_gep;948}949}950951// Check for displacement overflow.952if (!isInt<32>(Disp))953break;954955AM.IndexReg = IndexReg;956AM.Scale = Scale;957AM.Disp = (uint32_t)Disp;958GEPs.push_back(V);959960if (const GetElementPtrInst *GEP =961dyn_cast<GetElementPtrInst>(U->getOperand(0))) {962// Ok, the GEP indices were covered by constant-offset and scaled-index963// addressing. Update the address state and move on to examining the base.964V = GEP;965goto redo_gep;966} else if (X86SelectAddress(U->getOperand(0), AM)) {967return true;968}969970// If we couldn't merge the gep value into this addr mode, revert back to971// our address and just match the value instead of completely failing.972AM = SavedAM;973974for (const Value *I : reverse(GEPs))975if (handleConstantAddresses(I, AM))976return true;977978return false;979unsupported_gep:980// Ok, the GEP indices weren't all covered.981break;982}983}984985return handleConstantAddresses(V, AM);986}987988/// X86SelectCallAddress - Attempt to fill in an address from the given value.989///990bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {991const User *U = nullptr;992unsigned Opcode = Instruction::UserOp1;993const Instruction *I = dyn_cast<Instruction>(V);994// Record if the value is defined in the same basic block.995//996// This information is crucial to know whether or not folding an997// operand is valid.998// Indeed, FastISel generates or reuses a virtual register for all999// operands of all instructions it selects. Obviously, the definition and1000// its uses must use the same virtual register otherwise the produced1001// code is incorrect.1002// Before instruction selection, FunctionLoweringInfo::set sets the virtual1003// registers for values that are alive across basic blocks. This ensures1004// that the values are consistently set between across basic block, even1005// if different instruction selection mechanisms are used (e.g., a mix of1006// SDISel and FastISel).1007// For values local to a basic block, the instruction selection process1008// generates these virtual registers with whatever method is appropriate1009// for its needs. In particular, FastISel and SDISel do not share the way1010// local virtual registers are set.1011// Therefore, this is impossible (or at least unsafe) to share values1012// between basic blocks unless they use the same instruction selection1013// method, which is not guarantee for X86.1014// Moreover, things like hasOneUse could not be used accurately, if we1015// allow to reference values across basic blocks whereas they are not1016// alive across basic blocks initially.1017bool InMBB = true;1018if (I) {1019Opcode = I->getOpcode();1020U = I;1021InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();1022} else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {1023Opcode = C->getOpcode();1024U = C;1025}10261027switch (Opcode) {1028default: break;1029case Instruction::BitCast:1030// Look past bitcasts if its operand is in the same BB.1031if (InMBB)1032return X86SelectCallAddress(U->getOperand(0), AM);1033break;10341035case Instruction::IntToPtr:1036// Look past no-op inttoptrs if its operand is in the same BB.1037if (InMBB &&1038TLI.getValueType(DL, U->getOperand(0)->getType()) ==1039TLI.getPointerTy(DL))1040return X86SelectCallAddress(U->getOperand(0), AM);1041break;10421043case Instruction::PtrToInt:1044// Look past no-op ptrtoints if its operand is in the same BB.1045if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))1046return X86SelectCallAddress(U->getOperand(0), AM);1047break;1048}10491050// Handle constant address.1051if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {1052// Can't handle alternate code models yet.1053if (TM.getCodeModel() != CodeModel::Small &&1054TM.getCodeModel() != CodeModel::Medium)1055return false;10561057// RIP-relative addresses can't have additional register operands.1058if (Subtarget->isPICStyleRIPRel() &&1059(AM.Base.Reg != 0 || AM.IndexReg != 0))1060return false;10611062// Can't handle TLS.1063if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))1064if (GVar->isThreadLocal())1065return false;10661067// Okay, we've committed to selecting this global. Set up the basic address.1068AM.GV = GV;10691070// Return a direct reference to the global. Fastisel can handle calls to1071// functions that require loads, such as dllimport and nonlazybind1072// functions.1073if (Subtarget->isPICStyleRIPRel()) {1074// Use rip-relative addressing if we can. Above we verified that the1075// base and index registers are unused.1076assert(AM.Base.Reg == 0 && AM.IndexReg == 0);1077AM.Base.Reg = X86::RIP;1078} else {1079AM.GVOpFlags = Subtarget->classifyLocalReference(nullptr);1080}10811082return true;1083}10841085// If all else fails, try to materialize the value in a register.1086if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {1087auto GetCallRegForValue = [this](const Value *V) {1088Register Reg = getRegForValue(V);10891090// In 64-bit mode, we need a 64-bit register even if pointers are 32 bits.1091if (Reg && Subtarget->isTarget64BitILP32()) {1092Register CopyReg = createResultReg(&X86::GR32RegClass);1093BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV32rr),1094CopyReg)1095.addReg(Reg);10961097Register ExtReg = createResultReg(&X86::GR64RegClass);1098BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1099TII.get(TargetOpcode::SUBREG_TO_REG), ExtReg)1100.addImm(0)1101.addReg(CopyReg)1102.addImm(X86::sub_32bit);1103Reg = ExtReg;1104}11051106return Reg;1107};11081109if (AM.Base.Reg == 0) {1110AM.Base.Reg = GetCallRegForValue(V);1111return AM.Base.Reg != 0;1112}1113if (AM.IndexReg == 0) {1114assert(AM.Scale == 1 && "Scale with no index!");1115AM.IndexReg = GetCallRegForValue(V);1116return AM.IndexReg != 0;1117}1118}11191120return false;1121}112211231124/// X86SelectStore - Select and emit code to implement store instructions.1125bool X86FastISel::X86SelectStore(const Instruction *I) {1126// Atomic stores need special handling.1127const StoreInst *S = cast<StoreInst>(I);11281129if (S->isAtomic())1130return false;11311132const Value *PtrV = I->getOperand(1);1133if (TLI.supportSwiftError()) {1134// Swifterror values can come from either a function parameter with1135// swifterror attribute or an alloca with swifterror attribute.1136if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {1137if (Arg->hasSwiftErrorAttr())1138return false;1139}11401141if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {1142if (Alloca->isSwiftError())1143return false;1144}1145}11461147const Value *Val = S->getValueOperand();1148const Value *Ptr = S->getPointerOperand();11491150MVT VT;1151if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))1152return false;11531154Align Alignment = S->getAlign();1155Align ABIAlignment = DL.getABITypeAlign(Val->getType());1156bool Aligned = Alignment >= ABIAlignment;11571158X86AddressMode AM;1159if (!X86SelectAddress(Ptr, AM))1160return false;11611162return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);1163}11641165/// X86SelectRet - Select and emit code to implement ret instructions.1166bool X86FastISel::X86SelectRet(const Instruction *I) {1167const ReturnInst *Ret = cast<ReturnInst>(I);1168const Function &F = *I->getParent()->getParent();1169const X86MachineFunctionInfo *X86MFInfo =1170FuncInfo.MF->getInfo<X86MachineFunctionInfo>();11711172if (!FuncInfo.CanLowerReturn)1173return false;11741175if (TLI.supportSwiftError() &&1176F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))1177return false;11781179if (TLI.supportSplitCSR(FuncInfo.MF))1180return false;11811182CallingConv::ID CC = F.getCallingConv();1183if (CC != CallingConv::C &&1184CC != CallingConv::Fast &&1185CC != CallingConv::Tail &&1186CC != CallingConv::SwiftTail &&1187CC != CallingConv::X86_FastCall &&1188CC != CallingConv::X86_StdCall &&1189CC != CallingConv::X86_ThisCall &&1190CC != CallingConv::X86_64_SysV &&1191CC != CallingConv::Win64)1192return false;11931194// Don't handle popping bytes if they don't fit the ret's immediate.1195if (!isUInt<16>(X86MFInfo->getBytesToPopOnReturn()))1196return false;11971198// fastcc with -tailcallopt is intended to provide a guaranteed1199// tail call optimization. Fastisel doesn't know how to do that.1200if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||1201CC == CallingConv::Tail || CC == CallingConv::SwiftTail)1202return false;12031204// Let SDISel handle vararg functions.1205if (F.isVarArg())1206return false;12071208// Build a list of return value registers.1209SmallVector<unsigned, 4> RetRegs;12101211if (Ret->getNumOperands() > 0) {1212SmallVector<ISD::OutputArg, 4> Outs;1213GetReturnInfo(CC, F.getReturnType(), F.getAttributes(), Outs, TLI, DL);12141215// Analyze operands of the call, assigning locations to each operand.1216SmallVector<CCValAssign, 16> ValLocs;1217CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());1218CCInfo.AnalyzeReturn(Outs, RetCC_X86);12191220const Value *RV = Ret->getOperand(0);1221Register Reg = getRegForValue(RV);1222if (Reg == 0)1223return false;12241225// Only handle a single return value for now.1226if (ValLocs.size() != 1)1227return false;12281229CCValAssign &VA = ValLocs[0];12301231// Don't bother handling odd stuff for now.1232if (VA.getLocInfo() != CCValAssign::Full)1233return false;1234// Only handle register returns for now.1235if (!VA.isRegLoc())1236return false;12371238// The calling-convention tables for x87 returns don't tell1239// the whole story.1240if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)1241return false;12421243unsigned SrcReg = Reg + VA.getValNo();1244EVT SrcVT = TLI.getValueType(DL, RV->getType());1245EVT DstVT = VA.getValVT();1246// Special handling for extended integers.1247if (SrcVT != DstVT) {1248if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)1249return false;12501251if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())1252return false;12531254if (SrcVT == MVT::i1) {1255if (Outs[0].Flags.isSExt())1256return false;1257SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg);1258SrcVT = MVT::i8;1259}1260if (SrcVT != DstVT) {1261unsigned Op =1262Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;1263SrcReg =1264fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg);1265}1266}12671268// Make the copy.1269Register DstReg = VA.getLocReg();1270const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);1271// Avoid a cross-class copy. This is very unlikely.1272if (!SrcRC->contains(DstReg))1273return false;1274BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1275TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);12761277// Add register to return instruction.1278RetRegs.push_back(VA.getLocReg());1279}12801281// Swift calling convention does not require we copy the sret argument1282// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.12831284// All x86 ABIs require that for returning structs by value we copy1285// the sret argument into %rax/%eax (depending on ABI) for the return.1286// We saved the argument into a virtual register in the entry block,1287// so now we copy the value out and into %rax/%eax.1288if (F.hasStructRetAttr() && CC != CallingConv::Swift &&1289CC != CallingConv::SwiftTail) {1290Register Reg = X86MFInfo->getSRetReturnReg();1291assert(Reg &&1292"SRetReturnReg should have been set in LowerFormalArguments()!");1293unsigned RetReg = Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;1294BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1295TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);1296RetRegs.push_back(RetReg);1297}12981299// Now emit the RET.1300MachineInstrBuilder MIB;1301if (X86MFInfo->getBytesToPopOnReturn()) {1302MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1303TII.get(Subtarget->is64Bit() ? X86::RETI64 : X86::RETI32))1304.addImm(X86MFInfo->getBytesToPopOnReturn());1305} else {1306MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1307TII.get(Subtarget->is64Bit() ? X86::RET64 : X86::RET32));1308}1309for (unsigned Reg : RetRegs)1310MIB.addReg(Reg, RegState::Implicit);1311return true;1312}13131314/// X86SelectLoad - Select and emit code to implement load instructions.1315///1316bool X86FastISel::X86SelectLoad(const Instruction *I) {1317const LoadInst *LI = cast<LoadInst>(I);13181319// Atomic loads need special handling.1320if (LI->isAtomic())1321return false;13221323const Value *SV = I->getOperand(0);1324if (TLI.supportSwiftError()) {1325// Swifterror values can come from either a function parameter with1326// swifterror attribute or an alloca with swifterror attribute.1327if (const Argument *Arg = dyn_cast<Argument>(SV)) {1328if (Arg->hasSwiftErrorAttr())1329return false;1330}13311332if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {1333if (Alloca->isSwiftError())1334return false;1335}1336}13371338MVT VT;1339if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))1340return false;13411342const Value *Ptr = LI->getPointerOperand();13431344X86AddressMode AM;1345if (!X86SelectAddress(Ptr, AM))1346return false;13471348unsigned ResultReg = 0;1349if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,1350LI->getAlign().value()))1351return false;13521353updateValueMap(I, ResultReg);1354return true;1355}13561357static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {1358bool HasAVX512 = Subtarget->hasAVX512();1359bool HasAVX = Subtarget->hasAVX();1360bool HasSSE1 = Subtarget->hasSSE1();1361bool HasSSE2 = Subtarget->hasSSE2();13621363switch (VT.getSimpleVT().SimpleTy) {1364default: return 0;1365case MVT::i8: return X86::CMP8rr;1366case MVT::i16: return X86::CMP16rr;1367case MVT::i32: return X86::CMP32rr;1368case MVT::i64: return X86::CMP64rr;1369case MVT::f32:1370return HasAVX512 ? X86::VUCOMISSZrr1371: HasAVX ? X86::VUCOMISSrr1372: HasSSE1 ? X86::UCOMISSrr1373: 0;1374case MVT::f64:1375return HasAVX512 ? X86::VUCOMISDZrr1376: HasAVX ? X86::VUCOMISDrr1377: HasSSE2 ? X86::UCOMISDrr1378: 0;1379}1380}13811382/// If we have a comparison with RHS as the RHS of the comparison, return an1383/// opcode that works for the compare (e.g. CMP32ri) otherwise return 0.1384static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {1385switch (VT.getSimpleVT().SimpleTy) {1386// Otherwise, we can't fold the immediate into this comparison.1387default:1388return 0;1389case MVT::i8:1390return X86::CMP8ri;1391case MVT::i16:1392return X86::CMP16ri;1393case MVT::i32:1394return X86::CMP32ri;1395case MVT::i64:1396// 64-bit comparisons are only valid if the immediate fits in a 32-bit sext1397// field.1398return isInt<32>(RHSC->getSExtValue()) ? X86::CMP64ri32 : 0;1399}1400}14011402bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT,1403const DebugLoc &CurMIMD) {1404Register Op0Reg = getRegForValue(Op0);1405if (Op0Reg == 0) return false;14061407// Handle 'null' like i32/i64 0.1408if (isa<ConstantPointerNull>(Op1))1409Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));14101411// We have two options: compare with register or immediate. If the RHS of1412// the compare is an immediate that we can fold into this compare, use1413// CMPri, otherwise use CMPrr.1414if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {1415if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {1416BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurMIMD, TII.get(CompareImmOpc))1417.addReg(Op0Reg)1418.addImm(Op1C->getSExtValue());1419return true;1420}1421}14221423unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);1424if (CompareOpc == 0) return false;14251426Register Op1Reg = getRegForValue(Op1);1427if (Op1Reg == 0) return false;1428BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurMIMD, TII.get(CompareOpc))1429.addReg(Op0Reg)1430.addReg(Op1Reg);14311432return true;1433}14341435bool X86FastISel::X86SelectCmp(const Instruction *I) {1436const CmpInst *CI = cast<CmpInst>(I);14371438MVT VT;1439if (!isTypeLegal(I->getOperand(0)->getType(), VT))1440return false;14411442// Below code only works for scalars.1443if (VT.isVector())1444return false;14451446// Try to optimize or fold the cmp.1447CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);1448unsigned ResultReg = 0;1449switch (Predicate) {1450default: break;1451case CmpInst::FCMP_FALSE: {1452ResultReg = createResultReg(&X86::GR32RegClass);1453BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV32r0),1454ResultReg);1455ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, X86::sub_8bit);1456if (!ResultReg)1457return false;1458break;1459}1460case CmpInst::FCMP_TRUE: {1461ResultReg = createResultReg(&X86::GR8RegClass);1462BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV8ri),1463ResultReg).addImm(1);1464break;1465}1466}14671468if (ResultReg) {1469updateValueMap(I, ResultReg);1470return true;1471}14721473const Value *LHS = CI->getOperand(0);1474const Value *RHS = CI->getOperand(1);14751476// The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.1477// We don't have to materialize a zero constant for this case and can just use1478// %x again on the RHS.1479if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {1480const auto *RHSC = dyn_cast<ConstantFP>(RHS);1481if (RHSC && RHSC->isNullValue())1482RHS = LHS;1483}14841485// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.1486static const uint16_t SETFOpcTable[2][3] = {1487{ X86::COND_E, X86::COND_NP, X86::AND8rr },1488{ X86::COND_NE, X86::COND_P, X86::OR8rr }1489};1490const uint16_t *SETFOpc = nullptr;1491switch (Predicate) {1492default: break;1493case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;1494case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;1495}14961497ResultReg = createResultReg(&X86::GR8RegClass);1498if (SETFOpc) {1499if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))1500return false;15011502Register FlagReg1 = createResultReg(&X86::GR8RegClass);1503Register FlagReg2 = createResultReg(&X86::GR8RegClass);1504BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),1505FlagReg1).addImm(SETFOpc[0]);1506BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),1507FlagReg2).addImm(SETFOpc[1]);1508BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(SETFOpc[2]),1509ResultReg).addReg(FlagReg1).addReg(FlagReg2);1510updateValueMap(I, ResultReg);1511return true;1512}15131514X86::CondCode CC;1515bool SwapArgs;1516std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);1517assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");15181519if (SwapArgs)1520std::swap(LHS, RHS);15211522// Emit a compare of LHS/RHS.1523if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))1524return false;15251526BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),1527ResultReg).addImm(CC);1528updateValueMap(I, ResultReg);1529return true;1530}15311532bool X86FastISel::X86SelectZExt(const Instruction *I) {1533EVT DstVT = TLI.getValueType(DL, I->getType());1534if (!TLI.isTypeLegal(DstVT))1535return false;15361537Register ResultReg = getRegForValue(I->getOperand(0));1538if (ResultReg == 0)1539return false;15401541// Handle zero-extension from i1 to i8, which is common.1542MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());1543if (SrcVT == MVT::i1) {1544// Set the high bits to zero.1545ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg);1546SrcVT = MVT::i8;15471548if (ResultReg == 0)1549return false;1550}15511552if (DstVT == MVT::i64) {1553// Handle extension to 64-bits via sub-register shenanigans.1554unsigned MovInst;15551556switch (SrcVT.SimpleTy) {1557case MVT::i8: MovInst = X86::MOVZX32rr8; break;1558case MVT::i16: MovInst = X86::MOVZX32rr16; break;1559case MVT::i32: MovInst = X86::MOV32rr; break;1560default: llvm_unreachable("Unexpected zext to i64 source type");1561}15621563Register Result32 = createResultReg(&X86::GR32RegClass);1564BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(MovInst), Result32)1565.addReg(ResultReg);15661567ResultReg = createResultReg(&X86::GR64RegClass);1568BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::SUBREG_TO_REG),1569ResultReg)1570.addImm(0).addReg(Result32).addImm(X86::sub_32bit);1571} else if (DstVT == MVT::i16) {1572// i8->i16 doesn't exist in the autogenerated isel table. Need to zero1573// extend to 32-bits and then extract down to 16-bits.1574Register Result32 = createResultReg(&X86::GR32RegClass);1575BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOVZX32rr8),1576Result32).addReg(ResultReg);15771578ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, X86::sub_16bit);1579} else if (DstVT != MVT::i8) {1580ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,1581ResultReg);1582if (ResultReg == 0)1583return false;1584}15851586updateValueMap(I, ResultReg);1587return true;1588}15891590bool X86FastISel::X86SelectSExt(const Instruction *I) {1591EVT DstVT = TLI.getValueType(DL, I->getType());1592if (!TLI.isTypeLegal(DstVT))1593return false;15941595Register ResultReg = getRegForValue(I->getOperand(0));1596if (ResultReg == 0)1597return false;15981599// Handle sign-extension from i1 to i8.1600MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());1601if (SrcVT == MVT::i1) {1602// Set the high bits to zero.1603Register ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg);1604if (ZExtReg == 0)1605return false;16061607// Negate the result to make an 8-bit sign extended value.1608ResultReg = createResultReg(&X86::GR8RegClass);1609BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::NEG8r),1610ResultReg).addReg(ZExtReg);16111612SrcVT = MVT::i8;1613}16141615if (DstVT == MVT::i16) {1616// i8->i16 doesn't exist in the autogenerated isel table. Need to sign1617// extend to 32-bits and then extract down to 16-bits.1618Register Result32 = createResultReg(&X86::GR32RegClass);1619BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOVSX32rr8),1620Result32).addReg(ResultReg);16211622ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, X86::sub_16bit);1623} else if (DstVT != MVT::i8) {1624ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND,1625ResultReg);1626if (ResultReg == 0)1627return false;1628}16291630updateValueMap(I, ResultReg);1631return true;1632}16331634bool X86FastISel::X86SelectBranch(const Instruction *I) {1635// Unconditional branches are selected by tablegen-generated code.1636// Handle a conditional branch.1637const BranchInst *BI = cast<BranchInst>(I);1638MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];1639MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];16401641// Fold the common case of a conditional branch with a comparison1642// in the same block (values defined on other blocks may not have1643// initialized registers).1644X86::CondCode CC;1645if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {1646if (CI->hasOneUse() && CI->getParent() == I->getParent()) {1647EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType());16481649// Try to optimize or fold the cmp.1650CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);1651switch (Predicate) {1652default: break;1653case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, MIMD.getDL()); return true;1654case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, MIMD.getDL()); return true;1655}16561657const Value *CmpLHS = CI->getOperand(0);1658const Value *CmpRHS = CI->getOperand(1);16591660// The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,1661// 0.0.1662// We don't have to materialize a zero constant for this case and can just1663// use %x again on the RHS.1664if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {1665const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);1666if (CmpRHSC && CmpRHSC->isNullValue())1667CmpRHS = CmpLHS;1668}16691670// Try to take advantage of fallthrough opportunities.1671if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {1672std::swap(TrueMBB, FalseMBB);1673Predicate = CmpInst::getInversePredicate(Predicate);1674}16751676// FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition1677// code check. Instead two branch instructions are required to check all1678// the flags. First we change the predicate to a supported condition code,1679// which will be the first branch. Later one we will emit the second1680// branch.1681bool NeedExtraBranch = false;1682switch (Predicate) {1683default: break;1684case CmpInst::FCMP_OEQ:1685std::swap(TrueMBB, FalseMBB);1686[[fallthrough]];1687case CmpInst::FCMP_UNE:1688NeedExtraBranch = true;1689Predicate = CmpInst::FCMP_ONE;1690break;1691}16921693bool SwapArgs;1694std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);1695assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");16961697if (SwapArgs)1698std::swap(CmpLHS, CmpRHS);16991700// Emit a compare of the LHS and RHS, setting the flags.1701if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))1702return false;17031704BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))1705.addMBB(TrueMBB).addImm(CC);17061707// X86 requires a second branch to handle UNE (and OEQ, which is mapped1708// to UNE above).1709if (NeedExtraBranch) {1710BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))1711.addMBB(TrueMBB).addImm(X86::COND_P);1712}17131714finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);1715return true;1716}1717} else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {1718// Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which1719// typically happen for _Bool and C++ bools.1720MVT SourceVT;1721if (TI->hasOneUse() && TI->getParent() == I->getParent() &&1722isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {1723unsigned TestOpc = 0;1724switch (SourceVT.SimpleTy) {1725default: break;1726case MVT::i8: TestOpc = X86::TEST8ri; break;1727case MVT::i16: TestOpc = X86::TEST16ri; break;1728case MVT::i32: TestOpc = X86::TEST32ri; break;1729case MVT::i64: TestOpc = X86::TEST64ri32; break;1730}1731if (TestOpc) {1732Register OpReg = getRegForValue(TI->getOperand(0));1733if (OpReg == 0) return false;17341735BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TestOpc))1736.addReg(OpReg).addImm(1);17371738unsigned JmpCond = X86::COND_NE;1739if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {1740std::swap(TrueMBB, FalseMBB);1741JmpCond = X86::COND_E;1742}17431744BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))1745.addMBB(TrueMBB).addImm(JmpCond);17461747finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);1748return true;1749}1750}1751} else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {1752// Fake request the condition, otherwise the intrinsic might be completely1753// optimized away.1754Register TmpReg = getRegForValue(BI->getCondition());1755if (TmpReg == 0)1756return false;17571758BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))1759.addMBB(TrueMBB).addImm(CC);1760finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);1761return true;1762}17631764// Otherwise do a clumsy setcc and re-test it.1765// Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used1766// in an explicit cast, so make sure to handle that correctly.1767Register OpReg = getRegForValue(BI->getCondition());1768if (OpReg == 0) return false;17691770// In case OpReg is a K register, COPY to a GPR1771if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) {1772unsigned KOpReg = OpReg;1773OpReg = createResultReg(&X86::GR32RegClass);1774BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1775TII.get(TargetOpcode::COPY), OpReg)1776.addReg(KOpReg);1777OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, X86::sub_8bit);1778}1779BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TEST8ri))1780.addReg(OpReg)1781.addImm(1);1782BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::JCC_1))1783.addMBB(TrueMBB).addImm(X86::COND_NE);1784finishCondBranch(BI->getParent(), TrueMBB, FalseMBB);1785return true;1786}17871788bool X86FastISel::X86SelectShift(const Instruction *I) {1789unsigned CReg = 0, OpReg = 0;1790const TargetRegisterClass *RC = nullptr;1791if (I->getType()->isIntegerTy(8)) {1792CReg = X86::CL;1793RC = &X86::GR8RegClass;1794switch (I->getOpcode()) {1795case Instruction::LShr: OpReg = X86::SHR8rCL; break;1796case Instruction::AShr: OpReg = X86::SAR8rCL; break;1797case Instruction::Shl: OpReg = X86::SHL8rCL; break;1798default: return false;1799}1800} else if (I->getType()->isIntegerTy(16)) {1801CReg = X86::CX;1802RC = &X86::GR16RegClass;1803switch (I->getOpcode()) {1804default: llvm_unreachable("Unexpected shift opcode");1805case Instruction::LShr: OpReg = X86::SHR16rCL; break;1806case Instruction::AShr: OpReg = X86::SAR16rCL; break;1807case Instruction::Shl: OpReg = X86::SHL16rCL; break;1808}1809} else if (I->getType()->isIntegerTy(32)) {1810CReg = X86::ECX;1811RC = &X86::GR32RegClass;1812switch (I->getOpcode()) {1813default: llvm_unreachable("Unexpected shift opcode");1814case Instruction::LShr: OpReg = X86::SHR32rCL; break;1815case Instruction::AShr: OpReg = X86::SAR32rCL; break;1816case Instruction::Shl: OpReg = X86::SHL32rCL; break;1817}1818} else if (I->getType()->isIntegerTy(64)) {1819CReg = X86::RCX;1820RC = &X86::GR64RegClass;1821switch (I->getOpcode()) {1822default: llvm_unreachable("Unexpected shift opcode");1823case Instruction::LShr: OpReg = X86::SHR64rCL; break;1824case Instruction::AShr: OpReg = X86::SAR64rCL; break;1825case Instruction::Shl: OpReg = X86::SHL64rCL; break;1826}1827} else {1828return false;1829}18301831MVT VT;1832if (!isTypeLegal(I->getType(), VT))1833return false;18341835Register Op0Reg = getRegForValue(I->getOperand(0));1836if (Op0Reg == 0) return false;18371838Register Op1Reg = getRegForValue(I->getOperand(1));1839if (Op1Reg == 0) return false;1840BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),1841CReg).addReg(Op1Reg);18421843// The shift instruction uses X86::CL. If we defined a super-register1844// of X86::CL, emit a subreg KILL to precisely describe what we're doing here.1845if (CReg != X86::CL)1846BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1847TII.get(TargetOpcode::KILL), X86::CL)1848.addReg(CReg, RegState::Kill);18491850Register ResultReg = createResultReg(RC);1851BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(OpReg), ResultReg)1852.addReg(Op0Reg);1853updateValueMap(I, ResultReg);1854return true;1855}18561857bool X86FastISel::X86SelectDivRem(const Instruction *I) {1858const static unsigned NumTypes = 4; // i8, i16, i32, i641859const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem1860const static bool S = true; // IsSigned1861const static bool U = false; // !IsSigned1862const static unsigned Copy = TargetOpcode::COPY;1863// For the X86 DIV/IDIV instruction, in most cases the dividend1864// (numerator) must be in a specific register pair highreg:lowreg,1865// producing the quotient in lowreg and the remainder in highreg.1866// For most data types, to set up the instruction, the dividend is1867// copied into lowreg, and lowreg is sign-extended or zero-extended1868// into highreg. The exception is i8, where the dividend is defined1869// as a single register rather than a register pair, and we1870// therefore directly sign-extend or zero-extend the dividend into1871// lowreg, instead of copying, and ignore the highreg.1872const static struct DivRemEntry {1873// The following portion depends only on the data type.1874const TargetRegisterClass *RC;1875unsigned LowInReg; // low part of the register pair1876unsigned HighInReg; // high part of the register pair1877// The following portion depends on both the data type and the operation.1878struct DivRemResult {1879unsigned OpDivRem; // The specific DIV/IDIV opcode to use.1880unsigned OpSignExtend; // Opcode for sign-extending lowreg into1881// highreg, or copying a zero into highreg.1882unsigned OpCopy; // Opcode for copying dividend into lowreg, or1883// zero/sign-extending into lowreg for i8.1884unsigned DivRemResultReg; // Register containing the desired result.1885bool IsOpSigned; // Whether to use signed or unsigned form.1886} ResultTable[NumOps];1887} OpTable[NumTypes] = {1888{ &X86::GR8RegClass, X86::AX, 0, {1889{ X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv1890{ X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem1891{ X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv1892{ X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem1893}1894}, // i81895{ &X86::GR16RegClass, X86::AX, X86::DX, {1896{ X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv1897{ X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem1898{ X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv1899{ X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem1900}1901}, // i161902{ &X86::GR32RegClass, X86::EAX, X86::EDX, {1903{ X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv1904{ X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem1905{ X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv1906{ X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem1907}1908}, // i321909{ &X86::GR64RegClass, X86::RAX, X86::RDX, {1910{ X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv1911{ X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem1912{ X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv1913{ X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem1914}1915}, // i641916};19171918MVT VT;1919if (!isTypeLegal(I->getType(), VT))1920return false;19211922unsigned TypeIndex, OpIndex;1923switch (VT.SimpleTy) {1924default: return false;1925case MVT::i8: TypeIndex = 0; break;1926case MVT::i16: TypeIndex = 1; break;1927case MVT::i32: TypeIndex = 2; break;1928case MVT::i64: TypeIndex = 3;1929if (!Subtarget->is64Bit())1930return false;1931break;1932}19331934switch (I->getOpcode()) {1935default: llvm_unreachable("Unexpected div/rem opcode");1936case Instruction::SDiv: OpIndex = 0; break;1937case Instruction::SRem: OpIndex = 1; break;1938case Instruction::UDiv: OpIndex = 2; break;1939case Instruction::URem: OpIndex = 3; break;1940}19411942const DivRemEntry &TypeEntry = OpTable[TypeIndex];1943const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];1944Register Op0Reg = getRegForValue(I->getOperand(0));1945if (Op0Reg == 0)1946return false;1947Register Op1Reg = getRegForValue(I->getOperand(1));1948if (Op1Reg == 0)1949return false;19501951// Move op0 into low-order input register.1952BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1953TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);1954// Zero-extend or sign-extend into high-order input register.1955if (OpEntry.OpSignExtend) {1956if (OpEntry.IsOpSigned)1957BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1958TII.get(OpEntry.OpSignExtend));1959else {1960Register Zero32 = createResultReg(&X86::GR32RegClass);1961BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1962TII.get(X86::MOV32r0), Zero32);19631964// Copy the zero into the appropriate sub/super/identical physical1965// register. Unfortunately the operations needed are not uniform enough1966// to fit neatly into the table above.1967if (VT == MVT::i16) {1968BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1969TII.get(Copy), TypeEntry.HighInReg)1970.addReg(Zero32, 0, X86::sub_16bit);1971} else if (VT == MVT::i32) {1972BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1973TII.get(Copy), TypeEntry.HighInReg)1974.addReg(Zero32);1975} else if (VT == MVT::i64) {1976BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1977TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)1978.addImm(0).addReg(Zero32).addImm(X86::sub_32bit);1979}1980}1981}1982// Generate the DIV/IDIV instruction.1983BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,1984TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);1985// For i8 remainder, we can't reference ah directly, as we'll end1986// up with bogus copies like %r9b = COPY %ah. Reference ax1987// instead to prevent ah references in a rex instruction.1988//1989// The current assumption of the fast register allocator is that isel1990// won't generate explicit references to the GR8_NOREX registers. If1991// the allocator and/or the backend get enhanced to be more robust in1992// that regard, this can be, and should be, removed.1993unsigned ResultReg = 0;1994if ((I->getOpcode() == Instruction::SRem ||1995I->getOpcode() == Instruction::URem) &&1996OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {1997Register SourceSuperReg = createResultReg(&X86::GR16RegClass);1998Register ResultSuperReg = createResultReg(&X86::GR16RegClass);1999BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2000TII.get(Copy), SourceSuperReg).addReg(X86::AX);20012002// Shift AX right by 8 bits instead of using AH.2003BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SHR16ri),2004ResultSuperReg).addReg(SourceSuperReg).addImm(8);20052006// Now reference the 8-bit subreg of the result.2007ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,2008X86::sub_8bit);2009}2010// Copy the result out of the physreg if we haven't already.2011if (!ResultReg) {2012ResultReg = createResultReg(TypeEntry.RC);2013BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Copy), ResultReg)2014.addReg(OpEntry.DivRemResultReg);2015}2016updateValueMap(I, ResultReg);20172018return true;2019}20202021/// Emit a conditional move instruction (if the are supported) to lower2022/// the select.2023bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {2024// Check if the subtarget supports these instructions.2025if (!Subtarget->canUseCMOV())2026return false;20272028// FIXME: Add support for i8.2029if (RetVT < MVT::i16 || RetVT > MVT::i64)2030return false;20312032const Value *Cond = I->getOperand(0);2033const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);2034bool NeedTest = true;2035X86::CondCode CC = X86::COND_NE;20362037// Optimize conditions coming from a compare if both instructions are in the2038// same basic block (values defined in other basic blocks may not have2039// initialized registers).2040const auto *CI = dyn_cast<CmpInst>(Cond);2041if (CI && (CI->getParent() == I->getParent())) {2042CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);20432044// FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.2045static const uint16_t SETFOpcTable[2][3] = {2046{ X86::COND_NP, X86::COND_E, X86::TEST8rr },2047{ X86::COND_P, X86::COND_NE, X86::OR8rr }2048};2049const uint16_t *SETFOpc = nullptr;2050switch (Predicate) {2051default: break;2052case CmpInst::FCMP_OEQ:2053SETFOpc = &SETFOpcTable[0][0];2054Predicate = CmpInst::ICMP_NE;2055break;2056case CmpInst::FCMP_UNE:2057SETFOpc = &SETFOpcTable[1][0];2058Predicate = CmpInst::ICMP_NE;2059break;2060}20612062bool NeedSwap;2063std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate);2064assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");20652066const Value *CmpLHS = CI->getOperand(0);2067const Value *CmpRHS = CI->getOperand(1);2068if (NeedSwap)2069std::swap(CmpLHS, CmpRHS);20702071EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());2072// Emit a compare of the LHS and RHS, setting the flags.2073if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))2074return false;20752076if (SETFOpc) {2077Register FlagReg1 = createResultReg(&X86::GR8RegClass);2078Register FlagReg2 = createResultReg(&X86::GR8RegClass);2079BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),2080FlagReg1).addImm(SETFOpc[0]);2081BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),2082FlagReg2).addImm(SETFOpc[1]);2083auto const &II = TII.get(SETFOpc[2]);2084if (II.getNumDefs()) {2085Register TmpReg = createResultReg(&X86::GR8RegClass);2086BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, TmpReg)2087.addReg(FlagReg2).addReg(FlagReg1);2088} else {2089BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)2090.addReg(FlagReg2).addReg(FlagReg1);2091}2092}2093NeedTest = false;2094} else if (foldX86XALUIntrinsic(CC, I, Cond)) {2095// Fake request the condition, otherwise the intrinsic might be completely2096// optimized away.2097Register TmpReg = getRegForValue(Cond);2098if (TmpReg == 0)2099return false;21002101NeedTest = false;2102}21032104if (NeedTest) {2105// Selects operate on i1, however, CondReg is 8 bits width and may contain2106// garbage. Indeed, only the less significant bit is supposed to be2107// accurate. If we read more than the lsb, we may see non-zero values2108// whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for2109// the select. This is achieved by performing TEST against 1.2110Register CondReg = getRegForValue(Cond);2111if (CondReg == 0)2112return false;21132114// In case OpReg is a K register, COPY to a GPR2115if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {2116unsigned KCondReg = CondReg;2117CondReg = createResultReg(&X86::GR32RegClass);2118BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2119TII.get(TargetOpcode::COPY), CondReg)2120.addReg(KCondReg);2121CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, X86::sub_8bit);2122}2123BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TEST8ri))2124.addReg(CondReg)2125.addImm(1);2126}21272128const Value *LHS = I->getOperand(1);2129const Value *RHS = I->getOperand(2);21302131Register RHSReg = getRegForValue(RHS);2132Register LHSReg = getRegForValue(LHS);2133if (!LHSReg || !RHSReg)2134return false;21352136const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();2137unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC) / 8, false,2138Subtarget->hasNDD());2139Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, LHSReg, CC);2140updateValueMap(I, ResultReg);2141return true;2142}21432144/// Emit SSE or AVX instructions to lower the select.2145///2146/// Try to use SSE1/SSE2 instructions to simulate a select without branches.2147/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary2148/// SSE instructions are available. If AVX is available, try to use a VBLENDV.2149bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {2150// Optimize conditions coming from a compare if both instructions are in the2151// same basic block (values defined in other basic blocks may not have2152// initialized registers).2153const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));2154if (!CI || (CI->getParent() != I->getParent()))2155return false;21562157if (I->getType() != CI->getOperand(0)->getType() ||2158!((Subtarget->hasSSE1() && RetVT == MVT::f32) ||2159(Subtarget->hasSSE2() && RetVT == MVT::f64)))2160return false;21612162const Value *CmpLHS = CI->getOperand(0);2163const Value *CmpRHS = CI->getOperand(1);2164CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);21652166// The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.2167// We don't have to materialize a zero constant for this case and can just use2168// %x again on the RHS.2169if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {2170const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);2171if (CmpRHSC && CmpRHSC->isNullValue())2172CmpRHS = CmpLHS;2173}21742175unsigned CC;2176bool NeedSwap;2177std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);2178if (CC > 7 && !Subtarget->hasAVX())2179return false;21802181if (NeedSwap)2182std::swap(CmpLHS, CmpRHS);21832184const Value *LHS = I->getOperand(1);2185const Value *RHS = I->getOperand(2);21862187Register LHSReg = getRegForValue(LHS);2188Register RHSReg = getRegForValue(RHS);2189Register CmpLHSReg = getRegForValue(CmpLHS);2190Register CmpRHSReg = getRegForValue(CmpRHS);2191if (!LHSReg || !RHSReg || !CmpLHSReg || !CmpRHSReg)2192return false;21932194const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);2195unsigned ResultReg;21962197if (Subtarget->hasAVX512()) {2198// If we have AVX512 we can use a mask compare and masked movss/sd.2199const TargetRegisterClass *VR128X = &X86::VR128XRegClass;2200const TargetRegisterClass *VK1 = &X86::VK1RegClass;22012202unsigned CmpOpcode =2203(RetVT == MVT::f32) ? X86::VCMPSSZrri : X86::VCMPSDZrri;2204Register CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpRHSReg,2205CC);22062207// Need an IMPLICIT_DEF for the input that is used to generate the upper2208// bits of the result register since its not based on any of the inputs.2209Register ImplicitDefReg = createResultReg(VR128X);2210BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2211TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);22122213// Place RHSReg is the passthru of the masked movss/sd operation and put2214// LHS in the input. The mask input comes from the compare.2215unsigned MovOpcode =2216(RetVT == MVT::f32) ? X86::VMOVSSZrrk : X86::VMOVSDZrrk;2217unsigned MovReg = fastEmitInst_rrrr(MovOpcode, VR128X, RHSReg, CmpReg,2218ImplicitDefReg, LHSReg);22192220ResultReg = createResultReg(RC);2221BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2222TII.get(TargetOpcode::COPY), ResultReg).addReg(MovReg);22232224} else if (Subtarget->hasAVX()) {2225const TargetRegisterClass *VR128 = &X86::VR128RegClass;22262227// If we have AVX, create 1 blendv instead of 3 logic instructions.2228// Blendv was introduced with SSE 4.1, but the 2 register form implicitly2229// uses XMM0 as the selection register. That may need just as many2230// instructions as the AND/ANDN/OR sequence due to register moves, so2231// don't bother.2232unsigned CmpOpcode =2233(RetVT == MVT::f32) ? X86::VCMPSSrri : X86::VCMPSDrri;2234unsigned BlendOpcode =2235(RetVT == MVT::f32) ? X86::VBLENDVPSrrr : X86::VBLENDVPDrrr;22362237Register CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpRHSReg,2238CC);2239Register VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, LHSReg,2240CmpReg);2241ResultReg = createResultReg(RC);2242BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2243TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg);2244} else {2245// Choose the SSE instruction sequence based on data type (float or double).2246static const uint16_t OpcTable[2][4] = {2247{ X86::CMPSSrri, X86::ANDPSrr, X86::ANDNPSrr, X86::ORPSrr },2248{ X86::CMPSDrri, X86::ANDPDrr, X86::ANDNPDrr, X86::ORPDrr }2249};22502251const uint16_t *Opc = nullptr;2252switch (RetVT.SimpleTy) {2253default: return false;2254case MVT::f32: Opc = &OpcTable[0][0]; break;2255case MVT::f64: Opc = &OpcTable[1][0]; break;2256}22572258const TargetRegisterClass *VR128 = &X86::VR128RegClass;2259Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpRHSReg, CC);2260Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, LHSReg);2261Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, RHSReg);2262Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, AndReg);2263ResultReg = createResultReg(RC);2264BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2265TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);2266}2267updateValueMap(I, ResultReg);2268return true;2269}22702271bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {2272// These are pseudo CMOV instructions and will be later expanded into control-2273// flow.2274unsigned Opc;2275switch (RetVT.SimpleTy) {2276default: return false;2277case MVT::i8: Opc = X86::CMOV_GR8; break;2278case MVT::i16: Opc = X86::CMOV_GR16; break;2279case MVT::i32: Opc = X86::CMOV_GR32; break;2280case MVT::f16:2281Opc = Subtarget->hasAVX512() ? X86::CMOV_FR16X : X86::CMOV_FR16; break;2282case MVT::f32:2283Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X : X86::CMOV_FR32; break;2284case MVT::f64:2285Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X : X86::CMOV_FR64; break;2286}22872288const Value *Cond = I->getOperand(0);2289X86::CondCode CC = X86::COND_NE;22902291// Optimize conditions coming from a compare if both instructions are in the2292// same basic block (values defined in other basic blocks may not have2293// initialized registers).2294const auto *CI = dyn_cast<CmpInst>(Cond);2295if (CI && (CI->getParent() == I->getParent())) {2296bool NeedSwap;2297std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate());2298if (CC > X86::LAST_VALID_COND)2299return false;23002301const Value *CmpLHS = CI->getOperand(0);2302const Value *CmpRHS = CI->getOperand(1);23032304if (NeedSwap)2305std::swap(CmpLHS, CmpRHS);23062307EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType());2308if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))2309return false;2310} else {2311Register CondReg = getRegForValue(Cond);2312if (CondReg == 0)2313return false;23142315// In case OpReg is a K register, COPY to a GPR2316if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {2317unsigned KCondReg = CondReg;2318CondReg = createResultReg(&X86::GR32RegClass);2319BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2320TII.get(TargetOpcode::COPY), CondReg)2321.addReg(KCondReg);2322CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, X86::sub_8bit);2323}2324BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TEST8ri))2325.addReg(CondReg)2326.addImm(1);2327}23282329const Value *LHS = I->getOperand(1);2330const Value *RHS = I->getOperand(2);23312332Register LHSReg = getRegForValue(LHS);2333Register RHSReg = getRegForValue(RHS);2334if (!LHSReg || !RHSReg)2335return false;23362337const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);23382339Register ResultReg =2340fastEmitInst_rri(Opc, RC, RHSReg, LHSReg, CC);2341updateValueMap(I, ResultReg);2342return true;2343}23442345bool X86FastISel::X86SelectSelect(const Instruction *I) {2346MVT RetVT;2347if (!isTypeLegal(I->getType(), RetVT))2348return false;23492350// Check if we can fold the select.2351if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {2352CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);2353const Value *Opnd = nullptr;2354switch (Predicate) {2355default: break;2356case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;2357case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break;2358}2359// No need for a select anymore - this is an unconditional move.2360if (Opnd) {2361Register OpReg = getRegForValue(Opnd);2362if (OpReg == 0)2363return false;2364const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);2365Register ResultReg = createResultReg(RC);2366BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2367TII.get(TargetOpcode::COPY), ResultReg)2368.addReg(OpReg);2369updateValueMap(I, ResultReg);2370return true;2371}2372}23732374// First try to use real conditional move instructions.2375if (X86FastEmitCMoveSelect(RetVT, I))2376return true;23772378// Try to use a sequence of SSE instructions to simulate a conditional move.2379if (X86FastEmitSSESelect(RetVT, I))2380return true;23812382// Fall-back to pseudo conditional move instructions, which will be later2383// converted to control-flow.2384if (X86FastEmitPseudoSelect(RetVT, I))2385return true;23862387return false;2388}23892390// Common code for X86SelectSIToFP and X86SelectUIToFP.2391bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) {2392// The target-independent selection algorithm in FastISel already knows how2393// to select a SINT_TO_FP if the target is SSE but not AVX.2394// Early exit if the subtarget doesn't have AVX.2395// Unsigned conversion requires avx512.2396bool HasAVX512 = Subtarget->hasAVX512();2397if (!Subtarget->hasAVX() || (!IsSigned && !HasAVX512))2398return false;23992400// TODO: We could sign extend narrower types.2401EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());2402if (SrcVT != MVT::i32 && SrcVT != MVT::i64)2403return false;24042405// Select integer to float/double conversion.2406Register OpReg = getRegForValue(I->getOperand(0));2407if (OpReg == 0)2408return false;24092410unsigned Opcode;24112412static const uint16_t SCvtOpc[2][2][2] = {2413{ { X86::VCVTSI2SSrr, X86::VCVTSI642SSrr },2414{ X86::VCVTSI2SDrr, X86::VCVTSI642SDrr } },2415{ { X86::VCVTSI2SSZrr, X86::VCVTSI642SSZrr },2416{ X86::VCVTSI2SDZrr, X86::VCVTSI642SDZrr } },2417};2418static const uint16_t UCvtOpc[2][2] = {2419{ X86::VCVTUSI2SSZrr, X86::VCVTUSI642SSZrr },2420{ X86::VCVTUSI2SDZrr, X86::VCVTUSI642SDZrr },2421};2422bool Is64Bit = SrcVT == MVT::i64;24232424if (I->getType()->isDoubleTy()) {2425// s/uitofp int -> double2426Opcode = IsSigned ? SCvtOpc[HasAVX512][1][Is64Bit] : UCvtOpc[1][Is64Bit];2427} else if (I->getType()->isFloatTy()) {2428// s/uitofp int -> float2429Opcode = IsSigned ? SCvtOpc[HasAVX512][0][Is64Bit] : UCvtOpc[0][Is64Bit];2430} else2431return false;24322433MVT DstVT = TLI.getValueType(DL, I->getType()).getSimpleVT();2434const TargetRegisterClass *RC = TLI.getRegClassFor(DstVT);2435Register ImplicitDefReg = createResultReg(RC);2436BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2437TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);2438Register ResultReg = fastEmitInst_rr(Opcode, RC, ImplicitDefReg, OpReg);2439updateValueMap(I, ResultReg);2440return true;2441}24422443bool X86FastISel::X86SelectSIToFP(const Instruction *I) {2444return X86SelectIntToFP(I, /*IsSigned*/true);2445}24462447bool X86FastISel::X86SelectUIToFP(const Instruction *I) {2448return X86SelectIntToFP(I, /*IsSigned*/false);2449}24502451// Helper method used by X86SelectFPExt and X86SelectFPTrunc.2452bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,2453unsigned TargetOpc,2454const TargetRegisterClass *RC) {2455assert((I->getOpcode() == Instruction::FPExt ||2456I->getOpcode() == Instruction::FPTrunc) &&2457"Instruction must be an FPExt or FPTrunc!");2458bool HasAVX = Subtarget->hasAVX();24592460Register OpReg = getRegForValue(I->getOperand(0));2461if (OpReg == 0)2462return false;24632464unsigned ImplicitDefReg;2465if (HasAVX) {2466ImplicitDefReg = createResultReg(RC);2467BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2468TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);24692470}24712472Register ResultReg = createResultReg(RC);2473MachineInstrBuilder MIB;2474MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpc),2475ResultReg);24762477if (HasAVX)2478MIB.addReg(ImplicitDefReg);24792480MIB.addReg(OpReg);2481updateValueMap(I, ResultReg);2482return true;2483}24842485bool X86FastISel::X86SelectFPExt(const Instruction *I) {2486if (Subtarget->hasSSE2() && I->getType()->isDoubleTy() &&2487I->getOperand(0)->getType()->isFloatTy()) {2488bool HasAVX512 = Subtarget->hasAVX512();2489// fpext from float to double.2490unsigned Opc =2491HasAVX512 ? X86::VCVTSS2SDZrr2492: Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;2493return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f64));2494}24952496return false;2497}24982499bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {2500if (Subtarget->hasSSE2() && I->getType()->isFloatTy() &&2501I->getOperand(0)->getType()->isDoubleTy()) {2502bool HasAVX512 = Subtarget->hasAVX512();2503// fptrunc from double to float.2504unsigned Opc =2505HasAVX512 ? X86::VCVTSD2SSZrr2506: Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;2507return X86SelectFPExtOrFPTrunc(I, Opc, TLI.getRegClassFor(MVT::f32));2508}25092510return false;2511}25122513bool X86FastISel::X86SelectTrunc(const Instruction *I) {2514EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());2515EVT DstVT = TLI.getValueType(DL, I->getType());25162517// This code only handles truncation to byte.2518if (DstVT != MVT::i8 && DstVT != MVT::i1)2519return false;2520if (!TLI.isTypeLegal(SrcVT))2521return false;25222523Register InputReg = getRegForValue(I->getOperand(0));2524if (!InputReg)2525// Unhandled operand. Halt "fast" selection and bail.2526return false;25272528if (SrcVT == MVT::i8) {2529// Truncate from i8 to i1; no code needed.2530updateValueMap(I, InputReg);2531return true;2532}25332534// Issue an extract_subreg.2535Register ResultReg = fastEmitInst_extractsubreg(MVT::i8, InputReg,2536X86::sub_8bit);2537if (!ResultReg)2538return false;25392540updateValueMap(I, ResultReg);2541return true;2542}25432544bool X86FastISel::IsMemcpySmall(uint64_t Len) {2545return Len <= (Subtarget->is64Bit() ? 32 : 16);2546}25472548bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,2549X86AddressMode SrcAM, uint64_t Len) {25502551// Make sure we don't bloat code by inlining very large memcpy's.2552if (!IsMemcpySmall(Len))2553return false;25542555bool i64Legal = Subtarget->is64Bit();25562557// We don't care about alignment here since we just emit integer accesses.2558while (Len) {2559MVT VT;2560if (Len >= 8 && i64Legal)2561VT = MVT::i64;2562else if (Len >= 4)2563VT = MVT::i32;2564else if (Len >= 2)2565VT = MVT::i16;2566else2567VT = MVT::i8;25682569unsigned Reg;2570bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);2571RV &= X86FastEmitStore(VT, Reg, DestAM);2572assert(RV && "Failed to emit load or store??");2573(void)RV;25742575unsigned Size = VT.getSizeInBits()/8;2576Len -= Size;2577DestAM.Disp += Size;2578SrcAM.Disp += Size;2579}25802581return true;2582}25832584bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {2585// FIXME: Handle more intrinsics.2586switch (II->getIntrinsicID()) {2587default: return false;2588case Intrinsic::convert_from_fp16:2589case Intrinsic::convert_to_fp16: {2590if (Subtarget->useSoftFloat() || !Subtarget->hasF16C())2591return false;25922593const Value *Op = II->getArgOperand(0);2594Register InputReg = getRegForValue(Op);2595if (InputReg == 0)2596return false;25972598// F16C only allows converting from float to half and from half to float.2599bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16;2600if (IsFloatToHalf) {2601if (!Op->getType()->isFloatTy())2602return false;2603} else {2604if (!II->getType()->isFloatTy())2605return false;2606}26072608unsigned ResultReg = 0;2609const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16);2610if (IsFloatToHalf) {2611// 'InputReg' is implicitly promoted from register class FR32 to2612// register class VR128 by method 'constrainOperandRegClass' which is2613// directly called by 'fastEmitInst_ri'.2614// Instruction VCVTPS2PHrr takes an extra immediate operand which is2615// used to provide rounding control: use MXCSR.RC, encoded as 0b100.2616// It's consistent with the other FP instructions, which are usually2617// controlled by MXCSR.2618unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPS2PHZ128rr2619: X86::VCVTPS2PHrr;2620InputReg = fastEmitInst_ri(Opc, RC, InputReg, 4);26212622// Move the lower 32-bits of ResultReg to another register of class GR32.2623Opc = Subtarget->hasAVX512() ? X86::VMOVPDI2DIZrr2624: X86::VMOVPDI2DIrr;2625ResultReg = createResultReg(&X86::GR32RegClass);2626BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg)2627.addReg(InputReg, RegState::Kill);26282629// The result value is in the lower 16-bits of ResultReg.2630unsigned RegIdx = X86::sub_16bit;2631ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, RegIdx);2632} else {2633assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");2634// Explicitly zero-extend the input to 32-bit.2635InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg);26362637// The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.2638InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,2639InputReg);26402641unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr2642: X86::VCVTPH2PSrr;2643InputReg = fastEmitInst_r(Opc, RC, InputReg);26442645// The result value is in the lower 32-bits of ResultReg.2646// Emit an explicit copy from register class VR128 to register class FR32.2647ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));2648BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2649TII.get(TargetOpcode::COPY), ResultReg)2650.addReg(InputReg, RegState::Kill);2651}26522653updateValueMap(II, ResultReg);2654return true;2655}2656case Intrinsic::frameaddress: {2657MachineFunction *MF = FuncInfo.MF;2658if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI())2659return false;26602661Type *RetTy = II->getCalledFunction()->getReturnType();26622663MVT VT;2664if (!isTypeLegal(RetTy, VT))2665return false;26662667unsigned Opc;2668const TargetRegisterClass *RC = nullptr;26692670switch (VT.SimpleTy) {2671default: llvm_unreachable("Invalid result type for frameaddress.");2672case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;2673case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;2674}26752676// This needs to be set before we call getPtrSizedFrameRegister, otherwise2677// we get the wrong frame register.2678MachineFrameInfo &MFI = MF->getFrameInfo();2679MFI.setFrameAddressIsTaken(true);26802681const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();2682unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF);2683assert(((FrameReg == X86::RBP && VT == MVT::i64) ||2684(FrameReg == X86::EBP && VT == MVT::i32)) &&2685"Invalid Frame Register!");26862687// Always make a copy of the frame register to a vreg first, so that we2688// never directly reference the frame register (the TwoAddressInstruction-2689// Pass doesn't like that).2690Register SrcReg = createResultReg(RC);2691BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2692TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);26932694// Now recursively load from the frame address.2695// movq (%rbp), %rax2696// movq (%rax), %rax2697// movq (%rax), %rax2698// ...2699unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();2700while (Depth--) {2701Register DestReg = createResultReg(RC);2702addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2703TII.get(Opc), DestReg), SrcReg);2704SrcReg = DestReg;2705}27062707updateValueMap(II, SrcReg);2708return true;2709}2710case Intrinsic::memcpy: {2711const MemCpyInst *MCI = cast<MemCpyInst>(II);2712// Don't handle volatile or variable length memcpys.2713if (MCI->isVolatile())2714return false;27152716if (isa<ConstantInt>(MCI->getLength())) {2717// Small memcpy's are common enough that we want to do them2718// without a call if possible.2719uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();2720if (IsMemcpySmall(Len)) {2721X86AddressMode DestAM, SrcAM;2722if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||2723!X86SelectAddress(MCI->getRawSource(), SrcAM))2724return false;2725TryEmitSmallMemcpy(DestAM, SrcAM, Len);2726return true;2727}2728}27292730unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;2731if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))2732return false;27332734if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)2735return false;27362737return lowerCallTo(II, "memcpy", II->arg_size() - 1);2738}2739case Intrinsic::memset: {2740const MemSetInst *MSI = cast<MemSetInst>(II);27412742if (MSI->isVolatile())2743return false;27442745unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;2746if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))2747return false;27482749if (MSI->getDestAddressSpace() > 255)2750return false;27512752return lowerCallTo(II, "memset", II->arg_size() - 1);2753}2754case Intrinsic::stackprotector: {2755// Emit code to store the stack guard onto the stack.2756EVT PtrTy = TLI.getPointerTy(DL);27572758const Value *Op1 = II->getArgOperand(0); // The guard's value.2759const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));27602761MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);27622763// Grab the frame index.2764X86AddressMode AM;2765if (!X86SelectAddress(Slot, AM)) return false;2766if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;2767return true;2768}2769case Intrinsic::dbg_declare: {2770const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);2771X86AddressMode AM;2772assert(DI->getAddress() && "Null address should be checked earlier!");2773if (!X86SelectAddress(DI->getAddress(), AM))2774return false;2775const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);2776assert(DI->getVariable()->isValidLocationForIntrinsic(MIMD.getDL()) &&2777"Expected inlined-at fields to agree");2778addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II), AM)2779.addImm(0)2780.addMetadata(DI->getVariable())2781.addMetadata(DI->getExpression());2782return true;2783}2784case Intrinsic::trap: {2785BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::TRAP));2786return true;2787}2788case Intrinsic::sqrt: {2789if (!Subtarget->hasSSE1())2790return false;27912792Type *RetTy = II->getCalledFunction()->getReturnType();27932794MVT VT;2795if (!isTypeLegal(RetTy, VT))2796return false;27972798// Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT2799// is not generated by FastISel yet.2800// FIXME: Update this code once tablegen can handle it.2801static const uint16_t SqrtOpc[3][2] = {2802{ X86::SQRTSSr, X86::SQRTSDr },2803{ X86::VSQRTSSr, X86::VSQRTSDr },2804{ X86::VSQRTSSZr, X86::VSQRTSDZr },2805};2806unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :2807Subtarget->hasAVX() ? 1 :28080;2809unsigned Opc;2810switch (VT.SimpleTy) {2811default: return false;2812case MVT::f32: Opc = SqrtOpc[AVXLevel][0]; break;2813case MVT::f64: Opc = SqrtOpc[AVXLevel][1]; break;2814}28152816const Value *SrcVal = II->getArgOperand(0);2817Register SrcReg = getRegForValue(SrcVal);28182819if (SrcReg == 0)2820return false;28212822const TargetRegisterClass *RC = TLI.getRegClassFor(VT);2823unsigned ImplicitDefReg = 0;2824if (AVXLevel > 0) {2825ImplicitDefReg = createResultReg(RC);2826BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2827TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);2828}28292830Register ResultReg = createResultReg(RC);2831MachineInstrBuilder MIB;2832MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc),2833ResultReg);28342835if (ImplicitDefReg)2836MIB.addReg(ImplicitDefReg);28372838MIB.addReg(SrcReg);28392840updateValueMap(II, ResultReg);2841return true;2842}2843case Intrinsic::sadd_with_overflow:2844case Intrinsic::uadd_with_overflow:2845case Intrinsic::ssub_with_overflow:2846case Intrinsic::usub_with_overflow:2847case Intrinsic::smul_with_overflow:2848case Intrinsic::umul_with_overflow: {2849// This implements the basic lowering of the xalu with overflow intrinsics2850// into add/sub/mul followed by either seto or setb.2851const Function *Callee = II->getCalledFunction();2852auto *Ty = cast<StructType>(Callee->getReturnType());2853Type *RetTy = Ty->getTypeAtIndex(0U);2854assert(Ty->getTypeAtIndex(1)->isIntegerTy() &&2855Ty->getTypeAtIndex(1)->getScalarSizeInBits() == 1 &&2856"Overflow value expected to be an i1");28572858MVT VT;2859if (!isTypeLegal(RetTy, VT))2860return false;28612862if (VT < MVT::i8 || VT > MVT::i64)2863return false;28642865const Value *LHS = II->getArgOperand(0);2866const Value *RHS = II->getArgOperand(1);28672868// Canonicalize immediate to the RHS.2869if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative())2870std::swap(LHS, RHS);28712872unsigned BaseOpc, CondCode;2873switch (II->getIntrinsicID()) {2874default: llvm_unreachable("Unexpected intrinsic!");2875case Intrinsic::sadd_with_overflow:2876BaseOpc = ISD::ADD; CondCode = X86::COND_O; break;2877case Intrinsic::uadd_with_overflow:2878BaseOpc = ISD::ADD; CondCode = X86::COND_B; break;2879case Intrinsic::ssub_with_overflow:2880BaseOpc = ISD::SUB; CondCode = X86::COND_O; break;2881case Intrinsic::usub_with_overflow:2882BaseOpc = ISD::SUB; CondCode = X86::COND_B; break;2883case Intrinsic::smul_with_overflow:2884BaseOpc = X86ISD::SMUL; CondCode = X86::COND_O; break;2885case Intrinsic::umul_with_overflow:2886BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break;2887}28882889Register LHSReg = getRegForValue(LHS);2890if (LHSReg == 0)2891return false;28922893unsigned ResultReg = 0;2894// Check if we have an immediate version.2895if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {2896static const uint16_t Opc[2][4] = {2897{ X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },2898{ X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }2899};29002901if (CI->isOne() && (BaseOpc == ISD::ADD || BaseOpc == ISD::SUB) &&2902CondCode == X86::COND_O) {2903// We can use INC/DEC.2904ResultReg = createResultReg(TLI.getRegClassFor(VT));2905bool IsDec = BaseOpc == ISD::SUB;2906BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2907TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)2908.addReg(LHSReg);2909} else2910ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, CI->getZExtValue());2911}29122913unsigned RHSReg;2914if (!ResultReg) {2915RHSReg = getRegForValue(RHS);2916if (RHSReg == 0)2917return false;2918ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, RHSReg);2919}29202921// FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit2922// it manually.2923if (BaseOpc == X86ISD::UMUL && !ResultReg) {2924static const uint16_t MULOpc[] =2925{ X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };2926static const MCPhysReg Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };2927// First copy the first operand into RAX, which is an implicit input to2928// the X86::MUL*r instruction.2929BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2930TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])2931.addReg(LHSReg);2932ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],2933TLI.getRegClassFor(VT), RHSReg);2934} else if (BaseOpc == X86ISD::SMUL && !ResultReg) {2935static const uint16_t MULOpc[] =2936{ X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };2937if (VT == MVT::i8) {2938// Copy the first operand into AL, which is an implicit input to the2939// X86::IMUL8r instruction.2940BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,2941TII.get(TargetOpcode::COPY), X86::AL)2942.addReg(LHSReg);2943ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg);2944} else2945ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],2946TLI.getRegClassFor(VT), LHSReg, RHSReg);2947}29482949if (!ResultReg)2950return false;29512952// Assign to a GPR since the overflow return value is lowered to a SETcc.2953Register ResultReg2 = createResultReg(&X86::GR8RegClass);2954assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");2955BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::SETCCr),2956ResultReg2).addImm(CondCode);29572958updateValueMap(II, ResultReg, 2);2959return true;2960}2961case Intrinsic::x86_sse_cvttss2si:2962case Intrinsic::x86_sse_cvttss2si64:2963case Intrinsic::x86_sse2_cvttsd2si:2964case Intrinsic::x86_sse2_cvttsd2si64: {2965bool IsInputDouble;2966switch (II->getIntrinsicID()) {2967default: llvm_unreachable("Unexpected intrinsic.");2968case Intrinsic::x86_sse_cvttss2si:2969case Intrinsic::x86_sse_cvttss2si64:2970if (!Subtarget->hasSSE1())2971return false;2972IsInputDouble = false;2973break;2974case Intrinsic::x86_sse2_cvttsd2si:2975case Intrinsic::x86_sse2_cvttsd2si64:2976if (!Subtarget->hasSSE2())2977return false;2978IsInputDouble = true;2979break;2980}29812982Type *RetTy = II->getCalledFunction()->getReturnType();2983MVT VT;2984if (!isTypeLegal(RetTy, VT))2985return false;29862987static const uint16_t CvtOpc[3][2][2] = {2988{ { X86::CVTTSS2SIrr, X86::CVTTSS2SI64rr },2989{ X86::CVTTSD2SIrr, X86::CVTTSD2SI64rr } },2990{ { X86::VCVTTSS2SIrr, X86::VCVTTSS2SI64rr },2991{ X86::VCVTTSD2SIrr, X86::VCVTTSD2SI64rr } },2992{ { X86::VCVTTSS2SIZrr, X86::VCVTTSS2SI64Zrr },2993{ X86::VCVTTSD2SIZrr, X86::VCVTTSD2SI64Zrr } },2994};2995unsigned AVXLevel = Subtarget->hasAVX512() ? 2 :2996Subtarget->hasAVX() ? 1 :29970;2998unsigned Opc;2999switch (VT.SimpleTy) {3000default: llvm_unreachable("Unexpected result type.");3001case MVT::i32: Opc = CvtOpc[AVXLevel][IsInputDouble][0]; break;3002case MVT::i64: Opc = CvtOpc[AVXLevel][IsInputDouble][1]; break;3003}30043005// Check if we can fold insertelement instructions into the convert.3006const Value *Op = II->getArgOperand(0);3007while (auto *IE = dyn_cast<InsertElementInst>(Op)) {3008const Value *Index = IE->getOperand(2);3009if (!isa<ConstantInt>(Index))3010break;3011unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();30123013if (Idx == 0) {3014Op = IE->getOperand(1);3015break;3016}3017Op = IE->getOperand(0);3018}30193020Register Reg = getRegForValue(Op);3021if (Reg == 0)3022return false;30233024Register ResultReg = createResultReg(TLI.getRegClassFor(VT));3025BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg)3026.addReg(Reg);30273028updateValueMap(II, ResultReg);3029return true;3030}3031case Intrinsic::x86_sse42_crc32_32_8:3032case Intrinsic::x86_sse42_crc32_32_16:3033case Intrinsic::x86_sse42_crc32_32_32:3034case Intrinsic::x86_sse42_crc32_64_64: {3035if (!Subtarget->hasCRC32())3036return false;30373038Type *RetTy = II->getCalledFunction()->getReturnType();30393040MVT VT;3041if (!isTypeLegal(RetTy, VT))3042return false;30433044unsigned Opc;3045const TargetRegisterClass *RC = nullptr;30463047switch (II->getIntrinsicID()) {3048default:3049llvm_unreachable("Unexpected intrinsic.");3050#define GET_EGPR_IF_ENABLED(OPC) Subtarget->hasEGPR() ? OPC##_EVEX : OPC3051case Intrinsic::x86_sse42_crc32_32_8:3052Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r8);3053RC = &X86::GR32RegClass;3054break;3055case Intrinsic::x86_sse42_crc32_32_16:3056Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r16);3057RC = &X86::GR32RegClass;3058break;3059case Intrinsic::x86_sse42_crc32_32_32:3060Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r32);3061RC = &X86::GR32RegClass;3062break;3063case Intrinsic::x86_sse42_crc32_64_64:3064Opc = GET_EGPR_IF_ENABLED(X86::CRC32r64r64);3065RC = &X86::GR64RegClass;3066break;3067#undef GET_EGPR_IF_ENABLED3068}30693070const Value *LHS = II->getArgOperand(0);3071const Value *RHS = II->getArgOperand(1);30723073Register LHSReg = getRegForValue(LHS);3074Register RHSReg = getRegForValue(RHS);3075if (!LHSReg || !RHSReg)3076return false;30773078Register ResultReg = fastEmitInst_rr(Opc, RC, LHSReg, RHSReg);3079if (!ResultReg)3080return false;30813082updateValueMap(II, ResultReg);3083return true;3084}3085}3086}30873088bool X86FastISel::fastLowerArguments() {3089if (!FuncInfo.CanLowerReturn)3090return false;30913092const Function *F = FuncInfo.Fn;3093if (F->isVarArg())3094return false;30953096CallingConv::ID CC = F->getCallingConv();3097if (CC != CallingConv::C)3098return false;30993100if (Subtarget->isCallingConvWin64(CC))3101return false;31023103if (!Subtarget->is64Bit())3104return false;31053106if (Subtarget->useSoftFloat())3107return false;31083109// Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.3110unsigned GPRCnt = 0;3111unsigned FPRCnt = 0;3112for (auto const &Arg : F->args()) {3113if (Arg.hasAttribute(Attribute::ByVal) ||3114Arg.hasAttribute(Attribute::InReg) ||3115Arg.hasAttribute(Attribute::StructRet) ||3116Arg.hasAttribute(Attribute::SwiftSelf) ||3117Arg.hasAttribute(Attribute::SwiftAsync) ||3118Arg.hasAttribute(Attribute::SwiftError) ||3119Arg.hasAttribute(Attribute::Nest))3120return false;31213122Type *ArgTy = Arg.getType();3123if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())3124return false;31253126EVT ArgVT = TLI.getValueType(DL, ArgTy);3127if (!ArgVT.isSimple()) return false;3128switch (ArgVT.getSimpleVT().SimpleTy) {3129default: return false;3130case MVT::i32:3131case MVT::i64:3132++GPRCnt;3133break;3134case MVT::f32:3135case MVT::f64:3136if (!Subtarget->hasSSE1())3137return false;3138++FPRCnt;3139break;3140}31413142if (GPRCnt > 6)3143return false;31443145if (FPRCnt > 8)3146return false;3147}31483149static const MCPhysReg GPR32ArgRegs[] = {3150X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D3151};3152static const MCPhysReg GPR64ArgRegs[] = {3153X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R93154};3155static const MCPhysReg XMMArgRegs[] = {3156X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,3157X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM73158};31593160unsigned GPRIdx = 0;3161unsigned FPRIdx = 0;3162for (auto const &Arg : F->args()) {3163MVT VT = TLI.getSimpleValueType(DL, Arg.getType());3164const TargetRegisterClass *RC = TLI.getRegClassFor(VT);3165unsigned SrcReg;3166switch (VT.SimpleTy) {3167default: llvm_unreachable("Unexpected value type.");3168case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;3169case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;3170case MVT::f32: [[fallthrough]];3171case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;3172}3173Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);3174// FIXME: Unfortunately it's necessary to emit a copy from the livein copy.3175// Without this, EmitLiveInCopies may eliminate the livein if its only3176// use is a bitcast (which isn't turned into an instruction).3177Register ResultReg = createResultReg(RC);3178BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,3179TII.get(TargetOpcode::COPY), ResultReg)3180.addReg(DstReg, getKillRegState(true));3181updateValueMap(&Arg, ResultReg);3182}3183return true;3184}31853186static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,3187CallingConv::ID CC,3188const CallBase *CB) {3189if (Subtarget->is64Bit())3190return 0;3191if (Subtarget->getTargetTriple().isOSMSVCRT())3192return 0;3193if (CC == CallingConv::Fast || CC == CallingConv::GHC ||3194CC == CallingConv::HiPE || CC == CallingConv::Tail ||3195CC == CallingConv::SwiftTail)3196return 0;31973198if (CB)3199if (CB->arg_empty() || !CB->paramHasAttr(0, Attribute::StructRet) ||3200CB->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU())3201return 0;32023203return 4;3204}32053206bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {3207auto &OutVals = CLI.OutVals;3208auto &OutFlags = CLI.OutFlags;3209auto &OutRegs = CLI.OutRegs;3210auto &Ins = CLI.Ins;3211auto &InRegs = CLI.InRegs;3212CallingConv::ID CC = CLI.CallConv;3213bool &IsTailCall = CLI.IsTailCall;3214bool IsVarArg = CLI.IsVarArg;3215const Value *Callee = CLI.Callee;3216MCSymbol *Symbol = CLI.Symbol;3217const auto *CB = CLI.CB;32183219bool Is64Bit = Subtarget->is64Bit();3220bool IsWin64 = Subtarget->isCallingConvWin64(CC);32213222// Call / invoke instructions with NoCfCheck attribute require special3223// handling.3224if (CB && CB->doesNoCfCheck())3225return false;32263227// Functions with no_caller_saved_registers that need special handling.3228if ((CB && isa<CallInst>(CB) && CB->hasFnAttr("no_caller_saved_registers")))3229return false;32303231// Functions with no_callee_saved_registers that need special handling.3232if ((CB && CB->hasFnAttr("no_callee_saved_registers")))3233return false;32343235// Indirect calls with CFI checks need special handling.3236if (CB && CB->isIndirectCall() && CB->getOperandBundle(LLVMContext::OB_kcfi))3237return false;32383239// Functions using thunks for indirect calls need to use SDISel.3240if (Subtarget->useIndirectThunkCalls())3241return false;32423243// Handle only C and fastcc calling conventions for now.3244switch (CC) {3245default: return false;3246case CallingConv::C:3247case CallingConv::Fast:3248case CallingConv::Tail:3249case CallingConv::Swift:3250case CallingConv::SwiftTail:3251case CallingConv::X86_FastCall:3252case CallingConv::X86_StdCall:3253case CallingConv::X86_ThisCall:3254case CallingConv::Win64:3255case CallingConv::X86_64_SysV:3256case CallingConv::CFGuard_Check:3257break;3258}32593260// Allow SelectionDAG isel to handle tail calls.3261if (IsTailCall)3262return false;32633264// fastcc with -tailcallopt is intended to provide a guaranteed3265// tail call optimization. Fastisel doesn't know how to do that.3266if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||3267CC == CallingConv::Tail || CC == CallingConv::SwiftTail)3268return false;32693270// Don't know how to handle Win64 varargs yet. Nothing special needed for3271// x86-32. Special handling for x86-64 is implemented.3272if (IsVarArg && IsWin64)3273return false;32743275// Don't know about inalloca yet.3276if (CLI.CB && CLI.CB->hasInAllocaArgument())3277return false;32783279for (auto Flag : CLI.OutFlags)3280if (Flag.isSwiftError() || Flag.isPreallocated())3281return false;32823283SmallVector<MVT, 16> OutVTs;3284SmallVector<unsigned, 16> ArgRegs;32853286// If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra3287// instruction. This is safe because it is common to all FastISel supported3288// calling conventions on x86.3289for (int i = 0, e = OutVals.size(); i != e; ++i) {3290Value *&Val = OutVals[i];3291ISD::ArgFlagsTy Flags = OutFlags[i];3292if (auto *CI = dyn_cast<ConstantInt>(Val)) {3293if (CI->getBitWidth() < 32) {3294if (Flags.isSExt())3295Val = ConstantInt::get(CI->getContext(), CI->getValue().sext(32));3296else3297Val = ConstantInt::get(CI->getContext(), CI->getValue().zext(32));3298}3299}33003301// Passing bools around ends up doing a trunc to i1 and passing it.3302// Codegen this as an argument + "and 1".3303MVT VT;3304auto *TI = dyn_cast<TruncInst>(Val);3305unsigned ResultReg;3306if (TI && TI->getType()->isIntegerTy(1) && CLI.CB &&3307(TI->getParent() == CLI.CB->getParent()) && TI->hasOneUse()) {3308Value *PrevVal = TI->getOperand(0);3309ResultReg = getRegForValue(PrevVal);33103311if (!ResultReg)3312return false;33133314if (!isTypeLegal(PrevVal->getType(), VT))3315return false;33163317ResultReg = fastEmit_ri(VT, VT, ISD::AND, ResultReg, 1);3318} else {3319if (!isTypeLegal(Val->getType(), VT) ||3320(VT.isVector() && VT.getVectorElementType() == MVT::i1))3321return false;3322ResultReg = getRegForValue(Val);3323}33243325if (!ResultReg)3326return false;33273328ArgRegs.push_back(ResultReg);3329OutVTs.push_back(VT);3330}33313332// Analyze operands of the call, assigning locations to each operand.3333SmallVector<CCValAssign, 16> ArgLocs;3334CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());33353336// Allocate shadow area for Win643337if (IsWin64)3338CCInfo.AllocateStack(32, Align(8));33393340CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);33413342// Get a count of how many bytes are to be pushed on the stack.3343unsigned NumBytes = CCInfo.getAlignedCallFrameSize();33443345// Issue CALLSEQ_START3346unsigned AdjStackDown = TII.getCallFrameSetupOpcode();3347BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(AdjStackDown))3348.addImm(NumBytes).addImm(0).addImm(0);33493350// Walk the register/memloc assignments, inserting copies/loads.3351const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();3352for (const CCValAssign &VA : ArgLocs) {3353const Value *ArgVal = OutVals[VA.getValNo()];3354MVT ArgVT = OutVTs[VA.getValNo()];33553356if (ArgVT == MVT::x86mmx)3357return false;33583359unsigned ArgReg = ArgRegs[VA.getValNo()];33603361// Promote the value if needed.3362switch (VA.getLocInfo()) {3363case CCValAssign::Full: break;3364case CCValAssign::SExt: {3365assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&3366"Unexpected extend");33673368if (ArgVT == MVT::i1)3369return false;33703371bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,3372ArgVT, ArgReg);3373assert(Emitted && "Failed to emit a sext!"); (void)Emitted;3374ArgVT = VA.getLocVT();3375break;3376}3377case CCValAssign::ZExt: {3378assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&3379"Unexpected extend");33803381// Handle zero-extension from i1 to i8, which is common.3382if (ArgVT == MVT::i1) {3383// Set the high bits to zero.3384ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg);3385ArgVT = MVT::i8;33863387if (ArgReg == 0)3388return false;3389}33903391bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,3392ArgVT, ArgReg);3393assert(Emitted && "Failed to emit a zext!"); (void)Emitted;3394ArgVT = VA.getLocVT();3395break;3396}3397case CCValAssign::AExt: {3398assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&3399"Unexpected extend");3400bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,3401ArgVT, ArgReg);3402if (!Emitted)3403Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,3404ArgVT, ArgReg);3405if (!Emitted)3406Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,3407ArgVT, ArgReg);34083409assert(Emitted && "Failed to emit a aext!"); (void)Emitted;3410ArgVT = VA.getLocVT();3411break;3412}3413case CCValAssign::BCvt: {3414ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg);3415assert(ArgReg && "Failed to emit a bitcast!");3416ArgVT = VA.getLocVT();3417break;3418}3419case CCValAssign::VExt:3420// VExt has not been implemented, so this should be impossible to reach3421// for now. However, fallback to Selection DAG isel once implemented.3422return false;3423case CCValAssign::AExtUpper:3424case CCValAssign::SExtUpper:3425case CCValAssign::ZExtUpper:3426case CCValAssign::FPExt:3427case CCValAssign::Trunc:3428llvm_unreachable("Unexpected loc info!");3429case CCValAssign::Indirect:3430// FIXME: Indirect doesn't need extending, but fast-isel doesn't fully3431// support this.3432return false;3433}34343435if (VA.isRegLoc()) {3436BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,3437TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);3438OutRegs.push_back(VA.getLocReg());3439} else {3440assert(VA.isMemLoc() && "Unknown value location!");34413442// Don't emit stores for undef values.3443if (isa<UndefValue>(ArgVal))3444continue;34453446unsigned LocMemOffset = VA.getLocMemOffset();3447X86AddressMode AM;3448AM.Base.Reg = RegInfo->getStackRegister();3449AM.Disp = LocMemOffset;3450ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];3451Align Alignment = DL.getABITypeAlign(ArgVal->getType());3452MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(3453MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),3454MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);3455if (Flags.isByVal()) {3456X86AddressMode SrcAM;3457SrcAM.Base.Reg = ArgReg;3458if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))3459return false;3460} else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {3461// If this is a really simple value, emit this with the Value* version3462// of X86FastEmitStore. If it isn't simple, we don't want to do this,3463// as it can cause us to reevaluate the argument.3464if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))3465return false;3466} else {3467if (!X86FastEmitStore(ArgVT, ArgReg, AM, MMO))3468return false;3469}3470}3471}34723473// ELF / PIC requires GOT in the EBX register before function calls via PLT3474// GOT pointer.3475if (Subtarget->isPICStyleGOT()) {3476unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);3477BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,3478TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);3479}34803481if (Is64Bit && IsVarArg && !IsWin64) {3482// From AMD64 ABI document:3483// For calls that may call functions that use varargs or stdargs3484// (prototype-less calls or calls to functions containing ellipsis (...) in3485// the declaration) %al is used as hidden argument to specify the number3486// of SSE registers used. The contents of %al do not need to match exactly3487// the number of registers, but must be an ubound on the number of SSE3488// registers used and is in the range 0 - 8 inclusive.34893490// Count the number of XMM registers allocated.3491static const MCPhysReg XMMArgRegs[] = {3492X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,3493X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM73494};3495unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);3496assert((Subtarget->hasSSE1() || !NumXMMRegs)3497&& "SSE registers cannot be used when SSE is disabled");3498BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV8ri),3499X86::AL).addImm(NumXMMRegs);3500}35013502// Materialize callee address in a register. FIXME: GV address can be3503// handled with a CALLpcrel32 instead.3504X86AddressMode CalleeAM;3505if (!X86SelectCallAddress(Callee, CalleeAM))3506return false;35073508unsigned CalleeOp = 0;3509const GlobalValue *GV = nullptr;3510if (CalleeAM.GV != nullptr) {3511GV = CalleeAM.GV;3512} else if (CalleeAM.Base.Reg != 0) {3513CalleeOp = CalleeAM.Base.Reg;3514} else3515return false;35163517// Issue the call.3518MachineInstrBuilder MIB;3519if (CalleeOp) {3520// Register-indirect call.3521unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;3522MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc))3523.addReg(CalleeOp);3524} else {3525// Direct call.3526assert(GV && "Not a direct call");3527// See if we need any target-specific flags on the GV operand.3528unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV);3529if (OpFlags == X86II::MO_PLT && !Is64Bit &&3530TM.getRelocationModel() == Reloc::Static && isa<Function>(GV) &&3531cast<Function>(GV)->isIntrinsic())3532OpFlags = X86II::MO_NO_FLAG;35333534// This will be a direct call, or an indirect call through memory for3535// NonLazyBind calls or dllimport calls.3536bool NeedLoad = OpFlags == X86II::MO_DLLIMPORT ||3537OpFlags == X86II::MO_GOTPCREL ||3538OpFlags == X86II::MO_GOTPCREL_NORELAX ||3539OpFlags == X86II::MO_COFFSTUB;3540unsigned CallOpc = NeedLoad3541? (Is64Bit ? X86::CALL64m : X86::CALL32m)3542: (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);35433544MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc));3545if (NeedLoad)3546MIB.addReg(Is64Bit ? X86::RIP : 0).addImm(1).addReg(0);3547if (Symbol)3548MIB.addSym(Symbol, OpFlags);3549else3550MIB.addGlobalAddress(GV, 0, OpFlags);3551if (NeedLoad)3552MIB.addReg(0);3553}35543555// Add a register mask operand representing the call-preserved registers.3556// Proper defs for return values will be added by setPhysRegsDeadExcept().3557MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));35583559// Add an implicit use GOT pointer in EBX.3560if (Subtarget->isPICStyleGOT())3561MIB.addReg(X86::EBX, RegState::Implicit);35623563if (Is64Bit && IsVarArg && !IsWin64)3564MIB.addReg(X86::AL, RegState::Implicit);35653566// Add implicit physical register uses to the call.3567for (auto Reg : OutRegs)3568MIB.addReg(Reg, RegState::Implicit);35693570// Issue CALLSEQ_END3571unsigned NumBytesForCalleeToPop =3572X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,3573TM.Options.GuaranteedTailCallOpt)3574? NumBytes // Callee pops everything.3575: computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CB);3576unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();3577BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(AdjStackUp))3578.addImm(NumBytes).addImm(NumBytesForCalleeToPop);35793580// Now handle call return values.3581SmallVector<CCValAssign, 16> RVLocs;3582CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,3583CLI.RetTy->getContext());3584CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);35853586// Copy all of the result registers out of their specified physreg.3587Register ResultReg = FuncInfo.CreateRegs(CLI.RetTy);3588for (unsigned i = 0; i != RVLocs.size(); ++i) {3589CCValAssign &VA = RVLocs[i];3590EVT CopyVT = VA.getValVT();3591unsigned CopyReg = ResultReg + i;3592Register SrcReg = VA.getLocReg();35933594// If this is x86-64, and we disabled SSE, we can't return FP values3595if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&3596((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {3597report_fatal_error("SSE register return with SSE disabled");3598}35993600// If we prefer to use the value in xmm registers, copy it out as f80 and3601// use a truncate to move it from fp stack reg to xmm reg.3602if ((SrcReg == X86::FP0 || SrcReg == X86::FP1) &&3603isScalarFPTypeInSSEReg(VA.getValVT())) {3604CopyVT = MVT::f80;3605CopyReg = createResultReg(&X86::RFP80RegClass);3606}36073608// Copy out the result.3609BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,3610TII.get(TargetOpcode::COPY), CopyReg).addReg(SrcReg);3611InRegs.push_back(VA.getLocReg());36123613// Round the f80 to the right size, which also moves it to the appropriate3614// xmm register. This is accomplished by storing the f80 value in memory3615// and then loading it back.3616if (CopyVT != VA.getValVT()) {3617EVT ResVT = VA.getValVT();3618unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;3619unsigned MemSize = ResVT.getSizeInBits()/8;3620int FI = MFI.CreateStackObject(MemSize, Align(MemSize), false);3621addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,3622TII.get(Opc)), FI)3623.addReg(CopyReg);3624Opc = ResVT == MVT::f32 ? X86::MOVSSrm_alt : X86::MOVSDrm_alt;3625addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,3626TII.get(Opc), ResultReg + i), FI);3627}3628}36293630CLI.ResultReg = ResultReg;3631CLI.NumResultRegs = RVLocs.size();3632CLI.Call = MIB;36333634return true;3635}36363637bool3638X86FastISel::fastSelectInstruction(const Instruction *I) {3639switch (I->getOpcode()) {3640default: break;3641case Instruction::Load:3642return X86SelectLoad(I);3643case Instruction::Store:3644return X86SelectStore(I);3645case Instruction::Ret:3646return X86SelectRet(I);3647case Instruction::ICmp:3648case Instruction::FCmp:3649return X86SelectCmp(I);3650case Instruction::ZExt:3651return X86SelectZExt(I);3652case Instruction::SExt:3653return X86SelectSExt(I);3654case Instruction::Br:3655return X86SelectBranch(I);3656case Instruction::LShr:3657case Instruction::AShr:3658case Instruction::Shl:3659return X86SelectShift(I);3660case Instruction::SDiv:3661case Instruction::UDiv:3662case Instruction::SRem:3663case Instruction::URem:3664return X86SelectDivRem(I);3665case Instruction::Select:3666return X86SelectSelect(I);3667case Instruction::Trunc:3668return X86SelectTrunc(I);3669case Instruction::FPExt:3670return X86SelectFPExt(I);3671case Instruction::FPTrunc:3672return X86SelectFPTrunc(I);3673case Instruction::SIToFP:3674return X86SelectSIToFP(I);3675case Instruction::UIToFP:3676return X86SelectUIToFP(I);3677case Instruction::IntToPtr: // Deliberate fall-through.3678case Instruction::PtrToInt: {3679EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());3680EVT DstVT = TLI.getValueType(DL, I->getType());3681if (DstVT.bitsGT(SrcVT))3682return X86SelectZExt(I);3683if (DstVT.bitsLT(SrcVT))3684return X86SelectTrunc(I);3685Register Reg = getRegForValue(I->getOperand(0));3686if (Reg == 0) return false;3687updateValueMap(I, Reg);3688return true;3689}3690case Instruction::BitCast: {3691// Select SSE2/AVX bitcasts between 128/256/512 bit vector types.3692if (!Subtarget->hasSSE2())3693return false;36943695MVT SrcVT, DstVT;3696if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT) ||3697!isTypeLegal(I->getType(), DstVT))3698return false;36993700// Only allow vectors that use xmm/ymm/zmm.3701if (!SrcVT.isVector() || !DstVT.isVector() ||3702SrcVT.getVectorElementType() == MVT::i1 ||3703DstVT.getVectorElementType() == MVT::i1)3704return false;37053706Register Reg = getRegForValue(I->getOperand(0));3707if (!Reg)3708return false;37093710// Emit a reg-reg copy so we don't propagate cached known bits information3711// with the wrong VT if we fall out of fast isel after selecting this.3712const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT);3713Register ResultReg = createResultReg(DstClass);3714BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,3715TII.get(TargetOpcode::COPY), ResultReg).addReg(Reg);37163717updateValueMap(I, ResultReg);3718return true;3719}3720}37213722return false;3723}37243725unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {3726if (VT > MVT::i64)3727return 0;37283729uint64_t Imm = CI->getZExtValue();3730if (Imm == 0) {3731Register SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);3732switch (VT.SimpleTy) {3733default: llvm_unreachable("Unexpected value type");3734case MVT::i1:3735case MVT::i8:3736return fastEmitInst_extractsubreg(MVT::i8, SrcReg, X86::sub_8bit);3737case MVT::i16:3738return fastEmitInst_extractsubreg(MVT::i16, SrcReg, X86::sub_16bit);3739case MVT::i32:3740return SrcReg;3741case MVT::i64: {3742Register ResultReg = createResultReg(&X86::GR64RegClass);3743BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,3744TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)3745.addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);3746return ResultReg;3747}3748}3749}37503751unsigned Opc = 0;3752switch (VT.SimpleTy) {3753default: llvm_unreachable("Unexpected value type");3754case MVT::i1:3755VT = MVT::i8;3756[[fallthrough]];3757case MVT::i8: Opc = X86::MOV8ri; break;3758case MVT::i16: Opc = X86::MOV16ri; break;3759case MVT::i32: Opc = X86::MOV32ri; break;3760case MVT::i64: {3761if (isUInt<32>(Imm))3762Opc = X86::MOV32ri64;3763else if (isInt<32>(Imm))3764Opc = X86::MOV64ri32;3765else3766Opc = X86::MOV64ri;3767break;3768}3769}3770return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);3771}37723773unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {3774if (CFP->isNullValue())3775return fastMaterializeFloatZero(CFP);37763777// Can't handle alternate code models yet.3778CodeModel::Model CM = TM.getCodeModel();3779if (CM != CodeModel::Small && CM != CodeModel::Medium &&3780CM != CodeModel::Large)3781return 0;37823783// Get opcode and regclass of the output for the given load instruction.3784unsigned Opc = 0;3785bool HasSSE1 = Subtarget->hasSSE1();3786bool HasSSE2 = Subtarget->hasSSE2();3787bool HasAVX = Subtarget->hasAVX();3788bool HasAVX512 = Subtarget->hasAVX512();3789switch (VT.SimpleTy) {3790default: return 0;3791case MVT::f32:3792Opc = HasAVX512 ? X86::VMOVSSZrm_alt3793: HasAVX ? X86::VMOVSSrm_alt3794: HasSSE1 ? X86::MOVSSrm_alt3795: X86::LD_Fp32m;3796break;3797case MVT::f64:3798Opc = HasAVX512 ? X86::VMOVSDZrm_alt3799: HasAVX ? X86::VMOVSDrm_alt3800: HasSSE2 ? X86::MOVSDrm_alt3801: X86::LD_Fp64m;3802break;3803case MVT::f80:3804// No f80 support yet.3805return 0;3806}38073808// MachineConstantPool wants an explicit alignment.3809Align Alignment = DL.getPrefTypeAlign(CFP->getType());38103811// x86-32 PIC requires a PIC base register for constant pools.3812unsigned PICBase = 0;3813unsigned char OpFlag = Subtarget->classifyLocalReference(nullptr);3814if (OpFlag == X86II::MO_PIC_BASE_OFFSET)3815PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);3816else if (OpFlag == X86II::MO_GOTOFF)3817PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);3818else if (Subtarget->is64Bit() && TM.getCodeModel() != CodeModel::Large)3819PICBase = X86::RIP;38203821// Create the load from the constant pool.3822unsigned CPI = MCP.getConstantPoolIndex(CFP, Alignment);3823Register ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));38243825// Large code model only applies to 64-bit mode.3826if (Subtarget->is64Bit() && CM == CodeModel::Large) {3827Register AddrReg = createResultReg(&X86::GR64RegClass);3828BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV64ri),3829AddrReg)3830.addConstantPoolIndex(CPI, 0, OpFlag);3831MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,3832TII.get(Opc), ResultReg);3833addRegReg(MIB, AddrReg, false, PICBase, false);3834MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(3835MachinePointerInfo::getConstantPool(*FuncInfo.MF),3836MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment);3837MIB->addMemOperand(*FuncInfo.MF, MMO);3838return ResultReg;3839}38403841addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,3842TII.get(Opc), ResultReg),3843CPI, PICBase, OpFlag);3844return ResultReg;3845}38463847unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {3848// Can't handle large GlobalValues yet.3849if (TM.getCodeModel() != CodeModel::Small &&3850TM.getCodeModel() != CodeModel::Medium)3851return 0;3852if (TM.isLargeGlobalValue(GV))3853return 0;38543855// Materialize addresses with LEA/MOV instructions.3856X86AddressMode AM;3857if (X86SelectAddress(GV, AM)) {3858// If the expression is just a basereg, then we're done, otherwise we need3859// to emit an LEA.3860if (AM.BaseType == X86AddressMode::RegBase &&3861AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)3862return AM.Base.Reg;38633864Register ResultReg = createResultReg(TLI.getRegClassFor(VT));3865if (TM.getRelocationModel() == Reloc::Static &&3866TLI.getPointerTy(DL) == MVT::i64) {3867// The displacement code could be more than 32 bits away so we need to use3868// an instruction with a 64 bit immediate3869BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(X86::MOV64ri),3870ResultReg)3871.addGlobalAddress(GV);3872} else {3873unsigned Opc =3874TLI.getPointerTy(DL) == MVT::i323875? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)3876: X86::LEA64r;3877addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,3878TII.get(Opc), ResultReg), AM);3879}3880return ResultReg;3881}3882return 0;3883}38843885unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {3886EVT CEVT = TLI.getValueType(DL, C->getType(), true);38873888// Only handle simple types.3889if (!CEVT.isSimple())3890return 0;3891MVT VT = CEVT.getSimpleVT();38923893if (const auto *CI = dyn_cast<ConstantInt>(C))3894return X86MaterializeInt(CI, VT);3895if (const auto *CFP = dyn_cast<ConstantFP>(C))3896return X86MaterializeFP(CFP, VT);3897if (const auto *GV = dyn_cast<GlobalValue>(C))3898return X86MaterializeGV(GV, VT);3899if (isa<UndefValue>(C)) {3900unsigned Opc = 0;3901switch (VT.SimpleTy) {3902default:3903break;3904case MVT::f32:3905if (!Subtarget->hasSSE1())3906Opc = X86::LD_Fp032;3907break;3908case MVT::f64:3909if (!Subtarget->hasSSE2())3910Opc = X86::LD_Fp064;3911break;3912case MVT::f80:3913Opc = X86::LD_Fp080;3914break;3915}39163917if (Opc) {3918Register ResultReg = createResultReg(TLI.getRegClassFor(VT));3919BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc),3920ResultReg);3921return ResultReg;3922}3923}39243925return 0;3926}39273928unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {3929// Fail on dynamic allocas. At this point, getRegForValue has already3930// checked its CSE maps, so if we're here trying to handle a dynamic3931// alloca, we're not going to succeed. X86SelectAddress has a3932// check for dynamic allocas, because it's called directly from3933// various places, but targetMaterializeAlloca also needs a check3934// in order to avoid recursion between getRegForValue,3935// X86SelectAddrss, and targetMaterializeAlloca.3936if (!FuncInfo.StaticAllocaMap.count(C))3937return 0;3938assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");39393940X86AddressMode AM;3941if (!X86SelectAddress(C, AM))3942return 0;3943unsigned Opc =3944TLI.getPointerTy(DL) == MVT::i323945? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)3946: X86::LEA64r;3947const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));3948Register ResultReg = createResultReg(RC);3949addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,3950TII.get(Opc), ResultReg), AM);3951return ResultReg;3952}39533954unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {3955MVT VT;3956if (!isTypeLegal(CF->getType(), VT))3957return 0;39583959// Get opcode and regclass for the given zero.3960bool HasSSE1 = Subtarget->hasSSE1();3961bool HasSSE2 = Subtarget->hasSSE2();3962bool HasAVX512 = Subtarget->hasAVX512();3963unsigned Opc = 0;3964switch (VT.SimpleTy) {3965default: return 0;3966case MVT::f16:3967Opc = HasAVX512 ? X86::AVX512_FsFLD0SH : X86::FsFLD0SH;3968break;3969case MVT::f32:3970Opc = HasAVX512 ? X86::AVX512_FsFLD0SS3971: HasSSE1 ? X86::FsFLD0SS3972: X86::LD_Fp032;3973break;3974case MVT::f64:3975Opc = HasAVX512 ? X86::AVX512_FsFLD0SD3976: HasSSE2 ? X86::FsFLD0SD3977: X86::LD_Fp064;3978break;3979case MVT::f80:3980// No f80 support yet.3981return 0;3982}39833984Register ResultReg = createResultReg(TLI.getRegClassFor(VT));3985BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opc), ResultReg);3986return ResultReg;3987}398839893990bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,3991const LoadInst *LI) {3992const Value *Ptr = LI->getPointerOperand();3993X86AddressMode AM;3994if (!X86SelectAddress(Ptr, AM))3995return false;39963997const X86InstrInfo &XII = (const X86InstrInfo &)TII;39983999unsigned Size = DL.getTypeAllocSize(LI->getType());40004001SmallVector<MachineOperand, 8> AddrOps;4002AM.getFullAddress(AddrOps);40034004MachineInstr *Result = XII.foldMemoryOperandImpl(4005*FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, LI->getAlign(),4006/*AllowCommute=*/true);4007if (!Result)4008return false;40094010// The index register could be in the wrong register class. Unfortunately,4011// foldMemoryOperandImpl could have commuted the instruction so its not enough4012// to just look at OpNo + the offset to the index reg. We actually need to4013// scan the instruction to find the index reg and see if its the correct reg4014// class.4015unsigned OperandNo = 0;4016for (MachineInstr::mop_iterator I = Result->operands_begin(),4017E = Result->operands_end(); I != E; ++I, ++OperandNo) {4018MachineOperand &MO = *I;4019if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg)4020continue;4021// Found the index reg, now try to rewrite it.4022Register IndexReg = constrainOperandRegClass(Result->getDesc(),4023MO.getReg(), OperandNo);4024if (IndexReg == MO.getReg())4025continue;4026MO.setReg(IndexReg);4027}40284029Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));4030Result->cloneInstrSymbols(*FuncInfo.MF, *MI);4031MachineBasicBlock::iterator I(MI);4032removeDeadCode(I, std::next(I));4033return true;4034}40354036unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,4037const TargetRegisterClass *RC,4038unsigned Op0, unsigned Op1,4039unsigned Op2, unsigned Op3) {4040const MCInstrDesc &II = TII.get(MachineInstOpcode);40414042Register ResultReg = createResultReg(RC);4043Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());4044Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);4045Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);4046Op3 = constrainOperandRegClass(II, Op3, II.getNumDefs() + 3);40474048if (II.getNumDefs() >= 1)4049BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, ResultReg)4050.addReg(Op0)4051.addReg(Op1)4052.addReg(Op2)4053.addReg(Op3);4054else {4055BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)4056.addReg(Op0)4057.addReg(Op1)4058.addReg(Op2)4059.addReg(Op3);4060BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),4061ResultReg)4062.addReg(II.implicit_defs()[0]);4063}4064return ResultReg;4065}406640674068namespace llvm {4069FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,4070const TargetLibraryInfo *libInfo) {4071return new X86FastISel(funcInfo, libInfo);4072}4073}407440754076