Path: blob/main/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
104186 views
//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file implements the PPCISelLowering class.9//10//===----------------------------------------------------------------------===//1112#include "PPCISelLowering.h"13#include "MCTargetDesc/PPCMCTargetDesc.h"14#include "MCTargetDesc/PPCPredicates.h"15#include "PPC.h"16#include "PPCCCState.h"17#include "PPCCallingConv.h"18#include "PPCFrameLowering.h"19#include "PPCInstrInfo.h"20#include "PPCMachineFunctionInfo.h"21#include "PPCPerfectShuffle.h"22#include "PPCRegisterInfo.h"23#include "PPCSubtarget.h"24#include "PPCTargetMachine.h"25#include "llvm/ADT/APFloat.h"26#include "llvm/ADT/APInt.h"27#include "llvm/ADT/APSInt.h"28#include "llvm/ADT/ArrayRef.h"29#include "llvm/ADT/DenseMap.h"30#include "llvm/ADT/STLExtras.h"31#include "llvm/ADT/SmallPtrSet.h"32#include "llvm/ADT/SmallSet.h"33#include "llvm/ADT/SmallVector.h"34#include "llvm/ADT/Statistic.h"35#include "llvm/ADT/StringRef.h"36#include "llvm/ADT/StringSwitch.h"37#include "llvm/CodeGen/CallingConvLower.h"38#include "llvm/CodeGen/ISDOpcodes.h"39#include "llvm/CodeGen/MachineBasicBlock.h"40#include "llvm/CodeGen/MachineFrameInfo.h"41#include "llvm/CodeGen/MachineFunction.h"42#include "llvm/CodeGen/MachineInstr.h"43#include "llvm/CodeGen/MachineInstrBuilder.h"44#include "llvm/CodeGen/MachineJumpTableInfo.h"45#include "llvm/CodeGen/MachineLoopInfo.h"46#include "llvm/CodeGen/MachineMemOperand.h"47#include "llvm/CodeGen/MachineModuleInfo.h"48#include "llvm/CodeGen/MachineOperand.h"49#include "llvm/CodeGen/MachineRegisterInfo.h"50#include "llvm/CodeGen/RuntimeLibcallUtil.h"51#include "llvm/CodeGen/SelectionDAG.h"52#include "llvm/CodeGen/SelectionDAGNodes.h"53#include "llvm/CodeGen/TargetInstrInfo.h"54#include "llvm/CodeGen/TargetLowering.h"55#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"56#include "llvm/CodeGen/TargetRegisterInfo.h"57#include "llvm/CodeGen/ValueTypes.h"58#include "llvm/CodeGenTypes/MachineValueType.h"59#include "llvm/IR/CallingConv.h"60#include "llvm/IR/Constant.h"61#include "llvm/IR/Constants.h"62#include "llvm/IR/DataLayout.h"63#include "llvm/IR/DebugLoc.h"64#include "llvm/IR/DerivedTypes.h"65#include "llvm/IR/Function.h"66#include "llvm/IR/GlobalValue.h"67#include "llvm/IR/IRBuilder.h"68#include "llvm/IR/Instructions.h"69#include "llvm/IR/Intrinsics.h"70#include "llvm/IR/IntrinsicsPowerPC.h"71#include "llvm/IR/Module.h"72#include "llvm/IR/Type.h"73#include "llvm/IR/Use.h"74#include "llvm/IR/Value.h"75#include "llvm/MC/MCContext.h"76#include "llvm/MC/MCExpr.h"77#include "llvm/MC/MCRegisterInfo.h"78#include "llvm/MC/MCSectionXCOFF.h"79#include "llvm/MC/MCSymbolXCOFF.h"80#include "llvm/Support/AtomicOrdering.h"81#include "llvm/Support/BranchProbability.h"82#include "llvm/Support/Casting.h"83#include "llvm/Support/CodeGen.h"84#include "llvm/Support/CommandLine.h"85#include "llvm/Support/Compiler.h"86#include "llvm/Support/Debug.h"87#include "llvm/Support/ErrorHandling.h"88#include "llvm/Support/Format.h"89#include "llvm/Support/KnownBits.h"90#include "llvm/Support/MathExtras.h"91#include "llvm/Support/raw_ostream.h"92#include "llvm/Target/TargetMachine.h"93#include "llvm/Target/TargetOptions.h"94#include <algorithm>95#include <cassert>96#include <cstdint>97#include <iterator>98#include <list>99#include <optional>100#include <utility>101#include <vector>102103using namespace llvm;104105#define DEBUG_TYPE "ppc-lowering"106107static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",108cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);109110static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",111cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);112113static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",114cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);115116static cl::opt<bool> DisableSCO("disable-ppc-sco",117cl::desc("disable sibling call optimization on ppc"), cl::Hidden);118119static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",120cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);121122static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",123cl::desc("use absolute jump tables on ppc"), cl::Hidden);124125static cl::opt<bool>126DisablePerfectShuffle("ppc-disable-perfect-shuffle",127cl::desc("disable vector permute decomposition"),128cl::init(true), cl::Hidden);129130cl::opt<bool> DisableAutoPairedVecSt(131"disable-auto-paired-vec-st",132cl::desc("disable automatically generated 32byte paired vector stores"),133cl::init(true), cl::Hidden);134135static cl::opt<unsigned> PPCMinimumJumpTableEntries(136"ppc-min-jump-table-entries", cl::init(64), cl::Hidden,137cl::desc("Set minimum number of entries to use a jump table on PPC"));138139static cl::opt<unsigned> PPCGatherAllAliasesMaxDepth(140"ppc-gather-alias-max-depth", cl::init(18), cl::Hidden,141cl::desc("max depth when checking alias info in GatherAllAliases()"));142143static cl::opt<unsigned> PPCAIXTLSModelOptUseIEForLDLimit(144"ppc-aix-shared-lib-tls-model-opt-limit", cl::init(1), cl::Hidden,145cl::desc("Set inclusive limit count of TLS local-dynamic access(es) in a "146"function to use initial-exec"));147148STATISTIC(NumTailCalls, "Number of tail calls");149STATISTIC(NumSiblingCalls, "Number of sibling calls");150STATISTIC(ShufflesHandledWithVPERM,151"Number of shuffles lowered to a VPERM or XXPERM");152STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed");153154static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);155156static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);157158static const char AIXSSPCanaryWordName[] = "__ssp_canary_word";159160// A faster local-[exec|dynamic] TLS access sequence (enabled with the161// -maix-small-local-[exec|dynamic]-tls option) can be produced for TLS162// variables; consistent with the IBM XL compiler, we apply a max size of163// slightly under 32KB.164constexpr uint64_t AIXSmallTlsPolicySizeLimit = 32751;165166// FIXME: Remove this once the bug has been fixed!167extern cl::opt<bool> ANDIGlueBug;168169PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,170const PPCSubtarget &STI)171: TargetLowering(TM), Subtarget(STI) {172// Initialize map that relates the PPC addressing modes to the computed flags173// of a load/store instruction. The map is used to determine the optimal174// addressing mode when selecting load and stores.175initializeAddrModeMap();176// On PPC32/64, arguments smaller than 4/8 bytes are extended, so all177// arguments are at least 4/8 bytes aligned.178bool isPPC64 = Subtarget.isPPC64();179setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4));180181// Set up the register classes.182addRegisterClass(MVT::i32, &PPC::GPRCRegClass);183if (!useSoftFloat()) {184if (hasSPE()) {185addRegisterClass(MVT::f32, &PPC::GPRCRegClass);186// EFPU2 APU only supports f32187if (!Subtarget.hasEFPU2())188addRegisterClass(MVT::f64, &PPC::SPERCRegClass);189} else {190addRegisterClass(MVT::f32, &PPC::F4RCRegClass);191addRegisterClass(MVT::f64, &PPC::F8RCRegClass);192}193}194195// Match BITREVERSE to customized fast code sequence in the td file.196setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);197setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);198199// Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.200setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);201202// Custom lower inline assembly to check for special registers.203setOperationAction(ISD::INLINEASM, MVT::Other, Custom);204setOperationAction(ISD::INLINEASM_BR, MVT::Other, Custom);205206// PowerPC has an i16 but no i8 (or i1) SEXTLOAD.207for (MVT VT : MVT::integer_valuetypes()) {208setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);209setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);210}211212if (Subtarget.isISA3_0()) {213setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Legal);214setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Legal);215setTruncStoreAction(MVT::f64, MVT::f16, Legal);216setTruncStoreAction(MVT::f32, MVT::f16, Legal);217} else {218// No extending loads from f16 or HW conversions back and forth.219setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);220setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);221setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);222setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);223setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);224setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);225setTruncStoreAction(MVT::f64, MVT::f16, Expand);226setTruncStoreAction(MVT::f32, MVT::f16, Expand);227}228229setTruncStoreAction(MVT::f64, MVT::f32, Expand);230231// PowerPC has pre-inc load and store's.232setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);233setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);234setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);235setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);236setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);237setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);238setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);239setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);240setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);241setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);242if (!Subtarget.hasSPE()) {243setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);244setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);245setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);246setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);247}248249// PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry.250const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };251for (MVT VT : ScalarIntVTs) {252setOperationAction(ISD::ADDC, VT, Legal);253setOperationAction(ISD::ADDE, VT, Legal);254setOperationAction(ISD::SUBC, VT, Legal);255setOperationAction(ISD::SUBE, VT, Legal);256}257258if (Subtarget.useCRBits()) {259setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);260261if (isPPC64 || Subtarget.hasFPCVT()) {262setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Promote);263AddPromotedToType(ISD::STRICT_SINT_TO_FP, MVT::i1,264isPPC64 ? MVT::i64 : MVT::i32);265setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Promote);266AddPromotedToType(ISD::STRICT_UINT_TO_FP, MVT::i1,267isPPC64 ? MVT::i64 : MVT::i32);268269setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);270AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,271isPPC64 ? MVT::i64 : MVT::i32);272setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);273AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,274isPPC64 ? MVT::i64 : MVT::i32);275276setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i1, Promote);277AddPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::i1,278isPPC64 ? MVT::i64 : MVT::i32);279setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i1, Promote);280AddPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::i1,281isPPC64 ? MVT::i64 : MVT::i32);282283setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote);284AddPromotedToType(ISD::FP_TO_SINT, MVT::i1,285isPPC64 ? MVT::i64 : MVT::i32);286setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote);287AddPromotedToType(ISD::FP_TO_UINT, MVT::i1,288isPPC64 ? MVT::i64 : MVT::i32);289} else {290setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Custom);291setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Custom);292setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);293setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);294}295296// PowerPC does not support direct load/store of condition registers.297setOperationAction(ISD::LOAD, MVT::i1, Custom);298setOperationAction(ISD::STORE, MVT::i1, Custom);299300// FIXME: Remove this once the ANDI glue bug is fixed:301if (ANDIGlueBug)302setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);303304for (MVT VT : MVT::integer_valuetypes()) {305setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);306setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);307setTruncStoreAction(VT, MVT::i1, Expand);308}309310addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);311}312313// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on314// PPC (the libcall is not available).315setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom);316setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom);317setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::ppcf128, Custom);318setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::ppcf128, Custom);319320// We do not currently implement these libm ops for PowerPC.321setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);322setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand);323setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);324setOperationAction(ISD::FRINT, MVT::ppcf128, Expand);325setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);326setOperationAction(ISD::FREM, MVT::ppcf128, Expand);327328// PowerPC has no SREM/UREM instructions unless we are on P9329// On P9 we may use a hardware instruction to compute the remainder.330// When the result of both the remainder and the division is required it is331// more efficient to compute the remainder from the result of the division332// rather than use the remainder instruction. The instructions are legalized333// directly because the DivRemPairsPass performs the transformation at the IR334// level.335if (Subtarget.isISA3_0()) {336setOperationAction(ISD::SREM, MVT::i32, Legal);337setOperationAction(ISD::UREM, MVT::i32, Legal);338setOperationAction(ISD::SREM, MVT::i64, Legal);339setOperationAction(ISD::UREM, MVT::i64, Legal);340} else {341setOperationAction(ISD::SREM, MVT::i32, Expand);342setOperationAction(ISD::UREM, MVT::i32, Expand);343setOperationAction(ISD::SREM, MVT::i64, Expand);344setOperationAction(ISD::UREM, MVT::i64, Expand);345}346347// Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.348setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);349setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);350setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);351setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);352setOperationAction(ISD::UDIVREM, MVT::i32, Expand);353setOperationAction(ISD::SDIVREM, MVT::i32, Expand);354setOperationAction(ISD::UDIVREM, MVT::i64, Expand);355setOperationAction(ISD::SDIVREM, MVT::i64, Expand);356357// Handle constrained floating-point operations of scalar.358// TODO: Handle SPE specific operation.359setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);360setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);361setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);362setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);363setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);364365setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);366setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);367setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);368setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);369370if (!Subtarget.hasSPE()) {371setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal);372setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);373}374375if (Subtarget.hasVSX()) {376setOperationAction(ISD::STRICT_FRINT, MVT::f32, Legal);377setOperationAction(ISD::STRICT_FRINT, MVT::f64, Legal);378}379380if (Subtarget.hasFSQRT()) {381setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);382setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);383}384385if (Subtarget.hasFPRND()) {386setOperationAction(ISD::STRICT_FFLOOR, MVT::f32, Legal);387setOperationAction(ISD::STRICT_FCEIL, MVT::f32, Legal);388setOperationAction(ISD::STRICT_FTRUNC, MVT::f32, Legal);389setOperationAction(ISD::STRICT_FROUND, MVT::f32, Legal);390391setOperationAction(ISD::STRICT_FFLOOR, MVT::f64, Legal);392setOperationAction(ISD::STRICT_FCEIL, MVT::f64, Legal);393setOperationAction(ISD::STRICT_FTRUNC, MVT::f64, Legal);394setOperationAction(ISD::STRICT_FROUND, MVT::f64, Legal);395}396397// We don't support sin/cos/sqrt/fmod/pow398setOperationAction(ISD::FSIN , MVT::f64, Expand);399setOperationAction(ISD::FCOS , MVT::f64, Expand);400setOperationAction(ISD::FSINCOS, MVT::f64, Expand);401setOperationAction(ISD::FREM , MVT::f64, Expand);402setOperationAction(ISD::FPOW , MVT::f64, Expand);403setOperationAction(ISD::FSIN , MVT::f32, Expand);404setOperationAction(ISD::FCOS , MVT::f32, Expand);405setOperationAction(ISD::FSINCOS, MVT::f32, Expand);406setOperationAction(ISD::FREM , MVT::f32, Expand);407setOperationAction(ISD::FPOW , MVT::f32, Expand);408409// MASS transformation for LLVM intrinsics with replicating fast-math flag410// to be consistent to PPCGenScalarMASSEntries pass411if (TM.getOptLevel() == CodeGenOptLevel::Aggressive) {412setOperationAction(ISD::FSIN , MVT::f64, Custom);413setOperationAction(ISD::FCOS , MVT::f64, Custom);414setOperationAction(ISD::FPOW , MVT::f64, Custom);415setOperationAction(ISD::FLOG, MVT::f64, Custom);416setOperationAction(ISD::FLOG10, MVT::f64, Custom);417setOperationAction(ISD::FEXP, MVT::f64, Custom);418setOperationAction(ISD::FSIN , MVT::f32, Custom);419setOperationAction(ISD::FCOS , MVT::f32, Custom);420setOperationAction(ISD::FPOW , MVT::f32, Custom);421setOperationAction(ISD::FLOG, MVT::f32, Custom);422setOperationAction(ISD::FLOG10, MVT::f32, Custom);423setOperationAction(ISD::FEXP, MVT::f32, Custom);424}425426if (Subtarget.hasSPE()) {427setOperationAction(ISD::FMA , MVT::f64, Expand);428setOperationAction(ISD::FMA , MVT::f32, Expand);429} else {430setOperationAction(ISD::FMA , MVT::f64, Legal);431setOperationAction(ISD::FMA , MVT::f32, Legal);432}433434if (Subtarget.hasSPE())435setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);436437setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);438439// If we're enabling GP optimizations, use hardware square root440if (!Subtarget.hasFSQRT() &&441!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&442Subtarget.hasFRE()))443setOperationAction(ISD::FSQRT, MVT::f64, Expand);444445if (!Subtarget.hasFSQRT() &&446!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&447Subtarget.hasFRES()))448setOperationAction(ISD::FSQRT, MVT::f32, Expand);449450if (Subtarget.hasFCPSGN()) {451setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);452setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);453} else {454setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);455setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);456}457458if (Subtarget.hasFPRND()) {459setOperationAction(ISD::FFLOOR, MVT::f64, Legal);460setOperationAction(ISD::FCEIL, MVT::f64, Legal);461setOperationAction(ISD::FTRUNC, MVT::f64, Legal);462setOperationAction(ISD::FROUND, MVT::f64, Legal);463464setOperationAction(ISD::FFLOOR, MVT::f32, Legal);465setOperationAction(ISD::FCEIL, MVT::f32, Legal);466setOperationAction(ISD::FTRUNC, MVT::f32, Legal);467setOperationAction(ISD::FROUND, MVT::f32, Legal);468}469470// Prior to P10, PowerPC does not have BSWAP, but we can use vector BSWAP471// instruction xxbrd to speed up scalar BSWAP64.472if (Subtarget.isISA3_1()) {473setOperationAction(ISD::BSWAP, MVT::i32, Legal);474setOperationAction(ISD::BSWAP, MVT::i64, Legal);475} else {476setOperationAction(ISD::BSWAP, MVT::i32, Expand);477setOperationAction(478ISD::BSWAP, MVT::i64,479(Subtarget.hasP9Vector() && Subtarget.isPPC64()) ? Custom : Expand);480}481482// CTPOP or CTTZ were introduced in P8/P9 respectively483if (Subtarget.isISA3_0()) {484setOperationAction(ISD::CTTZ , MVT::i32 , Legal);485setOperationAction(ISD::CTTZ , MVT::i64 , Legal);486} else {487setOperationAction(ISD::CTTZ , MVT::i32 , Expand);488setOperationAction(ISD::CTTZ , MVT::i64 , Expand);489}490491if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {492setOperationAction(ISD::CTPOP, MVT::i32 , Legal);493setOperationAction(ISD::CTPOP, MVT::i64 , Legal);494} else {495setOperationAction(ISD::CTPOP, MVT::i32 , Expand);496setOperationAction(ISD::CTPOP, MVT::i64 , Expand);497}498499// PowerPC does not have ROTR500setOperationAction(ISD::ROTR, MVT::i32 , Expand);501setOperationAction(ISD::ROTR, MVT::i64 , Expand);502503if (!Subtarget.useCRBits()) {504// PowerPC does not have Select505setOperationAction(ISD::SELECT, MVT::i32, Expand);506setOperationAction(ISD::SELECT, MVT::i64, Expand);507setOperationAction(ISD::SELECT, MVT::f32, Expand);508setOperationAction(ISD::SELECT, MVT::f64, Expand);509}510511// PowerPC wants to turn select_cc of FP into fsel when possible.512setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);513setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);514515// PowerPC wants to optimize integer setcc a bit516if (!Subtarget.useCRBits())517setOperationAction(ISD::SETCC, MVT::i32, Custom);518519if (Subtarget.hasFPU()) {520setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Legal);521setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Legal);522setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Legal);523524setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);525setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);526setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Legal);527}528529// PowerPC does not have BRCOND which requires SetCC530if (!Subtarget.useCRBits())531setOperationAction(ISD::BRCOND, MVT::Other, Expand);532533setOperationAction(ISD::BR_JT, MVT::Other, Expand);534535if (Subtarget.hasSPE()) {536// SPE has built-in conversions537setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Legal);538setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Legal);539setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Legal);540setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);541setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);542setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);543544// SPE supports signaling compare of f32/f64.545setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);546setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);547} else {548// PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.549setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);550setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);551552// PowerPC does not have [U|S]INT_TO_FP553setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Expand);554setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Expand);555setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);556setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);557}558559if (Subtarget.hasDirectMove() && isPPC64) {560setOperationAction(ISD::BITCAST, MVT::f32, Legal);561setOperationAction(ISD::BITCAST, MVT::i32, Legal);562setOperationAction(ISD::BITCAST, MVT::i64, Legal);563setOperationAction(ISD::BITCAST, MVT::f64, Legal);564if (TM.Options.UnsafeFPMath) {565setOperationAction(ISD::LRINT, MVT::f64, Legal);566setOperationAction(ISD::LRINT, MVT::f32, Legal);567setOperationAction(ISD::LLRINT, MVT::f64, Legal);568setOperationAction(ISD::LLRINT, MVT::f32, Legal);569setOperationAction(ISD::LROUND, MVT::f64, Legal);570setOperationAction(ISD::LROUND, MVT::f32, Legal);571setOperationAction(ISD::LLROUND, MVT::f64, Legal);572setOperationAction(ISD::LLROUND, MVT::f32, Legal);573}574} else {575setOperationAction(ISD::BITCAST, MVT::f32, Expand);576setOperationAction(ISD::BITCAST, MVT::i32, Expand);577setOperationAction(ISD::BITCAST, MVT::i64, Expand);578setOperationAction(ISD::BITCAST, MVT::f64, Expand);579}580581// We cannot sextinreg(i1). Expand to shifts.582setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);583584// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support585// SjLj exception handling but a light-weight setjmp/longjmp replacement to586// support continuation, user-level threading, and etc.. As a result, no587// other SjLj exception interfaces are implemented and please don't build588// your own exception handling based on them.589// LLVM/Clang supports zero-cost DWARF exception handling.590setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);591setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);592593// We want to legalize GlobalAddress and ConstantPool nodes into the594// appropriate instructions to materialize the address.595setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);596setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);597setOperationAction(ISD::BlockAddress, MVT::i32, Custom);598setOperationAction(ISD::ConstantPool, MVT::i32, Custom);599setOperationAction(ISD::JumpTable, MVT::i32, Custom);600setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);601setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);602setOperationAction(ISD::BlockAddress, MVT::i64, Custom);603setOperationAction(ISD::ConstantPool, MVT::i64, Custom);604setOperationAction(ISD::JumpTable, MVT::i64, Custom);605606// TRAP is legal.607setOperationAction(ISD::TRAP, MVT::Other, Legal);608609// TRAMPOLINE is custom lowered.610setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);611setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);612613// VASTART needs to be custom lowered to use the VarArgsFrameIndex614setOperationAction(ISD::VASTART , MVT::Other, Custom);615616if (Subtarget.is64BitELFABI()) {617// VAARG always uses double-word chunks, so promote anything smaller.618setOperationAction(ISD::VAARG, MVT::i1, Promote);619AddPromotedToType(ISD::VAARG, MVT::i1, MVT::i64);620setOperationAction(ISD::VAARG, MVT::i8, Promote);621AddPromotedToType(ISD::VAARG, MVT::i8, MVT::i64);622setOperationAction(ISD::VAARG, MVT::i16, Promote);623AddPromotedToType(ISD::VAARG, MVT::i16, MVT::i64);624setOperationAction(ISD::VAARG, MVT::i32, Promote);625AddPromotedToType(ISD::VAARG, MVT::i32, MVT::i64);626setOperationAction(ISD::VAARG, MVT::Other, Expand);627} else if (Subtarget.is32BitELFABI()) {628// VAARG is custom lowered with the 32-bit SVR4 ABI.629setOperationAction(ISD::VAARG, MVT::Other, Custom);630setOperationAction(ISD::VAARG, MVT::i64, Custom);631} else632setOperationAction(ISD::VAARG, MVT::Other, Expand);633634// VACOPY is custom lowered with the 32-bit SVR4 ABI.635if (Subtarget.is32BitELFABI())636setOperationAction(ISD::VACOPY , MVT::Other, Custom);637else638setOperationAction(ISD::VACOPY , MVT::Other, Expand);639640// Use the default implementation.641setOperationAction(ISD::VAEND , MVT::Other, Expand);642setOperationAction(ISD::STACKSAVE , MVT::Other, Expand);643setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom);644setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);645setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom);646setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);647setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);648setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);649setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);650651// We want to custom lower some of our intrinsics.652setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);653setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);654setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);655setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);656setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom);657658// To handle counter-based loop conditions.659setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);660661setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);662setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);663setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom);664setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);665666// Comparisons that require checking two conditions.667if (Subtarget.hasSPE()) {668setCondCodeAction(ISD::SETO, MVT::f32, Expand);669setCondCodeAction(ISD::SETO, MVT::f64, Expand);670setCondCodeAction(ISD::SETUO, MVT::f32, Expand);671setCondCodeAction(ISD::SETUO, MVT::f64, Expand);672}673setCondCodeAction(ISD::SETULT, MVT::f32, Expand);674setCondCodeAction(ISD::SETULT, MVT::f64, Expand);675setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);676setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);677setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);678setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);679setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);680setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);681setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);682setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);683setCondCodeAction(ISD::SETONE, MVT::f32, Expand);684setCondCodeAction(ISD::SETONE, MVT::f64, Expand);685686setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);687setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);688689if (Subtarget.has64BitSupport()) {690// They also have instructions for converting between i64 and fp.691setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);692setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Expand);693setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);694setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Expand);695setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);696setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);697setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);698setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);699// This is just the low 32 bits of a (signed) fp->i64 conversion.700// We cannot do this with Promote because i64 is not a legal type.701setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);702setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);703704if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) {705setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);706setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);707}708} else {709// PowerPC does not have FP_TO_UINT on 32-bit implementations.710if (Subtarget.hasSPE()) {711setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Legal);712setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);713} else {714setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Expand);715setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);716}717}718719// With the instructions enabled under FPCVT, we can do everything.720if (Subtarget.hasFPCVT()) {721if (Subtarget.has64BitSupport()) {722setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);723setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);724setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);725setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);726setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);727setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);728setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);729setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);730}731732setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);733setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);734setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);735setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);736setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);737setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);738setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);739setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);740}741742if (Subtarget.use64BitRegs()) {743// 64-bit PowerPC implementations can support i64 types directly744addRegisterClass(MVT::i64, &PPC::G8RCRegClass);745// BUILD_PAIR can't be handled natively, and should be expanded to shl/or746setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);747// 64-bit PowerPC wants to expand i128 shifts itself.748setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);749setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);750setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);751} else {752// 32-bit PowerPC wants to expand i64 shifts itself.753setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);754setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);755setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);756}757758// PowerPC has better expansions for funnel shifts than the generic759// TargetLowering::expandFunnelShift.760if (Subtarget.has64BitSupport()) {761setOperationAction(ISD::FSHL, MVT::i64, Custom);762setOperationAction(ISD::FSHR, MVT::i64, Custom);763}764setOperationAction(ISD::FSHL, MVT::i32, Custom);765setOperationAction(ISD::FSHR, MVT::i32, Custom);766767if (Subtarget.hasVSX()) {768setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);769setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);770setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);771setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);772}773774if (Subtarget.hasAltivec()) {775for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {776setOperationAction(ISD::SADDSAT, VT, Legal);777setOperationAction(ISD::SSUBSAT, VT, Legal);778setOperationAction(ISD::UADDSAT, VT, Legal);779setOperationAction(ISD::USUBSAT, VT, Legal);780}781// First set operation action for all vector types to expand. Then we782// will selectively turn on ones that can be effectively codegen'd.783for (MVT VT : MVT::fixedlen_vector_valuetypes()) {784// add/sub are legal for all supported vector VT's.785setOperationAction(ISD::ADD, VT, Legal);786setOperationAction(ISD::SUB, VT, Legal);787788// For v2i64, these are only valid with P8Vector. This is corrected after789// the loop.790if (VT.getSizeInBits() <= 128 && VT.getScalarSizeInBits() <= 64) {791setOperationAction(ISD::SMAX, VT, Legal);792setOperationAction(ISD::SMIN, VT, Legal);793setOperationAction(ISD::UMAX, VT, Legal);794setOperationAction(ISD::UMIN, VT, Legal);795}796else {797setOperationAction(ISD::SMAX, VT, Expand);798setOperationAction(ISD::SMIN, VT, Expand);799setOperationAction(ISD::UMAX, VT, Expand);800setOperationAction(ISD::UMIN, VT, Expand);801}802803if (Subtarget.hasVSX()) {804setOperationAction(ISD::FMAXNUM, VT, Legal);805setOperationAction(ISD::FMINNUM, VT, Legal);806}807808// Vector instructions introduced in P8809if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {810setOperationAction(ISD::CTPOP, VT, Legal);811setOperationAction(ISD::CTLZ, VT, Legal);812}813else {814setOperationAction(ISD::CTPOP, VT, Expand);815setOperationAction(ISD::CTLZ, VT, Expand);816}817818// Vector instructions introduced in P9819if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))820setOperationAction(ISD::CTTZ, VT, Legal);821else822setOperationAction(ISD::CTTZ, VT, Expand);823824// We promote all shuffles to v16i8.825setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);826AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);827828// We promote all non-typed operations to v4i32.829setOperationAction(ISD::AND , VT, Promote);830AddPromotedToType (ISD::AND , VT, MVT::v4i32);831setOperationAction(ISD::OR , VT, Promote);832AddPromotedToType (ISD::OR , VT, MVT::v4i32);833setOperationAction(ISD::XOR , VT, Promote);834AddPromotedToType (ISD::XOR , VT, MVT::v4i32);835setOperationAction(ISD::LOAD , VT, Promote);836AddPromotedToType (ISD::LOAD , VT, MVT::v4i32);837setOperationAction(ISD::SELECT, VT, Promote);838AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);839setOperationAction(ISD::VSELECT, VT, Legal);840setOperationAction(ISD::SELECT_CC, VT, Promote);841AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32);842setOperationAction(ISD::STORE, VT, Promote);843AddPromotedToType (ISD::STORE, VT, MVT::v4i32);844845// No other operations are legal.846setOperationAction(ISD::MUL , VT, Expand);847setOperationAction(ISD::SDIV, VT, Expand);848setOperationAction(ISD::SREM, VT, Expand);849setOperationAction(ISD::UDIV, VT, Expand);850setOperationAction(ISD::UREM, VT, Expand);851setOperationAction(ISD::FDIV, VT, Expand);852setOperationAction(ISD::FREM, VT, Expand);853setOperationAction(ISD::FNEG, VT, Expand);854setOperationAction(ISD::FSQRT, VT, Expand);855setOperationAction(ISD::FLOG, VT, Expand);856setOperationAction(ISD::FLOG10, VT, Expand);857setOperationAction(ISD::FLOG2, VT, Expand);858setOperationAction(ISD::FEXP, VT, Expand);859setOperationAction(ISD::FEXP2, VT, Expand);860setOperationAction(ISD::FSIN, VT, Expand);861setOperationAction(ISD::FCOS, VT, Expand);862setOperationAction(ISD::FABS, VT, Expand);863setOperationAction(ISD::FFLOOR, VT, Expand);864setOperationAction(ISD::FCEIL, VT, Expand);865setOperationAction(ISD::FTRUNC, VT, Expand);866setOperationAction(ISD::FRINT, VT, Expand);867setOperationAction(ISD::FLDEXP, VT, Expand);868setOperationAction(ISD::FNEARBYINT, VT, Expand);869setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);870setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);871setOperationAction(ISD::BUILD_VECTOR, VT, Expand);872setOperationAction(ISD::MULHU, VT, Expand);873setOperationAction(ISD::MULHS, VT, Expand);874setOperationAction(ISD::UMUL_LOHI, VT, Expand);875setOperationAction(ISD::SMUL_LOHI, VT, Expand);876setOperationAction(ISD::UDIVREM, VT, Expand);877setOperationAction(ISD::SDIVREM, VT, Expand);878setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);879setOperationAction(ISD::FPOW, VT, Expand);880setOperationAction(ISD::BSWAP, VT, Expand);881setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);882setOperationAction(ISD::ROTL, VT, Expand);883setOperationAction(ISD::ROTR, VT, Expand);884885for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {886setTruncStoreAction(VT, InnerVT, Expand);887setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);888setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);889setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);890}891}892setOperationAction(ISD::SELECT_CC, MVT::v4i32, Expand);893if (!Subtarget.hasP8Vector()) {894setOperationAction(ISD::SMAX, MVT::v2i64, Expand);895setOperationAction(ISD::SMIN, MVT::v2i64, Expand);896setOperationAction(ISD::UMAX, MVT::v2i64, Expand);897setOperationAction(ISD::UMIN, MVT::v2i64, Expand);898}899900// We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle901// with merges, splats, etc.902setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);903904// Vector truncates to sub-word integer that fit in an Altivec/VSX register905// are cheap, so handle them before they get expanded to scalar.906setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);907setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);908setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);909setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);910setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);911912setOperationAction(ISD::AND , MVT::v4i32, Legal);913setOperationAction(ISD::OR , MVT::v4i32, Legal);914setOperationAction(ISD::XOR , MVT::v4i32, Legal);915setOperationAction(ISD::LOAD , MVT::v4i32, Legal);916setOperationAction(ISD::SELECT, MVT::v4i32,917Subtarget.useCRBits() ? Legal : Expand);918setOperationAction(ISD::STORE , MVT::v4i32, Legal);919setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);920setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Legal);921setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);922setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Legal);923setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);924setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);925setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);926setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);927setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);928setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);929setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);930setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);931932// Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8.933setOperationAction(ISD::ROTL, MVT::v1i128, Custom);934// With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w).935if (Subtarget.hasAltivec())936for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8})937setOperationAction(ISD::ROTL, VT, Legal);938// With hasP8Altivec set, we can lower ISD::ROTL to vrld.939if (Subtarget.hasP8Altivec())940setOperationAction(ISD::ROTL, MVT::v2i64, Legal);941942addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);943addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);944addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);945addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);946947setOperationAction(ISD::MUL, MVT::v4f32, Legal);948setOperationAction(ISD::FMA, MVT::v4f32, Legal);949950if (Subtarget.hasVSX()) {951setOperationAction(ISD::FDIV, MVT::v4f32, Legal);952setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);953setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);954}955956if (Subtarget.hasP8Altivec())957setOperationAction(ISD::MUL, MVT::v4i32, Legal);958else959setOperationAction(ISD::MUL, MVT::v4i32, Custom);960961if (Subtarget.isISA3_1()) {962setOperationAction(ISD::MUL, MVT::v2i64, Legal);963setOperationAction(ISD::MULHS, MVT::v2i64, Legal);964setOperationAction(ISD::MULHU, MVT::v2i64, Legal);965setOperationAction(ISD::MULHS, MVT::v4i32, Legal);966setOperationAction(ISD::MULHU, MVT::v4i32, Legal);967setOperationAction(ISD::UDIV, MVT::v2i64, Legal);968setOperationAction(ISD::SDIV, MVT::v2i64, Legal);969setOperationAction(ISD::UDIV, MVT::v4i32, Legal);970setOperationAction(ISD::SDIV, MVT::v4i32, Legal);971setOperationAction(ISD::UREM, MVT::v2i64, Legal);972setOperationAction(ISD::SREM, MVT::v2i64, Legal);973setOperationAction(ISD::UREM, MVT::v4i32, Legal);974setOperationAction(ISD::SREM, MVT::v4i32, Legal);975setOperationAction(ISD::UREM, MVT::v1i128, Legal);976setOperationAction(ISD::SREM, MVT::v1i128, Legal);977setOperationAction(ISD::UDIV, MVT::v1i128, Legal);978setOperationAction(ISD::SDIV, MVT::v1i128, Legal);979setOperationAction(ISD::ROTL, MVT::v1i128, Legal);980}981982setOperationAction(ISD::MUL, MVT::v8i16, Legal);983setOperationAction(ISD::MUL, MVT::v16i8, Custom);984985setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);986setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);987988setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);989setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);990setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);991setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);992993// Altivec does not contain unordered floating-point compare instructions994setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);995setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);996setCondCodeAction(ISD::SETO, MVT::v4f32, Expand);997setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);998999if (Subtarget.hasVSX()) {1000setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);1001setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);1002if (Subtarget.hasP8Vector()) {1003setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);1004setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);1005}1006if (Subtarget.hasDirectMove() && isPPC64) {1007setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);1008setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);1009setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);1010setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);1011setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal);1012setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);1013setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);1014setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);1015}1016setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);10171018// The nearbyint variants are not allowed to raise the inexact exception1019// so we can only code-gen them with unsafe math.1020if (TM.Options.UnsafeFPMath) {1021setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);1022setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);1023}10241025setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);1026setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);1027setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);1028setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);1029setOperationAction(ISD::FRINT, MVT::v2f64, Legal);1030setOperationAction(ISD::FROUND, MVT::v2f64, Legal);1031setOperationAction(ISD::FROUND, MVT::f64, Legal);1032setOperationAction(ISD::FRINT, MVT::f64, Legal);10331034setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);1035setOperationAction(ISD::FRINT, MVT::v4f32, Legal);1036setOperationAction(ISD::FROUND, MVT::v4f32, Legal);1037setOperationAction(ISD::FROUND, MVT::f32, Legal);1038setOperationAction(ISD::FRINT, MVT::f32, Legal);10391040setOperationAction(ISD::MUL, MVT::v2f64, Legal);1041setOperationAction(ISD::FMA, MVT::v2f64, Legal);10421043setOperationAction(ISD::FDIV, MVT::v2f64, Legal);1044setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);10451046// Share the Altivec comparison restrictions.1047setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);1048setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);1049setCondCodeAction(ISD::SETO, MVT::v2f64, Expand);1050setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);10511052setOperationAction(ISD::LOAD, MVT::v2f64, Legal);1053setOperationAction(ISD::STORE, MVT::v2f64, Legal);10541055setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);10561057if (Subtarget.hasP8Vector())1058addRegisterClass(MVT::f32, &PPC::VSSRCRegClass);10591060addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);10611062addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);1063addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);1064addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);10651066if (Subtarget.hasP8Altivec()) {1067setOperationAction(ISD::SHL, MVT::v2i64, Legal);1068setOperationAction(ISD::SRA, MVT::v2i64, Legal);1069setOperationAction(ISD::SRL, MVT::v2i64, Legal);10701071// 128 bit shifts can be accomplished via 3 instructions for SHL and1072// SRL, but not for SRA because of the instructions available:1073// VS{RL} and VS{RL}O. However due to direct move costs, it's not worth1074// doing1075setOperationAction(ISD::SHL, MVT::v1i128, Expand);1076setOperationAction(ISD::SRL, MVT::v1i128, Expand);1077setOperationAction(ISD::SRA, MVT::v1i128, Expand);10781079setOperationAction(ISD::SETCC, MVT::v2i64, Legal);1080}1081else {1082setOperationAction(ISD::SHL, MVT::v2i64, Expand);1083setOperationAction(ISD::SRA, MVT::v2i64, Expand);1084setOperationAction(ISD::SRL, MVT::v2i64, Expand);10851086setOperationAction(ISD::SETCC, MVT::v2i64, Custom);10871088// VSX v2i64 only supports non-arithmetic operations.1089setOperationAction(ISD::ADD, MVT::v2i64, Expand);1090setOperationAction(ISD::SUB, MVT::v2i64, Expand);1091}10921093if (Subtarget.isISA3_1())1094setOperationAction(ISD::SETCC, MVT::v1i128, Legal);1095else1096setOperationAction(ISD::SETCC, MVT::v1i128, Expand);10971098setOperationAction(ISD::LOAD, MVT::v2i64, Promote);1099AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);1100setOperationAction(ISD::STORE, MVT::v2i64, Promote);1101AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);11021103setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);11041105setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i64, Legal);1106setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i64, Legal);1107setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i64, Legal);1108setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i64, Legal);1109setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);1110setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);1111setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);1112setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);11131114// Custom handling for partial vectors of integers converted to1115// floating point. We already have optimal handling for v2i32 through1116// the DAG combine, so those aren't necessary.1117setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i8, Custom);1118setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i8, Custom);1119setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i16, Custom);1120setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i16, Custom);1121setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i8, Custom);1122setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i8, Custom);1123setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i16, Custom);1124setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i16, Custom);1125setOperationAction(ISD::UINT_TO_FP, MVT::v2i8, Custom);1126setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);1127setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom);1128setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);1129setOperationAction(ISD::SINT_TO_FP, MVT::v2i8, Custom);1130setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Custom);1131setOperationAction(ISD::SINT_TO_FP, MVT::v2i16, Custom);1132setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);11331134setOperationAction(ISD::FNEG, MVT::v4f32, Legal);1135setOperationAction(ISD::FNEG, MVT::v2f64, Legal);1136setOperationAction(ISD::FABS, MVT::v4f32, Legal);1137setOperationAction(ISD::FABS, MVT::v2f64, Legal);1138setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);1139setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Legal);11401141setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);1142setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);11431144// Handle constrained floating-point operations of vector.1145// The predictor is `hasVSX` because altivec instruction has1146// no exception but VSX vector instruction has.1147setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);1148setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);1149setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);1150setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);1151setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal);1152setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);1153setOperationAction(ISD::STRICT_FMAXNUM, MVT::v4f32, Legal);1154setOperationAction(ISD::STRICT_FMINNUM, MVT::v4f32, Legal);1155setOperationAction(ISD::STRICT_FRINT, MVT::v4f32, Legal);1156setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);1157setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal);1158setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);1159setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal);11601161setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);1162setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);1163setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);1164setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);1165setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal);1166setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);1167setOperationAction(ISD::STRICT_FMAXNUM, MVT::v2f64, Legal);1168setOperationAction(ISD::STRICT_FMINNUM, MVT::v2f64, Legal);1169setOperationAction(ISD::STRICT_FRINT, MVT::v2f64, Legal);1170setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);1171setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal);1172setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);1173setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);11741175addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);1176addRegisterClass(MVT::f128, &PPC::VRRCRegClass);11771178for (MVT FPT : MVT::fp_valuetypes())1179setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);11801181// Expand the SELECT to SELECT_CC1182setOperationAction(ISD::SELECT, MVT::f128, Expand);11831184setTruncStoreAction(MVT::f128, MVT::f64, Expand);1185setTruncStoreAction(MVT::f128, MVT::f32, Expand);11861187// No implementation for these ops for PowerPC.1188setOperationAction(ISD::FSINCOS, MVT::f128, Expand);1189setOperationAction(ISD::FSIN, MVT::f128, Expand);1190setOperationAction(ISD::FCOS, MVT::f128, Expand);1191setOperationAction(ISD::FPOW, MVT::f128, Expand);1192setOperationAction(ISD::FPOWI, MVT::f128, Expand);1193setOperationAction(ISD::FREM, MVT::f128, Expand);1194}11951196if (Subtarget.hasP8Altivec()) {1197addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);1198addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass);1199}12001201if (Subtarget.hasP9Vector()) {1202setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);1203setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);12041205// Test data class instructions store results in CR bits.1206if (Subtarget.useCRBits()) {1207setOperationAction(ISD::IS_FPCLASS, MVT::f32, Custom);1208setOperationAction(ISD::IS_FPCLASS, MVT::f64, Custom);1209setOperationAction(ISD::IS_FPCLASS, MVT::f128, Custom);1210}12111212// 128 bit shifts can be accomplished via 3 instructions for SHL and1213// SRL, but not for SRA because of the instructions available:1214// VS{RL} and VS{RL}O.1215setOperationAction(ISD::SHL, MVT::v1i128, Legal);1216setOperationAction(ISD::SRL, MVT::v1i128, Legal);1217setOperationAction(ISD::SRA, MVT::v1i128, Expand);12181219setOperationAction(ISD::FADD, MVT::f128, Legal);1220setOperationAction(ISD::FSUB, MVT::f128, Legal);1221setOperationAction(ISD::FDIV, MVT::f128, Legal);1222setOperationAction(ISD::FMUL, MVT::f128, Legal);1223setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);12241225setOperationAction(ISD::FMA, MVT::f128, Legal);1226setCondCodeAction(ISD::SETULT, MVT::f128, Expand);1227setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);1228setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);1229setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);1230setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);1231setCondCodeAction(ISD::SETONE, MVT::f128, Expand);12321233setOperationAction(ISD::FTRUNC, MVT::f128, Legal);1234setOperationAction(ISD::FRINT, MVT::f128, Legal);1235setOperationAction(ISD::FFLOOR, MVT::f128, Legal);1236setOperationAction(ISD::FCEIL, MVT::f128, Legal);1237setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);1238setOperationAction(ISD::FROUND, MVT::f128, Legal);12391240setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);1241setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);1242setOperationAction(ISD::BITCAST, MVT::i128, Custom);12431244// Handle constrained floating-point operations of fp1281245setOperationAction(ISD::STRICT_FADD, MVT::f128, Legal);1246setOperationAction(ISD::STRICT_FSUB, MVT::f128, Legal);1247setOperationAction(ISD::STRICT_FMUL, MVT::f128, Legal);1248setOperationAction(ISD::STRICT_FDIV, MVT::f128, Legal);1249setOperationAction(ISD::STRICT_FMA, MVT::f128, Legal);1250setOperationAction(ISD::STRICT_FSQRT, MVT::f128, Legal);1251setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Legal);1252setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);1253setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);1254setOperationAction(ISD::STRICT_FRINT, MVT::f128, Legal);1255setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f128, Legal);1256setOperationAction(ISD::STRICT_FFLOOR, MVT::f128, Legal);1257setOperationAction(ISD::STRICT_FCEIL, MVT::f128, Legal);1258setOperationAction(ISD::STRICT_FTRUNC, MVT::f128, Legal);1259setOperationAction(ISD::STRICT_FROUND, MVT::f128, Legal);1260setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);1261setOperationAction(ISD::BSWAP, MVT::v8i16, Legal);1262setOperationAction(ISD::BSWAP, MVT::v4i32, Legal);1263setOperationAction(ISD::BSWAP, MVT::v2i64, Legal);1264setOperationAction(ISD::BSWAP, MVT::v1i128, Legal);1265} else if (Subtarget.hasVSX()) {1266setOperationAction(ISD::LOAD, MVT::f128, Promote);1267setOperationAction(ISD::STORE, MVT::f128, Promote);12681269AddPromotedToType(ISD::LOAD, MVT::f128, MVT::v4i32);1270AddPromotedToType(ISD::STORE, MVT::f128, MVT::v4i32);12711272// Set FADD/FSUB as libcall to avoid the legalizer to expand the1273// fp_to_uint and int_to_fp.1274setOperationAction(ISD::FADD, MVT::f128, LibCall);1275setOperationAction(ISD::FSUB, MVT::f128, LibCall);12761277setOperationAction(ISD::FMUL, MVT::f128, Expand);1278setOperationAction(ISD::FDIV, MVT::f128, Expand);1279setOperationAction(ISD::FNEG, MVT::f128, Expand);1280setOperationAction(ISD::FABS, MVT::f128, Expand);1281setOperationAction(ISD::FSQRT, MVT::f128, Expand);1282setOperationAction(ISD::FMA, MVT::f128, Expand);1283setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);12841285// Expand the fp_extend if the target type is fp128.1286setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand);1287setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Expand);12881289// Expand the fp_round if the source type is fp128.1290for (MVT VT : {MVT::f32, MVT::f64}) {1291setOperationAction(ISD::FP_ROUND, VT, Custom);1292setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);1293}12941295setOperationAction(ISD::SETCC, MVT::f128, Custom);1296setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);1297setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);1298setOperationAction(ISD::BR_CC, MVT::f128, Expand);12991300// Lower following f128 select_cc pattern:1301// select_cc x, y, tv, fv, cc -> select_cc (setcc x, y, cc), 0, tv, fv, NE1302setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);13031304// We need to handle f128 SELECT_CC with integer result type.1305setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);1306setOperationAction(ISD::SELECT_CC, MVT::i64, isPPC64 ? Custom : Expand);1307}13081309if (Subtarget.hasP9Altivec()) {1310if (Subtarget.isISA3_1()) {1311setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal);1312setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Legal);1313setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Legal);1314setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal);1315} else {1316setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);1317setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);1318}1319setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);1320setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);1321setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);1322setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Legal);1323setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Legal);1324setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);1325setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);13261327setOperationAction(ISD::ABDU, MVT::v16i8, Legal);1328setOperationAction(ISD::ABDU, MVT::v8i16, Legal);1329setOperationAction(ISD::ABDU, MVT::v4i32, Legal);1330setOperationAction(ISD::ABDS, MVT::v4i32, Legal);1331}13321333if (Subtarget.hasP10Vector()) {1334setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);1335}1336}13371338if (Subtarget.pairedVectorMemops()) {1339addRegisterClass(MVT::v256i1, &PPC::VSRpRCRegClass);1340setOperationAction(ISD::LOAD, MVT::v256i1, Custom);1341setOperationAction(ISD::STORE, MVT::v256i1, Custom);1342}1343if (Subtarget.hasMMA()) {1344if (Subtarget.isISAFuture())1345addRegisterClass(MVT::v512i1, &PPC::WACCRCRegClass);1346else1347addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass);1348setOperationAction(ISD::LOAD, MVT::v512i1, Custom);1349setOperationAction(ISD::STORE, MVT::v512i1, Custom);1350setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom);1351}13521353if (Subtarget.has64BitSupport())1354setOperationAction(ISD::PREFETCH, MVT::Other, Legal);13551356if (Subtarget.isISA3_1())1357setOperationAction(ISD::SRA, MVT::v1i128, Legal);13581359setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);13601361if (!isPPC64) {1362setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);1363setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);1364}13651366if (shouldInlineQuadwordAtomics()) {1367setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);1368setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);1369setOperationAction(ISD::INTRINSIC_VOID, MVT::i128, Custom);1370}13711372setBooleanContents(ZeroOrOneBooleanContent);13731374if (Subtarget.hasAltivec()) {1375// Altivec instructions set fields to all zeros or all ones.1376setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);1377}13781379if (shouldInlineQuadwordAtomics())1380setMaxAtomicSizeInBitsSupported(128);1381else if (isPPC64)1382setMaxAtomicSizeInBitsSupported(64);1383else1384setMaxAtomicSizeInBitsSupported(32);13851386setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);13871388// We have target-specific dag combine patterns for the following nodes:1389setTargetDAGCombine({ISD::AND, ISD::ADD, ISD::SHL, ISD::SRA, ISD::SRL,1390ISD::MUL, ISD::FMA, ISD::SINT_TO_FP, ISD::BUILD_VECTOR});1391if (Subtarget.hasFPCVT())1392setTargetDAGCombine(ISD::UINT_TO_FP);1393setTargetDAGCombine({ISD::LOAD, ISD::STORE, ISD::BR_CC});1394if (Subtarget.useCRBits())1395setTargetDAGCombine(ISD::BRCOND);1396setTargetDAGCombine({ISD::BSWAP, ISD::INTRINSIC_WO_CHAIN,1397ISD::INTRINSIC_W_CHAIN, ISD::INTRINSIC_VOID});13981399setTargetDAGCombine({ISD::SIGN_EXTEND, ISD::ZERO_EXTEND, ISD::ANY_EXTEND});14001401setTargetDAGCombine({ISD::TRUNCATE, ISD::VECTOR_SHUFFLE});14021403if (Subtarget.useCRBits()) {1404setTargetDAGCombine({ISD::TRUNCATE, ISD::SETCC, ISD::SELECT_CC});1405}14061407setLibcallName(RTLIB::LOG_F128, "logf128");1408setLibcallName(RTLIB::LOG2_F128, "log2f128");1409setLibcallName(RTLIB::LOG10_F128, "log10f128");1410setLibcallName(RTLIB::EXP_F128, "expf128");1411setLibcallName(RTLIB::EXP2_F128, "exp2f128");1412setLibcallName(RTLIB::SIN_F128, "sinf128");1413setLibcallName(RTLIB::COS_F128, "cosf128");1414setLibcallName(RTLIB::SINCOS_F128, "sincosf128");1415setLibcallName(RTLIB::POW_F128, "powf128");1416setLibcallName(RTLIB::FMIN_F128, "fminf128");1417setLibcallName(RTLIB::FMAX_F128, "fmaxf128");1418setLibcallName(RTLIB::REM_F128, "fmodf128");1419setLibcallName(RTLIB::SQRT_F128, "sqrtf128");1420setLibcallName(RTLIB::CEIL_F128, "ceilf128");1421setLibcallName(RTLIB::FLOOR_F128, "floorf128");1422setLibcallName(RTLIB::TRUNC_F128, "truncf128");1423setLibcallName(RTLIB::ROUND_F128, "roundf128");1424setLibcallName(RTLIB::LROUND_F128, "lroundf128");1425setLibcallName(RTLIB::LLROUND_F128, "llroundf128");1426setLibcallName(RTLIB::RINT_F128, "rintf128");1427setLibcallName(RTLIB::LRINT_F128, "lrintf128");1428setLibcallName(RTLIB::LLRINT_F128, "llrintf128");1429setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");1430setLibcallName(RTLIB::FMA_F128, "fmaf128");1431setLibcallName(RTLIB::FREXP_F128, "frexpf128");14321433if (Subtarget.isAIXABI()) {1434setLibcallName(RTLIB::MEMCPY, isPPC64 ? "___memmove64" : "___memmove");1435setLibcallName(RTLIB::MEMMOVE, isPPC64 ? "___memmove64" : "___memmove");1436setLibcallName(RTLIB::MEMSET, isPPC64 ? "___memset64" : "___memset");1437setLibcallName(RTLIB::BZERO, isPPC64 ? "___bzero64" : "___bzero");1438}14391440// With 32 condition bits, we don't need to sink (and duplicate) compares1441// aggressively in CodeGenPrep.1442if (Subtarget.useCRBits()) {1443setHasMultipleConditionRegisters();1444setJumpIsExpensive();1445}14461447// TODO: The default entry number is set to 64. This stops most jump table1448// generation on PPC. But it is good for current PPC HWs because the indirect1449// branch instruction mtctr to the jump table may lead to bad branch predict.1450// Re-evaluate this value on future HWs that can do better with mtctr.1451setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);14521453setMinFunctionAlignment(Align(4));14541455switch (Subtarget.getCPUDirective()) {1456default: break;1457case PPC::DIR_970:1458case PPC::DIR_A2:1459case PPC::DIR_E500:1460case PPC::DIR_E500mc:1461case PPC::DIR_E5500:1462case PPC::DIR_PWR4:1463case PPC::DIR_PWR5:1464case PPC::DIR_PWR5X:1465case PPC::DIR_PWR6:1466case PPC::DIR_PWR6X:1467case PPC::DIR_PWR7:1468case PPC::DIR_PWR8:1469case PPC::DIR_PWR9:1470case PPC::DIR_PWR10:1471case PPC::DIR_PWR11:1472case PPC::DIR_PWR_FUTURE:1473setPrefLoopAlignment(Align(16));1474setPrefFunctionAlignment(Align(16));1475break;1476}14771478if (Subtarget.enableMachineScheduler())1479setSchedulingPreference(Sched::Source);1480else1481setSchedulingPreference(Sched::Hybrid);14821483computeRegisterProperties(STI.getRegisterInfo());14841485// The Freescale cores do better with aggressive inlining of memcpy and1486// friends. GCC uses same threshold of 128 bytes (= 32 word stores).1487if (Subtarget.getCPUDirective() == PPC::DIR_E500mc ||1488Subtarget.getCPUDirective() == PPC::DIR_E5500) {1489MaxStoresPerMemset = 32;1490MaxStoresPerMemsetOptSize = 16;1491MaxStoresPerMemcpy = 32;1492MaxStoresPerMemcpyOptSize = 8;1493MaxStoresPerMemmove = 32;1494MaxStoresPerMemmoveOptSize = 8;1495} else if (Subtarget.getCPUDirective() == PPC::DIR_A2) {1496// The A2 also benefits from (very) aggressive inlining of memcpy and1497// friends. The overhead of a the function call, even when warm, can be1498// over one hundred cycles.1499MaxStoresPerMemset = 128;1500MaxStoresPerMemcpy = 128;1501MaxStoresPerMemmove = 128;1502MaxLoadsPerMemcmp = 128;1503} else {1504MaxLoadsPerMemcmp = 8;1505MaxLoadsPerMemcmpOptSize = 4;1506}15071508IsStrictFPEnabled = true;15091510// Let the subtarget (CPU) decide if a predictable select is more expensive1511// than the corresponding branch. This information is used in CGP to decide1512// when to convert selects into branches.1513PredictableSelectIsExpensive = Subtarget.isPredictableSelectIsExpensive();15141515GatherAllAliasesMaxDepth = PPCGatherAllAliasesMaxDepth;1516}15171518// *********************************** NOTE ************************************1519// For selecting load and store instructions, the addressing modes are defined1520// as ComplexPatterns in PPCInstrInfo.td, which are then utilized in the TD1521// patterns to match the load the store instructions.1522//1523// The TD definitions for the addressing modes correspond to their respective1524// Select<AddrMode>Form() function in PPCISelDAGToDAG.cpp. These functions rely1525// on SelectOptimalAddrMode(), which calls computeMOFlags() to compute the1526// address mode flags of a particular node. Afterwards, the computed address1527// flags are passed into getAddrModeForFlags() in order to retrieve the optimal1528// addressing mode. SelectOptimalAddrMode() then sets the Base and Displacement1529// accordingly, based on the preferred addressing mode.1530//1531// Within PPCISelLowering.h, there are two enums: MemOpFlags and AddrMode.1532// MemOpFlags contains all the possible flags that can be used to compute the1533// optimal addressing mode for load and store instructions.1534// AddrMode contains all the possible load and store addressing modes available1535// on Power (such as DForm, DSForm, DQForm, XForm, etc.)1536//1537// When adding new load and store instructions, it is possible that new address1538// flags may need to be added into MemOpFlags, and a new addressing mode will1539// need to be added to AddrMode. An entry of the new addressing mode (consisting1540// of the minimal and main distinguishing address flags for the new load/store1541// instructions) will need to be added into initializeAddrModeMap() below.1542// Finally, when adding new addressing modes, the getAddrModeForFlags() will1543// need to be updated to account for selecting the optimal addressing mode.1544// *****************************************************************************1545/// Initialize the map that relates the different addressing modes of the load1546/// and store instructions to a set of flags. This ensures the load/store1547/// instruction is correctly matched during instruction selection.1548void PPCTargetLowering::initializeAddrModeMap() {1549AddrModesMap[PPC::AM_DForm] = {1550// LWZ, STW1551PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_WordInt,1552PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_WordInt,1553PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,1554PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,1555// LBZ, LHZ, STB, STH1556PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,1557PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,1558PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,1559PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,1560// LHA1561PPC::MOF_SExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,1562PPC::MOF_SExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,1563PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,1564PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,1565// LFS, LFD, STFS, STFD1566PPC::MOF_RPlusSImm16 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,1567PPC::MOF_RPlusLo | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,1568PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,1569PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,1570};1571AddrModesMap[PPC::AM_DSForm] = {1572// LWA1573PPC::MOF_SExt | PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_WordInt,1574PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,1575PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,1576// LD, STD1577PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_DoubleWordInt,1578PPC::MOF_NotAddNorCst | PPC::MOF_DoubleWordInt,1579PPC::MOF_AddrIsSImm32 | PPC::MOF_DoubleWordInt,1580// DFLOADf32, DFLOADf64, DSTOREf32, DSTOREf641581PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,1582PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,1583PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,1584};1585AddrModesMap[PPC::AM_DQForm] = {1586// LXV, STXV1587PPC::MOF_RPlusSImm16Mult16 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,1588PPC::MOF_NotAddNorCst | PPC::MOF_Vector | PPC::MOF_SubtargetP9,1589PPC::MOF_AddrIsSImm32 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,1590};1591AddrModesMap[PPC::AM_PrefixDForm] = {PPC::MOF_RPlusSImm34 |1592PPC::MOF_SubtargetP10};1593// TODO: Add mapping for quadword load/store.1594}15951596/// getMaxByValAlign - Helper for getByValTypeAlignment to determine1597/// the desired ByVal argument alignment.1598static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) {1599if (MaxAlign == MaxMaxAlign)1600return;1601if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {1602if (MaxMaxAlign >= 32 &&1603VTy->getPrimitiveSizeInBits().getFixedValue() >= 256)1604MaxAlign = Align(32);1605else if (VTy->getPrimitiveSizeInBits().getFixedValue() >= 128 &&1606MaxAlign < 16)1607MaxAlign = Align(16);1608} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {1609Align EltAlign;1610getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);1611if (EltAlign > MaxAlign)1612MaxAlign = EltAlign;1613} else if (StructType *STy = dyn_cast<StructType>(Ty)) {1614for (auto *EltTy : STy->elements()) {1615Align EltAlign;1616getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);1617if (EltAlign > MaxAlign)1618MaxAlign = EltAlign;1619if (MaxAlign == MaxMaxAlign)1620break;1621}1622}1623}16241625/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate1626/// function arguments in the caller parameter area.1627uint64_t PPCTargetLowering::getByValTypeAlignment(Type *Ty,1628const DataLayout &DL) const {1629// 16byte and wider vectors are passed on 16byte boundary.1630// The rest is 8 on PPC64 and 4 on PPC32 boundary.1631Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);1632if (Subtarget.hasAltivec())1633getMaxByValAlign(Ty, Alignment, Align(16));1634return Alignment.value();1635}16361637bool PPCTargetLowering::useSoftFloat() const {1638return Subtarget.useSoftFloat();1639}16401641bool PPCTargetLowering::hasSPE() const {1642return Subtarget.hasSPE();1643}16441645bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {1646return VT.isScalarInteger();1647}16481649bool PPCTargetLowering::shallExtractConstSplatVectorElementToStore(1650Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const {1651if (!Subtarget.isPPC64() || !Subtarget.hasVSX())1652return false;16531654if (auto *VTy = dyn_cast<VectorType>(VectorTy)) {1655if (VTy->getScalarType()->isIntegerTy()) {1656// ElemSizeInBits 8/16 can fit in immediate field, not needed here.1657if (ElemSizeInBits == 32) {1658Index = Subtarget.isLittleEndian() ? 2 : 1;1659return true;1660}1661if (ElemSizeInBits == 64) {1662Index = Subtarget.isLittleEndian() ? 1 : 0;1663return true;1664}1665}1666}1667return false;1668}16691670const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {1671switch ((PPCISD::NodeType)Opcode) {1672case PPCISD::FIRST_NUMBER: break;1673case PPCISD::FSEL: return "PPCISD::FSEL";1674case PPCISD::XSMAXC: return "PPCISD::XSMAXC";1675case PPCISD::XSMINC: return "PPCISD::XSMINC";1676case PPCISD::FCFID: return "PPCISD::FCFID";1677case PPCISD::FCFIDU: return "PPCISD::FCFIDU";1678case PPCISD::FCFIDS: return "PPCISD::FCFIDS";1679case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS";1680case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ";1681case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ";1682case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ";1683case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ";1684case PPCISD::FRE: return "PPCISD::FRE";1685case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";1686case PPCISD::FTSQRT:1687return "PPCISD::FTSQRT";1688case PPCISD::FSQRT:1689return "PPCISD::FSQRT";1690case PPCISD::STFIWX: return "PPCISD::STFIWX";1691case PPCISD::VPERM: return "PPCISD::VPERM";1692case PPCISD::XXSPLT: return "PPCISD::XXSPLT";1693case PPCISD::XXSPLTI_SP_TO_DP:1694return "PPCISD::XXSPLTI_SP_TO_DP";1695case PPCISD::XXSPLTI32DX:1696return "PPCISD::XXSPLTI32DX";1697case PPCISD::VECINSERT: return "PPCISD::VECINSERT";1698case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";1699case PPCISD::XXPERM:1700return "PPCISD::XXPERM";1701case PPCISD::VECSHL: return "PPCISD::VECSHL";1702case PPCISD::CMPB: return "PPCISD::CMPB";1703case PPCISD::Hi: return "PPCISD::Hi";1704case PPCISD::Lo: return "PPCISD::Lo";1705case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";1706case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";1707case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";1708case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";1709case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";1710case PPCISD::PROBED_ALLOCA: return "PPCISD::PROBED_ALLOCA";1711case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";1712case PPCISD::SRL: return "PPCISD::SRL";1713case PPCISD::SRA: return "PPCISD::SRA";1714case PPCISD::SHL: return "PPCISD::SHL";1715case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE";1716case PPCISD::CALL: return "PPCISD::CALL";1717case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP";1718case PPCISD::CALL_NOTOC: return "PPCISD::CALL_NOTOC";1719case PPCISD::CALL_RM:1720return "PPCISD::CALL_RM";1721case PPCISD::CALL_NOP_RM:1722return "PPCISD::CALL_NOP_RM";1723case PPCISD::CALL_NOTOC_RM:1724return "PPCISD::CALL_NOTOC_RM";1725case PPCISD::MTCTR: return "PPCISD::MTCTR";1726case PPCISD::BCTRL: return "PPCISD::BCTRL";1727case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC";1728case PPCISD::BCTRL_RM:1729return "PPCISD::BCTRL_RM";1730case PPCISD::BCTRL_LOAD_TOC_RM:1731return "PPCISD::BCTRL_LOAD_TOC_RM";1732case PPCISD::RET_GLUE: return "PPCISD::RET_GLUE";1733case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE";1734case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP";1735case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";1736case PPCISD::MFOCRF: return "PPCISD::MFOCRF";1737case PPCISD::MFVSR: return "PPCISD::MFVSR";1738case PPCISD::MTVSRA: return "PPCISD::MTVSRA";1739case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ";1740case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP";1741case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP";1742case PPCISD::SCALAR_TO_VECTOR_PERMUTED:1743return "PPCISD::SCALAR_TO_VECTOR_PERMUTED";1744case PPCISD::ANDI_rec_1_EQ_BIT:1745return "PPCISD::ANDI_rec_1_EQ_BIT";1746case PPCISD::ANDI_rec_1_GT_BIT:1747return "PPCISD::ANDI_rec_1_GT_BIT";1748case PPCISD::VCMP: return "PPCISD::VCMP";1749case PPCISD::VCMP_rec: return "PPCISD::VCMP_rec";1750case PPCISD::LBRX: return "PPCISD::LBRX";1751case PPCISD::STBRX: return "PPCISD::STBRX";1752case PPCISD::LFIWAX: return "PPCISD::LFIWAX";1753case PPCISD::LFIWZX: return "PPCISD::LFIWZX";1754case PPCISD::LXSIZX: return "PPCISD::LXSIZX";1755case PPCISD::STXSIX: return "PPCISD::STXSIX";1756case PPCISD::VEXTS: return "PPCISD::VEXTS";1757case PPCISD::LXVD2X: return "PPCISD::LXVD2X";1758case PPCISD::STXVD2X: return "PPCISD::STXVD2X";1759case PPCISD::LOAD_VEC_BE: return "PPCISD::LOAD_VEC_BE";1760case PPCISD::STORE_VEC_BE: return "PPCISD::STORE_VEC_BE";1761case PPCISD::ST_VSR_SCAL_INT:1762return "PPCISD::ST_VSR_SCAL_INT";1763case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";1764case PPCISD::BDNZ: return "PPCISD::BDNZ";1765case PPCISD::BDZ: return "PPCISD::BDZ";1766case PPCISD::MFFS: return "PPCISD::MFFS";1767case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ";1768case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN";1769case PPCISD::CR6SET: return "PPCISD::CR6SET";1770case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET";1771case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT";1772case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT";1773case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";1774case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L";1775case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS";1776case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA";1777case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L";1778case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR";1779case PPCISD::GET_TLS_MOD_AIX: return "PPCISD::GET_TLS_MOD_AIX";1780case PPCISD::GET_TPOINTER: return "PPCISD::GET_TPOINTER";1781case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";1782case PPCISD::TLSGD_AIX: return "PPCISD::TLSGD_AIX";1783case PPCISD::TLSLD_AIX: return "PPCISD::TLSLD_AIX";1784case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA";1785case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L";1786case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR";1787case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";1788case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";1789case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L";1790case PPCISD::PADDI_DTPREL:1791return "PPCISD::PADDI_DTPREL";1792case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT";1793case PPCISD::SC: return "PPCISD::SC";1794case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB";1795case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE";1796case PPCISD::RFEBB: return "PPCISD::RFEBB";1797case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD";1798case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN";1799case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";1800case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64";1801case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE";1802case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";1803case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";1804case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF";1805case PPCISD::MAT_PCREL_ADDR: return "PPCISD::MAT_PCREL_ADDR";1806case PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR:1807return "PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR";1808case PPCISD::TLS_LOCAL_EXEC_MAT_ADDR:1809return "PPCISD::TLS_LOCAL_EXEC_MAT_ADDR";1810case PPCISD::ACC_BUILD: return "PPCISD::ACC_BUILD";1811case PPCISD::PAIR_BUILD: return "PPCISD::PAIR_BUILD";1812case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG";1813case PPCISD::XXMFACC: return "PPCISD::XXMFACC";1814case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";1815case PPCISD::ZEXT_LD_SPLAT: return "PPCISD::ZEXT_LD_SPLAT";1816case PPCISD::SEXT_LD_SPLAT: return "PPCISD::SEXT_LD_SPLAT";1817case PPCISD::FNMSUB: return "PPCISD::FNMSUB";1818case PPCISD::STRICT_FADDRTZ:1819return "PPCISD::STRICT_FADDRTZ";1820case PPCISD::STRICT_FCTIDZ:1821return "PPCISD::STRICT_FCTIDZ";1822case PPCISD::STRICT_FCTIWZ:1823return "PPCISD::STRICT_FCTIWZ";1824case PPCISD::STRICT_FCTIDUZ:1825return "PPCISD::STRICT_FCTIDUZ";1826case PPCISD::STRICT_FCTIWUZ:1827return "PPCISD::STRICT_FCTIWUZ";1828case PPCISD::STRICT_FCFID:1829return "PPCISD::STRICT_FCFID";1830case PPCISD::STRICT_FCFIDU:1831return "PPCISD::STRICT_FCFIDU";1832case PPCISD::STRICT_FCFIDS:1833return "PPCISD::STRICT_FCFIDS";1834case PPCISD::STRICT_FCFIDUS:1835return "PPCISD::STRICT_FCFIDUS";1836case PPCISD::LXVRZX: return "PPCISD::LXVRZX";1837case PPCISD::STORE_COND:1838return "PPCISD::STORE_COND";1839}1840return nullptr;1841}18421843EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,1844EVT VT) const {1845if (!VT.isVector())1846return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;18471848return VT.changeVectorElementTypeToInteger();1849}18501851bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {1852assert(VT.isFloatingPoint() && "Non-floating-point FMA?");1853return true;1854}18551856//===----------------------------------------------------------------------===//1857// Node matching predicates, for use by the tblgen matching code.1858//===----------------------------------------------------------------------===//18591860/// isFloatingPointZero - Return true if this is 0.0 or -0.0.1861static bool isFloatingPointZero(SDValue Op) {1862if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))1863return CFP->getValueAPF().isZero();1864else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {1865// Maybe this has already been legalized into the constant pool?1866if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))1867if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))1868return CFP->getValueAPF().isZero();1869}1870return false;1871}18721873/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return1874/// true if Op is undef or if it matches the specified value.1875static bool isConstantOrUndef(int Op, int Val) {1876return Op < 0 || Op == Val;1877}18781879/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a1880/// VPKUHUM instruction.1881/// The ShuffleKind distinguishes between big-endian operations with1882/// two different inputs (0), either-endian operations with two identical1883/// inputs (1), and little-endian operations with two different inputs (2).1884/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).1885bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,1886SelectionDAG &DAG) {1887bool IsLE = DAG.getDataLayout().isLittleEndian();1888if (ShuffleKind == 0) {1889if (IsLE)1890return false;1891for (unsigned i = 0; i != 16; ++i)1892if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))1893return false;1894} else if (ShuffleKind == 2) {1895if (!IsLE)1896return false;1897for (unsigned i = 0; i != 16; ++i)1898if (!isConstantOrUndef(N->getMaskElt(i), i*2))1899return false;1900} else if (ShuffleKind == 1) {1901unsigned j = IsLE ? 0 : 1;1902for (unsigned i = 0; i != 8; ++i)1903if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) ||1904!isConstantOrUndef(N->getMaskElt(i+8), i*2+j))1905return false;1906}1907return true;1908}19091910/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a1911/// VPKUWUM instruction.1912/// The ShuffleKind distinguishes between big-endian operations with1913/// two different inputs (0), either-endian operations with two identical1914/// inputs (1), and little-endian operations with two different inputs (2).1915/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).1916bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,1917SelectionDAG &DAG) {1918bool IsLE = DAG.getDataLayout().isLittleEndian();1919if (ShuffleKind == 0) {1920if (IsLE)1921return false;1922for (unsigned i = 0; i != 16; i += 2)1923if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) ||1924!isConstantOrUndef(N->getMaskElt(i+1), i*2+3))1925return false;1926} else if (ShuffleKind == 2) {1927if (!IsLE)1928return false;1929for (unsigned i = 0; i != 16; i += 2)1930if (!isConstantOrUndef(N->getMaskElt(i ), i*2) ||1931!isConstantOrUndef(N->getMaskElt(i+1), i*2+1))1932return false;1933} else if (ShuffleKind == 1) {1934unsigned j = IsLE ? 0 : 2;1935for (unsigned i = 0; i != 8; i += 2)1936if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||1937!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) ||1938!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||1939!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1))1940return false;1941}1942return true;1943}19441945/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a1946/// VPKUDUM instruction, AND the VPKUDUM instruction exists for the1947/// current subtarget.1948///1949/// The ShuffleKind distinguishes between big-endian operations with1950/// two different inputs (0), either-endian operations with two identical1951/// inputs (1), and little-endian operations with two different inputs (2).1952/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).1953bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,1954SelectionDAG &DAG) {1955const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();1956if (!Subtarget.hasP8Vector())1957return false;19581959bool IsLE = DAG.getDataLayout().isLittleEndian();1960if (ShuffleKind == 0) {1961if (IsLE)1962return false;1963for (unsigned i = 0; i != 16; i += 4)1964if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) ||1965!isConstantOrUndef(N->getMaskElt(i+1), i*2+5) ||1966!isConstantOrUndef(N->getMaskElt(i+2), i*2+6) ||1967!isConstantOrUndef(N->getMaskElt(i+3), i*2+7))1968return false;1969} else if (ShuffleKind == 2) {1970if (!IsLE)1971return false;1972for (unsigned i = 0; i != 16; i += 4)1973if (!isConstantOrUndef(N->getMaskElt(i ), i*2) ||1974!isConstantOrUndef(N->getMaskElt(i+1), i*2+1) ||1975!isConstantOrUndef(N->getMaskElt(i+2), i*2+2) ||1976!isConstantOrUndef(N->getMaskElt(i+3), i*2+3))1977return false;1978} else if (ShuffleKind == 1) {1979unsigned j = IsLE ? 0 : 4;1980for (unsigned i = 0; i != 8; i += 4)1981if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||1982!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) ||1983!isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) ||1984!isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) ||1985!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||1986!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) ||1987!isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) ||1988!isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3))1989return false;1990}1991return true;1992}19931994/// isVMerge - Common function, used to match vmrg* shuffles.1995///1996static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,1997unsigned LHSStart, unsigned RHSStart) {1998if (N->getValueType(0) != MVT::v16i8)1999return false;2000assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&2001"Unsupported merge size!");20022003for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units2004for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit2005if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j),2006LHSStart+j+i*UnitSize) ||2007!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j),2008RHSStart+j+i*UnitSize))2009return false;2010}2011return true;2012}20132014/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for2015/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).2016/// The ShuffleKind distinguishes between big-endian merges with two2017/// different inputs (0), either-endian merges with two identical inputs (1),2018/// and little-endian merges with two different inputs (2). For the latter,2019/// the input operands are swapped (see PPCInstrAltivec.td).2020bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,2021unsigned ShuffleKind, SelectionDAG &DAG) {2022if (DAG.getDataLayout().isLittleEndian()) {2023if (ShuffleKind == 1) // unary2024return isVMerge(N, UnitSize, 0, 0);2025else if (ShuffleKind == 2) // swapped2026return isVMerge(N, UnitSize, 0, 16);2027else2028return false;2029} else {2030if (ShuffleKind == 1) // unary2031return isVMerge(N, UnitSize, 8, 8);2032else if (ShuffleKind == 0) // normal2033return isVMerge(N, UnitSize, 8, 24);2034else2035return false;2036}2037}20382039/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for2040/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).2041/// The ShuffleKind distinguishes between big-endian merges with two2042/// different inputs (0), either-endian merges with two identical inputs (1),2043/// and little-endian merges with two different inputs (2). For the latter,2044/// the input operands are swapped (see PPCInstrAltivec.td).2045bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,2046unsigned ShuffleKind, SelectionDAG &DAG) {2047if (DAG.getDataLayout().isLittleEndian()) {2048if (ShuffleKind == 1) // unary2049return isVMerge(N, UnitSize, 8, 8);2050else if (ShuffleKind == 2) // swapped2051return isVMerge(N, UnitSize, 8, 24);2052else2053return false;2054} else {2055if (ShuffleKind == 1) // unary2056return isVMerge(N, UnitSize, 0, 0);2057else if (ShuffleKind == 0) // normal2058return isVMerge(N, UnitSize, 0, 16);2059else2060return false;2061}2062}20632064/**2065* Common function used to match vmrgew and vmrgow shuffles2066*2067* The indexOffset determines whether to look for even or odd words in2068* the shuffle mask. This is based on the of the endianness of the target2069* machine.2070* - Little Endian:2071* - Use offset of 0 to check for odd elements2072* - Use offset of 4 to check for even elements2073* - Big Endian:2074* - Use offset of 0 to check for even elements2075* - Use offset of 4 to check for odd elements2076* A detailed description of the vector element ordering for little endian and2077* big endian can be found at2078* http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html2079* Targeting your applications - what little endian and big endian IBM XL C/C++2080* compiler differences mean to you2081*2082* The mask to the shuffle vector instruction specifies the indices of the2083* elements from the two input vectors to place in the result. The elements are2084* numbered in array-access order, starting with the first vector. These vectors2085* are always of type v16i8, thus each vector will contain 16 elements of size2086* 8. More info on the shuffle vector can be found in the2087* http://llvm.org/docs/LangRef.html#shufflevector-instruction2088* Language Reference.2089*2090* The RHSStartValue indicates whether the same input vectors are used (unary)2091* or two different input vectors are used, based on the following:2092* - If the instruction uses the same vector for both inputs, the range of the2093* indices will be 0 to 15. In this case, the RHSStart value passed should2094* be 0.2095* - If the instruction has two different vectors then the range of the2096* indices will be 0 to 31. In this case, the RHSStart value passed should2097* be 16 (indices 0-15 specify elements in the first vector while indices 162098* to 31 specify elements in the second vector).2099*2100* \param[in] N The shuffle vector SD Node to analyze2101* \param[in] IndexOffset Specifies whether to look for even or odd elements2102* \param[in] RHSStartValue Specifies the starting index for the righthand input2103* vector to the shuffle_vector instruction2104* \return true iff this shuffle vector represents an even or odd word merge2105*/2106static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,2107unsigned RHSStartValue) {2108if (N->getValueType(0) != MVT::v16i8)2109return false;21102111for (unsigned i = 0; i < 2; ++i)2112for (unsigned j = 0; j < 4; ++j)2113if (!isConstantOrUndef(N->getMaskElt(i*4+j),2114i*RHSStartValue+j+IndexOffset) ||2115!isConstantOrUndef(N->getMaskElt(i*4+j+8),2116i*RHSStartValue+j+IndexOffset+8))2117return false;2118return true;2119}21202121/**2122* Determine if the specified shuffle mask is suitable for the vmrgew or2123* vmrgow instructions.2124*2125* \param[in] N The shuffle vector SD Node to analyze2126* \param[in] CheckEven Check for an even merge (true) or an odd merge (false)2127* \param[in] ShuffleKind Identify the type of merge:2128* - 0 = big-endian merge with two different inputs;2129* - 1 = either-endian merge with two identical inputs;2130* - 2 = little-endian merge with two different inputs (inputs are swapped for2131* little-endian merges).2132* \param[in] DAG The current SelectionDAG2133* \return true iff this shuffle mask2134*/2135bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,2136unsigned ShuffleKind, SelectionDAG &DAG) {2137if (DAG.getDataLayout().isLittleEndian()) {2138unsigned indexOffset = CheckEven ? 4 : 0;2139if (ShuffleKind == 1) // Unary2140return isVMerge(N, indexOffset, 0);2141else if (ShuffleKind == 2) // swapped2142return isVMerge(N, indexOffset, 16);2143else2144return false;2145}2146else {2147unsigned indexOffset = CheckEven ? 0 : 4;2148if (ShuffleKind == 1) // Unary2149return isVMerge(N, indexOffset, 0);2150else if (ShuffleKind == 0) // Normal2151return isVMerge(N, indexOffset, 16);2152else2153return false;2154}2155return false;2156}21572158/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift2159/// amount, otherwise return -1.2160/// The ShuffleKind distinguishes between big-endian operations with two2161/// different inputs (0), either-endian operations with two identical inputs2162/// (1), and little-endian operations with two different inputs (2). For the2163/// latter, the input operands are swapped (see PPCInstrAltivec.td).2164int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,2165SelectionDAG &DAG) {2166if (N->getValueType(0) != MVT::v16i8)2167return -1;21682169ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);21702171// Find the first non-undef value in the shuffle mask.2172unsigned i;2173for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)2174/*search*/;21752176if (i == 16) return -1; // all undef.21772178// Otherwise, check to see if the rest of the elements are consecutively2179// numbered from this value.2180unsigned ShiftAmt = SVOp->getMaskElt(i);2181if (ShiftAmt < i) return -1;21822183ShiftAmt -= i;2184bool isLE = DAG.getDataLayout().isLittleEndian();21852186if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {2187// Check the rest of the elements to see if they are consecutive.2188for (++i; i != 16; ++i)2189if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))2190return -1;2191} else if (ShuffleKind == 1) {2192// Check the rest of the elements to see if they are consecutive.2193for (++i; i != 16; ++i)2194if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))2195return -1;2196} else2197return -1;21982199if (isLE)2200ShiftAmt = 16 - ShiftAmt;22012202return ShiftAmt;2203}22042205/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand2206/// specifies a splat of a single element that is suitable for input to2207/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).2208bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {2209EVT VT = N->getValueType(0);2210if (VT == MVT::v2i64 || VT == MVT::v2f64)2211return EltSize == 8 && N->getMaskElt(0) == N->getMaskElt(1);22122213assert(VT == MVT::v16i8 && isPowerOf2_32(EltSize) &&2214EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");22152216// The consecutive indices need to specify an element, not part of two2217// different elements. So abandon ship early if this isn't the case.2218if (N->getMaskElt(0) % EltSize != 0)2219return false;22202221// This is a splat operation if each element of the permute is the same, and2222// if the value doesn't reference the second vector.2223unsigned ElementBase = N->getMaskElt(0);22242225// FIXME: Handle UNDEF elements too!2226if (ElementBase >= 16)2227return false;22282229// Check that the indices are consecutive, in the case of a multi-byte element2230// splatted with a v16i8 mask.2231for (unsigned i = 1; i != EltSize; ++i)2232if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase))2233return false;22342235for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {2236if (N->getMaskElt(i) < 0) continue;2237for (unsigned j = 0; j != EltSize; ++j)2238if (N->getMaskElt(i+j) != N->getMaskElt(j))2239return false;2240}2241return true;2242}22432244/// Check that the mask is shuffling N byte elements. Within each N byte2245/// element of the mask, the indices could be either in increasing or2246/// decreasing order as long as they are consecutive.2247/// \param[in] N the shuffle vector SD Node to analyze2248/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/2249/// Word/DoubleWord/QuadWord).2250/// \param[in] StepLen the delta indices number among the N byte element, if2251/// the mask is in increasing/decreasing order then it is 1/-1.2252/// \return true iff the mask is shuffling N byte elements.2253static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,2254int StepLen) {2255assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) &&2256"Unexpected element width.");2257assert((StepLen == 1 || StepLen == -1) && "Unexpected element width.");22582259unsigned NumOfElem = 16 / Width;2260unsigned MaskVal[16]; // Width is never greater than 162261for (unsigned i = 0; i < NumOfElem; ++i) {2262MaskVal[0] = N->getMaskElt(i * Width);2263if ((StepLen == 1) && (MaskVal[0] % Width)) {2264return false;2265} else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {2266return false;2267}22682269for (unsigned int j = 1; j < Width; ++j) {2270MaskVal[j] = N->getMaskElt(i * Width + j);2271if (MaskVal[j] != MaskVal[j-1] + StepLen) {2272return false;2273}2274}2275}22762277return true;2278}22792280bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,2281unsigned &InsertAtByte, bool &Swap, bool IsLE) {2282if (!isNByteElemShuffleMask(N, 4, 1))2283return false;22842285// Now we look at mask elements 0,4,8,122286unsigned M0 = N->getMaskElt(0) / 4;2287unsigned M1 = N->getMaskElt(4) / 4;2288unsigned M2 = N->getMaskElt(8) / 4;2289unsigned M3 = N->getMaskElt(12) / 4;2290unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };2291unsigned BigEndianShifts[] = { 3, 0, 1, 2 };22922293// Below, let H and L be arbitrary elements of the shuffle mask2294// where H is in the range [4,7] and L is in the range [0,3].2295// H, 1, 2, 3 or L, 5, 6, 72296if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) ||2297(M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {2298ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];2299InsertAtByte = IsLE ? 12 : 0;2300Swap = M0 < 4;2301return true;2302}2303// 0, H, 2, 3 or 4, L, 6, 72304if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) ||2305(M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {2306ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];2307InsertAtByte = IsLE ? 8 : 4;2308Swap = M1 < 4;2309return true;2310}2311// 0, 1, H, 3 or 4, 5, L, 72312if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) ||2313(M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {2314ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];2315InsertAtByte = IsLE ? 4 : 8;2316Swap = M2 < 4;2317return true;2318}2319// 0, 1, 2, H or 4, 5, 6, L2320if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) ||2321(M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {2322ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];2323InsertAtByte = IsLE ? 0 : 12;2324Swap = M3 < 4;2325return true;2326}23272328// If both vector operands for the shuffle are the same vector, the mask will2329// contain only elements from the first one and the second one will be undef.2330if (N->getOperand(1).isUndef()) {2331ShiftElts = 0;2332Swap = true;2333unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;2334if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {2335InsertAtByte = IsLE ? 12 : 0;2336return true;2337}2338if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {2339InsertAtByte = IsLE ? 8 : 4;2340return true;2341}2342if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {2343InsertAtByte = IsLE ? 4 : 8;2344return true;2345}2346if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {2347InsertAtByte = IsLE ? 0 : 12;2348return true;2349}2350}23512352return false;2353}23542355bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,2356bool &Swap, bool IsLE) {2357assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");2358// Ensure each byte index of the word is consecutive.2359if (!isNByteElemShuffleMask(N, 4, 1))2360return false;23612362// Now we look at mask elements 0,4,8,12, which are the beginning of words.2363unsigned M0 = N->getMaskElt(0) / 4;2364unsigned M1 = N->getMaskElt(4) / 4;2365unsigned M2 = N->getMaskElt(8) / 4;2366unsigned M3 = N->getMaskElt(12) / 4;23672368// If both vector operands for the shuffle are the same vector, the mask will2369// contain only elements from the first one and the second one will be undef.2370if (N->getOperand(1).isUndef()) {2371assert(M0 < 4 && "Indexing into an undef vector?");2372if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4)2373return false;23742375ShiftElts = IsLE ? (4 - M0) % 4 : M0;2376Swap = false;2377return true;2378}23792380// Ensure each word index of the ShuffleVector Mask is consecutive.2381if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8)2382return false;23832384if (IsLE) {2385if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) {2386// Input vectors don't need to be swapped if the leading element2387// of the result is one of the 3 left elements of the second vector2388// (or if there is no shift to be done at all).2389Swap = false;2390ShiftElts = (8 - M0) % 8;2391} else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) {2392// Input vectors need to be swapped if the leading element2393// of the result is one of the 3 left elements of the first vector2394// (or if we're shifting by 4 - thereby simply swapping the vectors).2395Swap = true;2396ShiftElts = (4 - M0) % 4;2397}23982399return true;2400} else { // BE2401if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) {2402// Input vectors don't need to be swapped if the leading element2403// of the result is one of the 4 elements of the first vector.2404Swap = false;2405ShiftElts = M0;2406} else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) {2407// Input vectors need to be swapped if the leading element2408// of the result is one of the 4 elements of the right vector.2409Swap = true;2410ShiftElts = M0 - 4;2411}24122413return true;2414}2415}24162417bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {2418assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");24192420if (!isNByteElemShuffleMask(N, Width, -1))2421return false;24222423for (int i = 0; i < 16; i += Width)2424if (N->getMaskElt(i) != i + Width - 1)2425return false;24262427return true;2428}24292430bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {2431return isXXBRShuffleMaskHelper(N, 2);2432}24332434bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {2435return isXXBRShuffleMaskHelper(N, 4);2436}24372438bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {2439return isXXBRShuffleMaskHelper(N, 8);2440}24412442bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {2443return isXXBRShuffleMaskHelper(N, 16);2444}24452446/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap2447/// if the inputs to the instruction should be swapped and set \p DM to the2448/// value for the immediate.2449/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI2450/// AND element 0 of the result comes from the first input (LE) or second input2451/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.2452/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle2453/// mask.2454bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,2455bool &Swap, bool IsLE) {2456assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");24572458// Ensure each byte index of the double word is consecutive.2459if (!isNByteElemShuffleMask(N, 8, 1))2460return false;24612462unsigned M0 = N->getMaskElt(0) / 8;2463unsigned M1 = N->getMaskElt(8) / 8;2464assert(((M0 | M1) < 4) && "A mask element out of bounds?");24652466// If both vector operands for the shuffle are the same vector, the mask will2467// contain only elements from the first one and the second one will be undef.2468if (N->getOperand(1).isUndef()) {2469if ((M0 | M1) < 2) {2470DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);2471Swap = false;2472return true;2473} else2474return false;2475}24762477if (IsLE) {2478if (M0 > 1 && M1 < 2) {2479Swap = false;2480} else if (M0 < 2 && M1 > 1) {2481M0 = (M0 + 2) % 4;2482M1 = (M1 + 2) % 4;2483Swap = true;2484} else2485return false;24862487// Note: if control flow comes here that means Swap is already set above2488DM = (((~M1) & 1) << 1) + ((~M0) & 1);2489return true;2490} else { // BE2491if (M0 < 2 && M1 > 1) {2492Swap = false;2493} else if (M0 > 1 && M1 < 2) {2494M0 = (M0 + 2) % 4;2495M1 = (M1 + 2) % 4;2496Swap = true;2497} else2498return false;24992500// Note: if control flow comes here that means Swap is already set above2501DM = (M0 << 1) + (M1 & 1);2502return true;2503}2504}250525062507/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is2508/// appropriate for PPC mnemonics (which have a big endian bias - namely2509/// elements are counted from the left of the vector register).2510unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,2511SelectionDAG &DAG) {2512ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);2513assert(isSplatShuffleMask(SVOp, EltSize));2514EVT VT = SVOp->getValueType(0);25152516if (VT == MVT::v2i64 || VT == MVT::v2f64)2517return DAG.getDataLayout().isLittleEndian() ? 1 - SVOp->getMaskElt(0)2518: SVOp->getMaskElt(0);25192520if (DAG.getDataLayout().isLittleEndian())2521return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);2522else2523return SVOp->getMaskElt(0) / EltSize;2524}25252526/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed2527/// by using a vspltis[bhw] instruction of the specified element size, return2528/// the constant being splatted. The ByteSize field indicates the number of2529/// bytes of each element [124] -> [bhw].2530SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {2531SDValue OpVal;25322533// If ByteSize of the splat is bigger than the element size of the2534// build_vector, then we have a case where we are checking for a splat where2535// multiple elements of the buildvector are folded together into a single2536// logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).2537unsigned EltSize = 16/N->getNumOperands();2538if (EltSize < ByteSize) {2539unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval.2540SDValue UniquedVals[4];2541assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");25422543// See if all of the elements in the buildvector agree across.2544for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {2545if (N->getOperand(i).isUndef()) continue;2546// If the element isn't a constant, bail fully out.2547if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();25482549if (!UniquedVals[i&(Multiple-1)].getNode())2550UniquedVals[i&(Multiple-1)] = N->getOperand(i);2551else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))2552return SDValue(); // no match.2553}25542555// Okay, if we reached this point, UniquedVals[0..Multiple-1] contains2556// either constant or undef values that are identical for each chunk. See2557// if these chunks can form into a larger vspltis*.25582559// Check to see if all of the leading entries are either 0 or -1. If2560// neither, then this won't fit into the immediate field.2561bool LeadingZero = true;2562bool LeadingOnes = true;2563for (unsigned i = 0; i != Multiple-1; ++i) {2564if (!UniquedVals[i].getNode()) continue; // Must have been undefs.25652566LeadingZero &= isNullConstant(UniquedVals[i]);2567LeadingOnes &= isAllOnesConstant(UniquedVals[i]);2568}2569// Finally, check the least significant entry.2570if (LeadingZero) {2571if (!UniquedVals[Multiple-1].getNode())2572return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef2573int Val = UniquedVals[Multiple - 1]->getAsZExtVal();2574if (Val < 16) // 0,0,0,4 -> vspltisw(4)2575return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);2576}2577if (LeadingOnes) {2578if (!UniquedVals[Multiple-1].getNode())2579return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef2580int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();2581if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2)2582return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);2583}25842585return SDValue();2586}25872588// Check to see if this buildvec has a single non-undef value in its elements.2589for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {2590if (N->getOperand(i).isUndef()) continue;2591if (!OpVal.getNode())2592OpVal = N->getOperand(i);2593else if (OpVal != N->getOperand(i))2594return SDValue();2595}25962597if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def.25982599unsigned ValSizeInBytes = EltSize;2600uint64_t Value = 0;2601if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {2602Value = CN->getZExtValue();2603} else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {2604assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");2605Value = llvm::bit_cast<uint32_t>(CN->getValueAPF().convertToFloat());2606}26072608// If the splat value is larger than the element value, then we can never do2609// this splat. The only case that we could fit the replicated bits into our2610// immediate field for would be zero, and we prefer to use vxor for it.2611if (ValSizeInBytes < ByteSize) return SDValue();26122613// If the element value is larger than the splat value, check if it consists2614// of a repeated bit pattern of size ByteSize.2615if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8))2616return SDValue();26172618// Properly sign extend the value.2619int MaskVal = SignExtend32(Value, ByteSize * 8);26202621// If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.2622if (MaskVal == 0) return SDValue();26232624// Finally, if this value fits in a 5 bit sext field, return it2625if (SignExtend32<5>(MaskVal) == MaskVal)2626return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32);2627return SDValue();2628}26292630//===----------------------------------------------------------------------===//2631// Addressing Mode Selection2632//===----------------------------------------------------------------------===//26332634/// isIntS16Immediate - This method tests to see if the node is either a 32-bit2635/// or 64-bit immediate, and if the value can be accurately represented as a2636/// sign extension from a 16-bit value. If so, this returns true and the2637/// immediate.2638bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {2639if (!isa<ConstantSDNode>(N))2640return false;26412642Imm = (int16_t)N->getAsZExtVal();2643if (N->getValueType(0) == MVT::i32)2644return Imm == (int32_t)N->getAsZExtVal();2645else2646return Imm == (int64_t)N->getAsZExtVal();2647}2648bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {2649return isIntS16Immediate(Op.getNode(), Imm);2650}26512652/// Used when computing address flags for selecting loads and stores.2653/// If we have an OR, check if the LHS and RHS are provably disjoint.2654/// An OR of two provably disjoint values is equivalent to an ADD.2655/// Most PPC load/store instructions compute the effective address as a sum,2656/// so doing this conversion is useful.2657static bool provablyDisjointOr(SelectionDAG &DAG, const SDValue &N) {2658if (N.getOpcode() != ISD::OR)2659return false;2660KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));2661if (!LHSKnown.Zero.getBoolValue())2662return false;2663KnownBits RHSKnown = DAG.computeKnownBits(N.getOperand(1));2664return (~(LHSKnown.Zero | RHSKnown.Zero) == 0);2665}26662667/// SelectAddressEVXRegReg - Given the specified address, check to see if it can2668/// be represented as an indexed [r+r] operation.2669bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,2670SDValue &Index,2671SelectionDAG &DAG) const {2672for (SDNode *U : N->uses()) {2673if (MemSDNode *Memop = dyn_cast<MemSDNode>(U)) {2674if (Memop->getMemoryVT() == MVT::f64) {2675Base = N.getOperand(0);2676Index = N.getOperand(1);2677return true;2678}2679}2680}2681return false;2682}26832684/// isIntS34Immediate - This method tests if value of node given can be2685/// accurately represented as a sign extension from a 34-bit value. If so,2686/// this returns true and the immediate.2687bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {2688if (!isa<ConstantSDNode>(N))2689return false;26902691Imm = (int64_t)N->getAsZExtVal();2692return isInt<34>(Imm);2693}2694bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {2695return isIntS34Immediate(Op.getNode(), Imm);2696}26972698/// SelectAddressRegReg - Given the specified addressed, check to see if it2699/// can be represented as an indexed [r+r] operation. Returns false if it2700/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is2701/// non-zero and N can be represented by a base register plus a signed 16-bit2702/// displacement, make a more precise judgement by checking (displacement % \p2703/// EncodingAlignment).2704bool PPCTargetLowering::SelectAddressRegReg(2705SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG,2706MaybeAlign EncodingAlignment) const {2707// If we have a PC Relative target flag don't select as [reg+reg]. It will be2708// a [pc+imm].2709if (SelectAddressPCRel(N, Base))2710return false;27112712int16_t Imm = 0;2713if (N.getOpcode() == ISD::ADD) {2714// Is there any SPE load/store (f64), which can't handle 16bit offset?2715// SPE load/store can only handle 8-bit offsets.2716if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))2717return true;2718if (isIntS16Immediate(N.getOperand(1), Imm) &&2719(!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))2720return false; // r+i2721if (N.getOperand(1).getOpcode() == PPCISD::Lo)2722return false; // r+i27232724Base = N.getOperand(0);2725Index = N.getOperand(1);2726return true;2727} else if (N.getOpcode() == ISD::OR) {2728if (isIntS16Immediate(N.getOperand(1), Imm) &&2729(!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))2730return false; // r+i can fold it if we can.27312732// If this is an or of disjoint bitfields, we can codegen this as an add2733// (for better address arithmetic) if the LHS and RHS of the OR are provably2734// disjoint.2735KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));27362737if (LHSKnown.Zero.getBoolValue()) {2738KnownBits RHSKnown = DAG.computeKnownBits(N.getOperand(1));2739// If all of the bits are known zero on the LHS or RHS, the add won't2740// carry.2741if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) {2742Base = N.getOperand(0);2743Index = N.getOperand(1);2744return true;2745}2746}2747}27482749return false;2750}27512752// If we happen to be doing an i64 load or store into a stack slot that has2753// less than a 4-byte alignment, then the frame-index elimination may need to2754// use an indexed load or store instruction (because the offset may not be a2755// multiple of 4). The extra register needed to hold the offset comes from the2756// register scavenger, and it is possible that the scavenger will need to use2757// an emergency spill slot. As a result, we need to make sure that a spill slot2758// is allocated when doing an i64 load/store into a less-than-4-byte-aligned2759// stack slot.2760static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {2761// FIXME: This does not handle the LWA case.2762if (VT != MVT::i64)2763return;27642765// NOTE: We'll exclude negative FIs here, which come from argument2766// lowering, because there are no known test cases triggering this problem2767// using packed structures (or similar). We can remove this exclusion if2768// we find such a test case. The reason why this is so test-case driven is2769// because this entire 'fixup' is only to prevent crashes (from the2770// register scavenger) on not-really-valid inputs. For example, if we have:2771// %a = alloca i12772// %b = bitcast i1* %a to i64*2773// store i64* a, i64 b2774// then the store should really be marked as 'align 1', but is not. If it2775// were marked as 'align 1' then the indexed form would have been2776// instruction-selected initially, and the problem this 'fixup' is preventing2777// won't happen regardless.2778if (FrameIdx < 0)2779return;27802781MachineFunction &MF = DAG.getMachineFunction();2782MachineFrameInfo &MFI = MF.getFrameInfo();27832784if (MFI.getObjectAlign(FrameIdx) >= Align(4))2785return;27862787PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();2788FuncInfo->setHasNonRISpills();2789}27902791/// Returns true if the address N can be represented by a base register plus2792/// a signed 16-bit displacement [r+imm], and if it is not better2793/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept2794/// displacements that are multiples of that value.2795bool PPCTargetLowering::SelectAddressRegImm(2796SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG,2797MaybeAlign EncodingAlignment) const {2798// FIXME dl should come from parent load or store, not from address2799SDLoc dl(N);28002801// If we have a PC Relative target flag don't select as [reg+imm]. It will be2802// a [pc+imm].2803if (SelectAddressPCRel(N, Base))2804return false;28052806// If this can be more profitably realized as r+r, fail.2807if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment))2808return false;28092810if (N.getOpcode() == ISD::ADD) {2811int16_t imm = 0;2812if (isIntS16Immediate(N.getOperand(1), imm) &&2813(!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {2814Disp = DAG.getTargetConstant(imm, dl, N.getValueType());2815if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {2816Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());2817fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());2818} else {2819Base = N.getOperand(0);2820}2821return true; // [r+i]2822} else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {2823// Match LOAD (ADD (X, Lo(G))).2824assert(!N.getOperand(1).getConstantOperandVal(1) &&2825"Cannot handle constant offsets yet!");2826Disp = N.getOperand(1).getOperand(0); // The global address.2827assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||2828Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||2829Disp.getOpcode() == ISD::TargetConstantPool ||2830Disp.getOpcode() == ISD::TargetJumpTable);2831Base = N.getOperand(0);2832return true; // [&g+r]2833}2834} else if (N.getOpcode() == ISD::OR) {2835int16_t imm = 0;2836if (isIntS16Immediate(N.getOperand(1), imm) &&2837(!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {2838// If this is an or of disjoint bitfields, we can codegen this as an add2839// (for better address arithmetic) if the LHS and RHS of the OR are2840// provably disjoint.2841KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));28422843if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {2844// If all of the bits are known zero on the LHS or RHS, the add won't2845// carry.2846if (FrameIndexSDNode *FI =2847dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {2848Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());2849fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());2850} else {2851Base = N.getOperand(0);2852}2853Disp = DAG.getTargetConstant(imm, dl, N.getValueType());2854return true;2855}2856}2857} else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {2858// Loading from a constant address.28592860// If this address fits entirely in a 16-bit sext immediate field, codegen2861// this as "d, 0"2862int16_t Imm;2863if (isIntS16Immediate(CN, Imm) &&2864(!EncodingAlignment || isAligned(*EncodingAlignment, Imm))) {2865Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));2866Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,2867CN->getValueType(0));2868return true;2869}28702871// Handle 32-bit sext immediates with LIS + addr mode.2872if ((CN->getValueType(0) == MVT::i32 ||2873(int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&2874(!EncodingAlignment ||2875isAligned(*EncodingAlignment, CN->getZExtValue()))) {2876int Addr = (int)CN->getZExtValue();28772878// Otherwise, break this down into an LIS + disp.2879Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32);28802881Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl,2882MVT::i32);2883unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;2884Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);2885return true;2886}2887}28882889Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout()));2890if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {2891Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());2892fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());2893} else2894Base = N;2895return true; // [r+0]2896}28972898/// Similar to the 16-bit case but for instructions that take a 34-bit2899/// displacement field (prefixed loads/stores).2900bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp,2901SDValue &Base,2902SelectionDAG &DAG) const {2903// Only on 64-bit targets.2904if (N.getValueType() != MVT::i64)2905return false;29062907SDLoc dl(N);2908int64_t Imm = 0;29092910if (N.getOpcode() == ISD::ADD) {2911if (!isIntS34Immediate(N.getOperand(1), Imm))2912return false;2913Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());2914if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))2915Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());2916else2917Base = N.getOperand(0);2918return true;2919}29202921if (N.getOpcode() == ISD::OR) {2922if (!isIntS34Immediate(N.getOperand(1), Imm))2923return false;2924// If this is an or of disjoint bitfields, we can codegen this as an add2925// (for better address arithmetic) if the LHS and RHS of the OR are2926// provably disjoint.2927KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));2928if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) != ~0ULL)2929return false;2930if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))2931Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());2932else2933Base = N.getOperand(0);2934Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());2935return true;2936}29372938if (isIntS34Immediate(N, Imm)) { // If the address is a 34-bit const.2939Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());2940Base = DAG.getRegister(PPC::ZERO8, N.getValueType());2941return true;2942}29432944return false;2945}29462947/// SelectAddressRegRegOnly - Given the specified addressed, force it to be2948/// represented as an indexed [r+r] operation.2949bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,2950SDValue &Index,2951SelectionDAG &DAG) const {2952// Check to see if we can easily represent this as an [r+r] address. This2953// will fail if it thinks that the address is more profitably represented as2954// reg+imm, e.g. where imm = 0.2955if (SelectAddressRegReg(N, Base, Index, DAG))2956return true;29572958// If the address is the result of an add, we will utilize the fact that the2959// address calculation includes an implicit add. However, we can reduce2960// register pressure if we do not materialize a constant just for use as the2961// index register. We only get rid of the add if it is not an add of a2962// value and a 16-bit signed constant and both have a single use.2963int16_t imm = 0;2964if (N.getOpcode() == ISD::ADD &&2965(!isIntS16Immediate(N.getOperand(1), imm) ||2966!N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) {2967Base = N.getOperand(0);2968Index = N.getOperand(1);2969return true;2970}29712972// Otherwise, do it the hard way, using R0 as the base register.2973Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,2974N.getValueType());2975Index = N;2976return true;2977}29782979template <typename Ty> static bool isValidPCRelNode(SDValue N) {2980Ty *PCRelCand = dyn_cast<Ty>(N);2981return PCRelCand && (PPCInstrInfo::hasPCRelFlag(PCRelCand->getTargetFlags()));2982}29832984/// Returns true if this address is a PC Relative address.2985/// PC Relative addresses are marked with the flag PPCII::MO_PCREL_FLAG2986/// or if the node opcode is PPCISD::MAT_PCREL_ADDR.2987bool PPCTargetLowering::SelectAddressPCRel(SDValue N, SDValue &Base) const {2988// This is a materialize PC Relative node. Always select this as PC Relative.2989Base = N;2990if (N.getOpcode() == PPCISD::MAT_PCREL_ADDR)2991return true;2992if (isValidPCRelNode<ConstantPoolSDNode>(N) ||2993isValidPCRelNode<GlobalAddressSDNode>(N) ||2994isValidPCRelNode<JumpTableSDNode>(N) ||2995isValidPCRelNode<BlockAddressSDNode>(N))2996return true;2997return false;2998}29993000/// Returns true if we should use a direct load into vector instruction3001/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.3002static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {30033004// If there are any other uses other than scalar to vector, then we should3005// keep it as a scalar load -> direct move pattern to prevent multiple3006// loads.3007LoadSDNode *LD = dyn_cast<LoadSDNode>(N);3008if (!LD)3009return false;30103011EVT MemVT = LD->getMemoryVT();3012if (!MemVT.isSimple())3013return false;3014switch(MemVT.getSimpleVT().SimpleTy) {3015case MVT::i64:3016break;3017case MVT::i32:3018if (!ST.hasP8Vector())3019return false;3020break;3021case MVT::i16:3022case MVT::i8:3023if (!ST.hasP9Vector())3024return false;3025break;3026default:3027return false;3028}30293030SDValue LoadedVal(N, 0);3031if (!LoadedVal.hasOneUse())3032return false;30333034for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();3035UI != UE; ++UI)3036if (UI.getUse().get().getResNo() == 0 &&3037UI->getOpcode() != ISD::SCALAR_TO_VECTOR &&3038UI->getOpcode() != PPCISD::SCALAR_TO_VECTOR_PERMUTED)3039return false;30403041return true;3042}30433044/// getPreIndexedAddressParts - returns true by value, base pointer and3045/// offset pointer and addressing mode by reference if the node's address3046/// can be legally represented as pre-indexed load / store address.3047bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,3048SDValue &Offset,3049ISD::MemIndexedMode &AM,3050SelectionDAG &DAG) const {3051if (DisablePPCPreinc) return false;30523053bool isLoad = true;3054SDValue Ptr;3055EVT VT;3056Align Alignment;3057if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {3058Ptr = LD->getBasePtr();3059VT = LD->getMemoryVT();3060Alignment = LD->getAlign();3061} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {3062Ptr = ST->getBasePtr();3063VT = ST->getMemoryVT();3064Alignment = ST->getAlign();3065isLoad = false;3066} else3067return false;30683069// Do not generate pre-inc forms for specific loads that feed scalar_to_vector3070// instructions because we can fold these into a more efficient instruction3071// instead, (such as LXSD).3072if (isLoad && usePartialVectorLoads(N, Subtarget)) {3073return false;3074}30753076// PowerPC doesn't have preinc load/store instructions for vectors3077if (VT.isVector())3078return false;30793080if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {3081// Common code will reject creating a pre-inc form if the base pointer3082// is a frame index, or if N is a store and the base pointer is either3083// the same as or a predecessor of the value being stored. Check for3084// those situations here, and try with swapped Base/Offset instead.3085bool Swap = false;30863087if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base))3088Swap = true;3089else if (!isLoad) {3090SDValue Val = cast<StoreSDNode>(N)->getValue();3091if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode()))3092Swap = true;3093}30943095if (Swap)3096std::swap(Base, Offset);30973098AM = ISD::PRE_INC;3099return true;3100}31013102// LDU/STU can only handle immediates that are a multiple of 4.3103if (VT != MVT::i64) {3104if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, std::nullopt))3105return false;3106} else {3107// LDU/STU need an address with at least 4-byte alignment.3108if (Alignment < Align(4))3109return false;31103111if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, Align(4)))3112return false;3113}31143115if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {3116// PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of3117// sext i32 to i64 when addr mode is r+i.3118if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&3119LD->getExtensionType() == ISD::SEXTLOAD &&3120isa<ConstantSDNode>(Offset))3121return false;3122}31233124AM = ISD::PRE_INC;3125return true;3126}31273128//===----------------------------------------------------------------------===//3129// LowerOperation implementation3130//===----------------------------------------------------------------------===//31313132/// Return true if we should reference labels using a PICBase, set the HiOpFlags3133/// and LoOpFlags to the target MO flags.3134static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,3135unsigned &HiOpFlags, unsigned &LoOpFlags,3136const GlobalValue *GV = nullptr) {3137HiOpFlags = PPCII::MO_HA;3138LoOpFlags = PPCII::MO_LO;31393140// Don't use the pic base if not in PIC relocation model.3141if (IsPIC) {3142HiOpFlags = PPCII::MO_PIC_HA_FLAG;3143LoOpFlags = PPCII::MO_PIC_LO_FLAG;3144}3145}31463147static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,3148SelectionDAG &DAG) {3149SDLoc DL(HiPart);3150EVT PtrVT = HiPart.getValueType();3151SDValue Zero = DAG.getConstant(0, DL, PtrVT);31523153SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);3154SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);31553156// With PIC, the first instruction is actually "GR+hi(&G)".3157if (isPIC)3158Hi = DAG.getNode(ISD::ADD, DL, PtrVT,3159DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);31603161// Generate non-pic code that has direct accesses to the constant pool.3162// The address of the global is just (hi(&g)+lo(&g)).3163return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);3164}31653166static void setUsesTOCBasePtr(MachineFunction &MF) {3167PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();3168FuncInfo->setUsesTOCBasePtr();3169}31703171static void setUsesTOCBasePtr(SelectionDAG &DAG) {3172setUsesTOCBasePtr(DAG.getMachineFunction());3173}31743175SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,3176SDValue GA) const {3177const bool Is64Bit = Subtarget.isPPC64();3178EVT VT = Is64Bit ? MVT::i64 : MVT::i32;3179SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT)3180: Subtarget.isAIXABI()3181? DAG.getRegister(PPC::R2, VT)3182: DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);3183SDValue Ops[] = { GA, Reg };3184return DAG.getMemIntrinsicNode(3185PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,3186MachinePointerInfo::getGOT(DAG.getMachineFunction()), std::nullopt,3187MachineMemOperand::MOLoad);3188}31893190SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,3191SelectionDAG &DAG) const {3192EVT PtrVT = Op.getValueType();3193ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);3194const Constant *C = CP->getConstVal();31953196// 64-bit SVR4 ABI and AIX ABI code are always position-independent.3197// The actual address of the GlobalValue is stored in the TOC.3198if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {3199if (Subtarget.isUsingPCRelativeCalls()) {3200SDLoc DL(CP);3201EVT Ty = getPointerTy(DAG.getDataLayout());3202SDValue ConstPool = DAG.getTargetConstantPool(3203C, Ty, CP->getAlign(), CP->getOffset(), PPCII::MO_PCREL_FLAG);3204return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, ConstPool);3205}3206setUsesTOCBasePtr(DAG);3207SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0);3208return getTOCEntry(DAG, SDLoc(CP), GA);3209}32103211unsigned MOHiFlag, MOLoFlag;3212bool IsPIC = isPositionIndependent();3213getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);32143215if (IsPIC && Subtarget.isSVR4ABI()) {3216SDValue GA =3217DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), PPCII::MO_PIC_FLAG);3218return getTOCEntry(DAG, SDLoc(CP), GA);3219}32203221SDValue CPIHi =3222DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOHiFlag);3223SDValue CPILo =3224DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOLoFlag);3225return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);3226}32273228// For 64-bit PowerPC, prefer the more compact relative encodings.3229// This trades 32 bits per jump table entry for one or two instructions3230// on the jump site.3231unsigned PPCTargetLowering::getJumpTableEncoding() const {3232if (isJumpTableRelative())3233return MachineJumpTableInfo::EK_LabelDifference32;32343235return TargetLowering::getJumpTableEncoding();3236}32373238bool PPCTargetLowering::isJumpTableRelative() const {3239if (UseAbsoluteJumpTables)3240return false;3241if (Subtarget.isPPC64() || Subtarget.isAIXABI())3242return true;3243return TargetLowering::isJumpTableRelative();3244}32453246SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,3247SelectionDAG &DAG) const {3248if (!Subtarget.isPPC64() || Subtarget.isAIXABI())3249return TargetLowering::getPICJumpTableRelocBase(Table, DAG);32503251switch (getTargetMachine().getCodeModel()) {3252case CodeModel::Small:3253case CodeModel::Medium:3254return TargetLowering::getPICJumpTableRelocBase(Table, DAG);3255default:3256return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(),3257getPointerTy(DAG.getDataLayout()));3258}3259}32603261const MCExpr *3262PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,3263unsigned JTI,3264MCContext &Ctx) const {3265if (!Subtarget.isPPC64() || Subtarget.isAIXABI())3266return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);32673268switch (getTargetMachine().getCodeModel()) {3269case CodeModel::Small:3270case CodeModel::Medium:3271return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);3272default:3273return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);3274}3275}32763277SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {3278EVT PtrVT = Op.getValueType();3279JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);32803281// isUsingPCRelativeCalls() returns true when PCRelative is enabled3282if (Subtarget.isUsingPCRelativeCalls()) {3283SDLoc DL(JT);3284EVT Ty = getPointerTy(DAG.getDataLayout());3285SDValue GA =3286DAG.getTargetJumpTable(JT->getIndex(), Ty, PPCII::MO_PCREL_FLAG);3287SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);3288return MatAddr;3289}32903291// 64-bit SVR4 ABI and AIX ABI code are always position-independent.3292// The actual address of the GlobalValue is stored in the TOC.3293if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {3294setUsesTOCBasePtr(DAG);3295SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);3296return getTOCEntry(DAG, SDLoc(JT), GA);3297}32983299unsigned MOHiFlag, MOLoFlag;3300bool IsPIC = isPositionIndependent();3301getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);33023303if (IsPIC && Subtarget.isSVR4ABI()) {3304SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,3305PPCII::MO_PIC_FLAG);3306return getTOCEntry(DAG, SDLoc(GA), GA);3307}33083309SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);3310SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);3311return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG);3312}33133314SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,3315SelectionDAG &DAG) const {3316EVT PtrVT = Op.getValueType();3317BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);3318const BlockAddress *BA = BASDN->getBlockAddress();33193320// isUsingPCRelativeCalls() returns true when PCRelative is enabled3321if (Subtarget.isUsingPCRelativeCalls()) {3322SDLoc DL(BASDN);3323EVT Ty = getPointerTy(DAG.getDataLayout());3324SDValue GA = DAG.getTargetBlockAddress(BA, Ty, BASDN->getOffset(),3325PPCII::MO_PCREL_FLAG);3326SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);3327return MatAddr;3328}33293330// 64-bit SVR4 ABI and AIX ABI code are always position-independent.3331// The actual BlockAddress is stored in the TOC.3332if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {3333setUsesTOCBasePtr(DAG);3334SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());3335return getTOCEntry(DAG, SDLoc(BASDN), GA);3336}33373338// 32-bit position-independent ELF stores the BlockAddress in the .got.3339if (Subtarget.is32BitELFABI() && isPositionIndependent())3340return getTOCEntry(3341DAG, SDLoc(BASDN),3342DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()));33433344unsigned MOHiFlag, MOLoFlag;3345bool IsPIC = isPositionIndependent();3346getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);3347SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);3348SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);3349return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG);3350}33513352SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,3353SelectionDAG &DAG) const {3354if (Subtarget.isAIXABI())3355return LowerGlobalTLSAddressAIX(Op, DAG);33563357return LowerGlobalTLSAddressLinux(Op, DAG);3358}33593360/// updateForAIXShLibTLSModelOpt - Helper to initialize TLS model opt settings,3361/// and then apply the update.3362static void updateForAIXShLibTLSModelOpt(TLSModel::Model &Model,3363SelectionDAG &DAG,3364const TargetMachine &TM) {3365// Initialize TLS model opt setting lazily:3366// (1) Use initial-exec for single TLS var references within current function.3367// (2) Use local-dynamic for multiple TLS var references within current3368// function.3369PPCFunctionInfo *FuncInfo =3370DAG.getMachineFunction().getInfo<PPCFunctionInfo>();3371if (!FuncInfo->isAIXFuncTLSModelOptInitDone()) {3372SmallPtrSet<const GlobalValue *, 8> TLSGV;3373// Iterate over all instructions within current function, collect all TLS3374// global variables (global variables taken as the first parameter to3375// Intrinsic::threadlocal_address).3376const Function &Func = DAG.getMachineFunction().getFunction();3377for (Function::const_iterator BI = Func.begin(), BE = Func.end(); BI != BE;3378++BI)3379for (BasicBlock::const_iterator II = BI->begin(), IE = BI->end();3380II != IE; ++II)3381if (II->getOpcode() == Instruction::Call)3382if (const CallInst *CI = dyn_cast<const CallInst>(&*II))3383if (Function *CF = CI->getCalledFunction())3384if (CF->isDeclaration() &&3385CF->getIntrinsicID() == Intrinsic::threadlocal_address)3386if (const GlobalValue *GV =3387dyn_cast<GlobalValue>(II->getOperand(0))) {3388TLSModel::Model GVModel = TM.getTLSModel(GV);3389if (GVModel == TLSModel::LocalDynamic)3390TLSGV.insert(GV);3391}33923393unsigned TLSGVCnt = TLSGV.size();3394LLVM_DEBUG(dbgs() << format("LocalDynamic TLSGV count:%d\n", TLSGVCnt));3395if (TLSGVCnt <= PPCAIXTLSModelOptUseIEForLDLimit)3396FuncInfo->setAIXFuncUseTLSIEForLD();3397FuncInfo->setAIXFuncTLSModelOptInitDone();3398}33993400if (FuncInfo->isAIXFuncUseTLSIEForLD()) {3401LLVM_DEBUG(3402dbgs() << DAG.getMachineFunction().getName()3403<< " function is using the TLS-IE model for TLS-LD access.\n");3404Model = TLSModel::InitialExec;3405}3406}34073408SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,3409SelectionDAG &DAG) const {3410GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);34113412if (DAG.getTarget().useEmulatedTLS())3413report_fatal_error("Emulated TLS is not yet supported on AIX");34143415SDLoc dl(GA);3416const GlobalValue *GV = GA->getGlobal();3417EVT PtrVT = getPointerTy(DAG.getDataLayout());3418bool Is64Bit = Subtarget.isPPC64();3419TLSModel::Model Model = getTargetMachine().getTLSModel(GV);34203421// Apply update to the TLS model.3422if (Subtarget.hasAIXShLibTLSModelOpt())3423updateForAIXShLibTLSModelOpt(Model, DAG, getTargetMachine());34243425bool IsTLSLocalExecModel = Model == TLSModel::LocalExec;34263427if (IsTLSLocalExecModel || Model == TLSModel::InitialExec) {3428bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();3429bool HasAIXSmallTLSGlobalAttr = false;3430SDValue VariableOffsetTGA =3431DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_FLAG);3432SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);3433SDValue TLSReg;34343435if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))3436if (GVar->hasAttribute("aix-small-tls"))3437HasAIXSmallTLSGlobalAttr = true;34383439if (Is64Bit) {3440// For local-exec and initial-exec on AIX (64-bit), the sequence generated3441// involves a load of the variable offset (from the TOC), followed by an3442// add of the loaded variable offset to R13 (the thread pointer).3443// This code sequence looks like:3444// ld reg1,var[TC](2)3445// add reg2, reg1, r13 // r13 contains the thread pointer3446TLSReg = DAG.getRegister(PPC::X13, MVT::i64);34473448// With the -maix-small-local-exec-tls option, or with the "aix-small-tls"3449// global variable attribute, produce a faster access sequence for3450// local-exec TLS variables where the offset from the TLS base is encoded3451// as an immediate operand.3452//3453// We only utilize the faster local-exec access sequence when the TLS3454// variable has a size within the policy limit. We treat types that are3455// not sized or are empty as being over the policy size limit.3456if ((HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr) &&3457IsTLSLocalExecModel) {3458Type *GVType = GV->getValueType();3459if (GVType->isSized() && !GVType->isEmptyTy() &&3460GV->getDataLayout().getTypeAllocSize(GVType) <=3461AIXSmallTlsPolicySizeLimit)3462return DAG.getNode(PPCISD::Lo, dl, PtrVT, VariableOffsetTGA, TLSReg);3463}3464} else {3465// For local-exec and initial-exec on AIX (32-bit), the sequence generated3466// involves loading the variable offset from the TOC, generating a call to3467// .__get_tpointer to get the thread pointer (which will be in R3), and3468// adding the two together:3469// lwz reg1,var[TC](2)3470// bla .__get_tpointer3471// add reg2, reg1, r33472TLSReg = DAG.getNode(PPCISD::GET_TPOINTER, dl, PtrVT);34733474// We do not implement the 32-bit version of the faster access sequence3475// for local-exec that is controlled by the -maix-small-local-exec-tls3476// option, or the "aix-small-tls" global variable attribute.3477if (HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr)3478report_fatal_error("The small-local-exec TLS access sequence is "3479"currently only supported on AIX (64-bit mode).");3480}3481return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, VariableOffset);3482}34833484if (Model == TLSModel::LocalDynamic) {3485bool HasAIXSmallLocalDynamicTLS = Subtarget.hasAIXSmallLocalDynamicTLS();34863487// We do not implement the 32-bit version of the faster access sequence3488// for local-dynamic that is controlled by -maix-small-local-dynamic-tls.3489if (!Is64Bit && HasAIXSmallLocalDynamicTLS)3490report_fatal_error("The small-local-dynamic TLS access sequence is "3491"currently only supported on AIX (64-bit mode).");34923493// For local-dynamic on AIX, we need to generate one TOC entry for each3494// variable offset, and a single module-handle TOC entry for the entire3495// file.34963497SDValue VariableOffsetTGA =3498DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLSLD_FLAG);3499SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);35003501Module *M = DAG.getMachineFunction().getFunction().getParent();3502GlobalVariable *TLSGV =3503dyn_cast_or_null<GlobalVariable>(M->getOrInsertGlobal(3504StringRef("_$TLSML"), PointerType::getUnqual(*DAG.getContext())));3505TLSGV->setThreadLocalMode(GlobalVariable::LocalDynamicTLSModel);3506assert(TLSGV && "Not able to create GV for _$TLSML.");3507SDValue ModuleHandleTGA =3508DAG.getTargetGlobalAddress(TLSGV, dl, PtrVT, 0, PPCII::MO_TLSLDM_FLAG);3509SDValue ModuleHandleTOC = getTOCEntry(DAG, dl, ModuleHandleTGA);3510SDValue ModuleHandle =3511DAG.getNode(PPCISD::TLSLD_AIX, dl, PtrVT, ModuleHandleTOC);35123513// With the -maix-small-local-dynamic-tls option, produce a faster access3514// sequence for local-dynamic TLS variables where the offset from the3515// module-handle is encoded as an immediate operand.3516//3517// We only utilize the faster local-dynamic access sequence when the TLS3518// variable has a size within the policy limit. We treat types that are3519// not sized or are empty as being over the policy size limit.3520if (HasAIXSmallLocalDynamicTLS) {3521Type *GVType = GV->getValueType();3522if (GVType->isSized() && !GVType->isEmptyTy() &&3523GV->getDataLayout().getTypeAllocSize(GVType) <=3524AIXSmallTlsPolicySizeLimit)3525return DAG.getNode(PPCISD::Lo, dl, PtrVT, VariableOffsetTGA,3526ModuleHandle);3527}35283529return DAG.getNode(ISD::ADD, dl, PtrVT, ModuleHandle, VariableOffset);3530}35313532// If Local- or Initial-exec or Local-dynamic is not possible or specified,3533// all GlobalTLSAddress nodes are lowered using the general-dynamic model. We3534// need to generate two TOC entries, one for the variable offset, one for the3535// region handle. The global address for the TOC entry of the region handle is3536// created with the MO_TLSGDM_FLAG flag and the global address for the TOC3537// entry of the variable offset is created with MO_TLSGD_FLAG.3538SDValue VariableOffsetTGA =3539DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLSGD_FLAG);3540SDValue RegionHandleTGA =3541DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLSGDM_FLAG);3542SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);3543SDValue RegionHandle = getTOCEntry(DAG, dl, RegionHandleTGA);3544return DAG.getNode(PPCISD::TLSGD_AIX, dl, PtrVT, VariableOffset,3545RegionHandle);3546}35473548SDValue PPCTargetLowering::LowerGlobalTLSAddressLinux(SDValue Op,3549SelectionDAG &DAG) const {3550// FIXME: TLS addresses currently use medium model code sequences,3551// which is the most useful form. Eventually support for small and3552// large models could be added if users need it, at the cost of3553// additional complexity.3554GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);3555if (DAG.getTarget().useEmulatedTLS())3556return LowerToTLSEmulatedModel(GA, DAG);35573558SDLoc dl(GA);3559const GlobalValue *GV = GA->getGlobal();3560EVT PtrVT = getPointerTy(DAG.getDataLayout());3561bool is64bit = Subtarget.isPPC64();3562const Module *M = DAG.getMachineFunction().getFunction().getParent();3563PICLevel::Level picLevel = M->getPICLevel();35643565const TargetMachine &TM = getTargetMachine();3566TLSModel::Model Model = TM.getTLSModel(GV);35673568if (Model == TLSModel::LocalExec) {3569if (Subtarget.isUsingPCRelativeCalls()) {3570SDValue TLSReg = DAG.getRegister(PPC::X13, MVT::i64);3571SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,3572PPCII::MO_TPREL_PCREL_FLAG);3573SDValue MatAddr =3574DAG.getNode(PPCISD::TLS_LOCAL_EXEC_MAT_ADDR, dl, PtrVT, TGA);3575return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, MatAddr);3576}35773578SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,3579PPCII::MO_TPREL_HA);3580SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,3581PPCII::MO_TPREL_LO);3582SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64)3583: DAG.getRegister(PPC::R2, MVT::i32);35843585SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);3586return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);3587}35883589if (Model == TLSModel::InitialExec) {3590bool IsPCRel = Subtarget.isUsingPCRelativeCalls();3591SDValue TGA = DAG.getTargetGlobalAddress(3592GV, dl, PtrVT, 0, IsPCRel ? PPCII::MO_GOT_TPREL_PCREL_FLAG : 0);3593SDValue TGATLS = DAG.getTargetGlobalAddress(3594GV, dl, PtrVT, 0, IsPCRel ? PPCII::MO_TLS_PCREL_FLAG : PPCII::MO_TLS);3595SDValue TPOffset;3596if (IsPCRel) {3597SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, dl, PtrVT, TGA);3598TPOffset = DAG.getLoad(MVT::i64, dl, DAG.getEntryNode(), MatPCRel,3599MachinePointerInfo());3600} else {3601SDValue GOTPtr;3602if (is64bit) {3603setUsesTOCBasePtr(DAG);3604SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);3605GOTPtr =3606DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, PtrVT, GOTReg, TGA);3607} else {3608if (!TM.isPositionIndependent())3609GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);3610else if (picLevel == PICLevel::SmallPIC)3611GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);3612else3613GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);3614}3615TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, PtrVT, TGA, GOTPtr);3616}3617return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);3618}36193620if (Model == TLSModel::GeneralDynamic) {3621if (Subtarget.isUsingPCRelativeCalls()) {3622SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,3623PPCII::MO_GOT_TLSGD_PCREL_FLAG);3624return DAG.getNode(PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, dl, PtrVT, TGA);3625}36263627SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);3628SDValue GOTPtr;3629if (is64bit) {3630setUsesTOCBasePtr(DAG);3631SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);3632GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,3633GOTReg, TGA);3634} else {3635if (picLevel == PICLevel::SmallPIC)3636GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);3637else3638GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);3639}3640return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT,3641GOTPtr, TGA, TGA);3642}36433644if (Model == TLSModel::LocalDynamic) {3645if (Subtarget.isUsingPCRelativeCalls()) {3646SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,3647PPCII::MO_GOT_TLSLD_PCREL_FLAG);3648SDValue MatPCRel =3649DAG.getNode(PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, dl, PtrVT, TGA);3650return DAG.getNode(PPCISD::PADDI_DTPREL, dl, PtrVT, MatPCRel, TGA);3651}36523653SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);3654SDValue GOTPtr;3655if (is64bit) {3656setUsesTOCBasePtr(DAG);3657SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);3658GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,3659GOTReg, TGA);3660} else {3661if (picLevel == PICLevel::SmallPIC)3662GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);3663else3664GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);3665}3666SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl,3667PtrVT, GOTPtr, TGA, TGA);3668SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl,3669PtrVT, TLSAddr, TGA);3670return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);3671}36723673llvm_unreachable("Unknown TLS model!");3674}36753676SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,3677SelectionDAG &DAG) const {3678EVT PtrVT = Op.getValueType();3679GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);3680SDLoc DL(GSDN);3681const GlobalValue *GV = GSDN->getGlobal();36823683// 64-bit SVR4 ABI & AIX ABI code is always position-independent.3684// The actual address of the GlobalValue is stored in the TOC.3685if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {3686if (Subtarget.isUsingPCRelativeCalls()) {3687EVT Ty = getPointerTy(DAG.getDataLayout());3688if (isAccessedAsGotIndirect(Op)) {3689SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),3690PPCII::MO_GOT_PCREL_FLAG);3691SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);3692SDValue Load = DAG.getLoad(MVT::i64, DL, DAG.getEntryNode(), MatPCRel,3693MachinePointerInfo());3694return Load;3695} else {3696SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),3697PPCII::MO_PCREL_FLAG);3698return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);3699}3700}3701setUsesTOCBasePtr(DAG);3702SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());3703return getTOCEntry(DAG, DL, GA);3704}37053706unsigned MOHiFlag, MOLoFlag;3707bool IsPIC = isPositionIndependent();3708getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);37093710if (IsPIC && Subtarget.isSVR4ABI()) {3711SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,3712GSDN->getOffset(),3713PPCII::MO_PIC_FLAG);3714return getTOCEntry(DAG, DL, GA);3715}37163717SDValue GAHi =3718DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);3719SDValue GALo =3720DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);37213722return LowerLabelRef(GAHi, GALo, IsPIC, DAG);3723}37243725SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {3726bool IsStrict = Op->isStrictFPOpcode();3727ISD::CondCode CC =3728cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();3729SDValue LHS = Op.getOperand(IsStrict ? 1 : 0);3730SDValue RHS = Op.getOperand(IsStrict ? 2 : 1);3731SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();3732EVT LHSVT = LHS.getValueType();3733SDLoc dl(Op);37343735// Soften the setcc with libcall if it is fp128.3736if (LHSVT == MVT::f128) {3737assert(!Subtarget.hasP9Vector() &&3738"SETCC for f128 is already legal under Power9!");3739softenSetCCOperands(DAG, LHSVT, LHS, RHS, CC, dl, LHS, RHS, Chain,3740Op->getOpcode() == ISD::STRICT_FSETCCS);3741if (RHS.getNode())3742LHS = DAG.getNode(ISD::SETCC, dl, Op.getValueType(), LHS, RHS,3743DAG.getCondCode(CC));3744if (IsStrict)3745return DAG.getMergeValues({LHS, Chain}, dl);3746return LHS;3747}37483749assert(!IsStrict && "Don't know how to handle STRICT_FSETCC!");37503751if (Op.getValueType() == MVT::v2i64) {3752// When the operands themselves are v2i64 values, we need to do something3753// special because VSX has no underlying comparison operations for these.3754if (LHS.getValueType() == MVT::v2i64) {3755// Equality can be handled by casting to the legal type for Altivec3756// comparisons, everything else needs to be expanded.3757if (CC != ISD::SETEQ && CC != ISD::SETNE)3758return SDValue();3759SDValue SetCC32 = DAG.getSetCC(3760dl, MVT::v4i32, DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, LHS),3761DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, RHS), CC);3762int ShuffV[] = {1, 0, 3, 2};3763SDValue Shuff =3764DAG.getVectorShuffle(MVT::v4i32, dl, SetCC32, SetCC32, ShuffV);3765return DAG.getBitcast(MVT::v2i64,3766DAG.getNode(CC == ISD::SETEQ ? ISD::AND : ISD::OR,3767dl, MVT::v4i32, Shuff, SetCC32));3768}37693770// We handle most of these in the usual way.3771return Op;3772}37733774// If we're comparing for equality to zero, expose the fact that this is3775// implemented as a ctlz/srl pair on ppc, so that the dag combiner can3776// fold the new nodes.3777if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))3778return V;37793780if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {3781// Leave comparisons against 0 and -1 alone for now, since they're usually3782// optimized. FIXME: revisit this when we can custom lower all setcc3783// optimizations.3784if (C->isAllOnes() || C->isZero())3785return SDValue();3786}37873788// If we have an integer seteq/setne, turn it into a compare against zero3789// by xor'ing the rhs with the lhs, which is faster than setting a3790// condition register, reading it back out, and masking the correct bit. The3791// normal approach here uses sub to do this instead of xor. Using xor exposes3792// the result to other bit-twiddling opportunities.3793if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {3794EVT VT = Op.getValueType();3795SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, LHS, RHS);3796return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC);3797}3798return SDValue();3799}38003801SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {3802SDNode *Node = Op.getNode();3803EVT VT = Node->getValueType(0);3804EVT PtrVT = getPointerTy(DAG.getDataLayout());3805SDValue InChain = Node->getOperand(0);3806SDValue VAListPtr = Node->getOperand(1);3807const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();3808SDLoc dl(Node);38093810assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");38113812// gpr_index3813SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,3814VAListPtr, MachinePointerInfo(SV), MVT::i8);3815InChain = GprIndex.getValue(1);38163817if (VT == MVT::i64) {3818// Check if GprIndex is even3819SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,3820DAG.getConstant(1, dl, MVT::i32));3821SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,3822DAG.getConstant(0, dl, MVT::i32), ISD::SETNE);3823SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,3824DAG.getConstant(1, dl, MVT::i32));3825// Align GprIndex to be even if it isn't3826GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,3827GprIndex);3828}38293830// fpr index is 1 byte after gpr3831SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,3832DAG.getConstant(1, dl, MVT::i32));38333834// fpr3835SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,3836FprPtr, MachinePointerInfo(SV), MVT::i8);3837InChain = FprIndex.getValue(1);38383839SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,3840DAG.getConstant(8, dl, MVT::i32));38413842SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,3843DAG.getConstant(4, dl, MVT::i32));38443845// areas3846SDValue OverflowArea =3847DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo());3848InChain = OverflowArea.getValue(1);38493850SDValue RegSaveArea =3851DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo());3852InChain = RegSaveArea.getValue(1);38533854// select overflow_area if index > 83855SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,3856DAG.getConstant(8, dl, MVT::i32), ISD::SETLT);38573858// adjustment constant gpr_index * 4/83859SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,3860VT.isInteger() ? GprIndex : FprIndex,3861DAG.getConstant(VT.isInteger() ? 4 : 8, dl,3862MVT::i32));38633864// OurReg = RegSaveArea + RegConstant3865SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,3866RegConstant);38673868// Floating types are 32 bytes into RegSaveArea3869if (VT.isFloatingPoint())3870OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,3871DAG.getConstant(32, dl, MVT::i32));38723873// increase {f,g}pr_index by 1 (or 2 if VT is i64)3874SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,3875VT.isInteger() ? GprIndex : FprIndex,3876DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl,3877MVT::i32));38783879InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,3880VT.isInteger() ? VAListPtr : FprPtr,3881MachinePointerInfo(SV), MVT::i8);38823883// determine if we should load from reg_save_area or overflow_area3884SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);38853886// increase overflow_area by 4/8 if gpr/fpr > 83887SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,3888DAG.getConstant(VT.isInteger() ? 4 : 8,3889dl, MVT::i32));38903891OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,3892OverflowAreaPlusN);38933894InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr,3895MachinePointerInfo(), MVT::i32);38963897return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo());3898}38993900SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {3901assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");39023903// We have to copy the entire va_list struct:3904// 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte3905return DAG.getMemcpy(Op.getOperand(0), Op, Op.getOperand(1), Op.getOperand(2),3906DAG.getConstant(12, SDLoc(Op), MVT::i32), Align(8),3907false, true, /*CI=*/nullptr, std::nullopt,3908MachinePointerInfo(), MachinePointerInfo());3909}39103911SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,3912SelectionDAG &DAG) const {3913if (Subtarget.isAIXABI())3914report_fatal_error("ADJUST_TRAMPOLINE operation is not supported on AIX.");39153916return Op.getOperand(0);3917}39183919SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {3920MachineFunction &MF = DAG.getMachineFunction();3921PPCFunctionInfo &MFI = *MF.getInfo<PPCFunctionInfo>();39223923assert((Op.getOpcode() == ISD::INLINEASM ||3924Op.getOpcode() == ISD::INLINEASM_BR) &&3925"Expecting Inline ASM node.");39263927// If an LR store is already known to be required then there is not point in3928// checking this ASM as well.3929if (MFI.isLRStoreRequired())3930return Op;39313932// Inline ASM nodes have an optional last operand that is an incoming Flag of3933// type MVT::Glue. We want to ignore this last operand if that is the case.3934unsigned NumOps = Op.getNumOperands();3935if (Op.getOperand(NumOps - 1).getValueType() == MVT::Glue)3936--NumOps;39373938// Check all operands that may contain the LR.3939for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {3940const InlineAsm::Flag Flags(Op.getConstantOperandVal(i));3941unsigned NumVals = Flags.getNumOperandRegisters();3942++i; // Skip the ID value.39433944switch (Flags.getKind()) {3945default:3946llvm_unreachable("Bad flags!");3947case InlineAsm::Kind::RegUse:3948case InlineAsm::Kind::Imm:3949case InlineAsm::Kind::Mem:3950i += NumVals;3951break;3952case InlineAsm::Kind::Clobber:3953case InlineAsm::Kind::RegDef:3954case InlineAsm::Kind::RegDefEarlyClobber: {3955for (; NumVals; --NumVals, ++i) {3956Register Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg();3957if (Reg != PPC::LR && Reg != PPC::LR8)3958continue;3959MFI.setLRStoreRequired();3960return Op;3961}3962break;3963}3964}3965}39663967return Op;3968}39693970SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,3971SelectionDAG &DAG) const {3972if (Subtarget.isAIXABI())3973report_fatal_error("INIT_TRAMPOLINE operation is not supported on AIX.");39743975SDValue Chain = Op.getOperand(0);3976SDValue Trmp = Op.getOperand(1); // trampoline3977SDValue FPtr = Op.getOperand(2); // nested function3978SDValue Nest = Op.getOperand(3); // 'nest' parameter value3979SDLoc dl(Op);39803981EVT PtrVT = getPointerTy(DAG.getDataLayout());3982bool isPPC64 = (PtrVT == MVT::i64);3983Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());39843985TargetLowering::ArgListTy Args;3986TargetLowering::ArgListEntry Entry;39873988Entry.Ty = IntPtrTy;3989Entry.Node = Trmp; Args.push_back(Entry);39903991// TrampSize == (isPPC64 ? 48 : 40);3992Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl,3993isPPC64 ? MVT::i64 : MVT::i32);3994Args.push_back(Entry);39953996Entry.Node = FPtr; Args.push_back(Entry);3997Entry.Node = Nest; Args.push_back(Entry);39983999// Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)4000TargetLowering::CallLoweringInfo CLI(DAG);4001CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(4002CallingConv::C, Type::getVoidTy(*DAG.getContext()),4003DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));40044005std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);4006return CallResult.second;4007}40084009SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {4010MachineFunction &MF = DAG.getMachineFunction();4011PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();4012EVT PtrVT = getPointerTy(MF.getDataLayout());40134014SDLoc dl(Op);40154016if (Subtarget.isPPC64() || Subtarget.isAIXABI()) {4017// vastart just stores the address of the VarArgsFrameIndex slot into the4018// memory location argument.4019SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);4020const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();4021return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),4022MachinePointerInfo(SV));4023}40244025// For the 32-bit SVR4 ABI we follow the layout of the va_list struct.4026// We suppose the given va_list is already allocated.4027//4028// typedef struct {4029// char gpr; /* index into the array of 8 GPRs4030// * stored in the register save area4031// * gpr=0 corresponds to r3,4032// * gpr=1 to r4, etc.4033// */4034// char fpr; /* index into the array of 8 FPRs4035// * stored in the register save area4036// * fpr=0 corresponds to f1,4037// * fpr=1 to f2, etc.4038// */4039// char *overflow_arg_area;4040// /* location on stack that holds4041// * the next overflow argument4042// */4043// char *reg_save_area;4044// /* where r3:r10 and f1:f8 (if saved)4045// * are stored4046// */4047// } va_list[1];40484049SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);4050SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);4051SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),4052PtrVT);4053SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),4054PtrVT);40554056uint64_t FrameOffset = PtrVT.getSizeInBits()/8;4057SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT);40584059uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;4060SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT);40614062uint64_t FPROffset = 1;4063SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT);40644065const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();40664067// Store first byte : number of int regs4068SDValue firstStore =4069DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1),4070MachinePointerInfo(SV), MVT::i8);4071uint64_t nextOffset = FPROffset;4072SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),4073ConstFPROffset);40744075// Store second byte : number of float regs4076SDValue secondStore =4077DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,4078MachinePointerInfo(SV, nextOffset), MVT::i8);4079nextOffset += StackOffset;4080nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);40814082// Store second word : arguments given on stack4083SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,4084MachinePointerInfo(SV, nextOffset));4085nextOffset += FrameOffset;4086nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);40874088// Store third word : arguments given in registers4089return DAG.getStore(thirdStore, dl, FR, nextPtr,4090MachinePointerInfo(SV, nextOffset));4091}40924093/// FPR - The set of FP registers that should be allocated for arguments4094/// on Darwin and AIX.4095static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,4096PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,4097PPC::F11, PPC::F12, PPC::F13};40984099/// CalculateStackSlotSize - Calculates the size reserved for this argument on4100/// the stack.4101static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,4102unsigned PtrByteSize) {4103unsigned ArgSize = ArgVT.getStoreSize();4104if (Flags.isByVal())4105ArgSize = Flags.getByValSize();41064107// Round up to multiples of the pointer size, except for array members,4108// which are always packed.4109if (!Flags.isInConsecutiveRegs())4110ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;41114112return ArgSize;4113}41144115/// CalculateStackSlotAlignment - Calculates the alignment of this argument4116/// on the stack.4117static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,4118ISD::ArgFlagsTy Flags,4119unsigned PtrByteSize) {4120Align Alignment(PtrByteSize);41214122// Altivec parameters are padded to a 16 byte boundary.4123if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||4124ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||4125ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||4126ArgVT == MVT::v1i128 || ArgVT == MVT::f128)4127Alignment = Align(16);41284129// ByVal parameters are aligned as requested.4130if (Flags.isByVal()) {4131auto BVAlign = Flags.getNonZeroByValAlign();4132if (BVAlign > PtrByteSize) {4133if (BVAlign.value() % PtrByteSize != 0)4134llvm_unreachable(4135"ByVal alignment is not a multiple of the pointer size");41364137Alignment = BVAlign;4138}4139}41404141// Array members are always packed to their original alignment.4142if (Flags.isInConsecutiveRegs()) {4143// If the array member was split into multiple registers, the first4144// needs to be aligned to the size of the full type. (Except for4145// ppcf128, which is only aligned as its f64 components.)4146if (Flags.isSplit() && OrigVT != MVT::ppcf128)4147Alignment = Align(OrigVT.getStoreSize());4148else4149Alignment = Align(ArgVT.getStoreSize());4150}41514152return Alignment;4153}41544155/// CalculateStackSlotUsed - Return whether this argument will use its4156/// stack slot (instead of being passed in registers). ArgOffset,4157/// AvailableFPRs, and AvailableVRs must hold the current argument4158/// position, and will be updated to account for this argument.4159static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags,4160unsigned PtrByteSize, unsigned LinkageSize,4161unsigned ParamAreaSize, unsigned &ArgOffset,4162unsigned &AvailableFPRs,4163unsigned &AvailableVRs) {4164bool UseMemory = false;41654166// Respect alignment of argument on the stack.4167Align Alignment =4168CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);4169ArgOffset = alignTo(ArgOffset, Alignment);4170// If there's no space left in the argument save area, we must4171// use memory (this check also catches zero-sized arguments).4172if (ArgOffset >= LinkageSize + ParamAreaSize)4173UseMemory = true;41744175// Allocate argument on the stack.4176ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);4177if (Flags.isInConsecutiveRegsLast())4178ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;4179// If we overran the argument save area, we must use memory4180// (this check catches arguments passed partially in memory)4181if (ArgOffset > LinkageSize + ParamAreaSize)4182UseMemory = true;41834184// However, if the argument is actually passed in an FPR or a VR,4185// we don't use memory after all.4186if (!Flags.isByVal()) {4187if (ArgVT == MVT::f32 || ArgVT == MVT::f64)4188if (AvailableFPRs > 0) {4189--AvailableFPRs;4190return false;4191}4192if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||4193ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||4194ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||4195ArgVT == MVT::v1i128 || ArgVT == MVT::f128)4196if (AvailableVRs > 0) {4197--AvailableVRs;4198return false;4199}4200}42014202return UseMemory;4203}42044205/// EnsureStackAlignment - Round stack frame size up from NumBytes to4206/// ensure minimum alignment required for target.4207static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,4208unsigned NumBytes) {4209return alignTo(NumBytes, Lowering->getStackAlign());4210}42114212SDValue PPCTargetLowering::LowerFormalArguments(4213SDValue Chain, CallingConv::ID CallConv, bool isVarArg,4214const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,4215SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {4216if (Subtarget.isAIXABI())4217return LowerFormalArguments_AIX(Chain, CallConv, isVarArg, Ins, dl, DAG,4218InVals);4219if (Subtarget.is64BitELFABI())4220return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,4221InVals);4222assert(Subtarget.is32BitELFABI());4223return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,4224InVals);4225}42264227SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(4228SDValue Chain, CallingConv::ID CallConv, bool isVarArg,4229const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,4230SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {42314232// 32-bit SVR4 ABI Stack Frame Layout:4233// +-----------------------------------+4234// +--> | Back chain |4235// | +-----------------------------------+4236// | | Floating-point register save area |4237// | +-----------------------------------+4238// | | General register save area |4239// | +-----------------------------------+4240// | | CR save word |4241// | +-----------------------------------+4242// | | VRSAVE save word |4243// | +-----------------------------------+4244// | | Alignment padding |4245// | +-----------------------------------+4246// | | Vector register save area |4247// | +-----------------------------------+4248// | | Local variable space |4249// | +-----------------------------------+4250// | | Parameter list area |4251// | +-----------------------------------+4252// | | LR save word |4253// | +-----------------------------------+4254// SP--> +--- | Back chain |4255// +-----------------------------------+4256//4257// Specifications:4258// System V Application Binary Interface PowerPC Processor Supplement4259// AltiVec Technology Programming Interface Manual42604261MachineFunction &MF = DAG.getMachineFunction();4262MachineFrameInfo &MFI = MF.getFrameInfo();4263PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();42644265EVT PtrVT = getPointerTy(MF.getDataLayout());4266// Potential tail calls could cause overwriting of argument stack slots.4267bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&4268(CallConv == CallingConv::Fast));4269const Align PtrAlign(4);42704271// Assign locations to all of the incoming arguments.4272SmallVector<CCValAssign, 16> ArgLocs;4273PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,4274*DAG.getContext());42754276// Reserve space for the linkage area on the stack.4277unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();4278CCInfo.AllocateStack(LinkageSize, PtrAlign);4279if (useSoftFloat())4280CCInfo.PreAnalyzeFormalArguments(Ins);42814282CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);4283CCInfo.clearWasPPCF128();42844285for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {4286CCValAssign &VA = ArgLocs[i];42874288// Arguments stored in registers.4289if (VA.isRegLoc()) {4290const TargetRegisterClass *RC;4291EVT ValVT = VA.getValVT();42924293switch (ValVT.getSimpleVT().SimpleTy) {4294default:4295llvm_unreachable("ValVT not supported by formal arguments Lowering");4296case MVT::i1:4297case MVT::i32:4298RC = &PPC::GPRCRegClass;4299break;4300case MVT::f32:4301if (Subtarget.hasP8Vector())4302RC = &PPC::VSSRCRegClass;4303else if (Subtarget.hasSPE())4304RC = &PPC::GPRCRegClass;4305else4306RC = &PPC::F4RCRegClass;4307break;4308case MVT::f64:4309if (Subtarget.hasVSX())4310RC = &PPC::VSFRCRegClass;4311else if (Subtarget.hasSPE())4312// SPE passes doubles in GPR pairs.4313RC = &PPC::GPRCRegClass;4314else4315RC = &PPC::F8RCRegClass;4316break;4317case MVT::v16i8:4318case MVT::v8i16:4319case MVT::v4i32:4320RC = &PPC::VRRCRegClass;4321break;4322case MVT::v4f32:4323RC = &PPC::VRRCRegClass;4324break;4325case MVT::v2f64:4326case MVT::v2i64:4327RC = &PPC::VRRCRegClass;4328break;4329}43304331SDValue ArgValue;4332// Transform the arguments stored in physical registers into4333// virtual ones.4334if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) {4335assert(i + 1 < e && "No second half of double precision argument");4336Register RegLo = MF.addLiveIn(VA.getLocReg(), RC);4337Register RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC);4338SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32);4339SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32);4340if (!Subtarget.isLittleEndian())4341std::swap (ArgValueLo, ArgValueHi);4342ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo,4343ArgValueHi);4344} else {4345Register Reg = MF.addLiveIn(VA.getLocReg(), RC);4346ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,4347ValVT == MVT::i1 ? MVT::i32 : ValVT);4348if (ValVT == MVT::i1)4349ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);4350}43514352InVals.push_back(ArgValue);4353} else {4354// Argument stored in memory.4355assert(VA.isMemLoc());43564357// Get the extended size of the argument type in stack4358unsigned ArgSize = VA.getLocVT().getStoreSize();4359// Get the actual size of the argument type4360unsigned ObjSize = VA.getValVT().getStoreSize();4361unsigned ArgOffset = VA.getLocMemOffset();4362// Stack objects in PPC32 are right justified.4363ArgOffset += ArgSize - ObjSize;4364int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, isImmutable);43654366// Create load nodes to retrieve arguments from the stack.4367SDValue FIN = DAG.getFrameIndex(FI, PtrVT);4368InVals.push_back(4369DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));4370}4371}43724373// Assign locations to all of the incoming aggregate by value arguments.4374// Aggregates passed by value are stored in the local variable space of the4375// caller's stack frame, right above the parameter list area.4376SmallVector<CCValAssign, 16> ByValArgLocs;4377CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),4378ByValArgLocs, *DAG.getContext());43794380// Reserve stack space for the allocations in CCInfo.4381CCByValInfo.AllocateStack(CCInfo.getStackSize(), PtrAlign);43824383CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);43844385// Area that is at least reserved in the caller of this function.4386unsigned MinReservedArea = CCByValInfo.getStackSize();4387MinReservedArea = std::max(MinReservedArea, LinkageSize);43884389// Set the size that is at least reserved in caller of this function. Tail4390// call optimized function's reserved stack space needs to be aligned so that4391// taking the difference between two stack areas will result in an aligned4392// stack.4393MinReservedArea =4394EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);4395FuncInfo->setMinReservedArea(MinReservedArea);43964397SmallVector<SDValue, 8> MemOps;43984399// If the function takes variable number of arguments, make a frame index for4400// the start of the first vararg value... for expansion of llvm.va_start.4401if (isVarArg) {4402static const MCPhysReg GPArgRegs[] = {4403PPC::R3, PPC::R4, PPC::R5, PPC::R6,4404PPC::R7, PPC::R8, PPC::R9, PPC::R10,4405};4406const unsigned NumGPArgRegs = std::size(GPArgRegs);44074408static const MCPhysReg FPArgRegs[] = {4409PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,4410PPC::F84411};4412unsigned NumFPArgRegs = std::size(FPArgRegs);44134414if (useSoftFloat() || hasSPE())4415NumFPArgRegs = 0;44164417FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));4418FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs));44194420// Make room for NumGPArgRegs and NumFPArgRegs.4421int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +4422NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;44234424FuncInfo->setVarArgsStackOffset(MFI.CreateFixedObject(4425PtrVT.getSizeInBits() / 8, CCInfo.getStackSize(), true));44264427FuncInfo->setVarArgsFrameIndex(4428MFI.CreateStackObject(Depth, Align(8), false));4429SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);44304431// The fixed integer arguments of a variadic function are stored to the4432// VarArgsFrameIndex on the stack so that they may be loaded by4433// dereferencing the result of va_next.4434for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {4435// Get an existing live-in vreg, or add a new one.4436Register VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);4437if (!VReg)4438VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);44394440SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);4441SDValue Store =4442DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());4443MemOps.push_back(Store);4444// Increment the address by four for the next argument to store4445SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);4446FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);4447}44484449// FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 64450// is set.4451// The double arguments are stored to the VarArgsFrameIndex4452// on the stack.4453for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {4454// Get an existing live-in vreg, or add a new one.4455Register VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);4456if (!VReg)4457VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);44584459SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);4460SDValue Store =4461DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());4462MemOps.push_back(Store);4463// Increment the address by eight for the next argument to store4464SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl,4465PtrVT);4466FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);4467}4468}44694470if (!MemOps.empty())4471Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);44724473return Chain;4474}44754476// PPC64 passes i8, i16, and i32 values in i64 registers. Promote4477// value to MVT::i64 and then truncate to the correct register size.4478SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,4479EVT ObjectVT, SelectionDAG &DAG,4480SDValue ArgVal,4481const SDLoc &dl) const {4482if (Flags.isSExt())4483ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,4484DAG.getValueType(ObjectVT));4485else if (Flags.isZExt())4486ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,4487DAG.getValueType(ObjectVT));44884489return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);4490}44914492SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(4493SDValue Chain, CallingConv::ID CallConv, bool isVarArg,4494const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,4495SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {4496// TODO: add description of PPC stack frame format, or at least some docs.4497//4498bool isELFv2ABI = Subtarget.isELFv2ABI();4499bool isLittleEndian = Subtarget.isLittleEndian();4500MachineFunction &MF = DAG.getMachineFunction();4501MachineFrameInfo &MFI = MF.getFrameInfo();4502PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();45034504assert(!(CallConv == CallingConv::Fast && isVarArg) &&4505"fastcc not supported on varargs functions");45064507EVT PtrVT = getPointerTy(MF.getDataLayout());4508// Potential tail calls could cause overwriting of argument stack slots.4509bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&4510(CallConv == CallingConv::Fast));4511unsigned PtrByteSize = 8;4512unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();45134514static const MCPhysReg GPR[] = {4515PPC::X3, PPC::X4, PPC::X5, PPC::X6,4516PPC::X7, PPC::X8, PPC::X9, PPC::X10,4517};4518static const MCPhysReg VR[] = {4519PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,4520PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V134521};45224523const unsigned Num_GPR_Regs = std::size(GPR);4524const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;4525const unsigned Num_VR_Regs = std::size(VR);45264527// Do a first pass over the arguments to determine whether the ABI4528// guarantees that our caller has allocated the parameter save area4529// on its stack frame. In the ELFv1 ABI, this is always the case;4530// in the ELFv2 ABI, it is true if this is a vararg function or if4531// any parameter is located in a stack slot.45324533bool HasParameterArea = !isELFv2ABI || isVarArg;4534unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;4535unsigned NumBytes = LinkageSize;4536unsigned AvailableFPRs = Num_FPR_Regs;4537unsigned AvailableVRs = Num_VR_Regs;4538for (unsigned i = 0, e = Ins.size(); i != e; ++i) {4539if (Ins[i].Flags.isNest())4540continue;45414542if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,4543PtrByteSize, LinkageSize, ParamAreaSize,4544NumBytes, AvailableFPRs, AvailableVRs))4545HasParameterArea = true;4546}45474548// Add DAG nodes to load the arguments or copy them out of registers. On4549// entry to a function on PPC, the arguments start after the linkage area,4550// although the first ones are often in registers.45514552unsigned ArgOffset = LinkageSize;4553unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;4554SmallVector<SDValue, 8> MemOps;4555Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();4556unsigned CurArgIdx = 0;4557for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {4558SDValue ArgVal;4559bool needsLoad = false;4560EVT ObjectVT = Ins[ArgNo].VT;4561EVT OrigVT = Ins[ArgNo].ArgVT;4562unsigned ObjSize = ObjectVT.getStoreSize();4563unsigned ArgSize = ObjSize;4564ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;4565if (Ins[ArgNo].isOrigArg()) {4566std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);4567CurArgIdx = Ins[ArgNo].getOrigArgIndex();4568}4569// We re-align the argument offset for each argument, except when using the4570// fast calling convention, when we need to make sure we do that only when4571// we'll actually use a stack slot.4572unsigned CurArgOffset;4573Align Alignment;4574auto ComputeArgOffset = [&]() {4575/* Respect alignment of argument on the stack. */4576Alignment =4577CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);4578ArgOffset = alignTo(ArgOffset, Alignment);4579CurArgOffset = ArgOffset;4580};45814582if (CallConv != CallingConv::Fast) {4583ComputeArgOffset();45844585/* Compute GPR index associated with argument offset. */4586GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;4587GPR_idx = std::min(GPR_idx, Num_GPR_Regs);4588}45894590// FIXME the codegen can be much improved in some cases.4591// We do not have to keep everything in memory.4592if (Flags.isByVal()) {4593assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");45944595if (CallConv == CallingConv::Fast)4596ComputeArgOffset();45974598// ObjSize is the true size, ArgSize rounded up to multiple of registers.4599ObjSize = Flags.getByValSize();4600ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;4601// Empty aggregate parameters do not take up registers. Examples:4602// struct { } a;4603// union { } b;4604// int c[0];4605// etc. However, we have to provide a place-holder in InVals, so4606// pretend we have an 8-byte item at the current address for that4607// purpose.4608if (!ObjSize) {4609int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);4610SDValue FIN = DAG.getFrameIndex(FI, PtrVT);4611InVals.push_back(FIN);4612continue;4613}46144615// Create a stack object covering all stack doublewords occupied4616// by the argument. If the argument is (fully or partially) on4617// the stack, or if the argument is fully in registers but the4618// caller has allocated the parameter save anyway, we can refer4619// directly to the caller's stack frame. Otherwise, create a4620// local copy in our own frame.4621int FI;4622if (HasParameterArea ||4623ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)4624FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);4625else4626FI = MFI.CreateStackObject(ArgSize, Alignment, false);4627SDValue FIN = DAG.getFrameIndex(FI, PtrVT);46284629// Handle aggregates smaller than 8 bytes.4630if (ObjSize < PtrByteSize) {4631// The value of the object is its address, which differs from the4632// address of the enclosing doubleword on big-endian systems.4633SDValue Arg = FIN;4634if (!isLittleEndian) {4635SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT);4636Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);4637}4638InVals.push_back(Arg);46394640if (GPR_idx != Num_GPR_Regs) {4641Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);4642FuncInfo->addLiveInAttr(VReg, Flags);4643SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);4644EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), ObjSize * 8);4645SDValue Store =4646DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,4647MachinePointerInfo(&*FuncArg), ObjType);4648MemOps.push_back(Store);4649}4650// Whether we copied from a register or not, advance the offset4651// into the parameter save area by a full doubleword.4652ArgOffset += PtrByteSize;4653continue;4654}46554656// The value of the object is its address, which is the address of4657// its first stack doubleword.4658InVals.push_back(FIN);46594660// Store whatever pieces of the object are in registers to memory.4661for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {4662if (GPR_idx == Num_GPR_Regs)4663break;46644665Register VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);4666FuncInfo->addLiveInAttr(VReg, Flags);4667SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);4668SDValue Addr = FIN;4669if (j) {4670SDValue Off = DAG.getConstant(j, dl, PtrVT);4671Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);4672}4673unsigned StoreSizeInBits = std::min(PtrByteSize, (ObjSize - j)) * 8;4674EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), StoreSizeInBits);4675SDValue Store =4676DAG.getTruncStore(Val.getValue(1), dl, Val, Addr,4677MachinePointerInfo(&*FuncArg, j), ObjType);4678MemOps.push_back(Store);4679++GPR_idx;4680}4681ArgOffset += ArgSize;4682continue;4683}46844685switch (ObjectVT.getSimpleVT().SimpleTy) {4686default: llvm_unreachable("Unhandled argument type!");4687case MVT::i1:4688case MVT::i32:4689case MVT::i64:4690if (Flags.isNest()) {4691// The 'nest' parameter, if any, is passed in R11.4692Register VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);4693ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);46944695if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)4696ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);46974698break;4699}47004701// These can be scalar arguments or elements of an integer array type4702// passed directly. Clang may use those instead of "byval" aggregate4703// types to avoid forcing arguments to memory unnecessarily.4704if (GPR_idx != Num_GPR_Regs) {4705Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);4706FuncInfo->addLiveInAttr(VReg, Flags);4707ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);47084709if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)4710// PPC64 passes i8, i16, and i32 values in i64 registers. Promote4711// value to MVT::i64 and then truncate to the correct register size.4712ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);4713} else {4714if (CallConv == CallingConv::Fast)4715ComputeArgOffset();47164717needsLoad = true;4718ArgSize = PtrByteSize;4719}4720if (CallConv != CallingConv::Fast || needsLoad)4721ArgOffset += 8;4722break;47234724case MVT::f32:4725case MVT::f64:4726// These can be scalar arguments or elements of a float array type4727// passed directly. The latter are used to implement ELFv2 homogenous4728// float aggregates.4729if (FPR_idx != Num_FPR_Regs) {4730unsigned VReg;47314732if (ObjectVT == MVT::f32)4733VReg = MF.addLiveIn(FPR[FPR_idx],4734Subtarget.hasP8Vector()4735? &PPC::VSSRCRegClass4736: &PPC::F4RCRegClass);4737else4738VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX()4739? &PPC::VSFRCRegClass4740: &PPC::F8RCRegClass);47414742ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);4743++FPR_idx;4744} else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {4745// FIXME: We may want to re-enable this for CallingConv::Fast on the P84746// once we support fp <-> gpr moves.47474748// This can only ever happen in the presence of f32 array types,4749// since otherwise we never run out of FPRs before running out4750// of GPRs.4751Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);4752FuncInfo->addLiveInAttr(VReg, Flags);4753ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);47544755if (ObjectVT == MVT::f32) {4756if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))4757ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,4758DAG.getConstant(32, dl, MVT::i32));4759ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);4760}47614762ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);4763} else {4764if (CallConv == CallingConv::Fast)4765ComputeArgOffset();47664767needsLoad = true;4768}47694770// When passing an array of floats, the array occupies consecutive4771// space in the argument area; only round up to the next doubleword4772// at the end of the array. Otherwise, each float takes 8 bytes.4773if (CallConv != CallingConv::Fast || needsLoad) {4774ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;4775ArgOffset += ArgSize;4776if (Flags.isInConsecutiveRegsLast())4777ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;4778}4779break;4780case MVT::v4f32:4781case MVT::v4i32:4782case MVT::v8i16:4783case MVT::v16i8:4784case MVT::v2f64:4785case MVT::v2i64:4786case MVT::v1i128:4787case MVT::f128:4788// These can be scalar arguments or elements of a vector array type4789// passed directly. The latter are used to implement ELFv2 homogenous4790// vector aggregates.4791if (VR_idx != Num_VR_Regs) {4792Register VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);4793ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);4794++VR_idx;4795} else {4796if (CallConv == CallingConv::Fast)4797ComputeArgOffset();4798needsLoad = true;4799}4800if (CallConv != CallingConv::Fast || needsLoad)4801ArgOffset += 16;4802break;4803}48044805// We need to load the argument to a virtual register if we determined4806// above that we ran out of physical registers of the appropriate type.4807if (needsLoad) {4808if (ObjSize < ArgSize && !isLittleEndian)4809CurArgOffset += ArgSize - ObjSize;4810int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable);4811SDValue FIN = DAG.getFrameIndex(FI, PtrVT);4812ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());4813}48144815InVals.push_back(ArgVal);4816}48174818// Area that is at least reserved in the caller of this function.4819unsigned MinReservedArea;4820if (HasParameterArea)4821MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);4822else4823MinReservedArea = LinkageSize;48244825// Set the size that is at least reserved in caller of this function. Tail4826// call optimized functions' reserved stack space needs to be aligned so that4827// taking the difference between two stack areas will result in an aligned4828// stack.4829MinReservedArea =4830EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);4831FuncInfo->setMinReservedArea(MinReservedArea);48324833// If the function takes variable number of arguments, make a frame index for4834// the start of the first vararg value... for expansion of llvm.va_start.4835// On ELFv2ABI spec, it writes:4836// C programs that are intended to be *portable* across different compilers4837// and architectures must use the header file <stdarg.h> to deal with variable4838// argument lists.4839if (isVarArg && MFI.hasVAStart()) {4840int Depth = ArgOffset;48414842FuncInfo->setVarArgsFrameIndex(4843MFI.CreateFixedObject(PtrByteSize, Depth, true));4844SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);48454846// If this function is vararg, store any remaining integer argument regs4847// to their spots on the stack so that they may be loaded by dereferencing4848// the result of va_next.4849for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;4850GPR_idx < Num_GPR_Regs; ++GPR_idx) {4851Register VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);4852SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);4853SDValue Store =4854DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());4855MemOps.push_back(Store);4856// Increment the address by four for the next argument to store4857SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);4858FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);4859}4860}48614862if (!MemOps.empty())4863Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);48644865return Chain;4866}48674868/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be4869/// adjusted to accommodate the arguments for the tailcall.4870static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,4871unsigned ParamSize) {48724873if (!isTailCall) return 0;48744875PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();4876unsigned CallerMinReservedArea = FI->getMinReservedArea();4877int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;4878// Remember only if the new adjustment is bigger.4879if (SPDiff < FI->getTailCallSPDelta())4880FI->setTailCallSPDelta(SPDiff);48814882return SPDiff;4883}48844885static bool isFunctionGlobalAddress(const GlobalValue *CalleeGV);48864887static bool callsShareTOCBase(const Function *Caller,4888const GlobalValue *CalleeGV,4889const TargetMachine &TM) {4890// It does not make sense to call callsShareTOCBase() with a caller that4891// is PC Relative since PC Relative callers do not have a TOC.4892#ifndef NDEBUG4893const PPCSubtarget *STICaller = &TM.getSubtarget<PPCSubtarget>(*Caller);4894assert(!STICaller->isUsingPCRelativeCalls() &&4895"PC Relative callers do not have a TOC and cannot share a TOC Base");4896#endif48974898// Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols4899// don't have enough information to determine if the caller and callee share4900// the same TOC base, so we have to pessimistically assume they don't for4901// correctness.4902if (!CalleeGV)4903return false;49044905// If the callee is preemptable, then the static linker will use a plt-stub4906// which saves the toc to the stack, and needs a nop after the call4907// instruction to convert to a toc-restore.4908if (!TM.shouldAssumeDSOLocal(CalleeGV))4909return false;49104911// Functions with PC Relative enabled may clobber the TOC in the same DSO.4912// We may need a TOC restore in the situation where the caller requires a4913// valid TOC but the callee is PC Relative and does not.4914const Function *F = dyn_cast<Function>(CalleeGV);4915const GlobalAlias *Alias = dyn_cast<GlobalAlias>(CalleeGV);49164917// If we have an Alias we can try to get the function from there.4918if (Alias) {4919const GlobalObject *GlobalObj = Alias->getAliaseeObject();4920F = dyn_cast<Function>(GlobalObj);4921}49224923// If we still have no valid function pointer we do not have enough4924// information to determine if the callee uses PC Relative calls so we must4925// assume that it does.4926if (!F)4927return false;49284929// If the callee uses PC Relative we cannot guarantee that the callee won't4930// clobber the TOC of the caller and so we must assume that the two4931// functions do not share a TOC base.4932const PPCSubtarget *STICallee = &TM.getSubtarget<PPCSubtarget>(*F);4933if (STICallee->isUsingPCRelativeCalls())4934return false;49354936// If the GV is not a strong definition then we need to assume it can be4937// replaced by another function at link time. The function that replaces4938// it may not share the same TOC as the caller since the callee may be4939// replaced by a PC Relative version of the same function.4940if (!CalleeGV->isStrongDefinitionForLinker())4941return false;49424943// The medium and large code models are expected to provide a sufficiently4944// large TOC to provide all data addressing needs of a module with a4945// single TOC.4946if (CodeModel::Medium == TM.getCodeModel() ||4947CodeModel::Large == TM.getCodeModel())4948return true;49494950// Any explicitly-specified sections and section prefixes must also match.4951// Also, if we're using -ffunction-sections, then each function is always in4952// a different section (the same is true for COMDAT functions).4953if (TM.getFunctionSections() || CalleeGV->hasComdat() ||4954Caller->hasComdat() || CalleeGV->getSection() != Caller->getSection())4955return false;4956if (const auto *F = dyn_cast<Function>(CalleeGV)) {4957if (F->getSectionPrefix() != Caller->getSectionPrefix())4958return false;4959}49604961return true;4962}49634964static bool4965needStackSlotPassParameters(const PPCSubtarget &Subtarget,4966const SmallVectorImpl<ISD::OutputArg> &Outs) {4967assert(Subtarget.is64BitELFABI());49684969const unsigned PtrByteSize = 8;4970const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();49714972static const MCPhysReg GPR[] = {4973PPC::X3, PPC::X4, PPC::X5, PPC::X6,4974PPC::X7, PPC::X8, PPC::X9, PPC::X10,4975};4976static const MCPhysReg VR[] = {4977PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,4978PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V134979};49804981const unsigned NumGPRs = std::size(GPR);4982const unsigned NumFPRs = 13;4983const unsigned NumVRs = std::size(VR);4984const unsigned ParamAreaSize = NumGPRs * PtrByteSize;49854986unsigned NumBytes = LinkageSize;4987unsigned AvailableFPRs = NumFPRs;4988unsigned AvailableVRs = NumVRs;49894990for (const ISD::OutputArg& Param : Outs) {4991if (Param.Flags.isNest()) continue;49924993if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, PtrByteSize,4994LinkageSize, ParamAreaSize, NumBytes,4995AvailableFPRs, AvailableVRs))4996return true;4997}4998return false;4999}50005001static bool hasSameArgumentList(const Function *CallerFn, const CallBase &CB) {5002if (CB.arg_size() != CallerFn->arg_size())5003return false;50045005auto CalleeArgIter = CB.arg_begin();5006auto CalleeArgEnd = CB.arg_end();5007Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();50085009for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {5010const Value* CalleeArg = *CalleeArgIter;5011const Value* CallerArg = &(*CallerArgIter);5012if (CalleeArg == CallerArg)5013continue;50145015// e.g. @caller([4 x i64] %a, [4 x i64] %b) {5016// tail call @callee([4 x i64] undef, [4 x i64] %b)5017// }5018// 1st argument of callee is undef and has the same type as caller.5019if (CalleeArg->getType() == CallerArg->getType() &&5020isa<UndefValue>(CalleeArg))5021continue;50225023return false;5024}50255026return true;5027}50285029// Returns true if TCO is possible between the callers and callees5030// calling conventions.5031static bool5032areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,5033CallingConv::ID CalleeCC) {5034// Tail calls are possible with fastcc and ccc.5035auto isTailCallableCC = [] (CallingConv::ID CC){5036return CC == CallingConv::C || CC == CallingConv::Fast;5037};5038if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC))5039return false;50405041// We can safely tail call both fastcc and ccc callees from a c calling5042// convention caller. If the caller is fastcc, we may have less stack space5043// than a non-fastcc caller with the same signature so disable tail-calls in5044// that case.5045return CallerCC == CallingConv::C || CallerCC == CalleeCC;5046}50475048bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(5049const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,5050CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg,5051const SmallVectorImpl<ISD::OutputArg> &Outs,5052const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc,5053bool isCalleeExternalSymbol) const {5054bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;50555056if (DisableSCO && !TailCallOpt) return false;50575058// Variadic argument functions are not supported.5059if (isVarArg) return false;50605061// Check that the calling conventions are compatible for tco.5062if (!areCallingConvEligibleForTCO_64SVR4(CallerCC, CalleeCC))5063return false;50645065// Caller contains any byval parameter is not supported.5066if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))5067return false;50685069// Callee contains any byval parameter is not supported, too.5070// Note: This is a quick work around, because in some cases, e.g.5071// caller's stack size > callee's stack size, we are still able to apply5072// sibling call optimization. For example, gcc is able to do SCO for caller15073// in the following example, but not for caller2.5074// struct test {5075// long int a;5076// char ary[56];5077// } gTest;5078// __attribute__((noinline)) int callee(struct test v, struct test *b) {5079// b->a = v.a;5080// return 0;5081// }5082// void caller1(struct test a, struct test c, struct test *b) {5083// callee(gTest, b); }5084// void caller2(struct test *b) { callee(gTest, b); }5085if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))5086return false;50875088// If callee and caller use different calling conventions, we cannot pass5089// parameters on stack since offsets for the parameter area may be different.5090if (CallerCC != CalleeCC && needStackSlotPassParameters(Subtarget, Outs))5091return false;50925093// All variants of 64-bit ELF ABIs without PC-Relative addressing require that5094// the caller and callee share the same TOC for TCO/SCO. If the caller and5095// callee potentially have different TOC bases then we cannot tail call since5096// we need to restore the TOC pointer after the call.5097// ref: https://bugzilla.mozilla.org/show_bug.cgi?id=9739775098// We cannot guarantee this for indirect calls or calls to external functions.5099// When PC-Relative addressing is used, the concept of the TOC is no longer5100// applicable so this check is not required.5101// Check first for indirect calls.5102if (!Subtarget.isUsingPCRelativeCalls() &&5103!isFunctionGlobalAddress(CalleeGV) && !isCalleeExternalSymbol)5104return false;51055106// Check if we share the TOC base.5107if (!Subtarget.isUsingPCRelativeCalls() &&5108!callsShareTOCBase(CallerFunc, CalleeGV, getTargetMachine()))5109return false;51105111// TCO allows altering callee ABI, so we don't have to check further.5112if (CalleeCC == CallingConv::Fast && TailCallOpt)5113return true;51145115if (DisableSCO) return false;51165117// If callee use the same argument list that caller is using, then we can5118// apply SCO on this case. If it is not, then we need to check if callee needs5119// stack for passing arguments.5120// PC Relative tail calls may not have a CallBase.5121// If there is no CallBase we cannot verify if we have the same argument5122// list so assume that we don't have the same argument list.5123if (CB && !hasSameArgumentList(CallerFunc, *CB) &&5124needStackSlotPassParameters(Subtarget, Outs))5125return false;5126else if (!CB && needStackSlotPassParameters(Subtarget, Outs))5127return false;51285129return true;5130}51315132/// IsEligibleForTailCallOptimization - Check whether the call is eligible5133/// for tail call optimization. Targets which want to do tail call5134/// optimization should implement this function.5135bool PPCTargetLowering::IsEligibleForTailCallOptimization(5136const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,5137CallingConv::ID CallerCC, bool isVarArg,5138const SmallVectorImpl<ISD::InputArg> &Ins) const {5139if (!getTargetMachine().Options.GuaranteedTailCallOpt)5140return false;51415142// Variable argument functions are not supported.5143if (isVarArg)5144return false;51455146if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {5147// Functions containing by val parameters are not supported.5148if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))5149return false;51505151// Non-PIC/GOT tail calls are supported.5152if (getTargetMachine().getRelocationModel() != Reloc::PIC_)5153return true;51545155// At the moment we can only do local tail calls (in same module, hidden5156// or protected) if we are generating PIC.5157if (CalleeGV)5158return CalleeGV->hasHiddenVisibility() ||5159CalleeGV->hasProtectedVisibility();5160}51615162return false;5163}51645165/// isCallCompatibleAddress - Return the immediate to use if the specified5166/// 32-bit value is representable in the immediate field of a BxA instruction.5167static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {5168ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);5169if (!C) return nullptr;51705171int Addr = C->getZExtValue();5172if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero.5173SignExtend32<26>(Addr) != Addr)5174return nullptr; // Top 6 bits have to be sext of immediate.51755176return DAG5177.getConstant(5178(int)C->getZExtValue() >> 2, SDLoc(Op),5179DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))5180.getNode();5181}51825183namespace {51845185struct TailCallArgumentInfo {5186SDValue Arg;5187SDValue FrameIdxOp;5188int FrameIdx = 0;51895190TailCallArgumentInfo() = default;5191};51925193} // end anonymous namespace51945195/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.5196static void StoreTailCallArgumentsToStackSlot(5197SelectionDAG &DAG, SDValue Chain,5198const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,5199SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {5200for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {5201SDValue Arg = TailCallArgs[i].Arg;5202SDValue FIN = TailCallArgs[i].FrameIdxOp;5203int FI = TailCallArgs[i].FrameIdx;5204// Store relative to framepointer.5205MemOpChains.push_back(DAG.getStore(5206Chain, dl, Arg, FIN,5207MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));5208}5209}52105211/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to5212/// the appropriate stack slot for the tail call optimized function call.5213static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,5214SDValue OldRetAddr, SDValue OldFP,5215int SPDiff, const SDLoc &dl) {5216if (SPDiff) {5217// Calculate the new stack slot for the return address.5218MachineFunction &MF = DAG.getMachineFunction();5219const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();5220const PPCFrameLowering *FL = Subtarget.getFrameLowering();5221bool isPPC64 = Subtarget.isPPC64();5222int SlotSize = isPPC64 ? 8 : 4;5223int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();5224int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize,5225NewRetAddrLoc, true);5226EVT VT = isPPC64 ? MVT::i64 : MVT::i32;5227SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);5228Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,5229MachinePointerInfo::getFixedStack(MF, NewRetAddr));5230}5231return Chain;5232}52335234/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate5235/// the position of the argument.5236static void5237CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,5238SDValue Arg, int SPDiff, unsigned ArgOffset,5239SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {5240int Offset = ArgOffset + SPDiff;5241uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;5242int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);5243EVT VT = isPPC64 ? MVT::i64 : MVT::i32;5244SDValue FIN = DAG.getFrameIndex(FI, VT);5245TailCallArgumentInfo Info;5246Info.Arg = Arg;5247Info.FrameIdxOp = FIN;5248Info.FrameIdx = FI;5249TailCallArguments.push_back(Info);5250}52515252/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address5253/// stack slot. Returns the chain as result and the loaded frame pointers in5254/// LROpOut/FPOpout. Used when tail calling.5255SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(5256SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,5257SDValue &FPOpOut, const SDLoc &dl) const {5258if (SPDiff) {5259// Load the LR and FP stack slot for later adjusting.5260EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;5261LROpOut = getReturnAddrFrameIndex(DAG);5262LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());5263Chain = SDValue(LROpOut.getNode(), 1);5264}5265return Chain;5266}52675268/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified5269/// by "Src" to address "Dst" of size "Size". Alignment information is5270/// specified by the specific parameter attribute. The copy will be passed as5271/// a byval function parameter.5272/// Sometimes what we are copying is the end of a larger object, the part that5273/// does not fit in registers.5274static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,5275SDValue Chain, ISD::ArgFlagsTy Flags,5276SelectionDAG &DAG, const SDLoc &dl) {5277SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);5278return DAG.getMemcpy(5279Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), false, false,5280/*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo());5281}52825283/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of5284/// tail calls.5285static void LowerMemOpCallTo(5286SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,5287SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,5288bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,5289SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {5290EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());5291if (!isTailCall) {5292if (isVector) {5293SDValue StackPtr;5294if (isPPC64)5295StackPtr = DAG.getRegister(PPC::X1, MVT::i64);5296else5297StackPtr = DAG.getRegister(PPC::R1, MVT::i32);5298PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,5299DAG.getConstant(ArgOffset, dl, PtrVT));5300}5301MemOpChains.push_back(5302DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));5303// Calculate and remember argument location.5304} else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,5305TailCallArguments);5306}53075308static void5309PrepareTailCall(SelectionDAG &DAG, SDValue &InGlue, SDValue &Chain,5310const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,5311SDValue FPOp,5312SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {5313// Emit a sequence of copyto/copyfrom virtual registers for arguments that5314// might overwrite each other in case of tail call optimization.5315SmallVector<SDValue, 8> MemOpChains2;5316// Do not flag preceding copytoreg stuff together with the following stuff.5317InGlue = SDValue();5318StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,5319MemOpChains2, dl);5320if (!MemOpChains2.empty())5321Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);53225323// Store the return address to the appropriate stack slot.5324Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl);53255326// Emit callseq_end just before tailcall node.5327Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, dl);5328InGlue = Chain.getValue(1);5329}53305331// Is this global address that of a function that can be called by name? (as5332// opposed to something that must hold a descriptor for an indirect call).5333static bool isFunctionGlobalAddress(const GlobalValue *GV) {5334if (GV) {5335if (GV->isThreadLocal())5336return false;53375338return GV->getValueType()->isFunctionTy();5339}53405341return false;5342}53435344SDValue PPCTargetLowering::LowerCallResult(5345SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,5346const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,5347SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {5348SmallVector<CCValAssign, 16> RVLocs;5349CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,5350*DAG.getContext());53515352CCRetInfo.AnalyzeCallResult(5353Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)5354? RetCC_PPC_Cold5355: RetCC_PPC);53565357// Copy all of the result registers out of their specified physreg.5358for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {5359CCValAssign &VA = RVLocs[i];5360assert(VA.isRegLoc() && "Can only return in registers!");53615362SDValue Val;53635364if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {5365SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,5366InGlue);5367Chain = Lo.getValue(1);5368InGlue = Lo.getValue(2);5369VA = RVLocs[++i]; // skip ahead to next loc5370SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,5371InGlue);5372Chain = Hi.getValue(1);5373InGlue = Hi.getValue(2);5374if (!Subtarget.isLittleEndian())5375std::swap (Lo, Hi);5376Val = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, Lo, Hi);5377} else {5378Val = DAG.getCopyFromReg(Chain, dl,5379VA.getLocReg(), VA.getLocVT(), InGlue);5380Chain = Val.getValue(1);5381InGlue = Val.getValue(2);5382}53835384switch (VA.getLocInfo()) {5385default: llvm_unreachable("Unknown loc info!");5386case CCValAssign::Full: break;5387case CCValAssign::AExt:5388Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);5389break;5390case CCValAssign::ZExt:5391Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,5392DAG.getValueType(VA.getValVT()));5393Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);5394break;5395case CCValAssign::SExt:5396Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,5397DAG.getValueType(VA.getValVT()));5398Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);5399break;5400}54015402InVals.push_back(Val);5403}54045405return Chain;5406}54075408static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG,5409const PPCSubtarget &Subtarget, bool isPatchPoint) {5410auto *G = dyn_cast<GlobalAddressSDNode>(Callee);5411const GlobalValue *GV = G ? G->getGlobal() : nullptr;54125413// PatchPoint calls are not indirect.5414if (isPatchPoint)5415return false;54165417if (isFunctionGlobalAddress(GV) || isa<ExternalSymbolSDNode>(Callee))5418return false;54195420// Darwin, and 32-bit ELF can use a BLA. The descriptor based ABIs can not5421// becuase the immediate function pointer points to a descriptor instead of5422// a function entry point. The ELFv2 ABI cannot use a BLA because the function5423// pointer immediate points to the global entry point, while the BLA would5424// need to jump to the local entry point (see rL211174).5425if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI() &&5426isBLACompatibleAddress(Callee, DAG))5427return false;54285429return true;5430}54315432// AIX and 64-bit ELF ABIs w/o PCRel require a TOC save/restore around calls.5433static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) {5434return Subtarget.isAIXABI() ||5435(Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls());5436}54375438static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,5439const Function &Caller, const SDValue &Callee,5440const PPCSubtarget &Subtarget,5441const TargetMachine &TM,5442bool IsStrictFPCall = false) {5443if (CFlags.IsTailCall)5444return PPCISD::TC_RETURN;54455446unsigned RetOpc = 0;5447// This is a call through a function pointer.5448if (CFlags.IsIndirect) {5449// AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross5450// indirect calls. The save of the caller's TOC pointer to the stack will be5451// inserted into the DAG as part of call lowering. The restore of the TOC5452// pointer is modeled by using a pseudo instruction for the call opcode that5453// represents the 2 instruction sequence of an indirect branch and link,5454// immediately followed by a load of the TOC pointer from the stack save5455// slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC5456// as it is not saved or used.5457RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC5458: PPCISD::BCTRL;5459} else if (Subtarget.isUsingPCRelativeCalls()) {5460assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");5461RetOpc = PPCISD::CALL_NOTOC;5462} else if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) {5463// The ABIs that maintain a TOC pointer accross calls need to have a nop5464// immediately following the call instruction if the caller and callee may5465// have different TOC bases. At link time if the linker determines the calls5466// may not share a TOC base, the call is redirected to a trampoline inserted5467// by the linker. The trampoline will (among other things) save the callers5468// TOC pointer at an ABI designated offset in the linkage area and the5469// linker will rewrite the nop to be a load of the TOC pointer from the5470// linkage area into gpr2.5471auto *G = dyn_cast<GlobalAddressSDNode>(Callee);5472const GlobalValue *GV = G ? G->getGlobal() : nullptr;5473RetOpc =5474callsShareTOCBase(&Caller, GV, TM) ? PPCISD::CALL : PPCISD::CALL_NOP;5475} else5476RetOpc = PPCISD::CALL;5477if (IsStrictFPCall) {5478switch (RetOpc) {5479default:5480llvm_unreachable("Unknown call opcode");5481case PPCISD::BCTRL_LOAD_TOC:5482RetOpc = PPCISD::BCTRL_LOAD_TOC_RM;5483break;5484case PPCISD::BCTRL:5485RetOpc = PPCISD::BCTRL_RM;5486break;5487case PPCISD::CALL_NOTOC:5488RetOpc = PPCISD::CALL_NOTOC_RM;5489break;5490case PPCISD::CALL:5491RetOpc = PPCISD::CALL_RM;5492break;5493case PPCISD::CALL_NOP:5494RetOpc = PPCISD::CALL_NOP_RM;5495break;5496}5497}5498return RetOpc;5499}55005501static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,5502const SDLoc &dl, const PPCSubtarget &Subtarget) {5503if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI())5504if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG))5505return SDValue(Dest, 0);55065507// Returns true if the callee is local, and false otherwise.5508auto isLocalCallee = [&]() {5509const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);5510const GlobalValue *GV = G ? G->getGlobal() : nullptr;55115512return DAG.getTarget().shouldAssumeDSOLocal(GV) &&5513!isa_and_nonnull<GlobalIFunc>(GV);5514};55155516// The PLT is only used in 32-bit ELF PIC mode. Attempting to use the PLT in5517// a static relocation model causes some versions of GNU LD (2.17.50, at5518// least) to force BSS-PLT, instead of secure-PLT, even if all objects are5519// built with secure-PLT.5520bool UsePlt =5521Subtarget.is32BitELFABI() && !isLocalCallee() &&5522Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_;55235524const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) {5525const TargetMachine &TM = Subtarget.getTargetMachine();5526const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering();5527MCSymbolXCOFF *S =5528cast<MCSymbolXCOFF>(TLOF->getFunctionEntryPointSymbol(GV, TM));55295530MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());5531return DAG.getMCSymbol(S, PtrVT);5532};55335534auto *G = dyn_cast<GlobalAddressSDNode>(Callee);5535const GlobalValue *GV = G ? G->getGlobal() : nullptr;5536if (isFunctionGlobalAddress(GV)) {5537const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();55385539if (Subtarget.isAIXABI()) {5540assert(!isa<GlobalIFunc>(GV) && "IFunc is not supported on AIX.");5541return getAIXFuncEntryPointSymbolSDNode(GV);5542}5543return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0,5544UsePlt ? PPCII::MO_PLT : 0);5545}55465547if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {5548const char *SymName = S->getSymbol();5549if (Subtarget.isAIXABI()) {5550// If there exists a user-declared function whose name is the same as the5551// ExternalSymbol's, then we pick up the user-declared version.5552const Module *Mod = DAG.getMachineFunction().getFunction().getParent();5553if (const Function *F =5554dyn_cast_or_null<Function>(Mod->getNamedValue(SymName)))5555return getAIXFuncEntryPointSymbolSDNode(F);55565557// On AIX, direct function calls reference the symbol for the function's5558// entry point, which is named by prepending a "." before the function's5559// C-linkage name. A Qualname is returned here because an external5560// function entry point is a csect with XTY_ER property.5561const auto getExternalFunctionEntryPointSymbol = [&](StringRef SymName) {5562auto &Context = DAG.getMachineFunction().getContext();5563MCSectionXCOFF *Sec = Context.getXCOFFSection(5564(Twine(".") + Twine(SymName)).str(), SectionKind::getMetadata(),5565XCOFF::CsectProperties(XCOFF::XMC_PR, XCOFF::XTY_ER));5566return Sec->getQualNameSymbol();5567};55685569SymName = getExternalFunctionEntryPointSymbol(SymName)->getName().data();5570}5571return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(),5572UsePlt ? PPCII::MO_PLT : 0);5573}55745575// No transformation needed.5576assert(Callee.getNode() && "What no callee?");5577return Callee;5578}55795580static SDValue getOutputChainFromCallSeq(SDValue CallSeqStart) {5581assert(CallSeqStart.getOpcode() == ISD::CALLSEQ_START &&5582"Expected a CALLSEQ_STARTSDNode.");55835584// The last operand is the chain, except when the node has glue. If the node5585// has glue, then the last operand is the glue, and the chain is the second5586// last operand.5587SDValue LastValue = CallSeqStart.getValue(CallSeqStart->getNumValues() - 1);5588if (LastValue.getValueType() != MVT::Glue)5589return LastValue;55905591return CallSeqStart.getValue(CallSeqStart->getNumValues() - 2);5592}55935594// Creates the node that moves a functions address into the count register5595// to prepare for an indirect call instruction.5596static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,5597SDValue &Glue, SDValue &Chain,5598const SDLoc &dl) {5599SDValue MTCTROps[] = {Chain, Callee, Glue};5600EVT ReturnTypes[] = {MVT::Other, MVT::Glue};5601Chain = DAG.getNode(PPCISD::MTCTR, dl, ReturnTypes,5602ArrayRef(MTCTROps, Glue.getNode() ? 3 : 2));5603// The glue is the second value produced.5604Glue = Chain.getValue(1);5605}56065607static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,5608SDValue &Glue, SDValue &Chain,5609SDValue CallSeqStart,5610const CallBase *CB, const SDLoc &dl,5611bool hasNest,5612const PPCSubtarget &Subtarget) {5613// Function pointers in the 64-bit SVR4 ABI do not point to the function5614// entry point, but to the function descriptor (the function entry point5615// address is part of the function descriptor though).5616// The function descriptor is a three doubleword structure with the5617// following fields: function entry point, TOC base address and5618// environment pointer.5619// Thus for a call through a function pointer, the following actions need5620// to be performed:5621// 1. Save the TOC of the caller in the TOC save area of its stack5622// frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).5623// 2. Load the address of the function entry point from the function5624// descriptor.5625// 3. Load the TOC of the callee from the function descriptor into r2.5626// 4. Load the environment pointer from the function descriptor into5627// r11.5628// 5. Branch to the function entry point address.5629// 6. On return of the callee, the TOC of the caller needs to be5630// restored (this is done in FinishCall()).5631//5632// The loads are scheduled at the beginning of the call sequence, and the5633// register copies are flagged together to ensure that no other5634// operations can be scheduled in between. E.g. without flagging the5635// copies together, a TOC access in the caller could be scheduled between5636// the assignment of the callee TOC and the branch to the callee, which leads5637// to incorrect code.56385639// Start by loading the function address from the descriptor.5640SDValue LDChain = getOutputChainFromCallSeq(CallSeqStart);5641auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()5642? (MachineMemOperand::MODereferenceable |5643MachineMemOperand::MOInvariant)5644: MachineMemOperand::MONone;56455646MachinePointerInfo MPI(CB ? CB->getCalledOperand() : nullptr);56475648// Registers used in building the DAG.5649const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister();5650const MCRegister TOCReg = Subtarget.getTOCPointerRegister();56515652// Offsets of descriptor members.5653const unsigned TOCAnchorOffset = Subtarget.descriptorTOCAnchorOffset();5654const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset();56555656const MVT RegVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;5657const Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);56585659// One load for the functions entry point address.5660SDValue LoadFuncPtr = DAG.getLoad(RegVT, dl, LDChain, Callee, MPI,5661Alignment, MMOFlags);56625663// One for loading the TOC anchor for the module that contains the called5664// function.5665SDValue TOCOff = DAG.getIntPtrConstant(TOCAnchorOffset, dl);5666SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, Callee, TOCOff);5667SDValue TOCPtr =5668DAG.getLoad(RegVT, dl, LDChain, AddTOC,5669MPI.getWithOffset(TOCAnchorOffset), Alignment, MMOFlags);56705671// One for loading the environment pointer.5672SDValue PtrOff = DAG.getIntPtrConstant(EnvPtrOffset, dl);5673SDValue AddPtr = DAG.getNode(ISD::ADD, dl, RegVT, Callee, PtrOff);5674SDValue LoadEnvPtr =5675DAG.getLoad(RegVT, dl, LDChain, AddPtr,5676MPI.getWithOffset(EnvPtrOffset), Alignment, MMOFlags);567756785679// Then copy the newly loaded TOC anchor to the TOC pointer.5680SDValue TOCVal = DAG.getCopyToReg(Chain, dl, TOCReg, TOCPtr, Glue);5681Chain = TOCVal.getValue(0);5682Glue = TOCVal.getValue(1);56835684// If the function call has an explicit 'nest' parameter, it takes the5685// place of the environment pointer.5686assert((!hasNest || !Subtarget.isAIXABI()) &&5687"Nest parameter is not supported on AIX.");5688if (!hasNest) {5689SDValue EnvVal = DAG.getCopyToReg(Chain, dl, EnvPtrReg, LoadEnvPtr, Glue);5690Chain = EnvVal.getValue(0);5691Glue = EnvVal.getValue(1);5692}56935694// The rest of the indirect call sequence is the same as the non-descriptor5695// DAG.5696prepareIndirectCall(DAG, LoadFuncPtr, Glue, Chain, dl);5697}56985699static void5700buildCallOperands(SmallVectorImpl<SDValue> &Ops,5701PPCTargetLowering::CallFlags CFlags, const SDLoc &dl,5702SelectionDAG &DAG,5703SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,5704SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff,5705const PPCSubtarget &Subtarget) {5706const bool IsPPC64 = Subtarget.isPPC64();5707// MVT for a general purpose register.5708const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;57095710// First operand is always the chain.5711Ops.push_back(Chain);57125713// If it's a direct call pass the callee as the second operand.5714if (!CFlags.IsIndirect)5715Ops.push_back(Callee);5716else {5717assert(!CFlags.IsPatchPoint && "Patch point calls are not indirect.");57185719// For the TOC based ABIs, we have saved the TOC pointer to the linkage area5720// on the stack (this would have been done in `LowerCall_64SVR4` or5721// `LowerCall_AIX`). The call instruction is a pseudo instruction that5722// represents both the indirect branch and a load that restores the TOC5723// pointer from the linkage area. The operand for the TOC restore is an add5724// of the TOC save offset to the stack pointer. This must be the second5725// operand: after the chain input but before any other variadic arguments.5726// For 64-bit ELFv2 ABI with PCRel, do not restore the TOC as it is not5727// saved or used.5728if (isTOCSaveRestoreRequired(Subtarget)) {5729const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();57305731SDValue StackPtr = DAG.getRegister(StackPtrReg, RegVT);5732unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();5733SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);5734SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, StackPtr, TOCOff);5735Ops.push_back(AddTOC);5736}57375738// Add the register used for the environment pointer.5739if (Subtarget.usesFunctionDescriptors() && !CFlags.HasNest)5740Ops.push_back(DAG.getRegister(Subtarget.getEnvironmentPointerRegister(),5741RegVT));574257435744// Add CTR register as callee so a bctr can be emitted later.5745if (CFlags.IsTailCall)5746Ops.push_back(DAG.getRegister(IsPPC64 ? PPC::CTR8 : PPC::CTR, RegVT));5747}57485749// If this is a tail call add stack pointer delta.5750if (CFlags.IsTailCall)5751Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));57525753// Add argument registers to the end of the list so that they are known live5754// into the call.5755for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)5756Ops.push_back(DAG.getRegister(RegsToPass[i].first,5757RegsToPass[i].second.getValueType()));57585759// We cannot add R2/X2 as an operand here for PATCHPOINT, because there is5760// no way to mark dependencies as implicit here.5761// We will add the R2/X2 dependency in EmitInstrWithCustomInserter.5762if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) &&5763!CFlags.IsPatchPoint && !Subtarget.isUsingPCRelativeCalls())5764Ops.push_back(DAG.getRegister(Subtarget.getTOCPointerRegister(), RegVT));57655766// Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls5767if (CFlags.IsVarArg && Subtarget.is32BitELFABI())5768Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));57695770// Add a register mask operand representing the call-preserved registers.5771const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();5772const uint32_t *Mask =5773TRI->getCallPreservedMask(DAG.getMachineFunction(), CFlags.CallConv);5774assert(Mask && "Missing call preserved mask for calling convention");5775Ops.push_back(DAG.getRegisterMask(Mask));57765777// If the glue is valid, it is the last operand.5778if (Glue.getNode())5779Ops.push_back(Glue);5780}57815782SDValue PPCTargetLowering::FinishCall(5783CallFlags CFlags, const SDLoc &dl, SelectionDAG &DAG,5784SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue Glue,5785SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,5786unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,5787SmallVectorImpl<SDValue> &InVals, const CallBase *CB) const {57885789if ((Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls()) ||5790Subtarget.isAIXABI())5791setUsesTOCBasePtr(DAG);57925793unsigned CallOpc =5794getCallOpcode(CFlags, DAG.getMachineFunction().getFunction(), Callee,5795Subtarget, DAG.getTarget(), CB ? CB->isStrictFP() : false);57965797if (!CFlags.IsIndirect)5798Callee = transformCallee(Callee, DAG, dl, Subtarget);5799else if (Subtarget.usesFunctionDescriptors())5800prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CB,5801dl, CFlags.HasNest, Subtarget);5802else5803prepareIndirectCall(DAG, Callee, Glue, Chain, dl);58045805// Build the operand list for the call instruction.5806SmallVector<SDValue, 8> Ops;5807buildCallOperands(Ops, CFlags, dl, DAG, RegsToPass, Glue, Chain, Callee,5808SPDiff, Subtarget);58095810// Emit tail call.5811if (CFlags.IsTailCall) {5812// Indirect tail call when using PC Relative calls do not have the same5813// constraints.5814assert(((Callee.getOpcode() == ISD::Register &&5815cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||5816Callee.getOpcode() == ISD::TargetExternalSymbol ||5817Callee.getOpcode() == ISD::TargetGlobalAddress ||5818isa<ConstantSDNode>(Callee) ||5819(CFlags.IsIndirect && Subtarget.isUsingPCRelativeCalls())) &&5820"Expecting a global address, external symbol, absolute value, "5821"register or an indirect tail call when PC Relative calls are "5822"used.");5823// PC Relative calls also use TC_RETURN as the way to mark tail calls.5824assert(CallOpc == PPCISD::TC_RETURN &&5825"Unexpected call opcode for a tail call.");5826DAG.getMachineFunction().getFrameInfo().setHasTailCall();5827SDValue Ret = DAG.getNode(CallOpc, dl, MVT::Other, Ops);5828DAG.addNoMergeSiteInfo(Ret.getNode(), CFlags.NoMerge);5829return Ret;5830}58315832std::array<EVT, 2> ReturnTypes = {{MVT::Other, MVT::Glue}};5833Chain = DAG.getNode(CallOpc, dl, ReturnTypes, Ops);5834DAG.addNoMergeSiteInfo(Chain.getNode(), CFlags.NoMerge);5835Glue = Chain.getValue(1);58365837// When performing tail call optimization the callee pops its arguments off5838// the stack. Account for this here so these bytes can be pushed back on in5839// PPCFrameLowering::eliminateCallFramePseudoInstr.5840int BytesCalleePops = (CFlags.CallConv == CallingConv::Fast &&5841getTargetMachine().Options.GuaranteedTailCallOpt)5842? NumBytes5843: 0;58445845Chain = DAG.getCALLSEQ_END(Chain, NumBytes, BytesCalleePops, Glue, dl);5846Glue = Chain.getValue(1);58475848return LowerCallResult(Chain, Glue, CFlags.CallConv, CFlags.IsVarArg, Ins, dl,5849DAG, InVals);5850}58515852bool PPCTargetLowering::supportsTailCallFor(const CallBase *CB) const {5853CallingConv::ID CalleeCC = CB->getCallingConv();5854const Function *CallerFunc = CB->getCaller();5855CallingConv::ID CallerCC = CallerFunc->getCallingConv();5856const Function *CalleeFunc = CB->getCalledFunction();5857if (!CalleeFunc)5858return false;5859const GlobalValue *CalleeGV = dyn_cast<GlobalValue>(CalleeFunc);58605861SmallVector<ISD::OutputArg, 2> Outs;5862SmallVector<ISD::InputArg, 2> Ins;58635864GetReturnInfo(CalleeCC, CalleeFunc->getReturnType(),5865CalleeFunc->getAttributes(), Outs, *this,5866CalleeFunc->getDataLayout());58675868return isEligibleForTCO(CalleeGV, CalleeCC, CallerCC, CB,5869CalleeFunc->isVarArg(), Outs, Ins, CallerFunc,5870false /*isCalleeExternalSymbol*/);5871}58725873bool PPCTargetLowering::isEligibleForTCO(5874const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,5875CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg,5876const SmallVectorImpl<ISD::OutputArg> &Outs,5877const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc,5878bool isCalleeExternalSymbol) const {5879if (Subtarget.useLongCalls() && !(CB && CB->isMustTailCall()))5880return false;58815882if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())5883return IsEligibleForTailCallOptimization_64SVR4(5884CalleeGV, CalleeCC, CallerCC, CB, isVarArg, Outs, Ins, CallerFunc,5885isCalleeExternalSymbol);5886else5887return IsEligibleForTailCallOptimization(CalleeGV, CalleeCC, CallerCC,5888isVarArg, Ins);5889}58905891SDValue5892PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,5893SmallVectorImpl<SDValue> &InVals) const {5894SelectionDAG &DAG = CLI.DAG;5895SDLoc &dl = CLI.DL;5896SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;5897SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;5898SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;5899SDValue Chain = CLI.Chain;5900SDValue Callee = CLI.Callee;5901bool &isTailCall = CLI.IsTailCall;5902CallingConv::ID CallConv = CLI.CallConv;5903bool isVarArg = CLI.IsVarArg;5904bool isPatchPoint = CLI.IsPatchPoint;5905const CallBase *CB = CLI.CB;59065907if (isTailCall) {5908MachineFunction &MF = DAG.getMachineFunction();5909CallingConv::ID CallerCC = MF.getFunction().getCallingConv();5910auto *G = dyn_cast<GlobalAddressSDNode>(Callee);5911const GlobalValue *GV = G ? G->getGlobal() : nullptr;5912bool IsCalleeExternalSymbol = isa<ExternalSymbolSDNode>(Callee);59135914isTailCall =5915isEligibleForTCO(GV, CallConv, CallerCC, CB, isVarArg, Outs, Ins,5916&(MF.getFunction()), IsCalleeExternalSymbol);5917if (isTailCall) {5918++NumTailCalls;5919if (!getTargetMachine().Options.GuaranteedTailCallOpt)5920++NumSiblingCalls;59215922// PC Relative calls no longer guarantee that the callee is a Global5923// Address Node. The callee could be an indirect tail call in which5924// case the SDValue for the callee could be a load (to load the address5925// of a function pointer) or it may be a register copy (to move the5926// address of the callee from a function parameter into a virtual5927// register). It may also be an ExternalSymbolSDNode (ex memcopy).5928assert((Subtarget.isUsingPCRelativeCalls() ||5929isa<GlobalAddressSDNode>(Callee)) &&5930"Callee should be an llvm::Function object.");59315932LLVM_DEBUG(dbgs() << "TCO caller: " << DAG.getMachineFunction().getName()5933<< "\nTCO callee: ");5934LLVM_DEBUG(Callee.dump());5935}5936}59375938if (!isTailCall && CB && CB->isMustTailCall())5939report_fatal_error("failed to perform tail call elimination on a call "5940"site marked musttail");59415942// When long calls (i.e. indirect calls) are always used, calls are always5943// made via function pointer. If we have a function name, first translate it5944// into a pointer.5945if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) &&5946!isTailCall)5947Callee = LowerGlobalAddress(Callee, DAG);59485949CallFlags CFlags(5950CallConv, isTailCall, isVarArg, isPatchPoint,5951isIndirectCall(Callee, DAG, Subtarget, isPatchPoint),5952// hasNest5953Subtarget.is64BitELFABI() &&5954any_of(Outs, [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),5955CLI.NoMerge);59565957if (Subtarget.isAIXABI())5958return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,5959InVals, CB);59605961assert(Subtarget.isSVR4ABI());5962if (Subtarget.isPPC64())5963return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,5964InVals, CB);5965return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,5966InVals, CB);5967}59685969SDValue PPCTargetLowering::LowerCall_32SVR4(5970SDValue Chain, SDValue Callee, CallFlags CFlags,5971const SmallVectorImpl<ISD::OutputArg> &Outs,5972const SmallVectorImpl<SDValue> &OutVals,5973const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,5974SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,5975const CallBase *CB) const {5976// See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description5977// of the 32-bit SVR4 ABI stack frame layout.59785979const CallingConv::ID CallConv = CFlags.CallConv;5980const bool IsVarArg = CFlags.IsVarArg;5981const bool IsTailCall = CFlags.IsTailCall;59825983assert((CallConv == CallingConv::C ||5984CallConv == CallingConv::Cold ||5985CallConv == CallingConv::Fast) && "Unknown calling convention!");59865987const Align PtrAlign(4);59885989MachineFunction &MF = DAG.getMachineFunction();59905991// Mark this function as potentially containing a function that contains a5992// tail call. As a consequence the frame pointer will be used for dynamicalloc5993// and restoring the callers stack pointer in this functions epilog. This is5994// done because by tail calling the called function might overwrite the value5995// in this function's (MF) stack pointer stack slot 0(SP).5996if (getTargetMachine().Options.GuaranteedTailCallOpt &&5997CallConv == CallingConv::Fast)5998MF.getInfo<PPCFunctionInfo>()->setHasFastCall();59996000// Count how many bytes are to be pushed on the stack, including the linkage6001// area, parameter list area and the part of the local variable space which6002// contains copies of aggregates which are passed by value.60036004// Assign locations to all of the outgoing arguments.6005SmallVector<CCValAssign, 16> ArgLocs;6006PPCCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());60076008// Reserve space for the linkage area on the stack.6009CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),6010PtrAlign);6011if (useSoftFloat())6012CCInfo.PreAnalyzeCallOperands(Outs);60136014if (IsVarArg) {6015// Handle fixed and variable vector arguments differently.6016// Fixed vector arguments go into registers as long as registers are6017// available. Variable vector arguments always go into memory.6018unsigned NumArgs = Outs.size();60196020for (unsigned i = 0; i != NumArgs; ++i) {6021MVT ArgVT = Outs[i].VT;6022ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;6023bool Result;60246025if (Outs[i].IsFixed) {6026Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,6027CCInfo);6028} else {6029Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,6030ArgFlags, CCInfo);6031}60326033if (Result) {6034#ifndef NDEBUG6035errs() << "Call operand #" << i << " has unhandled type "6036<< ArgVT << "\n";6037#endif6038llvm_unreachable(nullptr);6039}6040}6041} else {6042// All arguments are treated the same.6043CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);6044}6045CCInfo.clearWasPPCF128();60466047// Assign locations to all of the outgoing aggregate by value arguments.6048SmallVector<CCValAssign, 16> ByValArgLocs;6049CCState CCByValInfo(CallConv, IsVarArg, MF, ByValArgLocs, *DAG.getContext());60506051// Reserve stack space for the allocations in CCInfo.6052CCByValInfo.AllocateStack(CCInfo.getStackSize(), PtrAlign);60536054CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);60556056// Size of the linkage area, parameter list area and the part of the local6057// space variable where copies of aggregates which are passed by value are6058// stored.6059unsigned NumBytes = CCByValInfo.getStackSize();60606061// Calculate by how many bytes the stack has to be adjusted in case of tail6062// call optimization.6063int SPDiff = CalculateTailCallSPDiff(DAG, IsTailCall, NumBytes);60646065// Adjust the stack pointer for the new arguments...6066// These operations are automatically eliminated by the prolog/epilog pass6067Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);6068SDValue CallSeqStart = Chain;60696070// Load the return address and frame pointer so it can be moved somewhere else6071// later.6072SDValue LROp, FPOp;6073Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);60746075// Set up a copy of the stack pointer for use loading and storing any6076// arguments that may not fit in the registers available for argument6077// passing.6078SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);60796080SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;6081SmallVector<TailCallArgumentInfo, 8> TailCallArguments;6082SmallVector<SDValue, 8> MemOpChains;60836084bool seenFloatArg = false;6085// Walk the register/memloc assignments, inserting copies/loads.6086// i - Tracks the index into the list of registers allocated for the call6087// RealArgIdx - Tracks the index into the list of actual function arguments6088// j - Tracks the index into the list of byval arguments6089for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size();6090i != e;6091++i, ++RealArgIdx) {6092CCValAssign &VA = ArgLocs[i];6093SDValue Arg = OutVals[RealArgIdx];6094ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags;60956096if (Flags.isByVal()) {6097// Argument is an aggregate which is passed by value, thus we need to6098// create a copy of it in the local variable space of the current stack6099// frame (which is the stack frame of the caller) and pass the address of6100// this copy to the callee.6101assert((j < ByValArgLocs.size()) && "Index out of bounds!");6102CCValAssign &ByValVA = ByValArgLocs[j++];6103assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");61046105// Memory reserved in the local variable space of the callers stack frame.6106unsigned LocMemOffset = ByValVA.getLocMemOffset();61076108SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);6109PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),6110StackPtr, PtrOff);61116112// Create a copy of the argument in the local area of the current6113// stack frame.6114SDValue MemcpyCall =6115CreateCopyOfByValArgument(Arg, PtrOff,6116CallSeqStart.getNode()->getOperand(0),6117Flags, DAG, dl);61186119// This must go outside the CALLSEQ_START..END.6120SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,6121SDLoc(MemcpyCall));6122DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),6123NewCallSeqStart.getNode());6124Chain = CallSeqStart = NewCallSeqStart;61256126// Pass the address of the aggregate copy on the stack either in a6127// physical register or in the parameter list area of the current stack6128// frame to the callee.6129Arg = PtrOff;6130}61316132// When useCRBits() is true, there can be i1 arguments.6133// It is because getRegisterType(MVT::i1) => MVT::i1,6134// and for other integer types getRegisterType() => MVT::i32.6135// Extend i1 and ensure callee will get i32.6136if (Arg.getValueType() == MVT::i1)6137Arg = DAG.getNode(Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,6138dl, MVT::i32, Arg);61396140if (VA.isRegLoc()) {6141seenFloatArg |= VA.getLocVT().isFloatingPoint();6142// Put argument in a physical register.6143if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) {6144bool IsLE = Subtarget.isLittleEndian();6145SDValue SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,6146DAG.getIntPtrConstant(IsLE ? 0 : 1, dl));6147RegsToPass.push_back(std::make_pair(VA.getLocReg(), SVal.getValue(0)));6148SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,6149DAG.getIntPtrConstant(IsLE ? 1 : 0, dl));6150RegsToPass.push_back(std::make_pair(ArgLocs[++i].getLocReg(),6151SVal.getValue(0)));6152} else6153RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));6154} else {6155// Put argument in the parameter list area of the current stack frame.6156assert(VA.isMemLoc());6157unsigned LocMemOffset = VA.getLocMemOffset();61586159if (!IsTailCall) {6160SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);6161PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),6162StackPtr, PtrOff);61636164MemOpChains.push_back(6165DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));6166} else {6167// Calculate and remember argument location.6168CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,6169TailCallArguments);6170}6171}6172}61736174if (!MemOpChains.empty())6175Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);61766177// Build a sequence of copy-to-reg nodes chained together with token chain6178// and flag operands which copy the outgoing args into the appropriate regs.6179SDValue InGlue;6180for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {6181Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,6182RegsToPass[i].second, InGlue);6183InGlue = Chain.getValue(1);6184}61856186// Set CR bit 6 to true if this is a vararg call with floating args passed in6187// registers.6188if (IsVarArg) {6189SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);6190SDValue Ops[] = { Chain, InGlue };61916192Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, dl,6193VTs, ArrayRef(Ops, InGlue.getNode() ? 2 : 1));61946195InGlue = Chain.getValue(1);6196}61976198if (IsTailCall)6199PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp,6200TailCallArguments);62016202return FinishCall(CFlags, dl, DAG, RegsToPass, InGlue, Chain, CallSeqStart,6203Callee, SPDiff, NumBytes, Ins, InVals, CB);6204}62056206// Copy an argument into memory, being careful to do this outside the6207// call sequence for the call to which the argument belongs.6208SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(6209SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,6210SelectionDAG &DAG, const SDLoc &dl) const {6211SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,6212CallSeqStart.getNode()->getOperand(0),6213Flags, DAG, dl);6214// The MEMCPY must go outside the CALLSEQ_START..END.6215int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);6216SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,6217SDLoc(MemcpyCall));6218DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),6219NewCallSeqStart.getNode());6220return NewCallSeqStart;6221}62226223SDValue PPCTargetLowering::LowerCall_64SVR4(6224SDValue Chain, SDValue Callee, CallFlags CFlags,6225const SmallVectorImpl<ISD::OutputArg> &Outs,6226const SmallVectorImpl<SDValue> &OutVals,6227const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,6228SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,6229const CallBase *CB) const {6230bool isELFv2ABI = Subtarget.isELFv2ABI();6231bool isLittleEndian = Subtarget.isLittleEndian();6232unsigned NumOps = Outs.size();6233bool IsSibCall = false;6234bool IsFastCall = CFlags.CallConv == CallingConv::Fast;62356236EVT PtrVT = getPointerTy(DAG.getDataLayout());6237unsigned PtrByteSize = 8;62386239MachineFunction &MF = DAG.getMachineFunction();62406241if (CFlags.IsTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)6242IsSibCall = true;62436244// Mark this function as potentially containing a function that contains a6245// tail call. As a consequence the frame pointer will be used for dynamicalloc6246// and restoring the callers stack pointer in this functions epilog. This is6247// done because by tail calling the called function might overwrite the value6248// in this function's (MF) stack pointer stack slot 0(SP).6249if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)6250MF.getInfo<PPCFunctionInfo>()->setHasFastCall();62516252assert(!(IsFastCall && CFlags.IsVarArg) &&6253"fastcc not supported on varargs functions");62546255// Count how many bytes are to be pushed on the stack, including the linkage6256// area, and parameter passing area. On ELFv1, the linkage area is 48 bytes6257// reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage6258// area is 32 bytes reserved space for [SP][CR][LR][TOC].6259unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();6260unsigned NumBytes = LinkageSize;6261unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;62626263static const MCPhysReg GPR[] = {6264PPC::X3, PPC::X4, PPC::X5, PPC::X6,6265PPC::X7, PPC::X8, PPC::X9, PPC::X10,6266};6267static const MCPhysReg VR[] = {6268PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,6269PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V136270};62716272const unsigned NumGPRs = std::size(GPR);6273const unsigned NumFPRs = useSoftFloat() ? 0 : 13;6274const unsigned NumVRs = std::size(VR);62756276// On ELFv2, we can avoid allocating the parameter area if all the arguments6277// can be passed to the callee in registers.6278// For the fast calling convention, there is another check below.6279// Note: We should keep consistent with LowerFormalArguments_64SVR4()6280bool HasParameterArea = !isELFv2ABI || CFlags.IsVarArg || IsFastCall;6281if (!HasParameterArea) {6282unsigned ParamAreaSize = NumGPRs * PtrByteSize;6283unsigned AvailableFPRs = NumFPRs;6284unsigned AvailableVRs = NumVRs;6285unsigned NumBytesTmp = NumBytes;6286for (unsigned i = 0; i != NumOps; ++i) {6287if (Outs[i].Flags.isNest()) continue;6288if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,6289PtrByteSize, LinkageSize, ParamAreaSize,6290NumBytesTmp, AvailableFPRs, AvailableVRs))6291HasParameterArea = true;6292}6293}62946295// When using the fast calling convention, we don't provide backing for6296// arguments that will be in registers.6297unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;62986299// Avoid allocating parameter area for fastcc functions if all the arguments6300// can be passed in the registers.6301if (IsFastCall)6302HasParameterArea = false;63036304// Add up all the space actually used.6305for (unsigned i = 0; i != NumOps; ++i) {6306ISD::ArgFlagsTy Flags = Outs[i].Flags;6307EVT ArgVT = Outs[i].VT;6308EVT OrigVT = Outs[i].ArgVT;63096310if (Flags.isNest())6311continue;63126313if (IsFastCall) {6314if (Flags.isByVal()) {6315NumGPRsUsed += (Flags.getByValSize()+7)/8;6316if (NumGPRsUsed > NumGPRs)6317HasParameterArea = true;6318} else {6319switch (ArgVT.getSimpleVT().SimpleTy) {6320default: llvm_unreachable("Unexpected ValueType for argument!");6321case MVT::i1:6322case MVT::i32:6323case MVT::i64:6324if (++NumGPRsUsed <= NumGPRs)6325continue;6326break;6327case MVT::v4i32:6328case MVT::v8i16:6329case MVT::v16i8:6330case MVT::v2f64:6331case MVT::v2i64:6332case MVT::v1i128:6333case MVT::f128:6334if (++NumVRsUsed <= NumVRs)6335continue;6336break;6337case MVT::v4f32:6338if (++NumVRsUsed <= NumVRs)6339continue;6340break;6341case MVT::f32:6342case MVT::f64:6343if (++NumFPRsUsed <= NumFPRs)6344continue;6345break;6346}6347HasParameterArea = true;6348}6349}63506351/* Respect alignment of argument on the stack. */6352auto Alignement =6353CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);6354NumBytes = alignTo(NumBytes, Alignement);63556356NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);6357if (Flags.isInConsecutiveRegsLast())6358NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;6359}63606361unsigned NumBytesActuallyUsed = NumBytes;63626363// In the old ELFv1 ABI,6364// the prolog code of the callee may store up to 8 GPR argument registers to6365// the stack, allowing va_start to index over them in memory if its varargs.6366// Because we cannot tell if this is needed on the caller side, we have to6367// conservatively assume that it is needed. As such, make sure we have at6368// least enough stack space for the caller to store the 8 GPRs.6369// In the ELFv2 ABI, we allocate the parameter area iff a callee6370// really requires memory operands, e.g. a vararg function.6371if (HasParameterArea)6372NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);6373else6374NumBytes = LinkageSize;63756376// Tail call needs the stack to be aligned.6377if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)6378NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);63796380int SPDiff = 0;63816382// Calculate by how many bytes the stack has to be adjusted in case of tail6383// call optimization.6384if (!IsSibCall)6385SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);63866387// To protect arguments on the stack from being clobbered in a tail call,6388// force all the loads to happen before doing any other lowering.6389if (CFlags.IsTailCall)6390Chain = DAG.getStackArgumentTokenFactor(Chain);63916392// Adjust the stack pointer for the new arguments...6393// These operations are automatically eliminated by the prolog/epilog pass6394if (!IsSibCall)6395Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);6396SDValue CallSeqStart = Chain;63976398// Load the return address and frame pointer so it can be move somewhere else6399// later.6400SDValue LROp, FPOp;6401Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);64026403// Set up a copy of the stack pointer for use loading and storing any6404// arguments that may not fit in the registers available for argument6405// passing.6406SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);64076408// Figure out which arguments are going to go in registers, and which in6409// memory. Also, if this is a vararg function, floating point operations6410// must be stored to our stack, and loaded into integer regs as well, if6411// any integer regs are available for argument passing.6412unsigned ArgOffset = LinkageSize;64136414SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;6415SmallVector<TailCallArgumentInfo, 8> TailCallArguments;64166417SmallVector<SDValue, 8> MemOpChains;6418for (unsigned i = 0; i != NumOps; ++i) {6419SDValue Arg = OutVals[i];6420ISD::ArgFlagsTy Flags = Outs[i].Flags;6421EVT ArgVT = Outs[i].VT;6422EVT OrigVT = Outs[i].ArgVT;64236424// PtrOff will be used to store the current argument to the stack if a6425// register cannot be found for it.6426SDValue PtrOff;64276428// We re-align the argument offset for each argument, except when using the6429// fast calling convention, when we need to make sure we do that only when6430// we'll actually use a stack slot.6431auto ComputePtrOff = [&]() {6432/* Respect alignment of argument on the stack. */6433auto Alignment =6434CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);6435ArgOffset = alignTo(ArgOffset, Alignment);64366437PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());64386439PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);6440};64416442if (!IsFastCall) {6443ComputePtrOff();64446445/* Compute GPR index associated with argument offset. */6446GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;6447GPR_idx = std::min(GPR_idx, NumGPRs);6448}64496450// Promote integers to 64-bit values.6451if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {6452// FIXME: Should this use ANY_EXTEND if neither sext nor zext?6453unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;6454Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);6455}64566457// FIXME memcpy is used way more than necessary. Correctness first.6458// Note: "by value" is code for passing a structure by value, not6459// basic types.6460if (Flags.isByVal()) {6461// Note: Size includes alignment padding, so6462// struct x { short a; char b; }6463// will have Size = 4. With #pragma pack(1), it will have Size = 3.6464// These are the proper values we need for right-justifying the6465// aggregate in a parameter register.6466unsigned Size = Flags.getByValSize();64676468// An empty aggregate parameter takes up no storage and no6469// registers.6470if (Size == 0)6471continue;64726473if (IsFastCall)6474ComputePtrOff();64756476// All aggregates smaller than 8 bytes must be passed right-justified.6477if (Size==1 || Size==2 || Size==4) {6478EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);6479if (GPR_idx != NumGPRs) {6480SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,6481MachinePointerInfo(), VT);6482MemOpChains.push_back(Load.getValue(1));6483RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));64846485ArgOffset += PtrByteSize;6486continue;6487}6488}64896490if (GPR_idx == NumGPRs && Size < 8) {6491SDValue AddPtr = PtrOff;6492if (!isLittleEndian) {6493SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,6494PtrOff.getValueType());6495AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);6496}6497Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,6498CallSeqStart,6499Flags, DAG, dl);6500ArgOffset += PtrByteSize;6501continue;6502}6503// Copy the object to parameter save area if it can not be entirely passed6504// by registers.6505// FIXME: we only need to copy the parts which need to be passed in6506// parameter save area. For the parts passed by registers, we don't need6507// to copy them to the stack although we need to allocate space for them6508// in parameter save area.6509if ((NumGPRs - GPR_idx) * PtrByteSize < Size)6510Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,6511CallSeqStart,6512Flags, DAG, dl);65136514// When a register is available, pass a small aggregate right-justified.6515if (Size < 8 && GPR_idx != NumGPRs) {6516// The easiest way to get this right-justified in a register6517// is to copy the structure into the rightmost portion of a6518// local variable slot, then load the whole slot into the6519// register.6520// FIXME: The memcpy seems to produce pretty awful code for6521// small aggregates, particularly for packed ones.6522// FIXME: It would be preferable to use the slot in the6523// parameter save area instead of a new local variable.6524SDValue AddPtr = PtrOff;6525if (!isLittleEndian) {6526SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType());6527AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);6528}6529Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,6530CallSeqStart,6531Flags, DAG, dl);65326533// Load the slot into the register.6534SDValue Load =6535DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo());6536MemOpChains.push_back(Load.getValue(1));6537RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));65386539// Done with this argument.6540ArgOffset += PtrByteSize;6541continue;6542}65436544// For aggregates larger than PtrByteSize, copy the pieces of the6545// object that fit into registers from the parameter save area.6546for (unsigned j=0; j<Size; j+=PtrByteSize) {6547SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());6548SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);6549if (GPR_idx != NumGPRs) {6550unsigned LoadSizeInBits = std::min(PtrByteSize, (Size - j)) * 8;6551EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), LoadSizeInBits);6552SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, AddArg,6553MachinePointerInfo(), ObjType);65546555MemOpChains.push_back(Load.getValue(1));6556RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));6557ArgOffset += PtrByteSize;6558} else {6559ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;6560break;6561}6562}6563continue;6564}65656566switch (Arg.getSimpleValueType().SimpleTy) {6567default: llvm_unreachable("Unexpected ValueType for argument!");6568case MVT::i1:6569case MVT::i32:6570case MVT::i64:6571if (Flags.isNest()) {6572// The 'nest' parameter, if any, is passed in R11.6573RegsToPass.push_back(std::make_pair(PPC::X11, Arg));6574break;6575}65766577// These can be scalar arguments or elements of an integer array type6578// passed directly. Clang may use those instead of "byval" aggregate6579// types to avoid forcing arguments to memory unnecessarily.6580if (GPR_idx != NumGPRs) {6581RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));6582} else {6583if (IsFastCall)6584ComputePtrOff();65856586assert(HasParameterArea &&6587"Parameter area must exist to pass an argument in memory.");6588LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,6589true, CFlags.IsTailCall, false, MemOpChains,6590TailCallArguments, dl);6591if (IsFastCall)6592ArgOffset += PtrByteSize;6593}6594if (!IsFastCall)6595ArgOffset += PtrByteSize;6596break;6597case MVT::f32:6598case MVT::f64: {6599// These can be scalar arguments or elements of a float array type6600// passed directly. The latter are used to implement ELFv2 homogenous6601// float aggregates.66026603// Named arguments go into FPRs first, and once they overflow, the6604// remaining arguments go into GPRs and then the parameter save area.6605// Unnamed arguments for vararg functions always go to GPRs and6606// then the parameter save area. For now, put all arguments to vararg6607// routines always in both locations (FPR *and* GPR or stack slot).6608bool NeedGPROrStack = CFlags.IsVarArg || FPR_idx == NumFPRs;6609bool NeededLoad = false;66106611// First load the argument into the next available FPR.6612if (FPR_idx != NumFPRs)6613RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));66146615// Next, load the argument into GPR or stack slot if needed.6616if (!NeedGPROrStack)6617;6618else if (GPR_idx != NumGPRs && !IsFastCall) {6619// FIXME: We may want to re-enable this for CallingConv::Fast on the P86620// once we support fp <-> gpr moves.66216622// In the non-vararg case, this can only ever happen in the6623// presence of f32 array types, since otherwise we never run6624// out of FPRs before running out of GPRs.6625SDValue ArgVal;66266627// Double values are always passed in a single GPR.6628if (Arg.getValueType() != MVT::f32) {6629ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);66306631// Non-array float values are extended and passed in a GPR.6632} else if (!Flags.isInConsecutiveRegs()) {6633ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);6634ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);66356636// If we have an array of floats, we collect every odd element6637// together with its predecessor into one GPR.6638} else if (ArgOffset % PtrByteSize != 0) {6639SDValue Lo, Hi;6640Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);6641Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);6642if (!isLittleEndian)6643std::swap(Lo, Hi);6644ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);66456646// The final element, if even, goes into the first half of a GPR.6647} else if (Flags.isInConsecutiveRegsLast()) {6648ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);6649ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);6650if (!isLittleEndian)6651ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,6652DAG.getConstant(32, dl, MVT::i32));66536654// Non-final even elements are skipped; they will be handled6655// together the with subsequent argument on the next go-around.6656} else6657ArgVal = SDValue();66586659if (ArgVal.getNode())6660RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));6661} else {6662if (IsFastCall)6663ComputePtrOff();66646665// Single-precision floating-point values are mapped to the6666// second (rightmost) word of the stack doubleword.6667if (Arg.getValueType() == MVT::f32 &&6668!isLittleEndian && !Flags.isInConsecutiveRegs()) {6669SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());6670PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);6671}66726673assert(HasParameterArea &&6674"Parameter area must exist to pass an argument in memory.");6675LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,6676true, CFlags.IsTailCall, false, MemOpChains,6677TailCallArguments, dl);66786679NeededLoad = true;6680}6681// When passing an array of floats, the array occupies consecutive6682// space in the argument area; only round up to the next doubleword6683// at the end of the array. Otherwise, each float takes 8 bytes.6684if (!IsFastCall || NeededLoad) {6685ArgOffset += (Arg.getValueType() == MVT::f32 &&6686Flags.isInConsecutiveRegs()) ? 4 : 8;6687if (Flags.isInConsecutiveRegsLast())6688ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;6689}6690break;6691}6692case MVT::v4f32:6693case MVT::v4i32:6694case MVT::v8i16:6695case MVT::v16i8:6696case MVT::v2f64:6697case MVT::v2i64:6698case MVT::v1i128:6699case MVT::f128:6700// These can be scalar arguments or elements of a vector array type6701// passed directly. The latter are used to implement ELFv2 homogenous6702// vector aggregates.67036704// For a varargs call, named arguments go into VRs or on the stack as6705// usual; unnamed arguments always go to the stack or the corresponding6706// GPRs when within range. For now, we always put the value in both6707// locations (or even all three).6708if (CFlags.IsVarArg) {6709assert(HasParameterArea &&6710"Parameter area must exist if we have a varargs call.");6711// We could elide this store in the case where the object fits6712// entirely in R registers. Maybe later.6713SDValue Store =6714DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());6715MemOpChains.push_back(Store);6716if (VR_idx != NumVRs) {6717SDValue Load =6718DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());6719MemOpChains.push_back(Load.getValue(1));6720RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));6721}6722ArgOffset += 16;6723for (unsigned i=0; i<16; i+=PtrByteSize) {6724if (GPR_idx == NumGPRs)6725break;6726SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,6727DAG.getConstant(i, dl, PtrVT));6728SDValue Load =6729DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());6730MemOpChains.push_back(Load.getValue(1));6731RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));6732}6733break;6734}67356736// Non-varargs Altivec params go into VRs or on the stack.6737if (VR_idx != NumVRs) {6738RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));6739} else {6740if (IsFastCall)6741ComputePtrOff();67426743assert(HasParameterArea &&6744"Parameter area must exist to pass an argument in memory.");6745LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,6746true, CFlags.IsTailCall, true, MemOpChains,6747TailCallArguments, dl);6748if (IsFastCall)6749ArgOffset += 16;6750}67516752if (!IsFastCall)6753ArgOffset += 16;6754break;6755}6756}67576758assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&6759"mismatch in size of parameter area");6760(void)NumBytesActuallyUsed;67616762if (!MemOpChains.empty())6763Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);67646765// Check if this is an indirect call (MTCTR/BCTRL).6766// See prepareDescriptorIndirectCall and buildCallOperands for more6767// information about calls through function pointers in the 64-bit SVR4 ABI.6768if (CFlags.IsIndirect) {6769// For 64-bit ELFv2 ABI with PCRel, do not save the TOC of the6770// caller in the TOC save area.6771if (isTOCSaveRestoreRequired(Subtarget)) {6772assert(!CFlags.IsTailCall && "Indirect tails calls not supported");6773// Load r2 into a virtual register and store it to the TOC save area.6774setUsesTOCBasePtr(DAG);6775SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);6776// TOC save area offset.6777unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();6778SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);6779SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);6780Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr,6781MachinePointerInfo::getStack(6782DAG.getMachineFunction(), TOCSaveOffset));6783}6784// In the ELFv2 ABI, R12 must contain the address of an indirect callee.6785// This does not mean the MTCTR instruction must use R12; it's easier6786// to model this as an extra parameter, so do that.6787if (isELFv2ABI && !CFlags.IsPatchPoint)6788RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));6789}67906791// Build a sequence of copy-to-reg nodes chained together with token chain6792// and flag operands which copy the outgoing args into the appropriate regs.6793SDValue InGlue;6794for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {6795Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,6796RegsToPass[i].second, InGlue);6797InGlue = Chain.getValue(1);6798}67996800if (CFlags.IsTailCall && !IsSibCall)6801PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp,6802TailCallArguments);68036804return FinishCall(CFlags, dl, DAG, RegsToPass, InGlue, Chain, CallSeqStart,6805Callee, SPDiff, NumBytes, Ins, InVals, CB);6806}68076808// Returns true when the shadow of a general purpose argument register6809// in the parameter save area is aligned to at least 'RequiredAlign'.6810static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) {6811assert(RequiredAlign.value() <= 16 &&6812"Required alignment greater than stack alignment.");6813switch (Reg) {6814default:6815report_fatal_error("called on invalid register.");6816case PPC::R5:6817case PPC::R9:6818case PPC::X3:6819case PPC::X5:6820case PPC::X7:6821case PPC::X9:6822// These registers are 16 byte aligned which is the most strict aligment6823// we can support.6824return true;6825case PPC::R3:6826case PPC::R7:6827case PPC::X4:6828case PPC::X6:6829case PPC::X8:6830case PPC::X10:6831// The shadow of these registers in the PSA is 8 byte aligned.6832return RequiredAlign <= 8;6833case PPC::R4:6834case PPC::R6:6835case PPC::R8:6836case PPC::R10:6837return RequiredAlign <= 4;6838}6839}68406841static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,6842CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,6843CCState &S) {6844AIXCCState &State = static_cast<AIXCCState &>(S);6845const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(6846State.getMachineFunction().getSubtarget());6847const bool IsPPC64 = Subtarget.isPPC64();6848const unsigned PtrSize = IsPPC64 ? 8 : 4;6849const Align PtrAlign(PtrSize);6850const Align StackAlign(16);6851const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;68526853if (ValVT == MVT::f128)6854report_fatal_error("f128 is unimplemented on AIX.");68556856if (ArgFlags.isNest())6857report_fatal_error("Nest arguments are unimplemented.");68586859static const MCPhysReg GPR_32[] = {// 32-bit registers.6860PPC::R3, PPC::R4, PPC::R5, PPC::R6,6861PPC::R7, PPC::R8, PPC::R9, PPC::R10};6862static const MCPhysReg GPR_64[] = {// 64-bit registers.6863PPC::X3, PPC::X4, PPC::X5, PPC::X6,6864PPC::X7, PPC::X8, PPC::X9, PPC::X10};68656866static const MCPhysReg VR[] = {// Vector registers.6867PPC::V2, PPC::V3, PPC::V4, PPC::V5,6868PPC::V6, PPC::V7, PPC::V8, PPC::V9,6869PPC::V10, PPC::V11, PPC::V12, PPC::V13};68706871const ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32;68726873if (ArgFlags.isByVal()) {6874const Align ByValAlign(ArgFlags.getNonZeroByValAlign());6875if (ByValAlign > StackAlign)6876report_fatal_error("Pass-by-value arguments with alignment greater than "6877"16 are not supported.");68786879const unsigned ByValSize = ArgFlags.getByValSize();6880const Align ObjAlign = ByValAlign > PtrAlign ? ByValAlign : PtrAlign;68816882// An empty aggregate parameter takes up no storage and no registers,6883// but needs a MemLoc for a stack slot for the formal arguments side.6884if (ByValSize == 0) {6885State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,6886State.getStackSize(), RegVT, LocInfo));6887return false;6888}68896890// Shadow allocate any registers that are not properly aligned.6891unsigned NextReg = State.getFirstUnallocated(GPRs);6892while (NextReg != GPRs.size() &&6893!isGPRShadowAligned(GPRs[NextReg], ObjAlign)) {6894// Shadow allocate next registers since its aligment is not strict enough.6895unsigned Reg = State.AllocateReg(GPRs);6896// Allocate the stack space shadowed by said register.6897State.AllocateStack(PtrSize, PtrAlign);6898assert(Reg && "Alocating register unexpectedly failed.");6899(void)Reg;6900NextReg = State.getFirstUnallocated(GPRs);6901}69026903const unsigned StackSize = alignTo(ByValSize, ObjAlign);6904unsigned Offset = State.AllocateStack(StackSize, ObjAlign);6905for (const unsigned E = Offset + StackSize; Offset < E; Offset += PtrSize) {6906if (unsigned Reg = State.AllocateReg(GPRs))6907State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));6908else {6909State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,6910Offset, MVT::INVALID_SIMPLE_VALUE_TYPE,6911LocInfo));6912break;6913}6914}6915return false;6916}69176918// Arguments always reserve parameter save area.6919switch (ValVT.SimpleTy) {6920default:6921report_fatal_error("Unhandled value type for argument.");6922case MVT::i64:6923// i64 arguments should have been split to i32 for PPC32.6924assert(IsPPC64 && "PPC32 should have split i64 values.");6925[[fallthrough]];6926case MVT::i1:6927case MVT::i32: {6928const unsigned Offset = State.AllocateStack(PtrSize, PtrAlign);6929// AIX integer arguments are always passed in register width.6930if (ValVT.getFixedSizeInBits() < RegVT.getFixedSizeInBits())6931LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt6932: CCValAssign::LocInfo::ZExt;6933if (unsigned Reg = State.AllocateReg(GPRs))6934State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));6935else6936State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, RegVT, LocInfo));69376938return false;6939}6940case MVT::f32:6941case MVT::f64: {6942// Parameter save area (PSA) is reserved even if the float passes in fpr.6943const unsigned StoreSize = LocVT.getStoreSize();6944// Floats are always 4-byte aligned in the PSA on AIX.6945// This includes f64 in 64-bit mode for ABI compatibility.6946const unsigned Offset =6947State.AllocateStack(IsPPC64 ? 8 : StoreSize, Align(4));6948unsigned FReg = State.AllocateReg(FPR);6949if (FReg)6950State.addLoc(CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, LocInfo));69516952// Reserve and initialize GPRs or initialize the PSA as required.6953for (unsigned I = 0; I < StoreSize; I += PtrSize) {6954if (unsigned Reg = State.AllocateReg(GPRs)) {6955assert(FReg && "An FPR should be available when a GPR is reserved.");6956if (State.isVarArg()) {6957// Successfully reserved GPRs are only initialized for vararg calls.6958// Custom handling is required for:6959// f64 in PPC32 needs to be split into 2 GPRs.6960// f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.6961State.addLoc(6962CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));6963}6964} else {6965// If there are insufficient GPRs, the PSA needs to be initialized.6966// Initialization occurs even if an FPR was initialized for6967// compatibility with the AIX XL compiler. The full memory for the6968// argument will be initialized even if a prior word is saved in GPR.6969// A custom memLoc is used when the argument also passes in FPR so6970// that the callee handling can skip over it easily.6971State.addLoc(6972FReg ? CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT,6973LocInfo)6974: CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));6975break;6976}6977}69786979return false;6980}6981case MVT::v4f32:6982case MVT::v4i32:6983case MVT::v8i16:6984case MVT::v16i8:6985case MVT::v2i64:6986case MVT::v2f64:6987case MVT::v1i128: {6988const unsigned VecSize = 16;6989const Align VecAlign(VecSize);69906991if (!State.isVarArg()) {6992// If there are vector registers remaining we don't consume any stack6993// space.6994if (unsigned VReg = State.AllocateReg(VR)) {6995State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));6996return false;6997}6998// Vectors passed on the stack do not shadow GPRs or FPRs even though they6999// might be allocated in the portion of the PSA that is shadowed by the7000// GPRs.7001const unsigned Offset = State.AllocateStack(VecSize, VecAlign);7002State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));7003return false;7004}70057006unsigned NextRegIndex = State.getFirstUnallocated(GPRs);7007// Burn any underaligned registers and their shadowed stack space until7008// we reach the required alignment.7009while (NextRegIndex != GPRs.size() &&7010!isGPRShadowAligned(GPRs[NextRegIndex], VecAlign)) {7011// Shadow allocate register and its stack shadow.7012unsigned Reg = State.AllocateReg(GPRs);7013State.AllocateStack(PtrSize, PtrAlign);7014assert(Reg && "Allocating register unexpectedly failed.");7015(void)Reg;7016NextRegIndex = State.getFirstUnallocated(GPRs);7017}70187019// Vectors that are passed as fixed arguments are handled differently.7020// They are passed in VRs if any are available (unlike arguments passed7021// through ellipses) and shadow GPRs (unlike arguments to non-vaarg7022// functions)7023if (State.isFixed(ValNo)) {7024if (unsigned VReg = State.AllocateReg(VR)) {7025State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));7026// Shadow allocate GPRs and stack space even though we pass in a VR.7027for (unsigned I = 0; I != VecSize; I += PtrSize)7028State.AllocateReg(GPRs);7029State.AllocateStack(VecSize, VecAlign);7030return false;7031}7032// No vector registers remain so pass on the stack.7033const unsigned Offset = State.AllocateStack(VecSize, VecAlign);7034State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));7035return false;7036}70377038// If all GPRS are consumed then we pass the argument fully on the stack.7039if (NextRegIndex == GPRs.size()) {7040const unsigned Offset = State.AllocateStack(VecSize, VecAlign);7041State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));7042return false;7043}70447045// Corner case for 32-bit codegen. We have 2 registers to pass the first7046// half of the argument, and then need to pass the remaining half on the7047// stack.7048if (GPRs[NextRegIndex] == PPC::R9) {7049const unsigned Offset = State.AllocateStack(VecSize, VecAlign);7050State.addLoc(7051CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));70527053const unsigned FirstReg = State.AllocateReg(PPC::R9);7054const unsigned SecondReg = State.AllocateReg(PPC::R10);7055assert(FirstReg && SecondReg &&7056"Allocating R9 or R10 unexpectedly failed.");7057State.addLoc(7058CCValAssign::getCustomReg(ValNo, ValVT, FirstReg, RegVT, LocInfo));7059State.addLoc(7060CCValAssign::getCustomReg(ValNo, ValVT, SecondReg, RegVT, LocInfo));7061return false;7062}70637064// We have enough GPRs to fully pass the vector argument, and we have7065// already consumed any underaligned registers. Start with the custom7066// MemLoc and then the custom RegLocs.7067const unsigned Offset = State.AllocateStack(VecSize, VecAlign);7068State.addLoc(7069CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));7070for (unsigned I = 0; I != VecSize; I += PtrSize) {7071const unsigned Reg = State.AllocateReg(GPRs);7072assert(Reg && "Failed to allocated register for vararg vector argument");7073State.addLoc(7074CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));7075}7076return false;7077}7078}7079return true;7080}70817082// So far, this function is only used by LowerFormalArguments_AIX()7083static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,7084bool IsPPC64,7085bool HasP8Vector,7086bool HasVSX) {7087assert((IsPPC64 || SVT != MVT::i64) &&7088"i64 should have been split for 32-bit codegen.");70897090switch (SVT) {7091default:7092report_fatal_error("Unexpected value type for formal argument");7093case MVT::i1:7094case MVT::i32:7095case MVT::i64:7096return IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;7097case MVT::f32:7098return HasP8Vector ? &PPC::VSSRCRegClass : &PPC::F4RCRegClass;7099case MVT::f64:7100return HasVSX ? &PPC::VSFRCRegClass : &PPC::F8RCRegClass;7101case MVT::v4f32:7102case MVT::v4i32:7103case MVT::v8i16:7104case MVT::v16i8:7105case MVT::v2i64:7106case MVT::v2f64:7107case MVT::v1i128:7108return &PPC::VRRCRegClass;7109}7110}71117112static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,7113SelectionDAG &DAG, SDValue ArgValue,7114MVT LocVT, const SDLoc &dl) {7115assert(ValVT.isScalarInteger() && LocVT.isScalarInteger());7116assert(ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits());71177118if (Flags.isSExt())7119ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue,7120DAG.getValueType(ValVT));7121else if (Flags.isZExt())7122ArgValue = DAG.getNode(ISD::AssertZext, dl, LocVT, ArgValue,7123DAG.getValueType(ValVT));71247125return DAG.getNode(ISD::TRUNCATE, dl, ValVT, ArgValue);7126}71277128static unsigned mapArgRegToOffsetAIX(unsigned Reg, const PPCFrameLowering *FL) {7129const unsigned LASize = FL->getLinkageSize();71307131if (PPC::GPRCRegClass.contains(Reg)) {7132assert(Reg >= PPC::R3 && Reg <= PPC::R10 &&7133"Reg must be a valid argument register!");7134return LASize + 4 * (Reg - PPC::R3);7135}71367137if (PPC::G8RCRegClass.contains(Reg)) {7138assert(Reg >= PPC::X3 && Reg <= PPC::X10 &&7139"Reg must be a valid argument register!");7140return LASize + 8 * (Reg - PPC::X3);7141}71427143llvm_unreachable("Only general purpose registers expected.");7144}71457146// AIX ABI Stack Frame Layout:7147//7148// Low Memory +--------------------------------------------+7149// SP +---> | Back chain | ---+7150// | +--------------------------------------------+ |7151// | | Saved Condition Register | |7152// | +--------------------------------------------+ |7153// | | Saved Linkage Register | |7154// | +--------------------------------------------+ | Linkage Area7155// | | Reserved for compilers | |7156// | +--------------------------------------------+ |7157// | | Reserved for binders | |7158// | +--------------------------------------------+ |7159// | | Saved TOC pointer | ---+7160// | +--------------------------------------------+7161// | | Parameter save area |7162// | +--------------------------------------------+7163// | | Alloca space |7164// | +--------------------------------------------+7165// | | Local variable space |7166// | +--------------------------------------------+7167// | | Float/int conversion temporary |7168// | +--------------------------------------------+7169// | | Save area for AltiVec registers |7170// | +--------------------------------------------+7171// | | AltiVec alignment padding |7172// | +--------------------------------------------+7173// | | Save area for VRSAVE register |7174// | +--------------------------------------------+7175// | | Save area for General Purpose registers |7176// | +--------------------------------------------+7177// | | Save area for Floating Point registers |7178// | +--------------------------------------------+7179// +---- | Back chain |7180// High Memory +--------------------------------------------+7181//7182// Specifications:7183// AIX 7.2 Assembler Language Reference7184// Subroutine linkage convention71857186SDValue PPCTargetLowering::LowerFormalArguments_AIX(7187SDValue Chain, CallingConv::ID CallConv, bool isVarArg,7188const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,7189SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {71907191assert((CallConv == CallingConv::C || CallConv == CallingConv::Cold ||7192CallConv == CallingConv::Fast) &&7193"Unexpected calling convention!");71947195if (getTargetMachine().Options.GuaranteedTailCallOpt)7196report_fatal_error("Tail call support is unimplemented on AIX.");71977198if (useSoftFloat())7199report_fatal_error("Soft float support is unimplemented on AIX.");72007201const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();72027203const bool IsPPC64 = Subtarget.isPPC64();7204const unsigned PtrByteSize = IsPPC64 ? 8 : 4;72057206// Assign locations to all of the incoming arguments.7207SmallVector<CCValAssign, 16> ArgLocs;7208MachineFunction &MF = DAG.getMachineFunction();7209MachineFrameInfo &MFI = MF.getFrameInfo();7210PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();7211AIXCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());72127213const EVT PtrVT = getPointerTy(MF.getDataLayout());7214// Reserve space for the linkage area on the stack.7215const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();7216CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));7217CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);72187219SmallVector<SDValue, 8> MemOps;72207221for (size_t I = 0, End = ArgLocs.size(); I != End; /* No increment here */) {7222CCValAssign &VA = ArgLocs[I++];7223MVT LocVT = VA.getLocVT();7224MVT ValVT = VA.getValVT();7225ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;7226// For compatibility with the AIX XL compiler, the float args in the7227// parameter save area are initialized even if the argument is available7228// in register. The caller is required to initialize both the register7229// and memory, however, the callee can choose to expect it in either.7230// The memloc is dismissed here because the argument is retrieved from7231// the register.7232if (VA.isMemLoc() && VA.needsCustom() && ValVT.isFloatingPoint())7233continue;72347235auto HandleMemLoc = [&]() {7236const unsigned LocSize = LocVT.getStoreSize();7237const unsigned ValSize = ValVT.getStoreSize();7238assert((ValSize <= LocSize) &&7239"Object size is larger than size of MemLoc");7240int CurArgOffset = VA.getLocMemOffset();7241// Objects are right-justified because AIX is big-endian.7242if (LocSize > ValSize)7243CurArgOffset += LocSize - ValSize;7244// Potential tail calls could cause overwriting of argument stack slots.7245const bool IsImmutable =7246!(getTargetMachine().Options.GuaranteedTailCallOpt &&7247(CallConv == CallingConv::Fast));7248int FI = MFI.CreateFixedObject(ValSize, CurArgOffset, IsImmutable);7249SDValue FIN = DAG.getFrameIndex(FI, PtrVT);7250SDValue ArgValue =7251DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo());7252InVals.push_back(ArgValue);7253};72547255// Vector arguments to VaArg functions are passed both on the stack, and7256// in any available GPRs. Load the value from the stack and add the GPRs7257// as live ins.7258if (VA.isMemLoc() && VA.needsCustom()) {7259assert(ValVT.isVector() && "Unexpected Custom MemLoc type.");7260assert(isVarArg && "Only use custom memloc for vararg.");7261// ValNo of the custom MemLoc, so we can compare it to the ValNo of the7262// matching custom RegLocs.7263const unsigned OriginalValNo = VA.getValNo();7264(void)OriginalValNo;72657266auto HandleCustomVecRegLoc = [&]() {7267assert(I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&7268"Missing custom RegLoc.");7269VA = ArgLocs[I++];7270assert(VA.getValVT().isVector() &&7271"Unexpected Val type for custom RegLoc.");7272assert(VA.getValNo() == OriginalValNo &&7273"ValNo mismatch between custom MemLoc and RegLoc.");7274MVT::SimpleValueType SVT = VA.getLocVT().SimpleTy;7275MF.addLiveIn(VA.getLocReg(),7276getRegClassForSVT(SVT, IsPPC64, Subtarget.hasP8Vector(),7277Subtarget.hasVSX()));7278};72797280HandleMemLoc();7281// In 64-bit there will be exactly 2 custom RegLocs that follow, and in7282// in 32-bit there will be 2 custom RegLocs if we are passing in R9 and7283// R10.7284HandleCustomVecRegLoc();7285HandleCustomVecRegLoc();72867287// If we are targeting 32-bit, there might be 2 extra custom RegLocs if7288// we passed the vector in R5, R6, R7 and R8.7289if (I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom()) {7290assert(!IsPPC64 &&7291"Only 2 custom RegLocs expected for 64-bit codegen.");7292HandleCustomVecRegLoc();7293HandleCustomVecRegLoc();7294}72957296continue;7297}72987299if (VA.isRegLoc()) {7300if (VA.getValVT().isScalarInteger())7301FuncInfo->appendParameterType(PPCFunctionInfo::FixedType);7302else if (VA.getValVT().isFloatingPoint() && !VA.getValVT().isVector()) {7303switch (VA.getValVT().SimpleTy) {7304default:7305report_fatal_error("Unhandled value type for argument.");7306case MVT::f32:7307FuncInfo->appendParameterType(PPCFunctionInfo::ShortFloatingPoint);7308break;7309case MVT::f64:7310FuncInfo->appendParameterType(PPCFunctionInfo::LongFloatingPoint);7311break;7312}7313} else if (VA.getValVT().isVector()) {7314switch (VA.getValVT().SimpleTy) {7315default:7316report_fatal_error("Unhandled value type for argument.");7317case MVT::v16i8:7318FuncInfo->appendParameterType(PPCFunctionInfo::VectorChar);7319break;7320case MVT::v8i16:7321FuncInfo->appendParameterType(PPCFunctionInfo::VectorShort);7322break;7323case MVT::v4i32:7324case MVT::v2i64:7325case MVT::v1i128:7326FuncInfo->appendParameterType(PPCFunctionInfo::VectorInt);7327break;7328case MVT::v4f32:7329case MVT::v2f64:7330FuncInfo->appendParameterType(PPCFunctionInfo::VectorFloat);7331break;7332}7333}7334}73357336if (Flags.isByVal() && VA.isMemLoc()) {7337const unsigned Size =7338alignTo(Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize,7339PtrByteSize);7340const int FI = MF.getFrameInfo().CreateFixedObject(7341Size, VA.getLocMemOffset(), /* IsImmutable */ false,7342/* IsAliased */ true);7343SDValue FIN = DAG.getFrameIndex(FI, PtrVT);7344InVals.push_back(FIN);73457346continue;7347}73487349if (Flags.isByVal()) {7350assert(VA.isRegLoc() && "MemLocs should already be handled.");73517352const MCPhysReg ArgReg = VA.getLocReg();7353const PPCFrameLowering *FL = Subtarget.getFrameLowering();73547355const unsigned StackSize = alignTo(Flags.getByValSize(), PtrByteSize);7356const int FI = MF.getFrameInfo().CreateFixedObject(7357StackSize, mapArgRegToOffsetAIX(ArgReg, FL), /* IsImmutable */ false,7358/* IsAliased */ true);7359SDValue FIN = DAG.getFrameIndex(FI, PtrVT);7360InVals.push_back(FIN);73617362// Add live ins for all the RegLocs for the same ByVal.7363const TargetRegisterClass *RegClass =7364IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;73657366auto HandleRegLoc = [&, RegClass, LocVT](const MCPhysReg PhysReg,7367unsigned Offset) {7368const Register VReg = MF.addLiveIn(PhysReg, RegClass);7369// Since the callers side has left justified the aggregate in the7370// register, we can simply store the entire register into the stack7371// slot.7372SDValue CopyFrom = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);7373// The store to the fixedstack object is needed becuase accessing a7374// field of the ByVal will use a gep and load. Ideally we will optimize7375// to extracting the value from the register directly, and elide the7376// stores when the arguments address is not taken, but that will need to7377// be future work.7378SDValue Store = DAG.getStore(7379CopyFrom.getValue(1), dl, CopyFrom,7380DAG.getObjectPtrOffset(dl, FIN, TypeSize::getFixed(Offset)),7381MachinePointerInfo::getFixedStack(MF, FI, Offset));73827383MemOps.push_back(Store);7384};73857386unsigned Offset = 0;7387HandleRegLoc(VA.getLocReg(), Offset);7388Offset += PtrByteSize;7389for (; Offset != StackSize && ArgLocs[I].isRegLoc();7390Offset += PtrByteSize) {7391assert(ArgLocs[I].getValNo() == VA.getValNo() &&7392"RegLocs should be for ByVal argument.");73937394const CCValAssign RL = ArgLocs[I++];7395HandleRegLoc(RL.getLocReg(), Offset);7396FuncInfo->appendParameterType(PPCFunctionInfo::FixedType);7397}73987399if (Offset != StackSize) {7400assert(ArgLocs[I].getValNo() == VA.getValNo() &&7401"Expected MemLoc for remaining bytes.");7402assert(ArgLocs[I].isMemLoc() && "Expected MemLoc for remaining bytes.");7403// Consume the MemLoc.The InVal has already been emitted, so nothing7404// more needs to be done.7405++I;7406}74077408continue;7409}74107411if (VA.isRegLoc() && !VA.needsCustom()) {7412MVT::SimpleValueType SVT = ValVT.SimpleTy;7413Register VReg =7414MF.addLiveIn(VA.getLocReg(),7415getRegClassForSVT(SVT, IsPPC64, Subtarget.hasP8Vector(),7416Subtarget.hasVSX()));7417SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);7418if (ValVT.isScalarInteger() &&7419(ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits())) {7420ArgValue =7421truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);7422}7423InVals.push_back(ArgValue);7424continue;7425}7426if (VA.isMemLoc()) {7427HandleMemLoc();7428continue;7429}7430}74317432// On AIX a minimum of 8 words is saved to the parameter save area.7433const unsigned MinParameterSaveArea = 8 * PtrByteSize;7434// Area that is at least reserved in the caller of this function.7435unsigned CallerReservedArea = std::max<unsigned>(7436CCInfo.getStackSize(), LinkageSize + MinParameterSaveArea);74377438// Set the size that is at least reserved in caller of this function. Tail7439// call optimized function's reserved stack space needs to be aligned so7440// that taking the difference between two stack areas will result in an7441// aligned stack.7442CallerReservedArea =7443EnsureStackAlignment(Subtarget.getFrameLowering(), CallerReservedArea);7444FuncInfo->setMinReservedArea(CallerReservedArea);74457446if (isVarArg) {7447FuncInfo->setVarArgsFrameIndex(7448MFI.CreateFixedObject(PtrByteSize, CCInfo.getStackSize(), true));7449SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);74507451static const MCPhysReg GPR_32[] = {PPC::R3, PPC::R4, PPC::R5, PPC::R6,7452PPC::R7, PPC::R8, PPC::R9, PPC::R10};74537454static const MCPhysReg GPR_64[] = {PPC::X3, PPC::X4, PPC::X5, PPC::X6,7455PPC::X7, PPC::X8, PPC::X9, PPC::X10};7456const unsigned NumGPArgRegs = std::size(IsPPC64 ? GPR_64 : GPR_32);74577458// The fixed integer arguments of a variadic function are stored to the7459// VarArgsFrameIndex on the stack so that they may be loaded by7460// dereferencing the result of va_next.7461for (unsigned GPRIndex =7462(CCInfo.getStackSize() - LinkageSize) / PtrByteSize;7463GPRIndex < NumGPArgRegs; ++GPRIndex) {74647465const Register VReg =7466IsPPC64 ? MF.addLiveIn(GPR_64[GPRIndex], &PPC::G8RCRegClass)7467: MF.addLiveIn(GPR_32[GPRIndex], &PPC::GPRCRegClass);74687469SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);7470SDValue Store =7471DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());7472MemOps.push_back(Store);7473// Increment the address for the next argument to store.7474SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);7475FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);7476}7477}74787479if (!MemOps.empty())7480Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);74817482return Chain;7483}74847485SDValue PPCTargetLowering::LowerCall_AIX(7486SDValue Chain, SDValue Callee, CallFlags CFlags,7487const SmallVectorImpl<ISD::OutputArg> &Outs,7488const SmallVectorImpl<SDValue> &OutVals,7489const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,7490SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,7491const CallBase *CB) const {7492// See PPCTargetLowering::LowerFormalArguments_AIX() for a description of the7493// AIX ABI stack frame layout.74947495assert((CFlags.CallConv == CallingConv::C ||7496CFlags.CallConv == CallingConv::Cold ||7497CFlags.CallConv == CallingConv::Fast) &&7498"Unexpected calling convention!");74997500if (CFlags.IsPatchPoint)7501report_fatal_error("This call type is unimplemented on AIX.");75027503const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();75047505MachineFunction &MF = DAG.getMachineFunction();7506SmallVector<CCValAssign, 16> ArgLocs;7507AIXCCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,7508*DAG.getContext());75097510// Reserve space for the linkage save area (LSA) on the stack.7511// In both PPC32 and PPC64 there are 6 reserved slots in the LSA:7512// [SP][CR][LR][2 x reserved][TOC].7513// The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.7514const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();7515const bool IsPPC64 = Subtarget.isPPC64();7516const EVT PtrVT = getPointerTy(DAG.getDataLayout());7517const unsigned PtrByteSize = IsPPC64 ? 8 : 4;7518CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));7519CCInfo.AnalyzeCallOperands(Outs, CC_AIX);75207521// The prolog code of the callee may store up to 8 GPR argument registers to7522// the stack, allowing va_start to index over them in memory if the callee7523// is variadic.7524// Because we cannot tell if this is needed on the caller side, we have to7525// conservatively assume that it is needed. As such, make sure we have at7526// least enough stack space for the caller to store the 8 GPRs.7527const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;7528const unsigned NumBytes = std::max<unsigned>(7529LinkageSize + MinParameterSaveAreaSize, CCInfo.getStackSize());75307531// Adjust the stack pointer for the new arguments...7532// These operations are automatically eliminated by the prolog/epilog pass.7533Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);7534SDValue CallSeqStart = Chain;75357536SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;7537SmallVector<SDValue, 8> MemOpChains;75387539// Set up a copy of the stack pointer for loading and storing any7540// arguments that may not fit in the registers available for argument7541// passing.7542const SDValue StackPtr = IsPPC64 ? DAG.getRegister(PPC::X1, MVT::i64)7543: DAG.getRegister(PPC::R1, MVT::i32);75447545for (unsigned I = 0, E = ArgLocs.size(); I != E;) {7546const unsigned ValNo = ArgLocs[I].getValNo();7547SDValue Arg = OutVals[ValNo];7548ISD::ArgFlagsTy Flags = Outs[ValNo].Flags;75497550if (Flags.isByVal()) {7551const unsigned ByValSize = Flags.getByValSize();75527553// Nothing to do for zero-sized ByVals on the caller side.7554if (!ByValSize) {7555++I;7556continue;7557}75587559auto GetLoad = [&](EVT VT, unsigned LoadOffset) {7560return DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain,7561(LoadOffset != 0)7562? DAG.getObjectPtrOffset(7563dl, Arg, TypeSize::getFixed(LoadOffset))7564: Arg,7565MachinePointerInfo(), VT);7566};75677568unsigned LoadOffset = 0;75697570// Initialize registers, which are fully occupied by the by-val argument.7571while (LoadOffset + PtrByteSize <= ByValSize && ArgLocs[I].isRegLoc()) {7572SDValue Load = GetLoad(PtrVT, LoadOffset);7573MemOpChains.push_back(Load.getValue(1));7574LoadOffset += PtrByteSize;7575const CCValAssign &ByValVA = ArgLocs[I++];7576assert(ByValVA.getValNo() == ValNo &&7577"Unexpected location for pass-by-value argument.");7578RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), Load));7579}75807581if (LoadOffset == ByValSize)7582continue;75837584// There must be one more loc to handle the remainder.7585assert(ArgLocs[I].getValNo() == ValNo &&7586"Expected additional location for by-value argument.");75877588if (ArgLocs[I].isMemLoc()) {7589assert(LoadOffset < ByValSize && "Unexpected memloc for by-val arg.");7590const CCValAssign &ByValVA = ArgLocs[I++];7591ISD::ArgFlagsTy MemcpyFlags = Flags;7592// Only memcpy the bytes that don't pass in register.7593MemcpyFlags.setByValSize(ByValSize - LoadOffset);7594Chain = CallSeqStart = createMemcpyOutsideCallSeq(7595(LoadOffset != 0) ? DAG.getObjectPtrOffset(7596dl, Arg, TypeSize::getFixed(LoadOffset))7597: Arg,7598DAG.getObjectPtrOffset(7599dl, StackPtr, TypeSize::getFixed(ByValVA.getLocMemOffset())),7600CallSeqStart, MemcpyFlags, DAG, dl);7601continue;7602}76037604// Initialize the final register residue.7605// Any residue that occupies the final by-val arg register must be7606// left-justified on AIX. Loads must be a power-of-2 size and cannot be7607// larger than the ByValSize. For example: a 7 byte by-val arg requires 4,7608// 2 and 1 byte loads.7609const unsigned ResidueBytes = ByValSize % PtrByteSize;7610assert(ResidueBytes != 0 && LoadOffset + PtrByteSize > ByValSize &&7611"Unexpected register residue for by-value argument.");7612SDValue ResidueVal;7613for (unsigned Bytes = 0; Bytes != ResidueBytes;) {7614const unsigned N = llvm::bit_floor(ResidueBytes - Bytes);7615const MVT VT =7616N == 1 ? MVT::i87617: ((N == 2) ? MVT::i16 : (N == 4 ? MVT::i32 : MVT::i64));7618SDValue Load = GetLoad(VT, LoadOffset);7619MemOpChains.push_back(Load.getValue(1));7620LoadOffset += N;7621Bytes += N;76227623// By-val arguments are passed left-justfied in register.7624// Every load here needs to be shifted, otherwise a full register load7625// should have been used.7626assert(PtrVT.getSimpleVT().getSizeInBits() > (Bytes * 8) &&7627"Unexpected load emitted during handling of pass-by-value "7628"argument.");7629unsigned NumSHLBits = PtrVT.getSimpleVT().getSizeInBits() - (Bytes * 8);7630EVT ShiftAmountTy =7631getShiftAmountTy(Load->getValueType(0), DAG.getDataLayout());7632SDValue SHLAmt = DAG.getConstant(NumSHLBits, dl, ShiftAmountTy);7633SDValue ShiftedLoad =7634DAG.getNode(ISD::SHL, dl, Load.getValueType(), Load, SHLAmt);7635ResidueVal = ResidueVal ? DAG.getNode(ISD::OR, dl, PtrVT, ResidueVal,7636ShiftedLoad)7637: ShiftedLoad;7638}76397640const CCValAssign &ByValVA = ArgLocs[I++];7641RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), ResidueVal));7642continue;7643}76447645CCValAssign &VA = ArgLocs[I++];7646const MVT LocVT = VA.getLocVT();7647const MVT ValVT = VA.getValVT();76487649switch (VA.getLocInfo()) {7650default:7651report_fatal_error("Unexpected argument extension type.");7652case CCValAssign::Full:7653break;7654case CCValAssign::ZExt:7655Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);7656break;7657case CCValAssign::SExt:7658Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);7659break;7660}76617662if (VA.isRegLoc() && !VA.needsCustom()) {7663RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));7664continue;7665}76667667// Vector arguments passed to VarArg functions need custom handling when7668// they are passed (at least partially) in GPRs.7669if (VA.isMemLoc() && VA.needsCustom() && ValVT.isVector()) {7670assert(CFlags.IsVarArg && "Custom MemLocs only used for Vector args.");7671// Store value to its stack slot.7672SDValue PtrOff =7673DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());7674PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);7675SDValue Store =7676DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());7677MemOpChains.push_back(Store);7678const unsigned OriginalValNo = VA.getValNo();7679// Then load the GPRs from the stack7680unsigned LoadOffset = 0;7681auto HandleCustomVecRegLoc = [&]() {7682assert(I != E && "Unexpected end of CCvalAssigns.");7683assert(ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&7684"Expected custom RegLoc.");7685CCValAssign RegVA = ArgLocs[I++];7686assert(RegVA.getValNo() == OriginalValNo &&7687"Custom MemLoc ValNo and custom RegLoc ValNo must match.");7688SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,7689DAG.getConstant(LoadOffset, dl, PtrVT));7690SDValue Load = DAG.getLoad(PtrVT, dl, Store, Add, MachinePointerInfo());7691MemOpChains.push_back(Load.getValue(1));7692RegsToPass.push_back(std::make_pair(RegVA.getLocReg(), Load));7693LoadOffset += PtrByteSize;7694};76957696// In 64-bit there will be exactly 2 custom RegLocs that follow, and in7697// in 32-bit there will be 2 custom RegLocs if we are passing in R9 and7698// R10.7699HandleCustomVecRegLoc();7700HandleCustomVecRegLoc();77017702if (I != E && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&7703ArgLocs[I].getValNo() == OriginalValNo) {7704assert(!IsPPC64 &&7705"Only 2 custom RegLocs expected for 64-bit codegen.");7706HandleCustomVecRegLoc();7707HandleCustomVecRegLoc();7708}77097710continue;7711}77127713if (VA.isMemLoc()) {7714SDValue PtrOff =7715DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());7716PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);7717MemOpChains.push_back(7718DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));77197720continue;7721}77227723if (!ValVT.isFloatingPoint())7724report_fatal_error(7725"Unexpected register handling for calling convention.");77267727// Custom handling is used for GPR initializations for vararg float7728// arguments.7729assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg &&7730LocVT.isInteger() &&7731"Custom register handling only expected for VarArg.");77327733SDValue ArgAsInt =7734DAG.getBitcast(MVT::getIntegerVT(ValVT.getSizeInBits()), Arg);77357736if (Arg.getValueType().getStoreSize() == LocVT.getStoreSize())7737// f32 in 32-bit GPR7738// f64 in 64-bit GPR7739RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt));7740else if (Arg.getValueType().getFixedSizeInBits() <7741LocVT.getFixedSizeInBits())7742// f32 in 64-bit GPR.7743RegsToPass.push_back(std::make_pair(7744VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, LocVT)));7745else {7746// f64 in two 32-bit GPRs7747// The 2 GPRs are marked custom and expected to be adjacent in ArgLocs.7748assert(Arg.getValueType() == MVT::f64 && CFlags.IsVarArg && !IsPPC64 &&7749"Unexpected custom register for argument!");7750CCValAssign &GPR1 = VA;7751SDValue MSWAsI64 = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgAsInt,7752DAG.getConstant(32, dl, MVT::i8));7753RegsToPass.push_back(std::make_pair(7754GPR1.getLocReg(), DAG.getZExtOrTrunc(MSWAsI64, dl, MVT::i32)));77557756if (I != E) {7757// If only 1 GPR was available, there will only be one custom GPR and7758// the argument will also pass in memory.7759CCValAssign &PeekArg = ArgLocs[I];7760if (PeekArg.isRegLoc() && PeekArg.getValNo() == PeekArg.getValNo()) {7761assert(PeekArg.needsCustom() && "A second custom GPR is expected.");7762CCValAssign &GPR2 = ArgLocs[I++];7763RegsToPass.push_back(std::make_pair(7764GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));7765}7766}7767}7768}77697770if (!MemOpChains.empty())7771Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);77727773// For indirect calls, we need to save the TOC base to the stack for7774// restoration after the call.7775if (CFlags.IsIndirect) {7776assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");7777const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister();7778const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();7779const MVT PtrVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;7780const unsigned TOCSaveOffset =7781Subtarget.getFrameLowering()->getTOCSaveOffset();77827783setUsesTOCBasePtr(DAG);7784SDValue Val = DAG.getCopyFromReg(Chain, dl, TOCBaseReg, PtrVT);7785SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);7786SDValue StackPtr = DAG.getRegister(StackPtrReg, PtrVT);7787SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);7788Chain = DAG.getStore(7789Val.getValue(1), dl, Val, AddPtr,7790MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));7791}77927793// Build a sequence of copy-to-reg nodes chained together with token chain7794// and flag operands which copy the outgoing args into the appropriate regs.7795SDValue InGlue;7796for (auto Reg : RegsToPass) {7797Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, InGlue);7798InGlue = Chain.getValue(1);7799}78007801const int SPDiff = 0;7802return FinishCall(CFlags, dl, DAG, RegsToPass, InGlue, Chain, CallSeqStart,7803Callee, SPDiff, NumBytes, Ins, InVals, CB);7804}78057806bool7807PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,7808MachineFunction &MF, bool isVarArg,7809const SmallVectorImpl<ISD::OutputArg> &Outs,7810LLVMContext &Context) const {7811SmallVector<CCValAssign, 16> RVLocs;7812CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);7813return CCInfo.CheckReturn(7814Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)7815? RetCC_PPC_Cold7816: RetCC_PPC);7817}78187819SDValue7820PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,7821bool isVarArg,7822const SmallVectorImpl<ISD::OutputArg> &Outs,7823const SmallVectorImpl<SDValue> &OutVals,7824const SDLoc &dl, SelectionDAG &DAG) const {7825SmallVector<CCValAssign, 16> RVLocs;7826CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,7827*DAG.getContext());7828CCInfo.AnalyzeReturn(Outs,7829(Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)7830? RetCC_PPC_Cold7831: RetCC_PPC);78327833SDValue Glue;7834SmallVector<SDValue, 4> RetOps(1, Chain);78357836// Copy the result values into the output registers.7837for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) {7838CCValAssign &VA = RVLocs[i];7839assert(VA.isRegLoc() && "Can only return in registers!");78407841SDValue Arg = OutVals[RealResIdx];78427843switch (VA.getLocInfo()) {7844default: llvm_unreachable("Unknown loc info!");7845case CCValAssign::Full: break;7846case CCValAssign::AExt:7847Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);7848break;7849case CCValAssign::ZExt:7850Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);7851break;7852case CCValAssign::SExt:7853Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);7854break;7855}7856if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {7857bool isLittleEndian = Subtarget.isLittleEndian();7858// Legalize ret f64 -> ret 2 x i32.7859SDValue SVal =7860DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,7861DAG.getIntPtrConstant(isLittleEndian ? 0 : 1, dl));7862Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Glue);7863RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));7864SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,7865DAG.getIntPtrConstant(isLittleEndian ? 1 : 0, dl));7866Glue = Chain.getValue(1);7867VA = RVLocs[++i]; // skip ahead to next loc7868Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Glue);7869} else7870Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);7871Glue = Chain.getValue(1);7872RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));7873}78747875RetOps[0] = Chain; // Update chain.78767877// Add the glue if we have it.7878if (Glue.getNode())7879RetOps.push_back(Glue);78807881return DAG.getNode(PPCISD::RET_GLUE, dl, MVT::Other, RetOps);7882}78837884SDValue7885PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,7886SelectionDAG &DAG) const {7887SDLoc dl(Op);78887889// Get the correct type for integers.7890EVT IntVT = Op.getValueType();78917892// Get the inputs.7893SDValue Chain = Op.getOperand(0);7894SDValue FPSIdx = getFramePointerFrameIndex(DAG);7895// Build a DYNAREAOFFSET node.7896SDValue Ops[2] = {Chain, FPSIdx};7897SDVTList VTs = DAG.getVTList(IntVT);7898return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);7899}79007901SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,7902SelectionDAG &DAG) const {7903// When we pop the dynamic allocation we need to restore the SP link.7904SDLoc dl(Op);79057906// Get the correct type for pointers.7907EVT PtrVT = getPointerTy(DAG.getDataLayout());79087909// Construct the stack pointer operand.7910bool isPPC64 = Subtarget.isPPC64();7911unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;7912SDValue StackPtr = DAG.getRegister(SP, PtrVT);79137914// Get the operands for the STACKRESTORE.7915SDValue Chain = Op.getOperand(0);7916SDValue SaveSP = Op.getOperand(1);79177918// Load the old link SP.7919SDValue LoadLinkSP =7920DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo());79217922// Restore the stack pointer.7923Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);79247925// Store the old link SP.7926return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo());7927}79287929SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {7930MachineFunction &MF = DAG.getMachineFunction();7931bool isPPC64 = Subtarget.isPPC64();7932EVT PtrVT = getPointerTy(MF.getDataLayout());79337934// Get current frame pointer save index. The users of this index will be7935// primarily DYNALLOC instructions.7936PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();7937int RASI = FI->getReturnAddrSaveIndex();79387939// If the frame pointer save index hasn't been defined yet.7940if (!RASI) {7941// Find out what the fix offset of the frame pointer save area.7942int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();7943// Allocate the frame index for frame pointer save area.7944RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false);7945// Save the result.7946FI->setReturnAddrSaveIndex(RASI);7947}7948return DAG.getFrameIndex(RASI, PtrVT);7949}79507951SDValue7952PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {7953MachineFunction &MF = DAG.getMachineFunction();7954bool isPPC64 = Subtarget.isPPC64();7955EVT PtrVT = getPointerTy(MF.getDataLayout());79567957// Get current frame pointer save index. The users of this index will be7958// primarily DYNALLOC instructions.7959PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();7960int FPSI = FI->getFramePointerSaveIndex();79617962// If the frame pointer save index hasn't been defined yet.7963if (!FPSI) {7964// Find out what the fix offset of the frame pointer save area.7965int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();7966// Allocate the frame index for frame pointer save area.7967FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);7968// Save the result.7969FI->setFramePointerSaveIndex(FPSI);7970}7971return DAG.getFrameIndex(FPSI, PtrVT);7972}79737974SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,7975SelectionDAG &DAG) const {7976MachineFunction &MF = DAG.getMachineFunction();7977// Get the inputs.7978SDValue Chain = Op.getOperand(0);7979SDValue Size = Op.getOperand(1);7980SDLoc dl(Op);79817982// Get the correct type for pointers.7983EVT PtrVT = getPointerTy(DAG.getDataLayout());7984// Negate the size.7985SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,7986DAG.getConstant(0, dl, PtrVT), Size);7987// Construct a node for the frame pointer save index.7988SDValue FPSIdx = getFramePointerFrameIndex(DAG);7989SDValue Ops[3] = { Chain, NegSize, FPSIdx };7990SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);7991if (hasInlineStackProbe(MF))7992return DAG.getNode(PPCISD::PROBED_ALLOCA, dl, VTs, Ops);7993return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);7994}79957996SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,7997SelectionDAG &DAG) const {7998MachineFunction &MF = DAG.getMachineFunction();79998000bool isPPC64 = Subtarget.isPPC64();8001EVT PtrVT = getPointerTy(DAG.getDataLayout());80028003int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false);8004return DAG.getFrameIndex(FI, PtrVT);8005}80068007SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,8008SelectionDAG &DAG) const {8009SDLoc DL(Op);8010return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,8011DAG.getVTList(MVT::i32, MVT::Other),8012Op.getOperand(0), Op.getOperand(1));8013}80148015SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,8016SelectionDAG &DAG) const {8017SDLoc DL(Op);8018return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,8019Op.getOperand(0), Op.getOperand(1));8020}80218022SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {8023if (Op.getValueType().isVector())8024return LowerVectorLoad(Op, DAG);80258026assert(Op.getValueType() == MVT::i1 &&8027"Custom lowering only for i1 loads");80288029// First, load 8 bits into 32 bits, then truncate to 1 bit.80308031SDLoc dl(Op);8032LoadSDNode *LD = cast<LoadSDNode>(Op);80338034SDValue Chain = LD->getChain();8035SDValue BasePtr = LD->getBasePtr();8036MachineMemOperand *MMO = LD->getMemOperand();80378038SDValue NewLD =8039DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain,8040BasePtr, MVT::i8, MMO);8041SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);80428043SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };8044return DAG.getMergeValues(Ops, dl);8045}80468047SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {8048if (Op.getOperand(1).getValueType().isVector())8049return LowerVectorStore(Op, DAG);80508051assert(Op.getOperand(1).getValueType() == MVT::i1 &&8052"Custom lowering only for i1 stores");80538054// First, zero extend to 32 bits, then use a truncating store to 8 bits.80558056SDLoc dl(Op);8057StoreSDNode *ST = cast<StoreSDNode>(Op);80588059SDValue Chain = ST->getChain();8060SDValue BasePtr = ST->getBasePtr();8061SDValue Value = ST->getValue();8062MachineMemOperand *MMO = ST->getMemOperand();80638064Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()),8065Value);8066return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);8067}80688069// FIXME: Remove this once the ANDI glue bug is fixed:8070SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {8071assert(Op.getValueType() == MVT::i1 &&8072"Custom lowering only for i1 results");80738074SDLoc DL(Op);8075return DAG.getNode(PPCISD::ANDI_rec_1_GT_BIT, DL, MVT::i1, Op.getOperand(0));8076}80778078SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,8079SelectionDAG &DAG) const {80808081// Implements a vector truncate that fits in a vector register as a shuffle.8082// We want to legalize vector truncates down to where the source fits in8083// a vector register (and target is therefore smaller than vector register8084// size). At that point legalization will try to custom lower the sub-legal8085// result and get here - where we can contain the truncate as a single target8086// operation.80878088// For example a trunc <2 x i16> to <2 x i8> could be visualized as follows:8089// <MSB1|LSB1, MSB2|LSB2> to <LSB1, LSB2>8090//8091// We will implement it for big-endian ordering as this (where x denotes8092// undefined):8093// < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to8094// < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u>8095//8096// The same operation in little-endian ordering will be:8097// <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to8098// <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>80998100EVT TrgVT = Op.getValueType();8101assert(TrgVT.isVector() && "Vector type expected.");8102unsigned TrgNumElts = TrgVT.getVectorNumElements();8103EVT EltVT = TrgVT.getVectorElementType();8104if (!isOperationCustom(Op.getOpcode(), TrgVT) ||8105TrgVT.getSizeInBits() > 128 || !isPowerOf2_32(TrgNumElts) ||8106!llvm::has_single_bit<uint32_t>(EltVT.getSizeInBits()))8107return SDValue();81088109SDValue N1 = Op.getOperand(0);8110EVT SrcVT = N1.getValueType();8111unsigned SrcSize = SrcVT.getSizeInBits();8112if (SrcSize > 256 || !isPowerOf2_32(SrcVT.getVectorNumElements()) ||8113!llvm::has_single_bit<uint32_t>(8114SrcVT.getVectorElementType().getSizeInBits()))8115return SDValue();8116if (SrcSize == 256 && SrcVT.getVectorNumElements() < 2)8117return SDValue();81188119unsigned WideNumElts = 128 / EltVT.getSizeInBits();8120EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);81218122SDLoc DL(Op);8123SDValue Op1, Op2;8124if (SrcSize == 256) {8125EVT VecIdxTy = getVectorIdxTy(DAG.getDataLayout());8126EVT SplitVT =8127N1.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());8128unsigned SplitNumElts = SplitVT.getVectorNumElements();8129Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1,8130DAG.getConstant(0, DL, VecIdxTy));8131Op2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1,8132DAG.getConstant(SplitNumElts, DL, VecIdxTy));8133}8134else {8135Op1 = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);8136Op2 = DAG.getUNDEF(WideVT);8137}81388139// First list the elements we want to keep.8140unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();8141SmallVector<int, 16> ShuffV;8142if (Subtarget.isLittleEndian())8143for (unsigned i = 0; i < TrgNumElts; ++i)8144ShuffV.push_back(i * SizeMult);8145else8146for (unsigned i = 1; i <= TrgNumElts; ++i)8147ShuffV.push_back(i * SizeMult - 1);81488149// Populate the remaining elements with undefs.8150for (unsigned i = TrgNumElts; i < WideNumElts; ++i)8151// ShuffV.push_back(i + WideNumElts);8152ShuffV.push_back(WideNumElts + 1);81538154Op1 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op1);8155Op2 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op2);8156return DAG.getVectorShuffle(WideVT, DL, Op1, Op2, ShuffV);8157}81588159/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when8160/// possible.8161SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {8162ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();8163EVT ResVT = Op.getValueType();8164EVT CmpVT = Op.getOperand(0).getValueType();8165SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);8166SDValue TV = Op.getOperand(2), FV = Op.getOperand(3);8167SDLoc dl(Op);81688169// Without power9-vector, we don't have native instruction for f128 comparison.8170// Following transformation to libcall is needed for setcc:8171// select_cc lhs, rhs, tv, fv, cc -> select_cc (setcc cc, x, y), 0, tv, fv, NE8172if (!Subtarget.hasP9Vector() && CmpVT == MVT::f128) {8173SDValue Z = DAG.getSetCC(8174dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT),8175LHS, RHS, CC);8176SDValue Zero = DAG.getConstant(0, dl, Z.getValueType());8177return DAG.getSelectCC(dl, Z, Zero, TV, FV, ISD::SETNE);8178}81798180// Not FP, or using SPE? Not a fsel.8181if (!CmpVT.isFloatingPoint() || !TV.getValueType().isFloatingPoint() ||8182Subtarget.hasSPE())8183return Op;81848185SDNodeFlags Flags = Op.getNode()->getFlags();81868187// We have xsmaxc[dq]p/xsminc[dq]p which are OK to emit even in the8188// presence of infinities.8189if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {8190switch (CC) {8191default:8192break;8193case ISD::SETOGT:8194case ISD::SETGT:8195return DAG.getNode(PPCISD::XSMAXC, dl, Op.getValueType(), LHS, RHS);8196case ISD::SETOLT:8197case ISD::SETLT:8198return DAG.getNode(PPCISD::XSMINC, dl, Op.getValueType(), LHS, RHS);8199}8200}82018202// We might be able to do better than this under some circumstances, but in8203// general, fsel-based lowering of select is a finite-math-only optimization.8204// For more information, see section F.3 of the 2.06 ISA specification.8205// With ISA 3.08206if ((!DAG.getTarget().Options.NoInfsFPMath && !Flags.hasNoInfs()) ||8207(!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs()) ||8208ResVT == MVT::f128)8209return Op;82108211// If the RHS of the comparison is a 0.0, we don't need to do the8212// subtraction at all.8213SDValue Sel1;8214if (isFloatingPointZero(RHS))8215switch (CC) {8216default: break; // SETUO etc aren't handled by fsel.8217case ISD::SETNE:8218std::swap(TV, FV);8219[[fallthrough]];8220case ISD::SETEQ:8221if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits8222LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);8223Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);8224if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits8225Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);8226return DAG.getNode(PPCISD::FSEL, dl, ResVT,8227DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV);8228case ISD::SETULT:8229case ISD::SETLT:8230std::swap(TV, FV); // fsel is natively setge, swap operands for setlt8231[[fallthrough]];8232case ISD::SETOGE:8233case ISD::SETGE:8234if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits8235LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);8236return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);8237case ISD::SETUGT:8238case ISD::SETGT:8239std::swap(TV, FV); // fsel is natively setge, swap operands for setlt8240[[fallthrough]];8241case ISD::SETOLE:8242case ISD::SETLE:8243if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits8244LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);8245return DAG.getNode(PPCISD::FSEL, dl, ResVT,8246DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);8247}82488249SDValue Cmp;8250switch (CC) {8251default: break; // SETUO etc aren't handled by fsel.8252case ISD::SETNE:8253std::swap(TV, FV);8254[[fallthrough]];8255case ISD::SETEQ:8256Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);8257if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits8258Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);8259Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);8260if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits8261Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);8262return DAG.getNode(PPCISD::FSEL, dl, ResVT,8263DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);8264case ISD::SETULT:8265case ISD::SETLT:8266Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);8267if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits8268Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);8269return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);8270case ISD::SETOGE:8271case ISD::SETGE:8272Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);8273if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits8274Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);8275return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);8276case ISD::SETUGT:8277case ISD::SETGT:8278Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);8279if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits8280Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);8281return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);8282case ISD::SETOLE:8283case ISD::SETLE:8284Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);8285if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits8286Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);8287return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);8288}8289return Op;8290}82918292static unsigned getPPCStrictOpcode(unsigned Opc) {8293switch (Opc) {8294default:8295llvm_unreachable("No strict version of this opcode!");8296case PPCISD::FCTIDZ:8297return PPCISD::STRICT_FCTIDZ;8298case PPCISD::FCTIWZ:8299return PPCISD::STRICT_FCTIWZ;8300case PPCISD::FCTIDUZ:8301return PPCISD::STRICT_FCTIDUZ;8302case PPCISD::FCTIWUZ:8303return PPCISD::STRICT_FCTIWUZ;8304case PPCISD::FCFID:8305return PPCISD::STRICT_FCFID;8306case PPCISD::FCFIDU:8307return PPCISD::STRICT_FCFIDU;8308case PPCISD::FCFIDS:8309return PPCISD::STRICT_FCFIDS;8310case PPCISD::FCFIDUS:8311return PPCISD::STRICT_FCFIDUS;8312}8313}83148315static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG,8316const PPCSubtarget &Subtarget) {8317SDLoc dl(Op);8318bool IsStrict = Op->isStrictFPOpcode();8319bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||8320Op.getOpcode() == ISD::STRICT_FP_TO_SINT;83218322// TODO: Any other flags to propagate?8323SDNodeFlags Flags;8324Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());83258326// For strict nodes, source is the second operand.8327SDValue Src = Op.getOperand(IsStrict ? 1 : 0);8328SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();8329MVT DestTy = Op.getSimpleValueType();8330assert(Src.getValueType().isFloatingPoint() &&8331(DestTy == MVT::i8 || DestTy == MVT::i16 || DestTy == MVT::i32 ||8332DestTy == MVT::i64) &&8333"Invalid FP_TO_INT types");8334if (Src.getValueType() == MVT::f32) {8335if (IsStrict) {8336Src =8337DAG.getNode(ISD::STRICT_FP_EXTEND, dl,8338DAG.getVTList(MVT::f64, MVT::Other), {Chain, Src}, Flags);8339Chain = Src.getValue(1);8340} else8341Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);8342}8343if ((DestTy == MVT::i8 || DestTy == MVT::i16) && Subtarget.hasP9Vector())8344DestTy = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;8345unsigned Opc = ISD::DELETED_NODE;8346switch (DestTy.SimpleTy) {8347default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");8348case MVT::i32:8349Opc = IsSigned ? PPCISD::FCTIWZ8350: (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ);8351break;8352case MVT::i64:8353assert((IsSigned || Subtarget.hasFPCVT()) &&8354"i64 FP_TO_UINT is supported only with FPCVT");8355Opc = IsSigned ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ;8356}8357EVT ConvTy = Src.getValueType() == MVT::f128 ? MVT::f128 : MVT::f64;8358SDValue Conv;8359if (IsStrict) {8360Opc = getPPCStrictOpcode(Opc);8361Conv = DAG.getNode(Opc, dl, DAG.getVTList(ConvTy, MVT::Other), {Chain, Src},8362Flags);8363} else {8364Conv = DAG.getNode(Opc, dl, ConvTy, Src);8365}8366return Conv;8367}83688369void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,8370SelectionDAG &DAG,8371const SDLoc &dl) const {8372SDValue Tmp = convertFPToInt(Op, DAG, Subtarget);8373bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||8374Op.getOpcode() == ISD::STRICT_FP_TO_SINT;8375bool IsStrict = Op->isStrictFPOpcode();83768377// Convert the FP value to an int value through memory.8378bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&8379(IsSigned || Subtarget.hasFPCVT());8380SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);8381int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();8382MachinePointerInfo MPI =8383MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);83848385// Emit a store to the stack slot.8386SDValue Chain = IsStrict ? Tmp.getValue(1) : DAG.getEntryNode();8387Align Alignment(DAG.getEVTAlign(Tmp.getValueType()));8388if (i32Stack) {8389MachineFunction &MF = DAG.getMachineFunction();8390Alignment = Align(4);8391MachineMemOperand *MMO =8392MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment);8393SDValue Ops[] = { Chain, Tmp, FIPtr };8394Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,8395DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);8396} else8397Chain = DAG.getStore(Chain, dl, Tmp, FIPtr, MPI, Alignment);83988399// Result is a load from the stack slot. If loading 4 bytes, make sure to8400// add in a bias on big endian.8401if (Op.getValueType() == MVT::i32 && !i32Stack) {8402FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,8403DAG.getConstant(4, dl, FIPtr.getValueType()));8404MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4);8405}84068407RLI.Chain = Chain;8408RLI.Ptr = FIPtr;8409RLI.MPI = MPI;8410RLI.Alignment = Alignment;8411}84128413/// Custom lowers floating point to integer conversions to use8414/// the direct move instructions available in ISA 2.07 to avoid the8415/// need for load/store combinations.8416SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,8417SelectionDAG &DAG,8418const SDLoc &dl) const {8419SDValue Conv = convertFPToInt(Op, DAG, Subtarget);8420SDValue Mov = DAG.getNode(PPCISD::MFVSR, dl, Op.getValueType(), Conv);8421if (Op->isStrictFPOpcode())8422return DAG.getMergeValues({Mov, Conv.getValue(1)}, dl);8423else8424return Mov;8425}84268427SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,8428const SDLoc &dl) const {8429bool IsStrict = Op->isStrictFPOpcode();8430bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||8431Op.getOpcode() == ISD::STRICT_FP_TO_SINT;8432SDValue Src = Op.getOperand(IsStrict ? 1 : 0);8433EVT SrcVT = Src.getValueType();8434EVT DstVT = Op.getValueType();84358436// FP to INT conversions are legal for f128.8437if (SrcVT == MVT::f128)8438return Subtarget.hasP9Vector() ? Op : SDValue();84398440// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on8441// PPC (the libcall is not available).8442if (SrcVT == MVT::ppcf128) {8443if (DstVT == MVT::i32) {8444// TODO: Conservatively pass only nofpexcept flag here. Need to check and8445// set other fast-math flags to FP operations in both strict and8446// non-strict cases. (FP_TO_SINT, FSUB)8447SDNodeFlags Flags;8448Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());84498450if (IsSigned) {8451SDValue Lo, Hi;8452std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::f64, MVT::f64);84538454// Add the two halves of the long double in round-to-zero mode, and use8455// a smaller FP_TO_SINT.8456if (IsStrict) {8457SDValue Res = DAG.getNode(PPCISD::STRICT_FADDRTZ, dl,8458DAG.getVTList(MVT::f64, MVT::Other),8459{Op.getOperand(0), Lo, Hi}, Flags);8460return DAG.getNode(ISD::STRICT_FP_TO_SINT, dl,8461DAG.getVTList(MVT::i32, MVT::Other),8462{Res.getValue(1), Res}, Flags);8463} else {8464SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);8465return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);8466}8467} else {8468const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};8469APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));8470SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT);8471SDValue SignMask = DAG.getConstant(0x80000000, dl, DstVT);8472if (IsStrict) {8473// Sel = Src < 0x800000008474// FltOfs = select Sel, 0.0, 0x800000008475// IntOfs = select Sel, 0, 0x800000008476// Result = fp_to_sint(Src - FltOfs) ^ IntOfs8477SDValue Chain = Op.getOperand(0);8478EVT SetCCVT =8479getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);8480EVT DstSetCCVT =8481getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), DstVT);8482SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT,8483Chain, true);8484Chain = Sel.getValue(1);84858486SDValue FltOfs = DAG.getSelect(8487dl, SrcVT, Sel, DAG.getConstantFP(0.0, dl, SrcVT), Cst);8488Sel = DAG.getBoolExtOrTrunc(Sel, dl, DstSetCCVT, DstVT);84898490SDValue Val = DAG.getNode(ISD::STRICT_FSUB, dl,8491DAG.getVTList(SrcVT, MVT::Other),8492{Chain, Src, FltOfs}, Flags);8493Chain = Val.getValue(1);8494SDValue SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl,8495DAG.getVTList(DstVT, MVT::Other),8496{Chain, Val}, Flags);8497Chain = SInt.getValue(1);8498SDValue IntOfs = DAG.getSelect(8499dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT), SignMask);8500SDValue Result = DAG.getNode(ISD::XOR, dl, DstVT, SInt, IntOfs);8501return DAG.getMergeValues({Result, Chain}, dl);8502} else {8503// X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X8504// FIXME: generated code sucks.8505SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128, Src, Cst);8506True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);8507True = DAG.getNode(ISD::ADD, dl, MVT::i32, True, SignMask);8508SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);8509return DAG.getSelectCC(dl, Src, Cst, True, False, ISD::SETGE);8510}8511}8512}85138514return SDValue();8515}85168517if (Subtarget.hasDirectMove() && Subtarget.isPPC64())8518return LowerFP_TO_INTDirectMove(Op, DAG, dl);85198520ReuseLoadInfo RLI;8521LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);85228523return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI,8524RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);8525}85268527// We're trying to insert a regular store, S, and then a load, L. If the8528// incoming value, O, is a load, we might just be able to have our load use the8529// address used by O. However, we don't know if anything else will store to8530// that address before we can load from it. To prevent this situation, we need8531// to insert our load, L, into the chain as a peer of O. To do this, we give L8532// the same chain operand as O, we create a token factor from the chain results8533// of O and L, and we replace all uses of O's chain result with that token8534// factor (see spliceIntoChain below for this last part).8535bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,8536ReuseLoadInfo &RLI,8537SelectionDAG &DAG,8538ISD::LoadExtType ET) const {8539// Conservatively skip reusing for constrained FP nodes.8540if (Op->isStrictFPOpcode())8541return false;85428543SDLoc dl(Op);8544bool ValidFPToUint = Op.getOpcode() == ISD::FP_TO_UINT &&8545(Subtarget.hasFPCVT() || Op.getValueType() == MVT::i32);8546if (ET == ISD::NON_EXTLOAD &&8547(ValidFPToUint || Op.getOpcode() == ISD::FP_TO_SINT) &&8548isOperationLegalOrCustom(Op.getOpcode(),8549Op.getOperand(0).getValueType())) {85508551LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);8552return true;8553}85548555LoadSDNode *LD = dyn_cast<LoadSDNode>(Op);8556if (!LD || LD->getExtensionType() != ET || LD->isVolatile() ||8557LD->isNonTemporal())8558return false;8559if (LD->getMemoryVT() != MemVT)8560return false;85618562// If the result of the load is an illegal type, then we can't build a8563// valid chain for reuse since the legalised loads and token factor node that8564// ties the legalised loads together uses a different output chain then the8565// illegal load.8566if (!isTypeLegal(LD->getValueType(0)))8567return false;85688569RLI.Ptr = LD->getBasePtr();8570if (LD->isIndexed() && !LD->getOffset().isUndef()) {8571assert(LD->getAddressingMode() == ISD::PRE_INC &&8572"Non-pre-inc AM on PPC?");8573RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,8574LD->getOffset());8575}85768577RLI.Chain = LD->getChain();8578RLI.MPI = LD->getPointerInfo();8579RLI.IsDereferenceable = LD->isDereferenceable();8580RLI.IsInvariant = LD->isInvariant();8581RLI.Alignment = LD->getAlign();8582RLI.AAInfo = LD->getAAInfo();8583RLI.Ranges = LD->getRanges();85848585RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);8586return true;8587}85888589// Given the head of the old chain, ResChain, insert a token factor containing8590// it and NewResChain, and make users of ResChain now be users of that token8591// factor.8592// TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead.8593void PPCTargetLowering::spliceIntoChain(SDValue ResChain,8594SDValue NewResChain,8595SelectionDAG &DAG) const {8596if (!ResChain)8597return;85988599SDLoc dl(NewResChain);86008601SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,8602NewResChain, DAG.getUNDEF(MVT::Other));8603assert(TF.getNode() != NewResChain.getNode() &&8604"A new TF really is required here");86058606DAG.ReplaceAllUsesOfValueWith(ResChain, TF);8607DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);8608}86098610/// Analyze profitability of direct move8611/// prefer float load to int load plus direct move8612/// when there is no integer use of int load8613bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {8614SDNode *Origin = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0).getNode();8615if (Origin->getOpcode() != ISD::LOAD)8616return true;86178618// If there is no LXSIBZX/LXSIHZX, like Power8,8619// prefer direct move if the memory size is 1 or 2 bytes.8620MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand();8621if (!Subtarget.hasP9Vector() &&8622(!MMO->getSize().hasValue() || MMO->getSize().getValue() <= 2))8623return true;86248625for (SDNode::use_iterator UI = Origin->use_begin(),8626UE = Origin->use_end();8627UI != UE; ++UI) {86288629// Only look at the users of the loaded value.8630if (UI.getUse().get().getResNo() != 0)8631continue;86328633if (UI->getOpcode() != ISD::SINT_TO_FP &&8634UI->getOpcode() != ISD::UINT_TO_FP &&8635UI->getOpcode() != ISD::STRICT_SINT_TO_FP &&8636UI->getOpcode() != ISD::STRICT_UINT_TO_FP)8637return true;8638}86398640return false;8641}86428643static SDValue convertIntToFP(SDValue Op, SDValue Src, SelectionDAG &DAG,8644const PPCSubtarget &Subtarget,8645SDValue Chain = SDValue()) {8646bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||8647Op.getOpcode() == ISD::STRICT_SINT_TO_FP;8648SDLoc dl(Op);86498650// TODO: Any other flags to propagate?8651SDNodeFlags Flags;8652Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());86538654// If we have FCFIDS, then use it when converting to single-precision.8655// Otherwise, convert to double-precision and then round.8656bool IsSingle = Op.getValueType() == MVT::f32 && Subtarget.hasFPCVT();8657unsigned ConvOpc = IsSingle ? (IsSigned ? PPCISD::FCFIDS : PPCISD::FCFIDUS)8658: (IsSigned ? PPCISD::FCFID : PPCISD::FCFIDU);8659EVT ConvTy = IsSingle ? MVT::f32 : MVT::f64;8660if (Op->isStrictFPOpcode()) {8661if (!Chain)8662Chain = Op.getOperand(0);8663return DAG.getNode(getPPCStrictOpcode(ConvOpc), dl,8664DAG.getVTList(ConvTy, MVT::Other), {Chain, Src}, Flags);8665} else8666return DAG.getNode(ConvOpc, dl, ConvTy, Src);8667}86688669/// Custom lowers integer to floating point conversions to use8670/// the direct move instructions available in ISA 2.07 to avoid the8671/// need for load/store combinations.8672SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,8673SelectionDAG &DAG,8674const SDLoc &dl) const {8675assert((Op.getValueType() == MVT::f32 ||8676Op.getValueType() == MVT::f64) &&8677"Invalid floating point type as target of conversion");8678assert(Subtarget.hasFPCVT() &&8679"Int to FP conversions with direct moves require FPCVT");8680SDValue Src = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0);8681bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;8682bool Signed = Op.getOpcode() == ISD::SINT_TO_FP ||8683Op.getOpcode() == ISD::STRICT_SINT_TO_FP;8684unsigned MovOpc = (WordInt && !Signed) ? PPCISD::MTVSRZ : PPCISD::MTVSRA;8685SDValue Mov = DAG.getNode(MovOpc, dl, MVT::f64, Src);8686return convertIntToFP(Op, Mov, DAG, Subtarget);8687}86888689static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {86908691EVT VecVT = Vec.getValueType();8692assert(VecVT.isVector() && "Expected a vector type.");8693assert(VecVT.getSizeInBits() < 128 && "Vector is already full width.");86948695EVT EltVT = VecVT.getVectorElementType();8696unsigned WideNumElts = 128 / EltVT.getSizeInBits();8697EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);86988699unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements();8700SmallVector<SDValue, 16> Ops(NumConcat);8701Ops[0] = Vec;8702SDValue UndefVec = DAG.getUNDEF(VecVT);8703for (unsigned i = 1; i < NumConcat; ++i)8704Ops[i] = UndefVec;87058706return DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops);8707}87088709SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,8710const SDLoc &dl) const {8711bool IsStrict = Op->isStrictFPOpcode();8712unsigned Opc = Op.getOpcode();8713SDValue Src = Op.getOperand(IsStrict ? 1 : 0);8714assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP ||8715Opc == ISD::STRICT_UINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP) &&8716"Unexpected conversion type");8717assert((Op.getValueType() == MVT::v2f64 || Op.getValueType() == MVT::v4f32) &&8718"Supports conversions to v2f64/v4f32 only.");87198720// TODO: Any other flags to propagate?8721SDNodeFlags Flags;8722Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());87238724bool SignedConv = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;8725bool FourEltRes = Op.getValueType() == MVT::v4f32;87268727SDValue Wide = widenVec(DAG, Src, dl);8728EVT WideVT = Wide.getValueType();8729unsigned WideNumElts = WideVT.getVectorNumElements();8730MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64;87318732SmallVector<int, 16> ShuffV;8733for (unsigned i = 0; i < WideNumElts; ++i)8734ShuffV.push_back(i + WideNumElts);87358736int Stride = FourEltRes ? WideNumElts / 4 : WideNumElts / 2;8737int SaveElts = FourEltRes ? 4 : 2;8738if (Subtarget.isLittleEndian())8739for (int i = 0; i < SaveElts; i++)8740ShuffV[i * Stride] = i;8741else8742for (int i = 1; i <= SaveElts; i++)8743ShuffV[i * Stride - 1] = i - 1;87448745SDValue ShuffleSrc2 =8746SignedConv ? DAG.getUNDEF(WideVT) : DAG.getConstant(0, dl, WideVT);8747SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV);87488749SDValue Extend;8750if (SignedConv) {8751Arrange = DAG.getBitcast(IntermediateVT, Arrange);8752EVT ExtVT = Src.getValueType();8753if (Subtarget.hasP9Altivec())8754ExtVT = EVT::getVectorVT(*DAG.getContext(), WideVT.getVectorElementType(),8755IntermediateVT.getVectorNumElements());87568757Extend = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, IntermediateVT, Arrange,8758DAG.getValueType(ExtVT));8759} else8760Extend = DAG.getNode(ISD::BITCAST, dl, IntermediateVT, Arrange);87618762if (IsStrict)8763return DAG.getNode(Opc, dl, DAG.getVTList(Op.getValueType(), MVT::Other),8764{Op.getOperand(0), Extend}, Flags);87658766return DAG.getNode(Opc, dl, Op.getValueType(), Extend);8767}87688769SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,8770SelectionDAG &DAG) const {8771SDLoc dl(Op);8772bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||8773Op.getOpcode() == ISD::STRICT_SINT_TO_FP;8774bool IsStrict = Op->isStrictFPOpcode();8775SDValue Src = Op.getOperand(IsStrict ? 1 : 0);8776SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();87778778// TODO: Any other flags to propagate?8779SDNodeFlags Flags;8780Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());87818782EVT InVT = Src.getValueType();8783EVT OutVT = Op.getValueType();8784if (OutVT.isVector() && OutVT.isFloatingPoint() &&8785isOperationCustom(Op.getOpcode(), InVT))8786return LowerINT_TO_FPVector(Op, DAG, dl);87878788// Conversions to f128 are legal.8789if (Op.getValueType() == MVT::f128)8790return Subtarget.hasP9Vector() ? Op : SDValue();87918792// Don't handle ppc_fp128 here; let it be lowered to a libcall.8793if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)8794return SDValue();87958796if (Src.getValueType() == MVT::i1) {8797SDValue Sel = DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Src,8798DAG.getConstantFP(1.0, dl, Op.getValueType()),8799DAG.getConstantFP(0.0, dl, Op.getValueType()));8800if (IsStrict)8801return DAG.getMergeValues({Sel, Chain}, dl);8802else8803return Sel;8804}88058806// If we have direct moves, we can do all the conversion, skip the store/load8807// however, without FPCVT we can't do most conversions.8808if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&8809Subtarget.isPPC64() && Subtarget.hasFPCVT())8810return LowerINT_TO_FPDirectMove(Op, DAG, dl);88118812assert((IsSigned || Subtarget.hasFPCVT()) &&8813"UINT_TO_FP is supported only with FPCVT");88148815if (Src.getValueType() == MVT::i64) {8816SDValue SINT = Src;8817// When converting to single-precision, we actually need to convert8818// to double-precision first and then round to single-precision.8819// To avoid double-rounding effects during that operation, we have8820// to prepare the input operand. Bits that might be truncated when8821// converting to double-precision are replaced by a bit that won't8822// be lost at this stage, but is below the single-precision rounding8823// position.8824//8825// However, if -enable-unsafe-fp-math is in effect, accept double8826// rounding to avoid the extra overhead.8827if (Op.getValueType() == MVT::f32 &&8828!Subtarget.hasFPCVT() &&8829!DAG.getTarget().Options.UnsafeFPMath) {88308831// Twiddle input to make sure the low 11 bits are zero. (If this8832// is the case, we are guaranteed the value will fit into the 53 bit8833// mantissa of an IEEE double-precision value without rounding.)8834// If any of those low 11 bits were not zero originally, make sure8835// bit 12 (value 2048) is set instead, so that the final rounding8836// to single-precision gets the correct result.8837SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,8838SINT, DAG.getConstant(2047, dl, MVT::i64));8839Round = DAG.getNode(ISD::ADD, dl, MVT::i64,8840Round, DAG.getConstant(2047, dl, MVT::i64));8841Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);8842Round = DAG.getNode(ISD::AND, dl, MVT::i64,8843Round, DAG.getConstant(-2048, dl, MVT::i64));88448845// However, we cannot use that value unconditionally: if the magnitude8846// of the input value is small, the bit-twiddling we did above might8847// end up visibly changing the output. Fortunately, in that case, we8848// don't need to twiddle bits since the original input will convert8849// exactly to double-precision floating-point already. Therefore,8850// construct a conditional to use the original value if the top 118851// bits are all sign-bit copies, and use the rounded value computed8852// above otherwise.8853SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,8854SINT, DAG.getConstant(53, dl, MVT::i32));8855Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,8856Cond, DAG.getConstant(1, dl, MVT::i64));8857Cond = DAG.getSetCC(8858dl,8859getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),8860Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);88618862SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);8863}88648865ReuseLoadInfo RLI;8866SDValue Bits;88678868MachineFunction &MF = DAG.getMachineFunction();8869if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {8870Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI,8871RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);8872spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);8873} else if (Subtarget.hasLFIWAX() &&8874canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {8875MachineMemOperand *MMO =8876MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,8877RLI.Alignment, RLI.AAInfo, RLI.Ranges);8878SDValue Ops[] = { RLI.Chain, RLI.Ptr };8879Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,8880DAG.getVTList(MVT::f64, MVT::Other),8881Ops, MVT::i32, MMO);8882spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);8883} else if (Subtarget.hasFPCVT() &&8884canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) {8885MachineMemOperand *MMO =8886MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,8887RLI.Alignment, RLI.AAInfo, RLI.Ranges);8888SDValue Ops[] = { RLI.Chain, RLI.Ptr };8889Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl,8890DAG.getVTList(MVT::f64, MVT::Other),8891Ops, MVT::i32, MMO);8892spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);8893} else if (((Subtarget.hasLFIWAX() &&8894SINT.getOpcode() == ISD::SIGN_EXTEND) ||8895(Subtarget.hasFPCVT() &&8896SINT.getOpcode() == ISD::ZERO_EXTEND)) &&8897SINT.getOperand(0).getValueType() == MVT::i32) {8898MachineFrameInfo &MFI = MF.getFrameInfo();8899EVT PtrVT = getPointerTy(DAG.getDataLayout());89008901int FrameIdx = MFI.CreateStackObject(4, Align(4), false);8902SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);89038904SDValue Store = DAG.getStore(Chain, dl, SINT.getOperand(0), FIdx,8905MachinePointerInfo::getFixedStack(8906DAG.getMachineFunction(), FrameIdx));8907Chain = Store;89088909assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&8910"Expected an i32 store");89118912RLI.Ptr = FIdx;8913RLI.Chain = Chain;8914RLI.MPI =8915MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);8916RLI.Alignment = Align(4);89178918MachineMemOperand *MMO =8919MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,8920RLI.Alignment, RLI.AAInfo, RLI.Ranges);8921SDValue Ops[] = { RLI.Chain, RLI.Ptr };8922Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ?8923PPCISD::LFIWZX : PPCISD::LFIWAX,8924dl, DAG.getVTList(MVT::f64, MVT::Other),8925Ops, MVT::i32, MMO);8926Chain = Bits.getValue(1);8927} else8928Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);89298930SDValue FP = convertIntToFP(Op, Bits, DAG, Subtarget, Chain);8931if (IsStrict)8932Chain = FP.getValue(1);89338934if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {8935if (IsStrict)8936FP = DAG.getNode(ISD::STRICT_FP_ROUND, dl,8937DAG.getVTList(MVT::f32, MVT::Other),8938{Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags);8939else8940FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,8941DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));8942}8943return FP;8944}89458946assert(Src.getValueType() == MVT::i32 &&8947"Unhandled INT_TO_FP type in custom expander!");8948// Since we only generate this in 64-bit mode, we can take advantage of8949// 64-bit registers. In particular, sign extend the input value into the8950// 64-bit register with extsw, store the WHOLE 64-bit value into the stack8951// then lfd it and fcfid it.8952MachineFunction &MF = DAG.getMachineFunction();8953MachineFrameInfo &MFI = MF.getFrameInfo();8954EVT PtrVT = getPointerTy(MF.getDataLayout());89558956SDValue Ld;8957if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {8958ReuseLoadInfo RLI;8959bool ReusingLoad;8960if (!(ReusingLoad = canReuseLoadAddress(Src, MVT::i32, RLI, DAG))) {8961int FrameIdx = MFI.CreateStackObject(4, Align(4), false);8962SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);89638964SDValue Store = DAG.getStore(Chain, dl, Src, FIdx,8965MachinePointerInfo::getFixedStack(8966DAG.getMachineFunction(), FrameIdx));8967Chain = Store;89688969assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&8970"Expected an i32 store");89718972RLI.Ptr = FIdx;8973RLI.Chain = Chain;8974RLI.MPI =8975MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);8976RLI.Alignment = Align(4);8977}89788979MachineMemOperand *MMO =8980MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,8981RLI.Alignment, RLI.AAInfo, RLI.Ranges);8982SDValue Ops[] = { RLI.Chain, RLI.Ptr };8983Ld = DAG.getMemIntrinsicNode(IsSigned ? PPCISD::LFIWAX : PPCISD::LFIWZX, dl,8984DAG.getVTList(MVT::f64, MVT::Other), Ops,8985MVT::i32, MMO);8986Chain = Ld.getValue(1);8987if (ReusingLoad)8988spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);8989} else {8990assert(Subtarget.isPPC64() &&8991"i32->FP without LFIWAX supported only on PPC64");89928993int FrameIdx = MFI.CreateStackObject(8, Align(8), false);8994SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);89958996SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, Src);89978998// STD the extended value into the stack slot.8999SDValue Store = DAG.getStore(9000Chain, dl, Ext64, FIdx,9001MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));9002Chain = Store;90039004// Load the value as a double.9005Ld = DAG.getLoad(9006MVT::f64, dl, Chain, FIdx,9007MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));9008Chain = Ld.getValue(1);9009}90109011// FCFID it and return it.9012SDValue FP = convertIntToFP(Op, Ld, DAG, Subtarget, Chain);9013if (IsStrict)9014Chain = FP.getValue(1);9015if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {9016if (IsStrict)9017FP = DAG.getNode(ISD::STRICT_FP_ROUND, dl,9018DAG.getVTList(MVT::f32, MVT::Other),9019{Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags);9020else9021FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,9022DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));9023}9024return FP;9025}90269027SDValue PPCTargetLowering::LowerGET_ROUNDING(SDValue Op,9028SelectionDAG &DAG) const {9029SDLoc dl(Op);9030/*9031The rounding mode is in bits 30:31 of FPSR, and has the following9032settings:903300 Round to nearest903401 Round to 0903510 Round to +inf903611 Round to -inf90379038GET_ROUNDING, on the other hand, expects the following:9039-1 Undefined90400 Round to 090411 Round to nearest90422 Round to +inf90433 Round to -inf90449045To perform the conversion, we do:9046((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))9047*/90489049MachineFunction &MF = DAG.getMachineFunction();9050EVT VT = Op.getValueType();9051EVT PtrVT = getPointerTy(MF.getDataLayout());90529053// Save FP Control Word to register9054SDValue Chain = Op.getOperand(0);9055SDValue MFFS = DAG.getNode(PPCISD::MFFS, dl, {MVT::f64, MVT::Other}, Chain);9056Chain = MFFS.getValue(1);90579058SDValue CWD;9059if (isTypeLegal(MVT::i64)) {9060CWD = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,9061DAG.getNode(ISD::BITCAST, dl, MVT::i64, MFFS));9062} else {9063// Save FP register to stack slot9064int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false);9065SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);9066Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo());90679068// Load FP Control Word from low 32 bits of stack slot.9069assert(hasBigEndianPartOrdering(MVT::i64, MF.getDataLayout()) &&9070"Stack slot adjustment is valid only on big endian subtargets!");9071SDValue Four = DAG.getConstant(4, dl, PtrVT);9072SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);9073CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo());9074Chain = CWD.getValue(1);9075}90769077// Transform as necessary9078SDValue CWD1 =9079DAG.getNode(ISD::AND, dl, MVT::i32,9080CWD, DAG.getConstant(3, dl, MVT::i32));9081SDValue CWD2 =9082DAG.getNode(ISD::SRL, dl, MVT::i32,9083DAG.getNode(ISD::AND, dl, MVT::i32,9084DAG.getNode(ISD::XOR, dl, MVT::i32,9085CWD, DAG.getConstant(3, dl, MVT::i32)),9086DAG.getConstant(3, dl, MVT::i32)),9087DAG.getConstant(1, dl, MVT::i32));90889089SDValue RetVal =9090DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);90919092RetVal =9093DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND),9094dl, VT, RetVal);90959096return DAG.getMergeValues({RetVal, Chain}, dl);9097}90989099SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {9100EVT VT = Op.getValueType();9101unsigned BitWidth = VT.getSizeInBits();9102SDLoc dl(Op);9103assert(Op.getNumOperands() == 3 &&9104VT == Op.getOperand(1).getValueType() &&9105"Unexpected SHL!");91069107// Expand into a bunch of logical ops. Note that these ops9108// depend on the PPC behavior for oversized shift amounts.9109SDValue Lo = Op.getOperand(0);9110SDValue Hi = Op.getOperand(1);9111SDValue Amt = Op.getOperand(2);9112EVT AmtVT = Amt.getValueType();91139114SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,9115DAG.getConstant(BitWidth, dl, AmtVT), Amt);9116SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);9117SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);9118SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);9119SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,9120DAG.getConstant(-BitWidth, dl, AmtVT));9121SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);9122SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);9123SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);9124SDValue OutOps[] = { OutLo, OutHi };9125return DAG.getMergeValues(OutOps, dl);9126}91279128SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {9129EVT VT = Op.getValueType();9130SDLoc dl(Op);9131unsigned BitWidth = VT.getSizeInBits();9132assert(Op.getNumOperands() == 3 &&9133VT == Op.getOperand(1).getValueType() &&9134"Unexpected SRL!");91359136// Expand into a bunch of logical ops. Note that these ops9137// depend on the PPC behavior for oversized shift amounts.9138SDValue Lo = Op.getOperand(0);9139SDValue Hi = Op.getOperand(1);9140SDValue Amt = Op.getOperand(2);9141EVT AmtVT = Amt.getValueType();91429143SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,9144DAG.getConstant(BitWidth, dl, AmtVT), Amt);9145SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);9146SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);9147SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);9148SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,9149DAG.getConstant(-BitWidth, dl, AmtVT));9150SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);9151SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);9152SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);9153SDValue OutOps[] = { OutLo, OutHi };9154return DAG.getMergeValues(OutOps, dl);9155}91569157SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {9158SDLoc dl(Op);9159EVT VT = Op.getValueType();9160unsigned BitWidth = VT.getSizeInBits();9161assert(Op.getNumOperands() == 3 &&9162VT == Op.getOperand(1).getValueType() &&9163"Unexpected SRA!");91649165// Expand into a bunch of logical ops, followed by a select_cc.9166SDValue Lo = Op.getOperand(0);9167SDValue Hi = Op.getOperand(1);9168SDValue Amt = Op.getOperand(2);9169EVT AmtVT = Amt.getValueType();91709171SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,9172DAG.getConstant(BitWidth, dl, AmtVT), Amt);9173SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);9174SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);9175SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);9176SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,9177DAG.getConstant(-BitWidth, dl, AmtVT));9178SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);9179SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);9180SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT),9181Tmp4, Tmp6, ISD::SETLE);9182SDValue OutOps[] = { OutLo, OutHi };9183return DAG.getMergeValues(OutOps, dl);9184}91859186SDValue PPCTargetLowering::LowerFunnelShift(SDValue Op,9187SelectionDAG &DAG) const {9188SDLoc dl(Op);9189EVT VT = Op.getValueType();9190unsigned BitWidth = VT.getSizeInBits();91919192bool IsFSHL = Op.getOpcode() == ISD::FSHL;9193SDValue X = Op.getOperand(0);9194SDValue Y = Op.getOperand(1);9195SDValue Z = Op.getOperand(2);9196EVT AmtVT = Z.getValueType();91979198// fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))9199// fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))9200// This is simpler than TargetLowering::expandFunnelShift because we can rely9201// on PowerPC shift by BW being well defined.9202Z = DAG.getNode(ISD::AND, dl, AmtVT, Z,9203DAG.getConstant(BitWidth - 1, dl, AmtVT));9204SDValue SubZ =9205DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Z);9206X = DAG.getNode(PPCISD::SHL, dl, VT, X, IsFSHL ? Z : SubZ);9207Y = DAG.getNode(PPCISD::SRL, dl, VT, Y, IsFSHL ? SubZ : Z);9208return DAG.getNode(ISD::OR, dl, VT, X, Y);9209}92109211//===----------------------------------------------------------------------===//9212// Vector related lowering.9213//92149215/// getCanonicalConstSplat - Build a canonical splat immediate of Val with an9216/// element size of SplatSize. Cast the result to VT.9217static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,9218SelectionDAG &DAG, const SDLoc &dl) {9219static const MVT VTys[] = { // canonical VT to use for each size.9220MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i329221};92229223EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];92249225// For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.9226if (Val == ((1LLU << (SplatSize * 8)) - 1)) {9227SplatSize = 1;9228Val = 0xFF;9229}92309231EVT CanonicalVT = VTys[SplatSize-1];92329233// Build a canonical splat for this value.9234return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT));9235}92369237/// BuildIntrinsicOp - Return a unary operator intrinsic node with the9238/// specified intrinsic ID.9239static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,9240const SDLoc &dl, EVT DestVT = MVT::Other) {9241if (DestVT == MVT::Other) DestVT = Op.getValueType();9242return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,9243DAG.getConstant(IID, dl, MVT::i32), Op);9244}92459246/// BuildIntrinsicOp - Return a binary operator intrinsic node with the9247/// specified intrinsic ID.9248static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,9249SelectionDAG &DAG, const SDLoc &dl,9250EVT DestVT = MVT::Other) {9251if (DestVT == MVT::Other) DestVT = LHS.getValueType();9252return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,9253DAG.getConstant(IID, dl, MVT::i32), LHS, RHS);9254}92559256/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the9257/// specified intrinsic ID.9258static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,9259SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,9260EVT DestVT = MVT::Other) {9261if (DestVT == MVT::Other) DestVT = Op0.getValueType();9262return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,9263DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);9264}92659266/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified9267/// amount. The result has the specified value type.9268static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,9269SelectionDAG &DAG, const SDLoc &dl) {9270// Force LHS/RHS to be the right type.9271LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);9272RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);92739274int Ops[16];9275for (unsigned i = 0; i != 16; ++i)9276Ops[i] = i + Amt;9277SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);9278return DAG.getNode(ISD::BITCAST, dl, VT, T);9279}92809281/// Do we have an efficient pattern in a .td file for this node?9282///9283/// \param V - pointer to the BuildVectorSDNode being matched9284/// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?9285///9286/// There are some patterns where it is beneficial to keep a BUILD_VECTOR9287/// node as a BUILD_VECTOR node rather than expanding it. The patterns where9288/// the opposite is true (expansion is beneficial) are:9289/// - The node builds a vector out of integers that are not 32 or 64-bits9290/// - The node builds a vector out of constants9291/// - The node is a "load-and-splat"9292/// In all other cases, we will choose to keep the BUILD_VECTOR.9293static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,9294bool HasDirectMove,9295bool HasP8Vector) {9296EVT VecVT = V->getValueType(0);9297bool RightType = VecVT == MVT::v2f64 ||9298(HasP8Vector && VecVT == MVT::v4f32) ||9299(HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));9300if (!RightType)9301return false;93029303bool IsSplat = true;9304bool IsLoad = false;9305SDValue Op0 = V->getOperand(0);93069307// This function is called in a block that confirms the node is not a constant9308// splat. So a constant BUILD_VECTOR here means the vector is built out of9309// different constants.9310if (V->isConstant())9311return false;9312for (int i = 0, e = V->getNumOperands(); i < e; ++i) {9313if (V->getOperand(i).isUndef())9314return false;9315// We want to expand nodes that represent load-and-splat even if the9316// loaded value is a floating point truncation or conversion to int.9317if (V->getOperand(i).getOpcode() == ISD::LOAD ||9318(V->getOperand(i).getOpcode() == ISD::FP_ROUND &&9319V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||9320(V->getOperand(i).getOpcode() == ISD::FP_TO_SINT &&9321V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||9322(V->getOperand(i).getOpcode() == ISD::FP_TO_UINT &&9323V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD))9324IsLoad = true;9325// If the operands are different or the input is not a load and has more9326// uses than just this BV node, then it isn't a splat.9327if (V->getOperand(i) != Op0 ||9328(!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode())))9329IsSplat = false;9330}9331return !(IsSplat && IsLoad);9332}93339334// Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128.9335SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {93369337SDLoc dl(Op);9338SDValue Op0 = Op->getOperand(0);93399340if (!Subtarget.isPPC64() || (Op0.getOpcode() != ISD::BUILD_PAIR) ||9341(Op.getValueType() != MVT::f128))9342return SDValue();93439344SDValue Lo = Op0.getOperand(0);9345SDValue Hi = Op0.getOperand(1);9346if ((Lo.getValueType() != MVT::i64) || (Hi.getValueType() != MVT::i64))9347return SDValue();93489349if (!Subtarget.isLittleEndian())9350std::swap(Lo, Hi);93519352return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Lo, Hi);9353}93549355static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {9356const SDValue *InputLoad = &Op;9357while (InputLoad->getOpcode() == ISD::BITCAST)9358InputLoad = &InputLoad->getOperand(0);9359if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||9360InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {9361IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;9362InputLoad = &InputLoad->getOperand(0);9363}9364if (InputLoad->getOpcode() != ISD::LOAD)9365return nullptr;9366LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);9367return ISD::isNormalLoad(LD) ? InputLoad : nullptr;9368}93699370// Convert the argument APFloat to a single precision APFloat if there is no9371// loss in information during the conversion to single precision APFloat and the9372// resulting number is not a denormal number. Return true if successful.9373bool llvm::convertToNonDenormSingle(APFloat &ArgAPFloat) {9374APFloat APFloatToConvert = ArgAPFloat;9375bool LosesInfo = true;9376APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,9377&LosesInfo);9378bool Success = (!LosesInfo && !APFloatToConvert.isDenormal());9379if (Success)9380ArgAPFloat = APFloatToConvert;9381return Success;9382}93839384// Bitcast the argument APInt to a double and convert it to a single precision9385// APFloat, bitcast the APFloat to an APInt and assign it to the original9386// argument if there is no loss in information during the conversion from9387// double to single precision APFloat and the resulting number is not a denormal9388// number. Return true if successful.9389bool llvm::convertToNonDenormSingle(APInt &ArgAPInt) {9390double DpValue = ArgAPInt.bitsToDouble();9391APFloat APFloatDp(DpValue);9392bool Success = convertToNonDenormSingle(APFloatDp);9393if (Success)9394ArgAPInt = APFloatDp.bitcastToAPInt();9395return Success;9396}93979398// Nondestructive check for convertTonNonDenormSingle.9399bool llvm::checkConvertToNonDenormSingle(APFloat &ArgAPFloat) {9400// Only convert if it loses info, since XXSPLTIDP should9401// handle the other case.9402APFloat APFloatToConvert = ArgAPFloat;9403bool LosesInfo = true;9404APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,9405&LosesInfo);94069407return (!LosesInfo && !APFloatToConvert.isDenormal());9408}94099410static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,9411unsigned &Opcode) {9412LoadSDNode *InputNode = dyn_cast<LoadSDNode>(Op.getOperand(0));9413if (!InputNode || !Subtarget.hasVSX() || !ISD::isUNINDEXEDLoad(InputNode))9414return false;94159416EVT Ty = Op->getValueType(0);9417// For v2f64, v4f32 and v4i32 types, we require the load to be non-extending9418// as we cannot handle extending loads for these types.9419if ((Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32) &&9420ISD::isNON_EXTLoad(InputNode))9421return true;94229423EVT MemVT = InputNode->getMemoryVT();9424// For v8i16 and v16i8 types, extending loads can be handled as long as the9425// memory VT is the same vector element VT type.9426// The loads feeding into the v8i16 and v16i8 types will be extending because9427// scalar i8/i16 are not legal types.9428if ((Ty == MVT::v8i16 || Ty == MVT::v16i8) && ISD::isEXTLoad(InputNode) &&9429(MemVT == Ty.getVectorElementType()))9430return true;94319432if (Ty == MVT::v2i64) {9433// Check the extend type, when the input type is i32, and the output vector9434// type is v2i64.9435if (MemVT == MVT::i32) {9436if (ISD::isZEXTLoad(InputNode))9437Opcode = PPCISD::ZEXT_LD_SPLAT;9438if (ISD::isSEXTLoad(InputNode))9439Opcode = PPCISD::SEXT_LD_SPLAT;9440}9441return true;9442}9443return false;9444}94459446// If this is a case we can't handle, return null and let the default9447// expansion code take care of it. If we CAN select this case, and if it9448// selects to a single instruction, return Op. Otherwise, if we can codegen9449// this case more efficiently than a constant pool load, lower it to the9450// sequence of ops that should be used.9451SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,9452SelectionDAG &DAG) const {9453SDLoc dl(Op);9454BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());9455assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");94569457// Check if this is a splat of a constant value.9458APInt APSplatBits, APSplatUndef;9459unsigned SplatBitSize;9460bool HasAnyUndefs;9461bool BVNIsConstantSplat =9462BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,9463HasAnyUndefs, 0, !Subtarget.isLittleEndian());94649465// If it is a splat of a double, check if we can shrink it to a 32 bit9466// non-denormal float which when converted back to double gives us the same9467// double. This is to exploit the XXSPLTIDP instruction.9468// If we lose precision, we use XXSPLTI32DX.9469if (BVNIsConstantSplat && (SplatBitSize == 64) &&9470Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {9471// Check the type first to short-circuit so we don't modify APSplatBits if9472// this block isn't executed.9473if ((Op->getValueType(0) == MVT::v2f64) &&9474convertToNonDenormSingle(APSplatBits)) {9475SDValue SplatNode = DAG.getNode(9476PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,9477DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));9478return DAG.getBitcast(Op.getValueType(), SplatNode);9479} else {9480// We may lose precision, so we have to use XXSPLTI32DX.94819482uint32_t Hi =9483(uint32_t)((APSplatBits.getZExtValue() & 0xFFFFFFFF00000000LL) >> 32);9484uint32_t Lo =9485(uint32_t)(APSplatBits.getZExtValue() & 0xFFFFFFFF);9486SDValue SplatNode = DAG.getUNDEF(MVT::v2i64);94879488if (!Hi || !Lo)9489// If either load is 0, then we should generate XXLXOR to set to 0.9490SplatNode = DAG.getTargetConstant(0, dl, MVT::v2i64);94919492if (Hi)9493SplatNode = DAG.getNode(9494PPCISD::XXSPLTI32DX, dl, MVT::v2i64, SplatNode,9495DAG.getTargetConstant(0, dl, MVT::i32),9496DAG.getTargetConstant(Hi, dl, MVT::i32));94979498if (Lo)9499SplatNode =9500DAG.getNode(PPCISD::XXSPLTI32DX, dl, MVT::v2i64, SplatNode,9501DAG.getTargetConstant(1, dl, MVT::i32),9502DAG.getTargetConstant(Lo, dl, MVT::i32));95039504return DAG.getBitcast(Op.getValueType(), SplatNode);9505}9506}95079508if (!BVNIsConstantSplat || SplatBitSize > 32) {9509unsigned NewOpcode = PPCISD::LD_SPLAT;95109511// Handle load-and-splat patterns as we have instructions that will do this9512// in one go.9513if (DAG.isSplatValue(Op, true) &&9514isValidSplatLoad(Subtarget, Op, NewOpcode)) {9515const SDValue *InputLoad = &Op.getOperand(0);9516LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);95179518// If the input load is an extending load, it will be an i32 -> i649519// extending load and isValidSplatLoad() will update NewOpcode.9520unsigned MemorySize = LD->getMemoryVT().getScalarSizeInBits();9521unsigned ElementSize =9522MemorySize * ((NewOpcode == PPCISD::LD_SPLAT) ? 1 : 2);95239524assert(((ElementSize == 2 * MemorySize)9525? (NewOpcode == PPCISD::ZEXT_LD_SPLAT ||9526NewOpcode == PPCISD::SEXT_LD_SPLAT)9527: (NewOpcode == PPCISD::LD_SPLAT)) &&9528"Unmatched element size and opcode!\n");95299530// Checking for a single use of this load, we have to check for vector9531// width (128 bits) / ElementSize uses (since each operand of the9532// BUILD_VECTOR is a separate use of the value.9533unsigned NumUsesOfInputLD = 128 / ElementSize;9534for (SDValue BVInOp : Op->ops())9535if (BVInOp.isUndef())9536NumUsesOfInputLD--;95379538// Exclude somes case where LD_SPLAT is worse than scalar_to_vector:9539// Below cases should also happen for "lfiwzx/lfiwax + LE target + index9540// 1" and "lxvrhx + BE target + index 7" and "lxvrbx + BE target + index9541// 15", but function IsValidSplatLoad() now will only return true when9542// the data at index 0 is not nullptr. So we will not get into trouble for9543// these cases.9544//9545// case 1 - lfiwzx/lfiwax9546// 1.1: load result is i32 and is sign/zero extend to i64;9547// 1.2: build a v2i64 vector type with above loaded value;9548// 1.3: the vector has only one value at index 0, others are all undef;9549// 1.4: on BE target, so that lfiwzx/lfiwax does not need any permute.9550if (NumUsesOfInputLD == 1 &&9551(Op->getValueType(0) == MVT::v2i64 && NewOpcode != PPCISD::LD_SPLAT &&9552!Subtarget.isLittleEndian() && Subtarget.hasVSX() &&9553Subtarget.hasLFIWAX()))9554return SDValue();95559556// case 2 - lxvr[hb]x9557// 2.1: load result is at most i16;9558// 2.2: build a vector with above loaded value;9559// 2.3: the vector has only one value at index 0, others are all undef;9560// 2.4: on LE target, so that lxvr[hb]x does not need any permute.9561if (NumUsesOfInputLD == 1 && Subtarget.isLittleEndian() &&9562Subtarget.isISA3_1() && ElementSize <= 16)9563return SDValue();95649565assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?");9566if (InputLoad->getNode()->hasNUsesOfValue(NumUsesOfInputLD, 0) &&9567Subtarget.hasVSX()) {9568SDValue Ops[] = {9569LD->getChain(), // Chain9570LD->getBasePtr(), // Ptr9571DAG.getValueType(Op.getValueType()) // VT9572};9573SDValue LdSplt = DAG.getMemIntrinsicNode(9574NewOpcode, dl, DAG.getVTList(Op.getValueType(), MVT::Other), Ops,9575LD->getMemoryVT(), LD->getMemOperand());9576// Replace all uses of the output chain of the original load with the9577// output chain of the new load.9578DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1),9579LdSplt.getValue(1));9580return LdSplt;9581}9582}95839584// In 64BIT mode BUILD_VECTOR nodes that are not constant splats of up to9585// 32-bits can be lowered to VSX instructions under certain conditions.9586// Without VSX, there is no pattern more efficient than expanding the node.9587if (Subtarget.hasVSX() && Subtarget.isPPC64() &&9588haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),9589Subtarget.hasP8Vector()))9590return Op;9591return SDValue();9592}95939594uint64_t SplatBits = APSplatBits.getZExtValue();9595uint64_t SplatUndef = APSplatUndef.getZExtValue();9596unsigned SplatSize = SplatBitSize / 8;95979598// First, handle single instruction cases.95999600// All zeros?9601if (SplatBits == 0) {9602// Canonicalize all zero vectors to be v4i32.9603if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {9604SDValue Z = DAG.getConstant(0, dl, MVT::v4i32);9605Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);9606}9607return Op;9608}96099610// We have XXSPLTIW for constant splats four bytes wide.9611// Given vector length is a multiple of 4, 2-byte splats can be replaced9612// with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to9613// make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be9614// turned into a 4-byte splat of 0xABABABAB.9615if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 2)9616return getCanonicalConstSplat(SplatBits | (SplatBits << 16), SplatSize * 2,9617Op.getValueType(), DAG, dl);96189619if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector() && SplatSize == 4)9620return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,9621dl);96229623// We have XXSPLTIB for constant splats one byte wide.9624if (Subtarget.hasP9Vector() && SplatSize == 1)9625return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,9626dl);96279628// If the sign extended value is in the range [-16,15], use VSPLTI[bhw].9629int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>9630(32-SplatBitSize));9631if (SextVal >= -16 && SextVal <= 15)9632return getCanonicalConstSplat(SextVal, SplatSize, Op.getValueType(), DAG,9633dl);96349635// Two instruction sequences.96369637// If this value is in the range [-32,30] and is even, use:9638// VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)9639// If this value is in the range [17,31] and is odd, use:9640// VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)9641// If this value is in the range [-31,-17] and is odd, use:9642// VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)9643// Note the last two are three-instruction sequences.9644if (SextVal >= -32 && SextVal <= 31) {9645// To avoid having these optimizations undone by constant folding,9646// we convert to a pseudo that will be expanded later into one of9647// the above forms.9648SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32);9649EVT VT = (SplatSize == 1 ? MVT::v16i8 :9650(SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));9651SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32);9652SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);9653if (VT == Op.getValueType())9654return RetVal;9655else9656return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);9657}96589659// If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is9660// 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important9661// for fneg/fabs.9662if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {9663// Make -1 and vspltisw -1:9664SDValue OnesV = getCanonicalConstSplat(-1, 4, MVT::v4i32, DAG, dl);96659666// Make the VSLW intrinsic, computing 0x8000_0000.9667SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,9668OnesV, DAG, dl);96699670// xor by OnesV to invert it.9671Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);9672return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);9673}96749675// Check to see if this is a wide variety of vsplti*, binop self cases.9676static const signed char SplatCsts[] = {9677-1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,9678-8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -169679};96809681for (unsigned idx = 0; idx < std::size(SplatCsts); ++idx) {9682// Indirect through the SplatCsts array so that we favor 'vsplti -1' for9683// cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1'9684int i = SplatCsts[idx];96859686// Figure out what shift amount will be used by altivec if shifted by i in9687// this splat size.9688unsigned TypeShiftAmt = i & (SplatBitSize-1);96899690// vsplti + shl self.9691if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {9692SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);9693static const unsigned IIDs[] = { // Intrinsic to use for each size.9694Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,9695Intrinsic::ppc_altivec_vslw9696};9697Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);9698return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);9699}97009701// vsplti + srl self.9702if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {9703SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);9704static const unsigned IIDs[] = { // Intrinsic to use for each size.9705Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,9706Intrinsic::ppc_altivec_vsrw9707};9708Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);9709return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);9710}97119712// vsplti + rol self.9713if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |9714((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {9715SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);9716static const unsigned IIDs[] = { // Intrinsic to use for each size.9717Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,9718Intrinsic::ppc_altivec_vrlw9719};9720Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);9721return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);9722}97239724// t = vsplti c, result = vsldoi t, t, 19725if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {9726SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);9727unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;9728return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);9729}9730// t = vsplti c, result = vsldoi t, t, 29731if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {9732SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);9733unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;9734return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);9735}9736// t = vsplti c, result = vsldoi t, t, 39737if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {9738SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);9739unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;9740return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);9741}9742}97439744return SDValue();9745}97469747/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit9748/// the specified operations to build the shuffle.9749static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,9750SDValue RHS, SelectionDAG &DAG,9751const SDLoc &dl) {9752unsigned OpNum = (PFEntry >> 26) & 0x0F;9753unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);9754unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);97559756enum {9757OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>9758OP_VMRGHW,9759OP_VMRGLW,9760OP_VSPLTISW0,9761OP_VSPLTISW1,9762OP_VSPLTISW2,9763OP_VSPLTISW3,9764OP_VSLDOI4,9765OP_VSLDOI8,9766OP_VSLDOI129767};97689769if (OpNum == OP_COPY) {9770if (LHSID == (1*9+2)*9+3) return LHS;9771assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");9772return RHS;9773}97749775SDValue OpLHS, OpRHS;9776OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);9777OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);97789779int ShufIdxs[16];9780switch (OpNum) {9781default: llvm_unreachable("Unknown i32 permute!");9782case OP_VMRGHW:9783ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3;9784ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;9785ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7;9786ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;9787break;9788case OP_VMRGLW:9789ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;9790ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;9791ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;9792ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;9793break;9794case OP_VSPLTISW0:9795for (unsigned i = 0; i != 16; ++i)9796ShufIdxs[i] = (i&3)+0;9797break;9798case OP_VSPLTISW1:9799for (unsigned i = 0; i != 16; ++i)9800ShufIdxs[i] = (i&3)+4;9801break;9802case OP_VSPLTISW2:9803for (unsigned i = 0; i != 16; ++i)9804ShufIdxs[i] = (i&3)+8;9805break;9806case OP_VSPLTISW3:9807for (unsigned i = 0; i != 16; ++i)9808ShufIdxs[i] = (i&3)+12;9809break;9810case OP_VSLDOI4:9811return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);9812case OP_VSLDOI8:9813return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);9814case OP_VSLDOI12:9815return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);9816}9817EVT VT = OpLHS.getValueType();9818OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);9819OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);9820SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);9821return DAG.getNode(ISD::BITCAST, dl, VT, T);9822}98239824/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled9825/// by the VINSERTB instruction introduced in ISA 3.0, else just return default9826/// SDValue.9827SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,9828SelectionDAG &DAG) const {9829const unsigned BytesInVector = 16;9830bool IsLE = Subtarget.isLittleEndian();9831SDLoc dl(N);9832SDValue V1 = N->getOperand(0);9833SDValue V2 = N->getOperand(1);9834unsigned ShiftElts = 0, InsertAtByte = 0;9835bool Swap = false;98369837// Shifts required to get the byte we want at element 7.9838unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,98390, 15, 14, 13, 12, 11, 10, 9};9840unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,98411, 2, 3, 4, 5, 6, 7, 8};98429843ArrayRef<int> Mask = N->getMask();9844int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};98459846// For each mask element, find out if we're just inserting something9847// from V2 into V1 or vice versa.9848// Possible permutations inserting an element from V2 into V1:9849// X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 159850// 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 159851// ...9852// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X9853// Inserting from V1 into V2 will be similar, except mask range will be9854// [16,31].98559856bool FoundCandidate = false;9857// If both vector operands for the shuffle are the same vector, the mask9858// will contain only elements from the first one and the second one will be9859// undef.9860unsigned VINSERTBSrcElem = IsLE ? 8 : 7;9861// Go through the mask of half-words to find an element that's being moved9862// from one vector to the other.9863for (unsigned i = 0; i < BytesInVector; ++i) {9864unsigned CurrentElement = Mask[i];9865// If 2nd operand is undefined, we should only look for element 7 in the9866// Mask.9867if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)9868continue;98699870bool OtherElementsInOrder = true;9871// Examine the other elements in the Mask to see if they're in original9872// order.9873for (unsigned j = 0; j < BytesInVector; ++j) {9874if (j == i)9875continue;9876// If CurrentElement is from V1 [0,15], then we the rest of the Mask to be9877// from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,9878// in which we always assume we're always picking from the 1st operand.9879int MaskOffset =9880(!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;9881if (Mask[j] != OriginalOrder[j] + MaskOffset) {9882OtherElementsInOrder = false;9883break;9884}9885}9886// If other elements are in original order, we record the number of shifts9887// we need to get the element we want into element 7. Also record which byte9888// in the vector we should insert into.9889if (OtherElementsInOrder) {9890// If 2nd operand is undefined, we assume no shifts and no swapping.9891if (V2.isUndef()) {9892ShiftElts = 0;9893Swap = false;9894} else {9895// Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.9896ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]9897: BigEndianShifts[CurrentElement & 0xF];9898Swap = CurrentElement < BytesInVector;9899}9900InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;9901FoundCandidate = true;9902break;9903}9904}99059906if (!FoundCandidate)9907return SDValue();99089909// Candidate found, construct the proper SDAG sequence with VINSERTB,9910// optionally with VECSHL if shift is required.9911if (Swap)9912std::swap(V1, V2);9913if (V2.isUndef())9914V2 = V1;9915if (ShiftElts) {9916SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,9917DAG.getConstant(ShiftElts, dl, MVT::i32));9918return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,9919DAG.getConstant(InsertAtByte, dl, MVT::i32));9920}9921return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,9922DAG.getConstant(InsertAtByte, dl, MVT::i32));9923}99249925/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled9926/// by the VINSERTH instruction introduced in ISA 3.0, else just return default9927/// SDValue.9928SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,9929SelectionDAG &DAG) const {9930const unsigned NumHalfWords = 8;9931const unsigned BytesInVector = NumHalfWords * 2;9932// Check that the shuffle is on half-words.9933if (!isNByteElemShuffleMask(N, 2, 1))9934return SDValue();99359936bool IsLE = Subtarget.isLittleEndian();9937SDLoc dl(N);9938SDValue V1 = N->getOperand(0);9939SDValue V2 = N->getOperand(1);9940unsigned ShiftElts = 0, InsertAtByte = 0;9941bool Swap = false;99429943// Shifts required to get the half-word we want at element 3.9944unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};9945unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};99469947uint32_t Mask = 0;9948uint32_t OriginalOrderLow = 0x1234567;9949uint32_t OriginalOrderHigh = 0x89ABCDEF;9950// Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a9951// 32-bit space, only need 4-bit nibbles per element.9952for (unsigned i = 0; i < NumHalfWords; ++i) {9953unsigned MaskShift = (NumHalfWords - 1 - i) * 4;9954Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift);9955}99569957// For each mask element, find out if we're just inserting something9958// from V2 into V1 or vice versa. Possible permutations inserting an element9959// from V2 into V1:9960// X, 1, 2, 3, 4, 5, 6, 79961// 0, X, 2, 3, 4, 5, 6, 79962// 0, 1, X, 3, 4, 5, 6, 79963// 0, 1, 2, X, 4, 5, 6, 79964// 0, 1, 2, 3, X, 5, 6, 79965// 0, 1, 2, 3, 4, X, 6, 79966// 0, 1, 2, 3, 4, 5, X, 79967// 0, 1, 2, 3, 4, 5, 6, X9968// Inserting from V1 into V2 will be similar, except mask range will be [8,15].99699970bool FoundCandidate = false;9971// Go through the mask of half-words to find an element that's being moved9972// from one vector to the other.9973for (unsigned i = 0; i < NumHalfWords; ++i) {9974unsigned MaskShift = (NumHalfWords - 1 - i) * 4;9975uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;9976uint32_t MaskOtherElts = ~(0xF << MaskShift);9977uint32_t TargetOrder = 0x0;99789979// If both vector operands for the shuffle are the same vector, the mask9980// will contain only elements from the first one and the second one will be9981// undef.9982if (V2.isUndef()) {9983ShiftElts = 0;9984unsigned VINSERTHSrcElem = IsLE ? 4 : 3;9985TargetOrder = OriginalOrderLow;9986Swap = false;9987// Skip if not the correct element or mask of other elements don't equal9988// to our expected order.9989if (MaskOneElt == VINSERTHSrcElem &&9990(Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {9991InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;9992FoundCandidate = true;9993break;9994}9995} else { // If both operands are defined.9996// Target order is [8,15] if the current mask is between [0,7].9997TargetOrder =9998(MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;9999// Skip if mask of other elements don't equal our expected order.10000if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {10001// We only need the last 3 bits for the number of shifts.10002ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]10003: BigEndianShifts[MaskOneElt & 0x7];10004InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;10005Swap = MaskOneElt < NumHalfWords;10006FoundCandidate = true;10007break;10008}10009}10010}1001110012if (!FoundCandidate)10013return SDValue();1001410015// Candidate found, construct the proper SDAG sequence with VINSERTH,10016// optionally with VECSHL if shift is required.10017if (Swap)10018std::swap(V1, V2);10019if (V2.isUndef())10020V2 = V1;10021SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);10022if (ShiftElts) {10023// Double ShiftElts because we're left shifting on v16i8 type.10024SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,10025DAG.getConstant(2 * ShiftElts, dl, MVT::i32));10026SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl);10027SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,10028DAG.getConstant(InsertAtByte, dl, MVT::i32));10029return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);10030}10031SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);10032SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,10033DAG.getConstant(InsertAtByte, dl, MVT::i32));10034return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);10035}1003610037/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be10038/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise10039/// return the default SDValue.10040SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN,10041SelectionDAG &DAG) const {10042// The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles10043// to v16i8. Peek through the bitcasts to get the actual operands.10044SDValue LHS = peekThroughBitcasts(SVN->getOperand(0));10045SDValue RHS = peekThroughBitcasts(SVN->getOperand(1));1004610047auto ShuffleMask = SVN->getMask();10048SDValue VecShuffle(SVN, 0);10049SDLoc DL(SVN);1005010051// Check that we have a four byte shuffle.10052if (!isNByteElemShuffleMask(SVN, 4, 1))10053return SDValue();1005410055// Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx.10056if (RHS->getOpcode() != ISD::BUILD_VECTOR) {10057std::swap(LHS, RHS);10058VecShuffle = peekThroughBitcasts(DAG.getCommutedVectorShuffle(*SVN));10059ShuffleVectorSDNode *CommutedSV = dyn_cast<ShuffleVectorSDNode>(VecShuffle);10060if (!CommutedSV)10061return SDValue();10062ShuffleMask = CommutedSV->getMask();10063}1006410065// Ensure that the RHS is a vector of constants.10066BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());10067if (!BVN)10068return SDValue();1006910070// Check if RHS is a splat of 4-bytes (or smaller).10071APInt APSplatValue, APSplatUndef;10072unsigned SplatBitSize;10073bool HasAnyUndefs;10074if (!BVN->isConstantSplat(APSplatValue, APSplatUndef, SplatBitSize,10075HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||10076SplatBitSize > 32)10077return SDValue();1007810079// Check that the shuffle mask matches the semantics of XXSPLTI32DX.10080// The instruction splats a constant C into two words of the source vector10081// producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }.10082// Thus we check that the shuffle mask is the equivalent of10083// <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively.10084// Note: the check above of isNByteElemShuffleMask() ensures that the bytes10085// within each word are consecutive, so we only need to check the first byte.10086SDValue Index;10087bool IsLE = Subtarget.isLittleEndian();10088if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) &&10089(ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 &&10090ShuffleMask[4] > 15 && ShuffleMask[12] > 15))10091Index = DAG.getTargetConstant(IsLE ? 0 : 1, DL, MVT::i32);10092else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) &&10093(ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 &&10094ShuffleMask[0] > 15 && ShuffleMask[8] > 15))10095Index = DAG.getTargetConstant(IsLE ? 1 : 0, DL, MVT::i32);10096else10097return SDValue();1009810099// If the splat is narrower than 32-bits, we need to get the 32-bit value10100// for XXSPLTI32DX.10101unsigned SplatVal = APSplatValue.getZExtValue();10102for (; SplatBitSize < 32; SplatBitSize <<= 1)10103SplatVal |= (SplatVal << SplatBitSize);1010410105SDValue SplatNode = DAG.getNode(10106PPCISD::XXSPLTI32DX, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, LHS),10107Index, DAG.getTargetConstant(SplatVal, DL, MVT::i32));10108return DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, SplatNode);10109}1011010111/// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).10112/// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is10113/// a multiple of 8. Otherwise convert it to a scalar rotation(i128)10114/// i.e (or (shl x, C1), (srl x, 128-C1)).10115SDValue PPCTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {10116assert(Op.getOpcode() == ISD::ROTL && "Should only be called for ISD::ROTL");10117assert(Op.getValueType() == MVT::v1i128 &&10118"Only set v1i128 as custom, other type shouldn't reach here!");10119SDLoc dl(Op);10120SDValue N0 = peekThroughBitcasts(Op.getOperand(0));10121SDValue N1 = peekThroughBitcasts(Op.getOperand(1));10122unsigned SHLAmt = N1.getConstantOperandVal(0);10123if (SHLAmt % 8 == 0) {10124std::array<int, 16> Mask;10125std::iota(Mask.begin(), Mask.end(), 0);10126std::rotate(Mask.begin(), Mask.begin() + SHLAmt / 8, Mask.end());10127if (SDValue Shuffle =10128DAG.getVectorShuffle(MVT::v16i8, dl, DAG.getBitcast(MVT::v16i8, N0),10129DAG.getUNDEF(MVT::v16i8), Mask))10130return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, Shuffle);10131}10132SDValue ArgVal = DAG.getBitcast(MVT::i128, N0);10133SDValue SHLOp = DAG.getNode(ISD::SHL, dl, MVT::i128, ArgVal,10134DAG.getConstant(SHLAmt, dl, MVT::i32));10135SDValue SRLOp = DAG.getNode(ISD::SRL, dl, MVT::i128, ArgVal,10136DAG.getConstant(128 - SHLAmt, dl, MVT::i32));10137SDValue OROp = DAG.getNode(ISD::OR, dl, MVT::i128, SHLOp, SRLOp);10138return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, OROp);10139}1014010141/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this10142/// is a shuffle we can handle in a single instruction, return it. Otherwise,10143/// return the code it can be lowered into. Worst case, it can always be10144/// lowered into a vperm.10145SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,10146SelectionDAG &DAG) const {10147SDLoc dl(Op);10148SDValue V1 = Op.getOperand(0);10149SDValue V2 = Op.getOperand(1);10150ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);1015110152// Any nodes that were combined in the target-independent combiner prior10153// to vector legalization will not be sent to the target combine. Try to10154// combine it here.10155if (SDValue NewShuffle = combineVectorShuffle(SVOp, DAG)) {10156if (!isa<ShuffleVectorSDNode>(NewShuffle))10157return NewShuffle;10158Op = NewShuffle;10159SVOp = cast<ShuffleVectorSDNode>(Op);10160V1 = Op.getOperand(0);10161V2 = Op.getOperand(1);10162}10163EVT VT = Op.getValueType();10164bool isLittleEndian = Subtarget.isLittleEndian();1016510166unsigned ShiftElts, InsertAtByte;10167bool Swap = false;1016810169// If this is a load-and-splat, we can do that with a single instruction10170// in some cases. However if the load has multiple uses, we don't want to10171// combine it because that will just produce multiple loads.10172bool IsPermutedLoad = false;10173const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad);10174if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&10175(PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&10176InputLoad->hasOneUse()) {10177bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4);10178int SplatIdx =10179PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);1018010181// The splat index for permuted loads will be in the left half of the vector10182// which is strictly wider than the loaded value by 8 bytes. So we need to10183// adjust the splat index to point to the correct address in memory.10184if (IsPermutedLoad) {10185assert((isLittleEndian || IsFourByte) &&10186"Unexpected size for permuted load on big endian target");10187SplatIdx += IsFourByte ? 2 : 1;10188assert((SplatIdx < (IsFourByte ? 4 : 2)) &&10189"Splat of a value outside of the loaded memory");10190}1019110192LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);10193// For 4-byte load-and-splat, we need Power9.10194if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {10195uint64_t Offset = 0;10196if (IsFourByte)10197Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;10198else10199Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;1020010201// If the width of the load is the same as the width of the splat,10202// loading with an offset would load the wrong memory.10203if (LD->getValueType(0).getSizeInBits() == (IsFourByte ? 32 : 64))10204Offset = 0;1020510206SDValue BasePtr = LD->getBasePtr();10207if (Offset != 0)10208BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),10209BasePtr, DAG.getIntPtrConstant(Offset, dl));10210SDValue Ops[] = {10211LD->getChain(), // Chain10212BasePtr, // BasePtr10213DAG.getValueType(Op.getValueType()) // VT10214};10215SDVTList VTL =10216DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other);10217SDValue LdSplt =10218DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL,10219Ops, LD->getMemoryVT(), LD->getMemOperand());10220DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1), LdSplt.getValue(1));10221if (LdSplt.getValueType() != SVOp->getValueType(0))10222LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt);10223return LdSplt;10224}10225}1022610227// All v2i64 and v2f64 shuffles are legal10228if (VT == MVT::v2i64 || VT == MVT::v2f64)10229return Op;1023010231if (Subtarget.hasP9Vector() &&10232PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,10233isLittleEndian)) {10234if (V2.isUndef())10235V2 = V1;10236else if (Swap)10237std::swap(V1, V2);10238SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);10239SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2);10240if (ShiftElts) {10241SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,10242DAG.getConstant(ShiftElts, dl, MVT::i32));10243SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl,10244DAG.getConstant(InsertAtByte, dl, MVT::i32));10245return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);10246}10247SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2,10248DAG.getConstant(InsertAtByte, dl, MVT::i32));10249return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);10250}1025110252if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {10253SDValue SplatInsertNode;10254if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG)))10255return SplatInsertNode;10256}1025710258if (Subtarget.hasP9Altivec()) {10259SDValue NewISDNode;10260if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))10261return NewISDNode;1026210263if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))10264return NewISDNode;10265}1026610267if (Subtarget.hasVSX() &&10268PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {10269if (Swap)10270std::swap(V1, V2);10271SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);10272SDValue Conv2 =10273DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2);1027410275SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2,10276DAG.getConstant(ShiftElts, dl, MVT::i32));10277return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);10278}1027910280if (Subtarget.hasVSX() &&10281PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {10282if (Swap)10283std::swap(V1, V2);10284SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);10285SDValue Conv2 =10286DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2);1028710288SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2,10289DAG.getConstant(ShiftElts, dl, MVT::i32));10290return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI);10291}1029210293if (Subtarget.hasP9Vector()) {10294if (PPC::isXXBRHShuffleMask(SVOp)) {10295SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);10296SDValue ReveHWord = DAG.getNode(ISD::BSWAP, dl, MVT::v8i16, Conv);10297return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord);10298} else if (PPC::isXXBRWShuffleMask(SVOp)) {10299SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);10300SDValue ReveWord = DAG.getNode(ISD::BSWAP, dl, MVT::v4i32, Conv);10301return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord);10302} else if (PPC::isXXBRDShuffleMask(SVOp)) {10303SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);10304SDValue ReveDWord = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Conv);10305return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord);10306} else if (PPC::isXXBRQShuffleMask(SVOp)) {10307SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1);10308SDValue ReveQWord = DAG.getNode(ISD::BSWAP, dl, MVT::v1i128, Conv);10309return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord);10310}10311}1031210313if (Subtarget.hasVSX()) {10314if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {10315int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG);1031610317SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);10318SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,10319DAG.getConstant(SplatIdx, dl, MVT::i32));10320return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat);10321}1032210323// Left shifts of 8 bytes are actually swaps. Convert accordingly.10324if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) {10325SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);10326SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);10327return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);10328}10329}1033010331// Cases that are handled by instructions that take permute immediates10332// (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be10333// selected by the instruction selector.10334if (V2.isUndef()) {10335if (PPC::isSplatShuffleMask(SVOp, 1) ||10336PPC::isSplatShuffleMask(SVOp, 2) ||10337PPC::isSplatShuffleMask(SVOp, 4) ||10338PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) ||10339PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) ||10340PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 ||10341PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) ||10342PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) ||10343PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) ||10344PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) ||10345PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) ||10346PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) ||10347(Subtarget.hasP8Altivec() && (10348PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) ||10349PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) ||10350PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) {10351return Op;10352}10353}1035410355// Altivec has a variety of "shuffle immediates" that take two vector inputs10356// and produce a fixed permutation. If any of these match, do not lower to10357// VPERM.10358unsigned int ShuffleKind = isLittleEndian ? 2 : 0;10359if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) ||10360PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) ||10361PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 ||10362PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) ||10363PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) ||10364PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) ||10365PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) ||10366PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) ||10367PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) ||10368(Subtarget.hasP8Altivec() && (10369PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) ||10370PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) ||10371PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG))))10372return Op;1037310374// Check to see if this is a shuffle of 4-byte values. If so, we can use our10375// perfect shuffle table to emit an optimal matching sequence.10376ArrayRef<int> PermMask = SVOp->getMask();1037710378if (!DisablePerfectShuffle && !isLittleEndian) {10379unsigned PFIndexes[4];10380bool isFourElementShuffle = true;10381for (unsigned i = 0; i != 4 && isFourElementShuffle;10382++i) { // Element number10383unsigned EltNo = 8; // Start out undef.10384for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.10385if (PermMask[i * 4 + j] < 0)10386continue; // Undef, ignore it.1038710388unsigned ByteSource = PermMask[i * 4 + j];10389if ((ByteSource & 3) != j) {10390isFourElementShuffle = false;10391break;10392}1039310394if (EltNo == 8) {10395EltNo = ByteSource / 4;10396} else if (EltNo != ByteSource / 4) {10397isFourElementShuffle = false;10398break;10399}10400}10401PFIndexes[i] = EltNo;10402}1040310404// If this shuffle can be expressed as a shuffle of 4-byte elements, use the10405// perfect shuffle vector to determine if it is cost effective to do this as10406// discrete instructions, or whether we should use a vperm.10407// For now, we skip this for little endian until such time as we have a10408// little-endian perfect shuffle table.10409if (isFourElementShuffle) {10410// Compute the index in the perfect shuffle table.10411unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +10412PFIndexes[2] * 9 + PFIndexes[3];1041310414unsigned PFEntry = PerfectShuffleTable[PFTableIndex];10415unsigned Cost = (PFEntry >> 30);1041610417// Determining when to avoid vperm is tricky. Many things affect the cost10418// of vperm, particularly how many times the perm mask needs to be10419// computed. For example, if the perm mask can be hoisted out of a loop or10420// is already used (perhaps because there are multiple permutes with the10421// same shuffle mask?) the vperm has a cost of 1. OTOH, hoisting the10422// permute mask out of the loop requires an extra register.10423//10424// As a compromise, we only emit discrete instructions if the shuffle can10425// be generated in 3 or fewer operations. When we have loop information10426// available, if this block is within a loop, we should avoid using vperm10427// for 3-operation perms and use a constant pool load instead.10428if (Cost < 3)10429return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);10430}10431}1043210433// Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant10434// vector that will get spilled to the constant pool.10435if (V2.isUndef()) V2 = V1;1043610437return LowerVPERM(Op, DAG, PermMask, VT, V1, V2);10438}1043910440SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,10441ArrayRef<int> PermMask, EVT VT,10442SDValue V1, SDValue V2) const {10443unsigned Opcode = PPCISD::VPERM;10444EVT ValType = V1.getValueType();10445SDLoc dl(Op);10446bool NeedSwap = false;10447bool isLittleEndian = Subtarget.isLittleEndian();10448bool isPPC64 = Subtarget.isPPC64();1044910450if (Subtarget.hasVSX() && Subtarget.hasP9Vector() &&10451(V1->hasOneUse() || V2->hasOneUse())) {10452LLVM_DEBUG(dbgs() << "At least one of two input vectors are dead - using "10453"XXPERM instead\n");10454Opcode = PPCISD::XXPERM;1045510456// The second input to XXPERM is also an output so if the second input has10457// multiple uses then copying is necessary, as a result we want the10458// single-use operand to be used as the second input to prevent copying.10459if ((!isLittleEndian && !V2->hasOneUse() && V1->hasOneUse()) ||10460(isLittleEndian && !V1->hasOneUse() && V2->hasOneUse())) {10461std::swap(V1, V2);10462NeedSwap = !NeedSwap;10463}10464}1046510466// The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except10467// that it is in input element units, not in bytes. Convert now.1046810469// For little endian, the order of the input vectors is reversed, and10470// the permutation mask is complemented with respect to 31. This is10471// necessary to produce proper semantics with the big-endian-based vperm10472// instruction.10473EVT EltVT = V1.getValueType().getVectorElementType();10474unsigned BytesPerElement = EltVT.getSizeInBits() / 8;1047510476bool V1HasXXSWAPD = V1->getOperand(0)->getOpcode() == PPCISD::XXSWAPD;10477bool V2HasXXSWAPD = V2->getOperand(0)->getOpcode() == PPCISD::XXSWAPD;1047810479/*10480Vectors will be appended like so: [ V1 | v2 ]10481XXSWAPD on V1:10482[ A | B | C | D ] -> [ C | D | A | B ]104830-3 4-7 8-11 12-15 0-3 4-7 8-11 12-1510484i.e. index of A, B += 8, and index of C, D -= 8.10485XXSWAPD on V2:10486[ E | F | G | H ] -> [ G | H | E | F ]1048716-19 20-23 24-27 28-31 16-19 20-23 24-27 28-3110488i.e. index of E, F += 8, index of G, H -= 810489Swap V1 and V2:10490[ V1 | V2 ] -> [ V2 | V1 ]104910-15 16-31 0-15 16-3110492i.e. index of V1 += 16, index of V2 -= 1610493*/1049410495SmallVector<SDValue, 16> ResultMask;10496for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {10497unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];1049810499if (V1HasXXSWAPD) {10500if (SrcElt < 8)10501SrcElt += 8;10502else if (SrcElt < 16)10503SrcElt -= 8;10504}10505if (V2HasXXSWAPD) {10506if (SrcElt > 23)10507SrcElt -= 8;10508else if (SrcElt > 15)10509SrcElt += 8;10510}10511if (NeedSwap) {10512if (SrcElt < 16)10513SrcElt += 16;10514else10515SrcElt -= 16;10516}10517for (unsigned j = 0; j != BytesPerElement; ++j)10518if (isLittleEndian)10519ResultMask.push_back(10520DAG.getConstant(31 - (SrcElt * BytesPerElement + j), dl, MVT::i32));10521else10522ResultMask.push_back(10523DAG.getConstant(SrcElt * BytesPerElement + j, dl, MVT::i32));10524}1052510526if (V1HasXXSWAPD) {10527dl = SDLoc(V1->getOperand(0));10528V1 = V1->getOperand(0)->getOperand(1);10529}10530if (V2HasXXSWAPD) {10531dl = SDLoc(V2->getOperand(0));10532V2 = V2->getOperand(0)->getOperand(1);10533}1053410535if (isPPC64 && (V1HasXXSWAPD || V2HasXXSWAPD)) {10536if (ValType != MVT::v2f64)10537V1 = DAG.getBitcast(MVT::v2f64, V1);10538if (V2.getValueType() != MVT::v2f64)10539V2 = DAG.getBitcast(MVT::v2f64, V2);10540}1054110542ShufflesHandledWithVPERM++;10543SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);10544LLVM_DEBUG({10545ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);10546if (Opcode == PPCISD::XXPERM) {10547dbgs() << "Emitting a XXPERM for the following shuffle:\n";10548} else {10549dbgs() << "Emitting a VPERM for the following shuffle:\n";10550}10551SVOp->dump();10552dbgs() << "With the following permute control vector:\n";10553VPermMask.dump();10554});1055510556if (Opcode == PPCISD::XXPERM)10557VPermMask = DAG.getBitcast(MVT::v4i32, VPermMask);1055810559// Only need to place items backwards in LE,10560// the mask was properly calculated.10561if (isLittleEndian)10562std::swap(V1, V2);1056310564SDValue VPERMNode =10565DAG.getNode(Opcode, dl, V1.getValueType(), V1, V2, VPermMask);1056610567VPERMNode = DAG.getBitcast(ValType, VPERMNode);10568return VPERMNode;10569}1057010571/// getVectorCompareInfo - Given an intrinsic, return false if it is not a10572/// vector comparison. If it is, return true and fill in Opc/isDot with10573/// information about the intrinsic.10574static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,10575bool &isDot, const PPCSubtarget &Subtarget) {10576unsigned IntrinsicID = Intrin.getConstantOperandVal(0);10577CompareOpc = -1;10578isDot = false;10579switch (IntrinsicID) {10580default:10581return false;10582// Comparison predicates.10583case Intrinsic::ppc_altivec_vcmpbfp_p:10584CompareOpc = 966;10585isDot = true;10586break;10587case Intrinsic::ppc_altivec_vcmpeqfp_p:10588CompareOpc = 198;10589isDot = true;10590break;10591case Intrinsic::ppc_altivec_vcmpequb_p:10592CompareOpc = 6;10593isDot = true;10594break;10595case Intrinsic::ppc_altivec_vcmpequh_p:10596CompareOpc = 70;10597isDot = true;10598break;10599case Intrinsic::ppc_altivec_vcmpequw_p:10600CompareOpc = 134;10601isDot = true;10602break;10603case Intrinsic::ppc_altivec_vcmpequd_p:10604if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {10605CompareOpc = 199;10606isDot = true;10607} else10608return false;10609break;10610case Intrinsic::ppc_altivec_vcmpneb_p:10611case Intrinsic::ppc_altivec_vcmpneh_p:10612case Intrinsic::ppc_altivec_vcmpnew_p:10613case Intrinsic::ppc_altivec_vcmpnezb_p:10614case Intrinsic::ppc_altivec_vcmpnezh_p:10615case Intrinsic::ppc_altivec_vcmpnezw_p:10616if (Subtarget.hasP9Altivec()) {10617switch (IntrinsicID) {10618default:10619llvm_unreachable("Unknown comparison intrinsic.");10620case Intrinsic::ppc_altivec_vcmpneb_p:10621CompareOpc = 7;10622break;10623case Intrinsic::ppc_altivec_vcmpneh_p:10624CompareOpc = 71;10625break;10626case Intrinsic::ppc_altivec_vcmpnew_p:10627CompareOpc = 135;10628break;10629case Intrinsic::ppc_altivec_vcmpnezb_p:10630CompareOpc = 263;10631break;10632case Intrinsic::ppc_altivec_vcmpnezh_p:10633CompareOpc = 327;10634break;10635case Intrinsic::ppc_altivec_vcmpnezw_p:10636CompareOpc = 391;10637break;10638}10639isDot = true;10640} else10641return false;10642break;10643case Intrinsic::ppc_altivec_vcmpgefp_p:10644CompareOpc = 454;10645isDot = true;10646break;10647case Intrinsic::ppc_altivec_vcmpgtfp_p:10648CompareOpc = 710;10649isDot = true;10650break;10651case Intrinsic::ppc_altivec_vcmpgtsb_p:10652CompareOpc = 774;10653isDot = true;10654break;10655case Intrinsic::ppc_altivec_vcmpgtsh_p:10656CompareOpc = 838;10657isDot = true;10658break;10659case Intrinsic::ppc_altivec_vcmpgtsw_p:10660CompareOpc = 902;10661isDot = true;10662break;10663case Intrinsic::ppc_altivec_vcmpgtsd_p:10664if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {10665CompareOpc = 967;10666isDot = true;10667} else10668return false;10669break;10670case Intrinsic::ppc_altivec_vcmpgtub_p:10671CompareOpc = 518;10672isDot = true;10673break;10674case Intrinsic::ppc_altivec_vcmpgtuh_p:10675CompareOpc = 582;10676isDot = true;10677break;10678case Intrinsic::ppc_altivec_vcmpgtuw_p:10679CompareOpc = 646;10680isDot = true;10681break;10682case Intrinsic::ppc_altivec_vcmpgtud_p:10683if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {10684CompareOpc = 711;10685isDot = true;10686} else10687return false;10688break;1068910690case Intrinsic::ppc_altivec_vcmpequq:10691case Intrinsic::ppc_altivec_vcmpgtsq:10692case Intrinsic::ppc_altivec_vcmpgtuq:10693if (!Subtarget.isISA3_1())10694return false;10695switch (IntrinsicID) {10696default:10697llvm_unreachable("Unknown comparison intrinsic.");10698case Intrinsic::ppc_altivec_vcmpequq:10699CompareOpc = 455;10700break;10701case Intrinsic::ppc_altivec_vcmpgtsq:10702CompareOpc = 903;10703break;10704case Intrinsic::ppc_altivec_vcmpgtuq:10705CompareOpc = 647;10706break;10707}10708break;1070910710// VSX predicate comparisons use the same infrastructure10711case Intrinsic::ppc_vsx_xvcmpeqdp_p:10712case Intrinsic::ppc_vsx_xvcmpgedp_p:10713case Intrinsic::ppc_vsx_xvcmpgtdp_p:10714case Intrinsic::ppc_vsx_xvcmpeqsp_p:10715case Intrinsic::ppc_vsx_xvcmpgesp_p:10716case Intrinsic::ppc_vsx_xvcmpgtsp_p:10717if (Subtarget.hasVSX()) {10718switch (IntrinsicID) {10719case Intrinsic::ppc_vsx_xvcmpeqdp_p:10720CompareOpc = 99;10721break;10722case Intrinsic::ppc_vsx_xvcmpgedp_p:10723CompareOpc = 115;10724break;10725case Intrinsic::ppc_vsx_xvcmpgtdp_p:10726CompareOpc = 107;10727break;10728case Intrinsic::ppc_vsx_xvcmpeqsp_p:10729CompareOpc = 67;10730break;10731case Intrinsic::ppc_vsx_xvcmpgesp_p:10732CompareOpc = 83;10733break;10734case Intrinsic::ppc_vsx_xvcmpgtsp_p:10735CompareOpc = 75;10736break;10737}10738isDot = true;10739} else10740return false;10741break;1074210743// Normal Comparisons.10744case Intrinsic::ppc_altivec_vcmpbfp:10745CompareOpc = 966;10746break;10747case Intrinsic::ppc_altivec_vcmpeqfp:10748CompareOpc = 198;10749break;10750case Intrinsic::ppc_altivec_vcmpequb:10751CompareOpc = 6;10752break;10753case Intrinsic::ppc_altivec_vcmpequh:10754CompareOpc = 70;10755break;10756case Intrinsic::ppc_altivec_vcmpequw:10757CompareOpc = 134;10758break;10759case Intrinsic::ppc_altivec_vcmpequd:10760if (Subtarget.hasP8Altivec())10761CompareOpc = 199;10762else10763return false;10764break;10765case Intrinsic::ppc_altivec_vcmpneb:10766case Intrinsic::ppc_altivec_vcmpneh:10767case Intrinsic::ppc_altivec_vcmpnew:10768case Intrinsic::ppc_altivec_vcmpnezb:10769case Intrinsic::ppc_altivec_vcmpnezh:10770case Intrinsic::ppc_altivec_vcmpnezw:10771if (Subtarget.hasP9Altivec())10772switch (IntrinsicID) {10773default:10774llvm_unreachable("Unknown comparison intrinsic.");10775case Intrinsic::ppc_altivec_vcmpneb:10776CompareOpc = 7;10777break;10778case Intrinsic::ppc_altivec_vcmpneh:10779CompareOpc = 71;10780break;10781case Intrinsic::ppc_altivec_vcmpnew:10782CompareOpc = 135;10783break;10784case Intrinsic::ppc_altivec_vcmpnezb:10785CompareOpc = 263;10786break;10787case Intrinsic::ppc_altivec_vcmpnezh:10788CompareOpc = 327;10789break;10790case Intrinsic::ppc_altivec_vcmpnezw:10791CompareOpc = 391;10792break;10793}10794else10795return false;10796break;10797case Intrinsic::ppc_altivec_vcmpgefp:10798CompareOpc = 454;10799break;10800case Intrinsic::ppc_altivec_vcmpgtfp:10801CompareOpc = 710;10802break;10803case Intrinsic::ppc_altivec_vcmpgtsb:10804CompareOpc = 774;10805break;10806case Intrinsic::ppc_altivec_vcmpgtsh:10807CompareOpc = 838;10808break;10809case Intrinsic::ppc_altivec_vcmpgtsw:10810CompareOpc = 902;10811break;10812case Intrinsic::ppc_altivec_vcmpgtsd:10813if (Subtarget.hasP8Altivec())10814CompareOpc = 967;10815else10816return false;10817break;10818case Intrinsic::ppc_altivec_vcmpgtub:10819CompareOpc = 518;10820break;10821case Intrinsic::ppc_altivec_vcmpgtuh:10822CompareOpc = 582;10823break;10824case Intrinsic::ppc_altivec_vcmpgtuw:10825CompareOpc = 646;10826break;10827case Intrinsic::ppc_altivec_vcmpgtud:10828if (Subtarget.hasP8Altivec())10829CompareOpc = 711;10830else10831return false;10832break;10833case Intrinsic::ppc_altivec_vcmpequq_p:10834case Intrinsic::ppc_altivec_vcmpgtsq_p:10835case Intrinsic::ppc_altivec_vcmpgtuq_p:10836if (!Subtarget.isISA3_1())10837return false;10838switch (IntrinsicID) {10839default:10840llvm_unreachable("Unknown comparison intrinsic.");10841case Intrinsic::ppc_altivec_vcmpequq_p:10842CompareOpc = 455;10843break;10844case Intrinsic::ppc_altivec_vcmpgtsq_p:10845CompareOpc = 903;10846break;10847case Intrinsic::ppc_altivec_vcmpgtuq_p:10848CompareOpc = 647;10849break;10850}10851isDot = true;10852break;10853}10854return true;10855}1085610857/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom10858/// lower, do it, otherwise return null.10859SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,10860SelectionDAG &DAG) const {10861unsigned IntrinsicID = Op.getConstantOperandVal(0);1086210863SDLoc dl(Op);1086410865switch (IntrinsicID) {10866case Intrinsic::thread_pointer:10867// Reads the thread pointer register, used for __builtin_thread_pointer.10868if (Subtarget.isPPC64())10869return DAG.getRegister(PPC::X13, MVT::i64);10870return DAG.getRegister(PPC::R2, MVT::i32);1087110872case Intrinsic::ppc_rldimi: {10873assert(Subtarget.isPPC64() && "rldimi is only available in 64-bit!");10874SDValue Src = Op.getOperand(1);10875APInt Mask = Op.getConstantOperandAPInt(4);10876if (Mask.isZero())10877return Op.getOperand(2);10878if (Mask.isAllOnes())10879return DAG.getNode(ISD::ROTL, dl, MVT::i64, Src, Op.getOperand(3));10880uint64_t SH = Op.getConstantOperandVal(3);10881unsigned MB = 0, ME = 0;10882if (!isRunOfOnes64(Mask.getZExtValue(), MB, ME))10883report_fatal_error("invalid rldimi mask!");10884// rldimi requires ME=63-SH, otherwise rotation is needed before rldimi.10885if (ME < 63 - SH) {10886Src = DAG.getNode(ISD::ROTL, dl, MVT::i64, Src,10887DAG.getConstant(ME + SH + 1, dl, MVT::i32));10888} else if (ME > 63 - SH) {10889Src = DAG.getNode(ISD::ROTL, dl, MVT::i64, Src,10890DAG.getConstant(ME + SH - 63, dl, MVT::i32));10891}10892return SDValue(10893DAG.getMachineNode(PPC::RLDIMI, dl, MVT::i64,10894{Op.getOperand(2), Src,10895DAG.getTargetConstant(63 - ME, dl, MVT::i32),10896DAG.getTargetConstant(MB, dl, MVT::i32)}),108970);10898}1089910900case Intrinsic::ppc_rlwimi: {10901APInt Mask = Op.getConstantOperandAPInt(4);10902if (Mask.isZero())10903return Op.getOperand(2);10904if (Mask.isAllOnes())10905return DAG.getNode(ISD::ROTL, dl, MVT::i32, Op.getOperand(1),10906Op.getOperand(3));10907unsigned MB = 0, ME = 0;10908if (!isRunOfOnes(Mask.getZExtValue(), MB, ME))10909report_fatal_error("invalid rlwimi mask!");10910return SDValue(DAG.getMachineNode(10911PPC::RLWIMI, dl, MVT::i32,10912{Op.getOperand(2), Op.getOperand(1), Op.getOperand(3),10913DAG.getTargetConstant(MB, dl, MVT::i32),10914DAG.getTargetConstant(ME, dl, MVT::i32)}),109150);10916}1091710918case Intrinsic::ppc_rlwnm: {10919if (Op.getConstantOperandVal(3) == 0)10920return DAG.getConstant(0, dl, MVT::i32);10921unsigned MB = 0, ME = 0;10922if (!isRunOfOnes(Op.getConstantOperandVal(3), MB, ME))10923report_fatal_error("invalid rlwnm mask!");10924return SDValue(10925DAG.getMachineNode(PPC::RLWNM, dl, MVT::i32,10926{Op.getOperand(1), Op.getOperand(2),10927DAG.getTargetConstant(MB, dl, MVT::i32),10928DAG.getTargetConstant(ME, dl, MVT::i32)}),109290);10930}1093110932case Intrinsic::ppc_mma_disassemble_acc: {10933if (Subtarget.isISAFuture()) {10934EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};10935SDValue WideVec =10936SDValue(DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, ReturnTypes,10937Op.getOperand(1)),109380);10939SmallVector<SDValue, 4> RetOps;10940SDValue Value = SDValue(WideVec.getNode(), 0);10941SDValue Value2 = SDValue(WideVec.getNode(), 1);1094210943SDValue Extract;10944Extract = DAG.getNode(10945PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,10946Subtarget.isLittleEndian() ? Value2 : Value,10947DAG.getConstant(Subtarget.isLittleEndian() ? 1 : 0,10948dl, getPointerTy(DAG.getDataLayout())));10949RetOps.push_back(Extract);10950Extract = DAG.getNode(10951PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,10952Subtarget.isLittleEndian() ? Value2 : Value,10953DAG.getConstant(Subtarget.isLittleEndian() ? 0 : 1,10954dl, getPointerTy(DAG.getDataLayout())));10955RetOps.push_back(Extract);10956Extract = DAG.getNode(10957PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,10958Subtarget.isLittleEndian() ? Value : Value2,10959DAG.getConstant(Subtarget.isLittleEndian() ? 1 : 0,10960dl, getPointerTy(DAG.getDataLayout())));10961RetOps.push_back(Extract);10962Extract = DAG.getNode(10963PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,10964Subtarget.isLittleEndian() ? Value : Value2,10965DAG.getConstant(Subtarget.isLittleEndian() ? 0 : 1,10966dl, getPointerTy(DAG.getDataLayout())));10967RetOps.push_back(Extract);10968return DAG.getMergeValues(RetOps, dl);10969}10970[[fallthrough]];10971}10972case Intrinsic::ppc_vsx_disassemble_pair: {10973int NumVecs = 2;10974SDValue WideVec = Op.getOperand(1);10975if (IntrinsicID == Intrinsic::ppc_mma_disassemble_acc) {10976NumVecs = 4;10977WideVec = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, WideVec);10978}10979SmallVector<SDValue, 4> RetOps;10980for (int VecNo = 0; VecNo < NumVecs; VecNo++) {10981SDValue Extract = DAG.getNode(10982PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, WideVec,10983DAG.getConstant(Subtarget.isLittleEndian() ? NumVecs - 1 - VecNo10984: VecNo,10985dl, getPointerTy(DAG.getDataLayout())));10986RetOps.push_back(Extract);10987}10988return DAG.getMergeValues(RetOps, dl);10989}1099010991case Intrinsic::ppc_mma_xxmfacc:10992case Intrinsic::ppc_mma_xxmtacc: {10993// Allow pre-isa-future subtargets to lower as normal.10994if (!Subtarget.isISAFuture())10995return SDValue();10996// The intrinsics for xxmtacc and xxmfacc take one argument of10997// type v512i1, for future cpu the corresponding wacc instruction10998// dmxx[inst|extf]dmr512 is always generated for type v512i1, negating10999// the need to produce the xxm[t|f]acc.11000SDValue WideVec = Op.getOperand(1);11001DAG.ReplaceAllUsesWith(Op, WideVec);11002return SDValue();11003}1100411005case Intrinsic::ppc_unpack_longdouble: {11006auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2));11007assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&11008"Argument of long double unpack must be 0 or 1!");11009return DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Op.getOperand(1),11010DAG.getConstant(!!(Idx->getSExtValue()), dl,11011Idx->getValueType(0)));11012}1101311014case Intrinsic::ppc_compare_exp_lt:11015case Intrinsic::ppc_compare_exp_gt:11016case Intrinsic::ppc_compare_exp_eq:11017case Intrinsic::ppc_compare_exp_uo: {11018unsigned Pred;11019switch (IntrinsicID) {11020case Intrinsic::ppc_compare_exp_lt:11021Pred = PPC::PRED_LT;11022break;11023case Intrinsic::ppc_compare_exp_gt:11024Pred = PPC::PRED_GT;11025break;11026case Intrinsic::ppc_compare_exp_eq:11027Pred = PPC::PRED_EQ;11028break;11029case Intrinsic::ppc_compare_exp_uo:11030Pred = PPC::PRED_UN;11031break;11032}11033return SDValue(11034DAG.getMachineNode(11035PPC::SELECT_CC_I4, dl, MVT::i32,11036{SDValue(DAG.getMachineNode(PPC::XSCMPEXPDP, dl, MVT::i32,11037Op.getOperand(1), Op.getOperand(2)),110380),11039DAG.getConstant(1, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32),11040DAG.getTargetConstant(Pred, dl, MVT::i32)}),110410);11042}11043case Intrinsic::ppc_test_data_class: {11044EVT OpVT = Op.getOperand(1).getValueType();11045unsigned CmprOpc = OpVT == MVT::f128 ? PPC::XSTSTDCQP11046: (OpVT == MVT::f64 ? PPC::XSTSTDCDP11047: PPC::XSTSTDCSP);11048return SDValue(11049DAG.getMachineNode(11050PPC::SELECT_CC_I4, dl, MVT::i32,11051{SDValue(DAG.getMachineNode(CmprOpc, dl, MVT::i32, Op.getOperand(2),11052Op.getOperand(1)),110530),11054DAG.getConstant(1, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32),11055DAG.getTargetConstant(PPC::PRED_EQ, dl, MVT::i32)}),110560);11057}11058case Intrinsic::ppc_fnmsub: {11059EVT VT = Op.getOperand(1).getValueType();11060if (!Subtarget.hasVSX() || (!Subtarget.hasFloat128() && VT == MVT::f128))11061return DAG.getNode(11062ISD::FNEG, dl, VT,11063DAG.getNode(ISD::FMA, dl, VT, Op.getOperand(1), Op.getOperand(2),11064DAG.getNode(ISD::FNEG, dl, VT, Op.getOperand(3))));11065return DAG.getNode(PPCISD::FNMSUB, dl, VT, Op.getOperand(1),11066Op.getOperand(2), Op.getOperand(3));11067}11068case Intrinsic::ppc_convert_f128_to_ppcf128:11069case Intrinsic::ppc_convert_ppcf128_to_f128: {11070RTLIB::Libcall LC = IntrinsicID == Intrinsic::ppc_convert_ppcf128_to_f12811071? RTLIB::CONVERT_PPCF128_F12811072: RTLIB::CONVERT_F128_PPCF128;11073MakeLibCallOptions CallOptions;11074std::pair<SDValue, SDValue> Result =11075makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(1), CallOptions,11076dl, SDValue());11077return Result.first;11078}11079case Intrinsic::ppc_maxfe:11080case Intrinsic::ppc_maxfl:11081case Intrinsic::ppc_maxfs:11082case Intrinsic::ppc_minfe:11083case Intrinsic::ppc_minfl:11084case Intrinsic::ppc_minfs: {11085EVT VT = Op.getValueType();11086assert(11087all_of(Op->ops().drop_front(4),11088[VT](const SDUse &Use) { return Use.getValueType() == VT; }) &&11089"ppc_[max|min]f[e|l|s] must have uniform type arguments");11090(void)VT;11091ISD::CondCode CC = ISD::SETGT;11092if (IntrinsicID == Intrinsic::ppc_minfe ||11093IntrinsicID == Intrinsic::ppc_minfl ||11094IntrinsicID == Intrinsic::ppc_minfs)11095CC = ISD::SETLT;11096unsigned I = Op.getNumOperands() - 2, Cnt = I;11097SDValue Res = Op.getOperand(I);11098for (--I; Cnt != 0; --Cnt, I = (--I == 0 ? (Op.getNumOperands() - 1) : I)) {11099Res =11100DAG.getSelectCC(dl, Res, Op.getOperand(I), Res, Op.getOperand(I), CC);11101}11102return Res;11103}11104}1110511106// If this is a lowered altivec predicate compare, CompareOpc is set to the11107// opcode number of the comparison.11108int CompareOpc;11109bool isDot;11110if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))11111return SDValue(); // Don't custom lower most intrinsics.1111211113// If this is a non-dot comparison, make the VCMP node and we are done.11114if (!isDot) {11115SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),11116Op.getOperand(1), Op.getOperand(2),11117DAG.getConstant(CompareOpc, dl, MVT::i32));11118return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);11119}1112011121// Create the PPCISD altivec 'dot' comparison node.11122SDValue Ops[] = {11123Op.getOperand(2), // LHS11124Op.getOperand(3), // RHS11125DAG.getConstant(CompareOpc, dl, MVT::i32)11126};11127EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };11128SDValue CompNode = DAG.getNode(PPCISD::VCMP_rec, dl, VTs, Ops);1112911130// Now that we have the comparison, emit a copy from the CR to a GPR.11131// This is flagged to the above dot comparison.11132SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,11133DAG.getRegister(PPC::CR6, MVT::i32),11134CompNode.getValue(1));1113511136// Unpack the result based on how the target uses it.11137unsigned BitNo; // Bit # of CR6.11138bool InvertBit; // Invert result?11139switch (Op.getConstantOperandVal(1)) {11140default: // Can't happen, don't crash on invalid number though.11141case 0: // Return the value of the EQ bit of CR6.11142BitNo = 0; InvertBit = false;11143break;11144case 1: // Return the inverted value of the EQ bit of CR6.11145BitNo = 0; InvertBit = true;11146break;11147case 2: // Return the value of the LT bit of CR6.11148BitNo = 2; InvertBit = false;11149break;11150case 3: // Return the inverted value of the LT bit of CR6.11151BitNo = 2; InvertBit = true;11152break;11153}1115411155// Shift the bit into the low position.11156Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,11157DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32));11158// Isolate the bit.11159Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,11160DAG.getConstant(1, dl, MVT::i32));1116111162// If we are supposed to, toggle the bit.11163if (InvertBit)11164Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,11165DAG.getConstant(1, dl, MVT::i32));11166return Flags;11167}1116811169SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,11170SelectionDAG &DAG) const {11171// SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to11172// the beginning of the argument list.11173int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1;11174SDLoc DL(Op);11175switch (Op.getConstantOperandVal(ArgStart)) {11176case Intrinsic::ppc_cfence: {11177assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");11178SDValue Val = Op.getOperand(ArgStart + 1);11179EVT Ty = Val.getValueType();11180if (Ty == MVT::i128) {11181// FIXME: Testing one of two paired registers is sufficient to guarantee11182// ordering?11183Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, Val);11184}11185unsigned Opcode = Subtarget.isPPC64() ? PPC::CFENCE8 : PPC::CFENCE;11186EVT FTy = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;11187return SDValue(11188DAG.getMachineNode(Opcode, DL, MVT::Other,11189DAG.getNode(ISD::ANY_EXTEND, DL, FTy, Val),11190Op.getOperand(0)),111910);11192}11193default:11194break;11195}11196return SDValue();11197}1119811199// Lower scalar BSWAP64 to xxbrd.11200SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {11201SDLoc dl(Op);11202if (!Subtarget.isPPC64())11203return Op;11204// MTVSRDD11205Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),11206Op.getOperand(0));11207// XXBRD11208Op = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Op);11209// MFVSRD11210int VectorIndex = 0;11211if (Subtarget.isLittleEndian())11212VectorIndex = 1;11213Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,11214DAG.getTargetConstant(VectorIndex, dl, MVT::i32));11215return Op;11216}1121711218// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be11219// compared to a value that is atomically loaded (atomic loads zero-extend).11220SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,11221SelectionDAG &DAG) const {11222assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&11223"Expecting an atomic compare-and-swap here.");11224SDLoc dl(Op);11225auto *AtomicNode = cast<AtomicSDNode>(Op.getNode());11226EVT MemVT = AtomicNode->getMemoryVT();11227if (MemVT.getSizeInBits() >= 32)11228return Op;1122911230SDValue CmpOp = Op.getOperand(2);11231// If this is already correctly zero-extended, leave it alone.11232auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits());11233if (DAG.MaskedValueIsZero(CmpOp, HighBits))11234return Op;1123511236// Clear the high bits of the compare operand.11237unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;11238SDValue NewCmpOp =11239DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp,11240DAG.getConstant(MaskVal, dl, MVT::i32));1124111242// Replace the existing compare operand with the properly zero-extended one.11243SmallVector<SDValue, 4> Ops;11244for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)11245Ops.push_back(AtomicNode->getOperand(i));11246Ops[2] = NewCmpOp;11247MachineMemOperand *MMO = AtomicNode->getMemOperand();11248SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other);11249auto NodeTy =11250(MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;11251return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);11252}1125311254SDValue PPCTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op,11255SelectionDAG &DAG) const {11256AtomicSDNode *N = cast<AtomicSDNode>(Op.getNode());11257EVT MemVT = N->getMemoryVT();11258assert(MemVT.getSimpleVT() == MVT::i128 &&11259"Expect quadword atomic operations");11260SDLoc dl(N);11261unsigned Opc = N->getOpcode();11262switch (Opc) {11263case ISD::ATOMIC_LOAD: {11264// Lower quadword atomic load to int_ppc_atomic_load_i128 which will be11265// lowered to ppc instructions by pattern matching instruction selector.11266SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);11267SmallVector<SDValue, 4> Ops{11268N->getOperand(0),11269DAG.getConstant(Intrinsic::ppc_atomic_load_i128, dl, MVT::i32)};11270for (int I = 1, E = N->getNumOperands(); I < E; ++I)11271Ops.push_back(N->getOperand(I));11272SDValue LoadedVal = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, Tys,11273Ops, MemVT, N->getMemOperand());11274SDValue ValLo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i128, LoadedVal);11275SDValue ValHi =11276DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i128, LoadedVal.getValue(1));11277ValHi = DAG.getNode(ISD::SHL, dl, MVT::i128, ValHi,11278DAG.getConstant(64, dl, MVT::i32));11279SDValue Val =11280DAG.getNode(ISD::OR, dl, {MVT::i128, MVT::Other}, {ValLo, ValHi});11281return DAG.getNode(ISD::MERGE_VALUES, dl, {MVT::i128, MVT::Other},11282{Val, LoadedVal.getValue(2)});11283}11284case ISD::ATOMIC_STORE: {11285// Lower quadword atomic store to int_ppc_atomic_store_i128 which will be11286// lowered to ppc instructions by pattern matching instruction selector.11287SDVTList Tys = DAG.getVTList(MVT::Other);11288SmallVector<SDValue, 4> Ops{11289N->getOperand(0),11290DAG.getConstant(Intrinsic::ppc_atomic_store_i128, dl, MVT::i32)};11291SDValue Val = N->getOperand(1);11292SDValue ValLo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, Val);11293SDValue ValHi = DAG.getNode(ISD::SRL, dl, MVT::i128, Val,11294DAG.getConstant(64, dl, MVT::i32));11295ValHi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, ValHi);11296Ops.push_back(ValLo);11297Ops.push_back(ValHi);11298Ops.push_back(N->getOperand(2));11299return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops, MemVT,11300N->getMemOperand());11301}11302default:11303llvm_unreachable("Unexpected atomic opcode");11304}11305}1130611307static SDValue getDataClassTest(SDValue Op, FPClassTest Mask, const SDLoc &Dl,11308SelectionDAG &DAG,11309const PPCSubtarget &Subtarget) {11310assert(Mask <= fcAllFlags && "Invalid fp_class flags!");1131111312enum DataClassMask {11313DC_NAN = 1 << 6,11314DC_NEG_INF = 1 << 4,11315DC_POS_INF = 1 << 5,11316DC_NEG_ZERO = 1 << 2,11317DC_POS_ZERO = 1 << 3,11318DC_NEG_SUBNORM = 1,11319DC_POS_SUBNORM = 1 << 1,11320};1132111322EVT VT = Op.getValueType();1132311324unsigned TestOp = VT == MVT::f128 ? PPC::XSTSTDCQP11325: VT == MVT::f64 ? PPC::XSTSTDCDP11326: PPC::XSTSTDCSP;1132711328if (Mask == fcAllFlags)11329return DAG.getBoolConstant(true, Dl, MVT::i1, VT);11330if (Mask == 0)11331return DAG.getBoolConstant(false, Dl, MVT::i1, VT);1133211333// When it's cheaper or necessary to test reverse flags.11334if ((Mask & fcNormal) == fcNormal || Mask == ~fcQNan || Mask == ~fcSNan) {11335SDValue Rev = getDataClassTest(Op, ~Mask, Dl, DAG, Subtarget);11336return DAG.getNOT(Dl, Rev, MVT::i1);11337}1133811339// Power doesn't support testing whether a value is 'normal'. Test the rest11340// first, and test if it's 'not not-normal' with expected sign.11341if (Mask & fcNormal) {11342SDValue Rev(DAG.getMachineNode(11343TestOp, Dl, MVT::i32,11344DAG.getTargetConstant(DC_NAN | DC_NEG_INF | DC_POS_INF |11345DC_NEG_ZERO | DC_POS_ZERO |11346DC_NEG_SUBNORM | DC_POS_SUBNORM,11347Dl, MVT::i32),11348Op),113490);11350// Sign are stored in CR bit 0, result are in CR bit 2.11351SDValue Sign(11352DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, Dl, MVT::i1, Rev,11353DAG.getTargetConstant(PPC::sub_lt, Dl, MVT::i32)),113540);11355SDValue Normal(DAG.getNOT(11356Dl,11357SDValue(DAG.getMachineNode(11358TargetOpcode::EXTRACT_SUBREG, Dl, MVT::i1, Rev,11359DAG.getTargetConstant(PPC::sub_eq, Dl, MVT::i32)),113600),11361MVT::i1));11362if (Mask & fcPosNormal)11363Sign = DAG.getNOT(Dl, Sign, MVT::i1);11364SDValue Result = DAG.getNode(ISD::AND, Dl, MVT::i1, Sign, Normal);11365if (Mask == fcPosNormal || Mask == fcNegNormal)11366return Result;1136711368return DAG.getNode(11369ISD::OR, Dl, MVT::i1,11370getDataClassTest(Op, Mask & ~fcNormal, Dl, DAG, Subtarget), Result);11371}1137211373// The instruction doesn't differentiate between signaling or quiet NaN. Test11374// the rest first, and test if it 'is NaN and is signaling/quiet'.11375if ((Mask & fcNan) == fcQNan || (Mask & fcNan) == fcSNan) {11376bool IsQuiet = Mask & fcQNan;11377SDValue NanCheck = getDataClassTest(Op, fcNan, Dl, DAG, Subtarget);1137811379// Quietness is determined by the first bit in fraction field.11380uint64_t QuietMask = 0;11381SDValue HighWord;11382if (VT == MVT::f128) {11383HighWord = DAG.getNode(11384ISD::EXTRACT_VECTOR_ELT, Dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Op),11385DAG.getVectorIdxConstant(Subtarget.isLittleEndian() ? 3 : 0, Dl));11386QuietMask = 0x8000;11387} else if (VT == MVT::f64) {11388if (Subtarget.isPPC64()) {11389HighWord = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32,11390DAG.getBitcast(MVT::i64, Op),11391DAG.getConstant(1, Dl, MVT::i32));11392} else {11393SDValue Vec = DAG.getBitcast(11394MVT::v4i32, DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v2f64, Op));11395HighWord = DAG.getNode(11396ISD::EXTRACT_VECTOR_ELT, Dl, MVT::i32, Vec,11397DAG.getVectorIdxConstant(Subtarget.isLittleEndian() ? 1 : 0, Dl));11398}11399QuietMask = 0x80000;11400} else if (VT == MVT::f32) {11401HighWord = DAG.getBitcast(MVT::i32, Op);11402QuietMask = 0x400000;11403}11404SDValue NanRes = DAG.getSetCC(11405Dl, MVT::i1,11406DAG.getNode(ISD::AND, Dl, MVT::i32, HighWord,11407DAG.getConstant(QuietMask, Dl, MVT::i32)),11408DAG.getConstant(0, Dl, MVT::i32), IsQuiet ? ISD::SETNE : ISD::SETEQ);11409NanRes = DAG.getNode(ISD::AND, Dl, MVT::i1, NanCheck, NanRes);11410if (Mask == fcQNan || Mask == fcSNan)11411return NanRes;1141211413return DAG.getNode(ISD::OR, Dl, MVT::i1,11414getDataClassTest(Op, Mask & ~fcNan, Dl, DAG, Subtarget),11415NanRes);11416}1141711418unsigned NativeMask = 0;11419if ((Mask & fcNan) == fcNan)11420NativeMask |= DC_NAN;11421if (Mask & fcNegInf)11422NativeMask |= DC_NEG_INF;11423if (Mask & fcPosInf)11424NativeMask |= DC_POS_INF;11425if (Mask & fcNegZero)11426NativeMask |= DC_NEG_ZERO;11427if (Mask & fcPosZero)11428NativeMask |= DC_POS_ZERO;11429if (Mask & fcNegSubnormal)11430NativeMask |= DC_NEG_SUBNORM;11431if (Mask & fcPosSubnormal)11432NativeMask |= DC_POS_SUBNORM;11433return SDValue(11434DAG.getMachineNode(11435TargetOpcode::EXTRACT_SUBREG, Dl, MVT::i1,11436SDValue(DAG.getMachineNode(11437TestOp, Dl, MVT::i32,11438DAG.getTargetConstant(NativeMask, Dl, MVT::i32), Op),114390),11440DAG.getTargetConstant(PPC::sub_eq, Dl, MVT::i32)),114410);11442}1144311444SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op,11445SelectionDAG &DAG) const {11446assert(Subtarget.hasP9Vector() && "Test data class requires Power9");11447SDValue LHS = Op.getOperand(0);11448uint64_t RHSC = Op.getConstantOperandVal(1);11449SDLoc Dl(Op);11450FPClassTest Category = static_cast<FPClassTest>(RHSC);11451return getDataClassTest(LHS, Category, Dl, DAG, Subtarget);11452}1145311454SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,11455SelectionDAG &DAG) const {11456SDLoc dl(Op);11457// Create a stack slot that is 16-byte aligned.11458MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();11459int FrameIdx = MFI.CreateStackObject(16, Align(16), false);11460EVT PtrVT = getPointerTy(DAG.getDataLayout());11461SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);1146211463// Store the input value into Value#0 of the stack slot.11464SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,11465MachinePointerInfo());11466// Load it out.11467return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());11468}1146911470SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,11471SelectionDAG &DAG) const {11472assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&11473"Should only be called for ISD::INSERT_VECTOR_ELT");1147411475ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));1147611477EVT VT = Op.getValueType();11478SDLoc dl(Op);11479SDValue V1 = Op.getOperand(0);11480SDValue V2 = Op.getOperand(1);1148111482if (VT == MVT::v2f64 && C)11483return Op;1148411485if (Subtarget.hasP9Vector()) {11486// A f32 load feeding into a v4f32 insert_vector_elt is handled in this way11487// because on P10, it allows this specific insert_vector_elt load pattern to11488// utilize the refactored load and store infrastructure in order to exploit11489// prefixed loads.11490// On targets with inexpensive direct moves (Power9 and up), a11491// (insert_vector_elt v4f32:$vec, (f32 load)) is always better as an integer11492// load since a single precision load will involve conversion to double11493// precision on the load followed by another conversion to single precision.11494if ((VT == MVT::v4f32) && (V2.getValueType() == MVT::f32) &&11495(isa<LoadSDNode>(V2))) {11496SDValue BitcastVector = DAG.getBitcast(MVT::v4i32, V1);11497SDValue BitcastLoad = DAG.getBitcast(MVT::i32, V2);11498SDValue InsVecElt =11499DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4i32, BitcastVector,11500BitcastLoad, Op.getOperand(2));11501return DAG.getBitcast(MVT::v4f32, InsVecElt);11502}11503}1150411505if (Subtarget.isISA3_1()) {11506if ((VT == MVT::v2i64 || VT == MVT::v2f64) && !Subtarget.isPPC64())11507return SDValue();11508// On P10, we have legal lowering for constant and variable indices for11509// all vectors.11510if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||11511VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)11512return Op;11513}1151411515// Before P10, we have legal lowering for constant indices but not for11516// variable ones.11517if (!C)11518return SDValue();1151911520// We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.11521if (VT == MVT::v8i16 || VT == MVT::v16i8) {11522SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);11523unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;11524unsigned InsertAtElement = C->getZExtValue();11525unsigned InsertAtByte = InsertAtElement * BytesInEachElement;11526if (Subtarget.isLittleEndian()) {11527InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;11528}11529return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,11530DAG.getConstant(InsertAtByte, dl, MVT::i32));11531}11532return Op;11533}1153411535SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,11536SelectionDAG &DAG) const {11537SDLoc dl(Op);11538LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());11539SDValue LoadChain = LN->getChain();11540SDValue BasePtr = LN->getBasePtr();11541EVT VT = Op.getValueType();1154211543if (VT != MVT::v256i1 && VT != MVT::v512i1)11544return Op;1154511546// Type v256i1 is used for pairs and v512i1 is used for accumulators.11547// Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in11548// 2 or 4 vsx registers.11549assert((VT != MVT::v512i1 || Subtarget.hasMMA()) &&11550"Type unsupported without MMA");11551assert((VT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&11552"Type unsupported without paired vector support");11553Align Alignment = LN->getAlign();11554SmallVector<SDValue, 4> Loads;11555SmallVector<SDValue, 4> LoadChains;11556unsigned NumVecs = VT.getSizeInBits() / 128;11557for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {11558SDValue Load =11559DAG.getLoad(MVT::v16i8, dl, LoadChain, BasePtr,11560LN->getPointerInfo().getWithOffset(Idx * 16),11561commonAlignment(Alignment, Idx * 16),11562LN->getMemOperand()->getFlags(), LN->getAAInfo());11563BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,11564DAG.getConstant(16, dl, BasePtr.getValueType()));11565Loads.push_back(Load);11566LoadChains.push_back(Load.getValue(1));11567}11568if (Subtarget.isLittleEndian()) {11569std::reverse(Loads.begin(), Loads.end());11570std::reverse(LoadChains.begin(), LoadChains.end());11571}11572SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);11573SDValue Value =11574DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD,11575dl, VT, Loads);11576SDValue RetOps[] = {Value, TF};11577return DAG.getMergeValues(RetOps, dl);11578}1157911580SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,11581SelectionDAG &DAG) const {11582SDLoc dl(Op);11583StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());11584SDValue StoreChain = SN->getChain();11585SDValue BasePtr = SN->getBasePtr();11586SDValue Value = SN->getValue();11587SDValue Value2 = SN->getValue();11588EVT StoreVT = Value.getValueType();1158911590if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1)11591return Op;1159211593// Type v256i1 is used for pairs and v512i1 is used for accumulators.11594// Here we create 2 or 4 v16i8 stores to store the pair or accumulator11595// underlying registers individually.11596assert((StoreVT != MVT::v512i1 || Subtarget.hasMMA()) &&11597"Type unsupported without MMA");11598assert((StoreVT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&11599"Type unsupported without paired vector support");11600Align Alignment = SN->getAlign();11601SmallVector<SDValue, 4> Stores;11602unsigned NumVecs = 2;11603if (StoreVT == MVT::v512i1) {11604if (Subtarget.isISAFuture()) {11605EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};11606MachineSDNode *ExtNode = DAG.getMachineNode(11607PPC::DMXXEXTFDMR512, dl, ReturnTypes, Op.getOperand(1));1160811609Value = SDValue(ExtNode, 0);11610Value2 = SDValue(ExtNode, 1);11611} else11612Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value);11613NumVecs = 4;11614}11615for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {11616unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx;11617SDValue Elt;11618if (Subtarget.isISAFuture()) {11619VecNum = Subtarget.isLittleEndian() ? 1 - (Idx % 2) : (Idx % 2);11620Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,11621Idx > 1 ? Value2 : Value,11622DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));11623} else11624Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value,11625DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));1162611627SDValue Store =11628DAG.getStore(StoreChain, dl, Elt, BasePtr,11629SN->getPointerInfo().getWithOffset(Idx * 16),11630commonAlignment(Alignment, Idx * 16),11631SN->getMemOperand()->getFlags(), SN->getAAInfo());11632BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,11633DAG.getConstant(16, dl, BasePtr.getValueType()));11634Stores.push_back(Store);11635}11636SDValue TF = DAG.getTokenFactor(dl, Stores);11637return TF;11638}1163911640SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {11641SDLoc dl(Op);11642if (Op.getValueType() == MVT::v4i32) {11643SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);1164411645SDValue Zero = getCanonicalConstSplat(0, 1, MVT::v4i32, DAG, dl);11646// +16 as shift amt.11647SDValue Neg16 = getCanonicalConstSplat(-16, 4, MVT::v4i32, DAG, dl);11648SDValue RHSSwap = // = vrlw RHS, 1611649BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);1165011651// Shrinkify inputs to v8i16.11652LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);11653RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);11654RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);1165511656// Low parts multiplied together, generating 32-bit results (we ignore the11657// top parts).11658SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,11659LHS, RHS, DAG, dl, MVT::v4i32);1166011661SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,11662LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);11663// Shift the high parts up 16 bits.11664HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,11665Neg16, DAG, dl);11666return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);11667} else if (Op.getValueType() == MVT::v16i8) {11668SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);11669bool isLittleEndian = Subtarget.isLittleEndian();1167011671// Multiply the even 8-bit parts, producing 16-bit sums.11672SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,11673LHS, RHS, DAG, dl, MVT::v8i16);11674EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);1167511676// Multiply the odd 8-bit parts, producing 16-bit sums.11677SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,11678LHS, RHS, DAG, dl, MVT::v8i16);11679OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);1168011681// Merge the results together. Because vmuleub and vmuloub are11682// instructions with a big-endian bias, we must reverse the11683// element numbering and reverse the meaning of "odd" and "even"11684// when generating little endian code.11685int Ops[16];11686for (unsigned i = 0; i != 8; ++i) {11687if (isLittleEndian) {11688Ops[i*2 ] = 2*i;11689Ops[i*2+1] = 2*i+16;11690} else {11691Ops[i*2 ] = 2*i+1;11692Ops[i*2+1] = 2*i+1+16;11693}11694}11695if (isLittleEndian)11696return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);11697else11698return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);11699} else {11700llvm_unreachable("Unknown mul to lower!");11701}11702}1170311704SDValue PPCTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {11705bool IsStrict = Op->isStrictFPOpcode();11706if (Op.getOperand(IsStrict ? 1 : 0).getValueType() == MVT::f128 &&11707!Subtarget.hasP9Vector())11708return SDValue();1170911710return Op;11711}1171211713// Custom lowering for fpext vf32 to v2f6411714SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {1171511716assert(Op.getOpcode() == ISD::FP_EXTEND &&11717"Should only be called for ISD::FP_EXTEND");1171811719// FIXME: handle extends from half precision float vectors on P9.11720// We only want to custom lower an extend from v2f32 to v2f64.11721if (Op.getValueType() != MVT::v2f64 ||11722Op.getOperand(0).getValueType() != MVT::v2f32)11723return SDValue();1172411725SDLoc dl(Op);11726SDValue Op0 = Op.getOperand(0);1172711728switch (Op0.getOpcode()) {11729default:11730return SDValue();11731case ISD::EXTRACT_SUBVECTOR: {11732assert(Op0.getNumOperands() == 2 &&11733isa<ConstantSDNode>(Op0->getOperand(1)) &&11734"Node should have 2 operands with second one being a constant!");1173511736if (Op0.getOperand(0).getValueType() != MVT::v4f32)11737return SDValue();1173811739// Custom lower is only done for high or low doubleword.11740int Idx = Op0.getConstantOperandVal(1);11741if (Idx % 2 != 0)11742return SDValue();1174311744// Since input is v4f32, at this point Idx is either 0 or 2.11745// Shift to get the doubleword position we want.11746int DWord = Idx >> 1;1174711748// High and low word positions are different on little endian.11749if (Subtarget.isLittleEndian())11750DWord ^= 0x1;1175111752return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64,11753Op0.getOperand(0), DAG.getConstant(DWord, dl, MVT::i32));11754}11755case ISD::FADD:11756case ISD::FMUL:11757case ISD::FSUB: {11758SDValue NewLoad[2];11759for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {11760// Ensure both input are loads.11761SDValue LdOp = Op0.getOperand(i);11762if (LdOp.getOpcode() != ISD::LOAD)11763return SDValue();11764// Generate new load node.11765LoadSDNode *LD = cast<LoadSDNode>(LdOp);11766SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};11767NewLoad[i] = DAG.getMemIntrinsicNode(11768PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,11769LD->getMemoryVT(), LD->getMemOperand());11770}11771SDValue NewOp =11772DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, NewLoad[0],11773NewLoad[1], Op0.getNode()->getFlags());11774return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewOp,11775DAG.getConstant(0, dl, MVT::i32));11776}11777case ISD::LOAD: {11778LoadSDNode *LD = cast<LoadSDNode>(Op0);11779SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};11780SDValue NewLd = DAG.getMemIntrinsicNode(11781PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,11782LD->getMemoryVT(), LD->getMemOperand());11783return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewLd,11784DAG.getConstant(0, dl, MVT::i32));11785}11786}11787llvm_unreachable("ERROR:Should return for all cases within swtich.");11788}1178911790/// LowerOperation - Provide custom lowering hooks for some operations.11791///11792SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {11793switch (Op.getOpcode()) {11794default: llvm_unreachable("Wasn't expecting to be able to lower this!");11795case ISD::FPOW: return lowerPow(Op, DAG);11796case ISD::FSIN: return lowerSin(Op, DAG);11797case ISD::FCOS: return lowerCos(Op, DAG);11798case ISD::FLOG: return lowerLog(Op, DAG);11799case ISD::FLOG10: return lowerLog10(Op, DAG);11800case ISD::FEXP: return lowerExp(Op, DAG);11801case ISD::ConstantPool: return LowerConstantPool(Op, DAG);11802case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);11803case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);11804case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);11805case ISD::JumpTable: return LowerJumpTable(Op, DAG);11806case ISD::STRICT_FSETCC:11807case ISD::STRICT_FSETCCS:11808case ISD::SETCC: return LowerSETCC(Op, DAG);11809case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);11810case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);1181111812case ISD::INLINEASM:11813case ISD::INLINEASM_BR: return LowerINLINEASM(Op, DAG);11814// Variable argument lowering.11815case ISD::VASTART: return LowerVASTART(Op, DAG);11816case ISD::VAARG: return LowerVAARG(Op, DAG);11817case ISD::VACOPY: return LowerVACOPY(Op, DAG);1181811819case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG);11820case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);11821case ISD::GET_DYNAMIC_AREA_OFFSET:11822return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);1182311824// Exception handling lowering.11825case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG);11826case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);11827case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);1182811829case ISD::LOAD: return LowerLOAD(Op, DAG);11830case ISD::STORE: return LowerSTORE(Op, DAG);11831case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);11832case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);11833case ISD::STRICT_FP_TO_UINT:11834case ISD::STRICT_FP_TO_SINT:11835case ISD::FP_TO_UINT:11836case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op));11837case ISD::STRICT_UINT_TO_FP:11838case ISD::STRICT_SINT_TO_FP:11839case ISD::UINT_TO_FP:11840case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);11841case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);1184211843// Lower 64-bit shifts.11844case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG);11845case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);11846case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);1184711848case ISD::FSHL: return LowerFunnelShift(Op, DAG);11849case ISD::FSHR: return LowerFunnelShift(Op, DAG);1185011851// Vector-related lowering.11852case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);11853case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);11854case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);11855case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);11856case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);11857case ISD::MUL: return LowerMUL(Op, DAG);11858case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);11859case ISD::STRICT_FP_ROUND:11860case ISD::FP_ROUND:11861return LowerFP_ROUND(Op, DAG);11862case ISD::ROTL: return LowerROTL(Op, DAG);1186311864// For counter-based loop handling.11865case ISD::INTRINSIC_W_CHAIN: return SDValue();1186611867case ISD::BITCAST: return LowerBITCAST(Op, DAG);1186811869// Frame & Return address.11870case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);11871case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);1187211873case ISD::INTRINSIC_VOID:11874return LowerINTRINSIC_VOID(Op, DAG);11875case ISD::BSWAP:11876return LowerBSWAP(Op, DAG);11877case ISD::ATOMIC_CMP_SWAP:11878return LowerATOMIC_CMP_SWAP(Op, DAG);11879case ISD::ATOMIC_STORE:11880return LowerATOMIC_LOAD_STORE(Op, DAG);11881case ISD::IS_FPCLASS:11882return LowerIS_FPCLASS(Op, DAG);11883}11884}1188511886void PPCTargetLowering::ReplaceNodeResults(SDNode *N,11887SmallVectorImpl<SDValue>&Results,11888SelectionDAG &DAG) const {11889SDLoc dl(N);11890switch (N->getOpcode()) {11891default:11892llvm_unreachable("Do not know how to custom type legalize this operation!");11893case ISD::ATOMIC_LOAD: {11894SDValue Res = LowerATOMIC_LOAD_STORE(SDValue(N, 0), DAG);11895Results.push_back(Res);11896Results.push_back(Res.getValue(1));11897break;11898}11899case ISD::READCYCLECOUNTER: {11900SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);11901SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));1190211903Results.push_back(11904DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, RTB, RTB.getValue(1)));11905Results.push_back(RTB.getValue(2));11906break;11907}11908case ISD::INTRINSIC_W_CHAIN: {11909if (N->getConstantOperandVal(1) != Intrinsic::loop_decrement)11910break;1191111912assert(N->getValueType(0) == MVT::i1 &&11913"Unexpected result type for CTR decrement intrinsic");11914EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),11915N->getValueType(0));11916SDVTList VTs = DAG.getVTList(SVT, MVT::Other);11917SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),11918N->getOperand(1));1191911920Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewInt));11921Results.push_back(NewInt.getValue(1));11922break;11923}11924case ISD::INTRINSIC_WO_CHAIN: {11925switch (N->getConstantOperandVal(0)) {11926case Intrinsic::ppc_pack_longdouble:11927Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,11928N->getOperand(2), N->getOperand(1)));11929break;11930case Intrinsic::ppc_maxfe:11931case Intrinsic::ppc_minfe:11932case Intrinsic::ppc_fnmsub:11933case Intrinsic::ppc_convert_f128_to_ppcf128:11934Results.push_back(LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG));11935break;11936}11937break;11938}11939case ISD::VAARG: {11940if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64())11941return;1194211943EVT VT = N->getValueType(0);1194411945if (VT == MVT::i64) {11946SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG);1194711948Results.push_back(NewNode);11949Results.push_back(NewNode.getValue(1));11950}11951return;11952}11953case ISD::STRICT_FP_TO_SINT:11954case ISD::STRICT_FP_TO_UINT:11955case ISD::FP_TO_SINT:11956case ISD::FP_TO_UINT: {11957// LowerFP_TO_INT() can only handle f32 and f64.11958if (N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType() ==11959MVT::ppcf128)11960return;11961SDValue LoweredValue = LowerFP_TO_INT(SDValue(N, 0), DAG, dl);11962Results.push_back(LoweredValue);11963if (N->isStrictFPOpcode())11964Results.push_back(LoweredValue.getValue(1));11965return;11966}11967case ISD::TRUNCATE: {11968if (!N->getValueType(0).isVector())11969return;11970SDValue Lowered = LowerTRUNCATEVector(SDValue(N, 0), DAG);11971if (Lowered)11972Results.push_back(Lowered);11973return;11974}11975case ISD::FSHL:11976case ISD::FSHR:11977// Don't handle funnel shifts here.11978return;11979case ISD::BITCAST:11980// Don't handle bitcast here.11981return;11982case ISD::FP_EXTEND:11983SDValue Lowered = LowerFP_EXTEND(SDValue(N, 0), DAG);11984if (Lowered)11985Results.push_back(Lowered);11986return;11987}11988}1198911990//===----------------------------------------------------------------------===//11991// Other Lowering Code11992//===----------------------------------------------------------------------===//1199311994static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {11995Module *M = Builder.GetInsertBlock()->getParent()->getParent();11996Function *Func = Intrinsic::getDeclaration(M, Id);11997return Builder.CreateCall(Func, {});11998}1199912000// The mappings for emitLeading/TrailingFence is taken from12001// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html12002Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,12003Instruction *Inst,12004AtomicOrdering Ord) const {12005if (Ord == AtomicOrdering::SequentiallyConsistent)12006return callIntrinsic(Builder, Intrinsic::ppc_sync);12007if (isReleaseOrStronger(Ord))12008return callIntrinsic(Builder, Intrinsic::ppc_lwsync);12009return nullptr;12010}1201112012Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,12013Instruction *Inst,12014AtomicOrdering Ord) const {12015if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {12016// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and12017// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html12018// and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.12019if (isa<LoadInst>(Inst))12020return Builder.CreateCall(12021Intrinsic::getDeclaration(12022Builder.GetInsertBlock()->getParent()->getParent(),12023Intrinsic::ppc_cfence, {Inst->getType()}),12024{Inst});12025// FIXME: Can use isync for rmw operation.12026return callIntrinsic(Builder, Intrinsic::ppc_lwsync);12027}12028return nullptr;12029}1203012031MachineBasicBlock *12032PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,12033unsigned AtomicSize,12034unsigned BinOpcode,12035unsigned CmpOpcode,12036unsigned CmpPred) const {12037// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.12038const TargetInstrInfo *TII = Subtarget.getInstrInfo();1203912040auto LoadMnemonic = PPC::LDARX;12041auto StoreMnemonic = PPC::STDCX;12042switch (AtomicSize) {12043default:12044llvm_unreachable("Unexpected size of atomic entity");12045case 1:12046LoadMnemonic = PPC::LBARX;12047StoreMnemonic = PPC::STBCX;12048assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");12049break;12050case 2:12051LoadMnemonic = PPC::LHARX;12052StoreMnemonic = PPC::STHCX;12053assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");12054break;12055case 4:12056LoadMnemonic = PPC::LWARX;12057StoreMnemonic = PPC::STWCX;12058break;12059case 8:12060LoadMnemonic = PPC::LDARX;12061StoreMnemonic = PPC::STDCX;12062break;12063}1206412065const BasicBlock *LLVM_BB = BB->getBasicBlock();12066MachineFunction *F = BB->getParent();12067MachineFunction::iterator It = ++BB->getIterator();1206812069Register dest = MI.getOperand(0).getReg();12070Register ptrA = MI.getOperand(1).getReg();12071Register ptrB = MI.getOperand(2).getReg();12072Register incr = MI.getOperand(3).getReg();12073DebugLoc dl = MI.getDebugLoc();1207412075MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);12076MachineBasicBlock *loop2MBB =12077CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;12078MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);12079F->insert(It, loopMBB);12080if (CmpOpcode)12081F->insert(It, loop2MBB);12082F->insert(It, exitMBB);12083exitMBB->splice(exitMBB->begin(), BB,12084std::next(MachineBasicBlock::iterator(MI)), BB->end());12085exitMBB->transferSuccessorsAndUpdatePHIs(BB);1208612087MachineRegisterInfo &RegInfo = F->getRegInfo();12088Register TmpReg = (!BinOpcode) ? incr :12089RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass12090: &PPC::GPRCRegClass);1209112092// thisMBB:12093// ...12094// fallthrough --> loopMBB12095BB->addSuccessor(loopMBB);1209612097// loopMBB:12098// l[wd]arx dest, ptr12099// add r0, dest, incr12100// st[wd]cx. r0, ptr12101// bne- loopMBB12102// fallthrough --> exitMBB1210312104// For max/min...12105// loopMBB:12106// l[wd]arx dest, ptr12107// cmpl?[wd] dest, incr12108// bgt exitMBB12109// loop2MBB:12110// st[wd]cx. dest, ptr12111// bne- loopMBB12112// fallthrough --> exitMBB1211312114BB = loopMBB;12115BuildMI(BB, dl, TII->get(LoadMnemonic), dest)12116.addReg(ptrA).addReg(ptrB);12117if (BinOpcode)12118BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);12119if (CmpOpcode) {12120Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);12121// Signed comparisons of byte or halfword values must be sign-extended.12122if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {12123Register ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);12124BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),12125ExtReg).addReg(dest);12126BuildMI(BB, dl, TII->get(CmpOpcode), CrReg).addReg(ExtReg).addReg(incr);12127} else12128BuildMI(BB, dl, TII->get(CmpOpcode), CrReg).addReg(dest).addReg(incr);1212912130BuildMI(BB, dl, TII->get(PPC::BCC))12131.addImm(CmpPred)12132.addReg(CrReg)12133.addMBB(exitMBB);12134BB->addSuccessor(loop2MBB);12135BB->addSuccessor(exitMBB);12136BB = loop2MBB;12137}12138BuildMI(BB, dl, TII->get(StoreMnemonic))12139.addReg(TmpReg).addReg(ptrA).addReg(ptrB);12140BuildMI(BB, dl, TII->get(PPC::BCC))12141.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);12142BB->addSuccessor(loopMBB);12143BB->addSuccessor(exitMBB);1214412145// exitMBB:12146// ...12147BB = exitMBB;12148return BB;12149}1215012151static bool isSignExtended(MachineInstr &MI, const PPCInstrInfo *TII) {12152switch(MI.getOpcode()) {12153default:12154return false;12155case PPC::COPY:12156return TII->isSignExtended(MI.getOperand(1).getReg(),12157&MI.getMF()->getRegInfo());12158case PPC::LHA:12159case PPC::LHA8:12160case PPC::LHAU:12161case PPC::LHAU8:12162case PPC::LHAUX:12163case PPC::LHAUX8:12164case PPC::LHAX:12165case PPC::LHAX8:12166case PPC::LWA:12167case PPC::LWAUX:12168case PPC::LWAX:12169case PPC::LWAX_32:12170case PPC::LWA_32:12171case PPC::PLHA:12172case PPC::PLHA8:12173case PPC::PLHA8pc:12174case PPC::PLHApc:12175case PPC::PLWA:12176case PPC::PLWA8:12177case PPC::PLWA8pc:12178case PPC::PLWApc:12179case PPC::EXTSB:12180case PPC::EXTSB8:12181case PPC::EXTSB8_32_64:12182case PPC::EXTSB8_rec:12183case PPC::EXTSB_rec:12184case PPC::EXTSH:12185case PPC::EXTSH8:12186case PPC::EXTSH8_32_64:12187case PPC::EXTSH8_rec:12188case PPC::EXTSH_rec:12189case PPC::EXTSW:12190case PPC::EXTSWSLI:12191case PPC::EXTSWSLI_32_64:12192case PPC::EXTSWSLI_32_64_rec:12193case PPC::EXTSWSLI_rec:12194case PPC::EXTSW_32:12195case PPC::EXTSW_32_64:12196case PPC::EXTSW_32_64_rec:12197case PPC::EXTSW_rec:12198case PPC::SRAW:12199case PPC::SRAWI:12200case PPC::SRAWI_rec:12201case PPC::SRAW_rec:12202return true;12203}12204return false;12205}1220612207MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(12208MachineInstr &MI, MachineBasicBlock *BB,12209bool is8bit, // operation12210unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const {12211// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.12212const PPCInstrInfo *TII = Subtarget.getInstrInfo();1221312214// If this is a signed comparison and the value being compared is not known12215// to be sign extended, sign extend it here.12216DebugLoc dl = MI.getDebugLoc();12217MachineFunction *F = BB->getParent();12218MachineRegisterInfo &RegInfo = F->getRegInfo();12219Register incr = MI.getOperand(3).getReg();12220bool IsSignExtended =12221incr.isVirtual() && isSignExtended(*RegInfo.getVRegDef(incr), TII);1222212223if (CmpOpcode == PPC::CMPW && !IsSignExtended) {12224Register ValueReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);12225BuildMI(*BB, MI, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueReg)12226.addReg(MI.getOperand(3).getReg());12227MI.getOperand(3).setReg(ValueReg);12228incr = ValueReg;12229}12230// If we support part-word atomic mnemonics, just use them12231if (Subtarget.hasPartwordAtomics())12232return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, CmpOpcode,12233CmpPred);1223412235// In 64 bit mode we have to use 64 bits for addresses, even though the12236// lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address12237// registers without caring whether they're 32 or 64, but here we're12238// doing actual arithmetic on the addresses.12239bool is64bit = Subtarget.isPPC64();12240bool isLittleEndian = Subtarget.isLittleEndian();12241unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;1224212243const BasicBlock *LLVM_BB = BB->getBasicBlock();12244MachineFunction::iterator It = ++BB->getIterator();1224512246Register dest = MI.getOperand(0).getReg();12247Register ptrA = MI.getOperand(1).getReg();12248Register ptrB = MI.getOperand(2).getReg();1224912250MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);12251MachineBasicBlock *loop2MBB =12252CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;12253MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);12254F->insert(It, loopMBB);12255if (CmpOpcode)12256F->insert(It, loop2MBB);12257F->insert(It, exitMBB);12258exitMBB->splice(exitMBB->begin(), BB,12259std::next(MachineBasicBlock::iterator(MI)), BB->end());12260exitMBB->transferSuccessorsAndUpdatePHIs(BB);1226112262const TargetRegisterClass *RC =12263is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;12264const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;1226512266Register PtrReg = RegInfo.createVirtualRegister(RC);12267Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);12268Register ShiftReg =12269isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);12270Register Incr2Reg = RegInfo.createVirtualRegister(GPRC);12271Register MaskReg = RegInfo.createVirtualRegister(GPRC);12272Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);12273Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);12274Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);12275Register Tmp3Reg = RegInfo.createVirtualRegister(GPRC);12276Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);12277Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);12278Register SrwDestReg = RegInfo.createVirtualRegister(GPRC);12279Register Ptr1Reg;12280Register TmpReg =12281(!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC);1228212283// thisMBB:12284// ...12285// fallthrough --> loopMBB12286BB->addSuccessor(loopMBB);1228712288// The 4-byte load must be aligned, while a char or short may be12289// anywhere in the word. Hence all this nasty bookkeeping code.12290// add ptr1, ptrA, ptrB [copy if ptrA==0]12291// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]12292// xori shift, shift1, 24 [16]12293// rlwinm ptr, ptr1, 0, 0, 2912294// slw incr2, incr, shift12295// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]12296// slw mask, mask2, shift12297// loopMBB:12298// lwarx tmpDest, ptr12299// add tmp, tmpDest, incr212300// andc tmp2, tmpDest, mask12301// and tmp3, tmp, mask12302// or tmp4, tmp3, tmp212303// stwcx. tmp4, ptr12304// bne- loopMBB12305// fallthrough --> exitMBB12306// srw SrwDest, tmpDest, shift12307// rlwinm SrwDest, SrwDest, 0, 24 [16], 3112308if (ptrA != ZeroReg) {12309Ptr1Reg = RegInfo.createVirtualRegister(RC);12310BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)12311.addReg(ptrA)12312.addReg(ptrB);12313} else {12314Ptr1Reg = ptrB;12315}12316// We need use 32-bit subregister to avoid mismatch register class in 64-bit12317// mode.12318BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)12319.addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)12320.addImm(3)12321.addImm(27)12322.addImm(is8bit ? 28 : 27);12323if (!isLittleEndian)12324BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)12325.addReg(Shift1Reg)12326.addImm(is8bit ? 24 : 16);12327if (is64bit)12328BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)12329.addReg(Ptr1Reg)12330.addImm(0)12331.addImm(61);12332else12333BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)12334.addReg(Ptr1Reg)12335.addImm(0)12336.addImm(0)12337.addImm(29);12338BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg).addReg(incr).addReg(ShiftReg);12339if (is8bit)12340BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);12341else {12342BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);12343BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)12344.addReg(Mask3Reg)12345.addImm(65535);12346}12347BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)12348.addReg(Mask2Reg)12349.addReg(ShiftReg);1235012351BB = loopMBB;12352BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)12353.addReg(ZeroReg)12354.addReg(PtrReg);12355if (BinOpcode)12356BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)12357.addReg(Incr2Reg)12358.addReg(TmpDestReg);12359BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)12360.addReg(TmpDestReg)12361.addReg(MaskReg);12362BuildMI(BB, dl, TII->get(PPC::AND), Tmp3Reg).addReg(TmpReg).addReg(MaskReg);12363if (CmpOpcode) {12364// For unsigned comparisons, we can directly compare the shifted values.12365// For signed comparisons we shift and sign extend.12366Register SReg = RegInfo.createVirtualRegister(GPRC);12367Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);12368BuildMI(BB, dl, TII->get(PPC::AND), SReg)12369.addReg(TmpDestReg)12370.addReg(MaskReg);12371unsigned ValueReg = SReg;12372unsigned CmpReg = Incr2Reg;12373if (CmpOpcode == PPC::CMPW) {12374ValueReg = RegInfo.createVirtualRegister(GPRC);12375BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg)12376.addReg(SReg)12377.addReg(ShiftReg);12378Register ValueSReg = RegInfo.createVirtualRegister(GPRC);12379BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg)12380.addReg(ValueReg);12381ValueReg = ValueSReg;12382CmpReg = incr;12383}12384BuildMI(BB, dl, TII->get(CmpOpcode), CrReg).addReg(ValueReg).addReg(CmpReg);12385BuildMI(BB, dl, TII->get(PPC::BCC))12386.addImm(CmpPred)12387.addReg(CrReg)12388.addMBB(exitMBB);12389BB->addSuccessor(loop2MBB);12390BB->addSuccessor(exitMBB);12391BB = loop2MBB;12392}12393BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg).addReg(Tmp3Reg).addReg(Tmp2Reg);12394BuildMI(BB, dl, TII->get(PPC::STWCX))12395.addReg(Tmp4Reg)12396.addReg(ZeroReg)12397.addReg(PtrReg);12398BuildMI(BB, dl, TII->get(PPC::BCC))12399.addImm(PPC::PRED_NE)12400.addReg(PPC::CR0)12401.addMBB(loopMBB);12402BB->addSuccessor(loopMBB);12403BB->addSuccessor(exitMBB);1240412405// exitMBB:12406// ...12407BB = exitMBB;12408// Since the shift amount is not a constant, we need to clear12409// the upper bits with a separate RLWINM.12410BuildMI(*BB, BB->begin(), dl, TII->get(PPC::RLWINM), dest)12411.addReg(SrwDestReg)12412.addImm(0)12413.addImm(is8bit ? 24 : 16)12414.addImm(31);12415BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), SrwDestReg)12416.addReg(TmpDestReg)12417.addReg(ShiftReg);12418return BB;12419}1242012421llvm::MachineBasicBlock *12422PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,12423MachineBasicBlock *MBB) const {12424DebugLoc DL = MI.getDebugLoc();12425const TargetInstrInfo *TII = Subtarget.getInstrInfo();12426const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();1242712428MachineFunction *MF = MBB->getParent();12429MachineRegisterInfo &MRI = MF->getRegInfo();1243012431const BasicBlock *BB = MBB->getBasicBlock();12432MachineFunction::iterator I = ++MBB->getIterator();1243312434Register DstReg = MI.getOperand(0).getReg();12435const TargetRegisterClass *RC = MRI.getRegClass(DstReg);12436assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");12437Register mainDstReg = MRI.createVirtualRegister(RC);12438Register restoreDstReg = MRI.createVirtualRegister(RC);1243912440MVT PVT = getPointerTy(MF->getDataLayout());12441assert((PVT == MVT::i64 || PVT == MVT::i32) &&12442"Invalid Pointer Size!");12443// For v = setjmp(buf), we generate12444//12445// thisMBB:12446// SjLjSetup mainMBB12447// bl mainMBB12448// v_restore = 112449// b sinkMBB12450//12451// mainMBB:12452// buf[LabelOffset] = LR12453// v_main = 012454//12455// sinkMBB:12456// v = phi(main, restore)12457//1245812459MachineBasicBlock *thisMBB = MBB;12460MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);12461MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);12462MF->insert(I, mainMBB);12463MF->insert(I, sinkMBB);1246412465MachineInstrBuilder MIB;1246612467// Transfer the remainder of BB and its successor edges to sinkMBB.12468sinkMBB->splice(sinkMBB->begin(), MBB,12469std::next(MachineBasicBlock::iterator(MI)), MBB->end());12470sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);1247112472// Note that the structure of the jmp_buf used here is not compatible12473// with that used by libc, and is not designed to be. Specifically, it12474// stores only those 'reserved' registers that LLVM does not otherwise12475// understand how to spill. Also, by convention, by the time this12476// intrinsic is called, Clang has already stored the frame address in the12477// first slot of the buffer and stack address in the third. Following the12478// X86 target code, we'll store the jump address in the second slot. We also12479// need to save the TOC pointer (R2) to handle jumps between shared12480// libraries, and that will be stored in the fourth slot. The thread12481// identifier (R13) is not affected.1248212483// thisMBB:12484const int64_t LabelOffset = 1 * PVT.getStoreSize();12485const int64_t TOCOffset = 3 * PVT.getStoreSize();12486const int64_t BPOffset = 4 * PVT.getStoreSize();1248712488// Prepare IP either in reg.12489const TargetRegisterClass *PtrRC = getRegClassFor(PVT);12490Register LabelReg = MRI.createVirtualRegister(PtrRC);12491Register BufReg = MI.getOperand(1).getReg();1249212493if (Subtarget.is64BitELFABI()) {12494setUsesTOCBasePtr(*MBB->getParent());12495MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))12496.addReg(PPC::X2)12497.addImm(TOCOffset)12498.addReg(BufReg)12499.cloneMemRefs(MI);12500}1250112502// Naked functions never have a base pointer, and so we use r1. For all12503// other functions, this decision must be delayed until during PEI.12504unsigned BaseReg;12505if (MF->getFunction().hasFnAttribute(Attribute::Naked))12506BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;12507else12508BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;1250912510MIB = BuildMI(*thisMBB, MI, DL,12511TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))12512.addReg(BaseReg)12513.addImm(BPOffset)12514.addReg(BufReg)12515.cloneMemRefs(MI);1251612517// Setup12518MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);12519MIB.addRegMask(TRI->getNoPreservedMask());1252012521BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);1252212523MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))12524.addMBB(mainMBB);12525MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);1252612527thisMBB->addSuccessor(mainMBB, BranchProbability::getZero());12528thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne());1252912530// mainMBB:12531// mainDstReg = 012532MIB =12533BuildMI(mainMBB, DL,12534TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);1253512536// Store IP12537if (Subtarget.isPPC64()) {12538MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))12539.addReg(LabelReg)12540.addImm(LabelOffset)12541.addReg(BufReg);12542} else {12543MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))12544.addReg(LabelReg)12545.addImm(LabelOffset)12546.addReg(BufReg);12547}12548MIB.cloneMemRefs(MI);1254912550BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);12551mainMBB->addSuccessor(sinkMBB);1255212553// sinkMBB:12554BuildMI(*sinkMBB, sinkMBB->begin(), DL,12555TII->get(PPC::PHI), DstReg)12556.addReg(mainDstReg).addMBB(mainMBB)12557.addReg(restoreDstReg).addMBB(thisMBB);1255812559MI.eraseFromParent();12560return sinkMBB;12561}1256212563MachineBasicBlock *12564PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,12565MachineBasicBlock *MBB) const {12566DebugLoc DL = MI.getDebugLoc();12567const TargetInstrInfo *TII = Subtarget.getInstrInfo();1256812569MachineFunction *MF = MBB->getParent();12570MachineRegisterInfo &MRI = MF->getRegInfo();1257112572MVT PVT = getPointerTy(MF->getDataLayout());12573assert((PVT == MVT::i64 || PVT == MVT::i32) &&12574"Invalid Pointer Size!");1257512576const TargetRegisterClass *RC =12577(PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;12578Register Tmp = MRI.createVirtualRegister(RC);12579// Since FP is only updated here but NOT referenced, it's treated as GPR.12580unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;12581unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;12582unsigned BP =12583(PVT == MVT::i64)12584? PPC::X3012585: (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R2912586: PPC::R30);1258712588MachineInstrBuilder MIB;1258912590const int64_t LabelOffset = 1 * PVT.getStoreSize();12591const int64_t SPOffset = 2 * PVT.getStoreSize();12592const int64_t TOCOffset = 3 * PVT.getStoreSize();12593const int64_t BPOffset = 4 * PVT.getStoreSize();1259412595Register BufReg = MI.getOperand(0).getReg();1259612597// Reload FP (the jumped-to function may not have had a12598// frame pointer, and if so, then its r31 will be restored12599// as necessary).12600if (PVT == MVT::i64) {12601MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)12602.addImm(0)12603.addReg(BufReg);12604} else {12605MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)12606.addImm(0)12607.addReg(BufReg);12608}12609MIB.cloneMemRefs(MI);1261012611// Reload IP12612if (PVT == MVT::i64) {12613MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)12614.addImm(LabelOffset)12615.addReg(BufReg);12616} else {12617MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)12618.addImm(LabelOffset)12619.addReg(BufReg);12620}12621MIB.cloneMemRefs(MI);1262212623// Reload SP12624if (PVT == MVT::i64) {12625MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)12626.addImm(SPOffset)12627.addReg(BufReg);12628} else {12629MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)12630.addImm(SPOffset)12631.addReg(BufReg);12632}12633MIB.cloneMemRefs(MI);1263412635// Reload BP12636if (PVT == MVT::i64) {12637MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)12638.addImm(BPOffset)12639.addReg(BufReg);12640} else {12641MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)12642.addImm(BPOffset)12643.addReg(BufReg);12644}12645MIB.cloneMemRefs(MI);1264612647// Reload TOC12648if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {12649setUsesTOCBasePtr(*MBB->getParent());12650MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)12651.addImm(TOCOffset)12652.addReg(BufReg)12653.cloneMemRefs(MI);12654}1265512656// Jump12657BuildMI(*MBB, MI, DL,12658TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);12659BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));1266012661MI.eraseFromParent();12662return MBB;12663}1266412665bool PPCTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {12666// If the function specifically requests inline stack probes, emit them.12667if (MF.getFunction().hasFnAttribute("probe-stack"))12668return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==12669"inline-asm";12670return false;12671}1267212673unsigned PPCTargetLowering::getStackProbeSize(const MachineFunction &MF) const {12674const TargetFrameLowering *TFI = Subtarget.getFrameLowering();12675unsigned StackAlign = TFI->getStackAlignment();12676assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&12677"Unexpected stack alignment");12678// The default stack probe size is 4096 if the function has no12679// stack-probe-size attribute.12680const Function &Fn = MF.getFunction();12681unsigned StackProbeSize =12682Fn.getFnAttributeAsParsedInteger("stack-probe-size", 4096);12683// Round down to the stack alignment.12684StackProbeSize &= ~(StackAlign - 1);12685return StackProbeSize ? StackProbeSize : StackAlign;12686}1268712688// Lower dynamic stack allocation with probing. `emitProbedAlloca` is splitted12689// into three phases. In the first phase, it uses pseudo instruction12690// PREPARE_PROBED_ALLOCA to get the future result of actual FramePointer and12691// FinalStackPtr. In the second phase, it generates a loop for probing blocks.12692// At last, it uses pseudo instruction DYNAREAOFFSET to get the future result of12693// MaxCallFrameSize so that it can calculate correct data area pointer.12694MachineBasicBlock *12695PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,12696MachineBasicBlock *MBB) const {12697const bool isPPC64 = Subtarget.isPPC64();12698MachineFunction *MF = MBB->getParent();12699const TargetInstrInfo *TII = Subtarget.getInstrInfo();12700DebugLoc DL = MI.getDebugLoc();12701const unsigned ProbeSize = getStackProbeSize(*MF);12702const BasicBlock *ProbedBB = MBB->getBasicBlock();12703MachineRegisterInfo &MRI = MF->getRegInfo();12704// The CFG of probing stack looks as12705// +-----+12706// | MBB |12707// +--+--+12708// |12709// +----v----+12710// +--->+ TestMBB +---+12711// | +----+----+ |12712// | | |12713// | +-----v----+ |12714// +---+ BlockMBB | |12715// +----------+ |12716// |12717// +---------+ |12718// | TailMBB +<--+12719// +---------+12720// In MBB, calculate previous frame pointer and final stack pointer.12721// In TestMBB, test if sp is equal to final stack pointer, if so, jump to12722// TailMBB. In BlockMBB, update the sp atomically and jump back to TestMBB.12723// TailMBB is spliced via \p MI.12724MachineBasicBlock *TestMBB = MF->CreateMachineBasicBlock(ProbedBB);12725MachineBasicBlock *TailMBB = MF->CreateMachineBasicBlock(ProbedBB);12726MachineBasicBlock *BlockMBB = MF->CreateMachineBasicBlock(ProbedBB);1272712728MachineFunction::iterator MBBIter = ++MBB->getIterator();12729MF->insert(MBBIter, TestMBB);12730MF->insert(MBBIter, BlockMBB);12731MF->insert(MBBIter, TailMBB);1273212733const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;12734const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;1273512736Register DstReg = MI.getOperand(0).getReg();12737Register NegSizeReg = MI.getOperand(1).getReg();12738Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;12739Register FinalStackPtr = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);12740Register FramePointer = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);12741Register ActualNegSizeReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);1274212743// Since value of NegSizeReg might be realigned in prologepilog, insert a12744// PREPARE_PROBED_ALLOCA pseudo instruction to get actual FramePointer and12745// NegSize.12746unsigned ProbeOpc;12747if (!MRI.hasOneNonDBGUse(NegSizeReg))12748ProbeOpc =12749isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64 : PPC::PREPARE_PROBED_ALLOCA_32;12750else12751// By introducing PREPARE_PROBED_ALLOCA_NEGSIZE_OPT, ActualNegSizeReg12752// and NegSizeReg will be allocated in the same phyreg to avoid12753// redundant copy when NegSizeReg has only one use which is current MI and12754// will be replaced by PREPARE_PROBED_ALLOCA then.12755ProbeOpc = isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_6412756: PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32;12757BuildMI(*MBB, {MI}, DL, TII->get(ProbeOpc), FramePointer)12758.addDef(ActualNegSizeReg)12759.addReg(NegSizeReg)12760.add(MI.getOperand(2))12761.add(MI.getOperand(3));1276212763// Calculate final stack pointer, which equals to SP + ActualNegSize.12764BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4),12765FinalStackPtr)12766.addReg(SPReg)12767.addReg(ActualNegSizeReg);1276812769// Materialize a scratch register for update.12770int64_t NegProbeSize = -(int64_t)ProbeSize;12771assert(isInt<32>(NegProbeSize) && "Unhandled probe size!");12772Register ScratchReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);12773if (!isInt<16>(NegProbeSize)) {12774Register TempReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);12775BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS), TempReg)12776.addImm(NegProbeSize >> 16);12777BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI),12778ScratchReg)12779.addReg(TempReg)12780.addImm(NegProbeSize & 0xFFFF);12781} else12782BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LI8 : PPC::LI), ScratchReg)12783.addImm(NegProbeSize);1278412785{12786// Probing leading residual part.12787Register Div = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);12788BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::DIVD : PPC::DIVW), Div)12789.addReg(ActualNegSizeReg)12790.addReg(ScratchReg);12791Register Mul = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);12792BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::MULLD : PPC::MULLW), Mul)12793.addReg(Div)12794.addReg(ScratchReg);12795Register NegMod = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);12796BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::SUBF8 : PPC::SUBF), NegMod)12797.addReg(Mul)12798.addReg(ActualNegSizeReg);12799BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)12800.addReg(FramePointer)12801.addReg(SPReg)12802.addReg(NegMod);12803}1280412805{12806// Remaining part should be multiple of ProbeSize.12807Register CmpResult = MRI.createVirtualRegister(&PPC::CRRCRegClass);12808BuildMI(TestMBB, DL, TII->get(isPPC64 ? PPC::CMPD : PPC::CMPW), CmpResult)12809.addReg(SPReg)12810.addReg(FinalStackPtr);12811BuildMI(TestMBB, DL, TII->get(PPC::BCC))12812.addImm(PPC::PRED_EQ)12813.addReg(CmpResult)12814.addMBB(TailMBB);12815TestMBB->addSuccessor(BlockMBB);12816TestMBB->addSuccessor(TailMBB);12817}1281812819{12820// Touch the block.12821// |P...|P...|P...12822BuildMI(BlockMBB, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)12823.addReg(FramePointer)12824.addReg(SPReg)12825.addReg(ScratchReg);12826BuildMI(BlockMBB, DL, TII->get(PPC::B)).addMBB(TestMBB);12827BlockMBB->addSuccessor(TestMBB);12828}1282912830// Calculation of MaxCallFrameSize is deferred to prologepilog, use12831// DYNAREAOFFSET pseudo instruction to get the future result.12832Register MaxCallFrameSizeReg =12833MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);12834BuildMI(TailMBB, DL,12835TII->get(isPPC64 ? PPC::DYNAREAOFFSET8 : PPC::DYNAREAOFFSET),12836MaxCallFrameSizeReg)12837.add(MI.getOperand(2))12838.add(MI.getOperand(3));12839BuildMI(TailMBB, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4), DstReg)12840.addReg(SPReg)12841.addReg(MaxCallFrameSizeReg);1284212843// Splice instructions after MI to TailMBB.12844TailMBB->splice(TailMBB->end(), MBB,12845std::next(MachineBasicBlock::iterator(MI)), MBB->end());12846TailMBB->transferSuccessorsAndUpdatePHIs(MBB);12847MBB->addSuccessor(TestMBB);1284812849// Delete the pseudo instruction.12850MI.eraseFromParent();1285112852++NumDynamicAllocaProbed;12853return TailMBB;12854}1285512856static bool IsSelectCC(MachineInstr &MI) {12857switch (MI.getOpcode()) {12858case PPC::SELECT_CC_I4:12859case PPC::SELECT_CC_I8:12860case PPC::SELECT_CC_F4:12861case PPC::SELECT_CC_F8:12862case PPC::SELECT_CC_F16:12863case PPC::SELECT_CC_VRRC:12864case PPC::SELECT_CC_VSFRC:12865case PPC::SELECT_CC_VSSRC:12866case PPC::SELECT_CC_VSRC:12867case PPC::SELECT_CC_SPE4:12868case PPC::SELECT_CC_SPE:12869return true;12870default:12871return false;12872}12873}1287412875static bool IsSelect(MachineInstr &MI) {12876switch (MI.getOpcode()) {12877case PPC::SELECT_I4:12878case PPC::SELECT_I8:12879case PPC::SELECT_F4:12880case PPC::SELECT_F8:12881case PPC::SELECT_F16:12882case PPC::SELECT_SPE:12883case PPC::SELECT_SPE4:12884case PPC::SELECT_VRRC:12885case PPC::SELECT_VSFRC:12886case PPC::SELECT_VSSRC:12887case PPC::SELECT_VSRC:12888return true;12889default:12890return false;12891}12892}1289312894MachineBasicBlock *12895PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,12896MachineBasicBlock *BB) const {12897if (MI.getOpcode() == TargetOpcode::STACKMAP ||12898MI.getOpcode() == TargetOpcode::PATCHPOINT) {12899if (Subtarget.is64BitELFABI() &&12900MI.getOpcode() == TargetOpcode::PATCHPOINT &&12901!Subtarget.isUsingPCRelativeCalls()) {12902// Call lowering should have added an r2 operand to indicate a dependence12903// on the TOC base pointer value. It can't however, because there is no12904// way to mark the dependence as implicit there, and so the stackmap code12905// will confuse it with a regular operand. Instead, add the dependence12906// here.12907MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true));12908}1290912910return emitPatchPoint(MI, BB);12911}1291212913if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 ||12914MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {12915return emitEHSjLjSetJmp(MI, BB);12916} else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 ||12917MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {12918return emitEHSjLjLongJmp(MI, BB);12919}1292012921const TargetInstrInfo *TII = Subtarget.getInstrInfo();1292212923// To "insert" these instructions we actually have to insert their12924// control-flow patterns.12925const BasicBlock *LLVM_BB = BB->getBasicBlock();12926MachineFunction::iterator It = ++BB->getIterator();1292712928MachineFunction *F = BB->getParent();12929MachineRegisterInfo &MRI = F->getRegInfo();1293012931if (Subtarget.hasISEL() &&12932(MI.getOpcode() == PPC::SELECT_CC_I4 ||12933MI.getOpcode() == PPC::SELECT_CC_I8 ||12934MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8)) {12935SmallVector<MachineOperand, 2> Cond;12936if (MI.getOpcode() == PPC::SELECT_CC_I4 ||12937MI.getOpcode() == PPC::SELECT_CC_I8)12938Cond.push_back(MI.getOperand(4));12939else12940Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));12941Cond.push_back(MI.getOperand(1));1294212943DebugLoc dl = MI.getDebugLoc();12944TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond,12945MI.getOperand(2).getReg(), MI.getOperand(3).getReg());12946} else if (IsSelectCC(MI) || IsSelect(MI)) {12947// The incoming instruction knows the destination vreg to set, the12948// condition code register to branch on, the true/false values to12949// select between, and a branch opcode to use.1295012951// thisMBB:12952// ...12953// TrueVal = ...12954// cmpTY ccX, r1, r212955// bCC sinkMBB12956// fallthrough --> copy0MBB12957MachineBasicBlock *thisMBB = BB;12958MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);12959MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);12960DebugLoc dl = MI.getDebugLoc();12961F->insert(It, copy0MBB);12962F->insert(It, sinkMBB);1296312964// Set the call frame size on entry to the new basic blocks.12965// See https://reviews.llvm.org/D156113.12966unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);12967copy0MBB->setCallFrameSize(CallFrameSize);12968sinkMBB->setCallFrameSize(CallFrameSize);1296912970// Transfer the remainder of BB and its successor edges to sinkMBB.12971sinkMBB->splice(sinkMBB->begin(), BB,12972std::next(MachineBasicBlock::iterator(MI)), BB->end());12973sinkMBB->transferSuccessorsAndUpdatePHIs(BB);1297412975// Next, add the true and fallthrough blocks as its successors.12976BB->addSuccessor(copy0MBB);12977BB->addSuccessor(sinkMBB);1297812979if (IsSelect(MI)) {12980BuildMI(BB, dl, TII->get(PPC::BC))12981.addReg(MI.getOperand(1).getReg())12982.addMBB(sinkMBB);12983} else {12984unsigned SelectPred = MI.getOperand(4).getImm();12985BuildMI(BB, dl, TII->get(PPC::BCC))12986.addImm(SelectPred)12987.addReg(MI.getOperand(1).getReg())12988.addMBB(sinkMBB);12989}1299012991// copy0MBB:12992// %FalseValue = ...12993// # fallthrough to sinkMBB12994BB = copy0MBB;1299512996// Update machine-CFG edges12997BB->addSuccessor(sinkMBB);1299812999// sinkMBB:13000// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]13001// ...13002BB = sinkMBB;13003BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg())13004.addReg(MI.getOperand(3).getReg())13005.addMBB(copy0MBB)13006.addReg(MI.getOperand(2).getReg())13007.addMBB(thisMBB);13008} else if (MI.getOpcode() == PPC::ReadTB) {13009// To read the 64-bit time-base register on a 32-bit target, we read the13010// two halves. Should the counter have wrapped while it was being read, we13011// need to try again.13012// ...13013// readLoop:13014// mfspr Rx,TBU # load from TBU13015// mfspr Ry,TB # load from TB13016// mfspr Rz,TBU # load from TBU13017// cmpw crX,Rx,Rz # check if 'old'='new'13018// bne readLoop # branch if they're not equal13019// ...1302013021MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);13022MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);13023DebugLoc dl = MI.getDebugLoc();13024F->insert(It, readMBB);13025F->insert(It, sinkMBB);1302613027// Transfer the remainder of BB and its successor edges to sinkMBB.13028sinkMBB->splice(sinkMBB->begin(), BB,13029std::next(MachineBasicBlock::iterator(MI)), BB->end());13030sinkMBB->transferSuccessorsAndUpdatePHIs(BB);1303113032BB->addSuccessor(readMBB);13033BB = readMBB;1303413035MachineRegisterInfo &RegInfo = F->getRegInfo();13036Register ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);13037Register LoReg = MI.getOperand(0).getReg();13038Register HiReg = MI.getOperand(1).getReg();1303913040BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);13041BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);13042BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);1304313044Register CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);1304513046BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)13047.addReg(HiReg)13048.addReg(ReadAgainReg);13049BuildMI(BB, dl, TII->get(PPC::BCC))13050.addImm(PPC::PRED_NE)13051.addReg(CmpReg)13052.addMBB(readMBB);1305313054BB->addSuccessor(readMBB);13055BB->addSuccessor(sinkMBB);13056} else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)13057BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);13058else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)13059BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);13060else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)13061BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4);13062else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)13063BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8);1306413065else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)13066BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);13067else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)13068BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);13069else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)13070BB = EmitAtomicBinary(MI, BB, 4, PPC::AND);13071else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)13072BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8);1307313074else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)13075BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);13076else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)13077BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);13078else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)13079BB = EmitAtomicBinary(MI, BB, 4, PPC::OR);13080else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)13081BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8);1308213083else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)13084BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);13085else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)13086BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);13087else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)13088BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR);13089else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)13090BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8);1309113092else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)13093BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);13094else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)13095BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);13096else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)13097BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND);13098else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)13099BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8);1310013101else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)13102BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);13103else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)13104BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);13105else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)13106BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF);13107else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)13108BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8);1310913110else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8)13111BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LT);13112else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16)13113BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LT);13114else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32)13115BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LT);13116else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64)13117BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LT);1311813119else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8)13120BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GT);13121else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16)13122BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GT);13123else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32)13124BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GT);13125else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64)13126BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GT);1312713128else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8)13129BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LT);13130else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16)13131BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LT);13132else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32)13133BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LT);13134else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64)13135BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LT);1313613137else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8)13138BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GT);13139else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16)13140BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GT);13141else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32)13142BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GT);13143else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64)13144BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GT);1314513146else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)13147BB = EmitPartwordAtomicBinary(MI, BB, true, 0);13148else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)13149BB = EmitPartwordAtomicBinary(MI, BB, false, 0);13150else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)13151BB = EmitAtomicBinary(MI, BB, 4, 0);13152else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)13153BB = EmitAtomicBinary(MI, BB, 8, 0);13154else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||13155MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||13156(Subtarget.hasPartwordAtomics() &&13157MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) ||13158(Subtarget.hasPartwordAtomics() &&13159MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {13160bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;1316113162auto LoadMnemonic = PPC::LDARX;13163auto StoreMnemonic = PPC::STDCX;13164switch (MI.getOpcode()) {13165default:13166llvm_unreachable("Compare and swap of unknown size");13167case PPC::ATOMIC_CMP_SWAP_I8:13168LoadMnemonic = PPC::LBARX;13169StoreMnemonic = PPC::STBCX;13170assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");13171break;13172case PPC::ATOMIC_CMP_SWAP_I16:13173LoadMnemonic = PPC::LHARX;13174StoreMnemonic = PPC::STHCX;13175assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");13176break;13177case PPC::ATOMIC_CMP_SWAP_I32:13178LoadMnemonic = PPC::LWARX;13179StoreMnemonic = PPC::STWCX;13180break;13181case PPC::ATOMIC_CMP_SWAP_I64:13182LoadMnemonic = PPC::LDARX;13183StoreMnemonic = PPC::STDCX;13184break;13185}13186MachineRegisterInfo &RegInfo = F->getRegInfo();13187Register dest = MI.getOperand(0).getReg();13188Register ptrA = MI.getOperand(1).getReg();13189Register ptrB = MI.getOperand(2).getReg();13190Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);13191Register oldval = MI.getOperand(3).getReg();13192Register newval = MI.getOperand(4).getReg();13193DebugLoc dl = MI.getDebugLoc();1319413195MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);13196MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);13197MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);13198F->insert(It, loop1MBB);13199F->insert(It, loop2MBB);13200F->insert(It, exitMBB);13201exitMBB->splice(exitMBB->begin(), BB,13202std::next(MachineBasicBlock::iterator(MI)), BB->end());13203exitMBB->transferSuccessorsAndUpdatePHIs(BB);1320413205// thisMBB:13206// ...13207// fallthrough --> loopMBB13208BB->addSuccessor(loop1MBB);1320913210// loop1MBB:13211// l[bhwd]arx dest, ptr13212// cmp[wd] dest, oldval13213// bne- exitBB13214// loop2MBB:13215// st[bhwd]cx. newval, ptr13216// bne- loopMBB13217// b exitBB13218// exitBB:13219BB = loop1MBB;13220BuildMI(BB, dl, TII->get(LoadMnemonic), dest).addReg(ptrA).addReg(ptrB);13221BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), CrReg)13222.addReg(dest)13223.addReg(oldval);13224BuildMI(BB, dl, TII->get(PPC::BCC))13225.addImm(PPC::PRED_NE)13226.addReg(CrReg)13227.addMBB(exitMBB);13228BB->addSuccessor(loop2MBB);13229BB->addSuccessor(exitMBB);1323013231BB = loop2MBB;13232BuildMI(BB, dl, TII->get(StoreMnemonic))13233.addReg(newval)13234.addReg(ptrA)13235.addReg(ptrB);13236BuildMI(BB, dl, TII->get(PPC::BCC))13237.addImm(PPC::PRED_NE)13238.addReg(PPC::CR0)13239.addMBB(loop1MBB);13240BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);13241BB->addSuccessor(loop1MBB);13242BB->addSuccessor(exitMBB);1324313244// exitMBB:13245// ...13246BB = exitMBB;13247} else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||13248MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {13249// We must use 64-bit registers for addresses when targeting 64-bit,13250// since we're actually doing arithmetic on them. Other registers13251// can be 32-bit.13252bool is64bit = Subtarget.isPPC64();13253bool isLittleEndian = Subtarget.isLittleEndian();13254bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;1325513256Register dest = MI.getOperand(0).getReg();13257Register ptrA = MI.getOperand(1).getReg();13258Register ptrB = MI.getOperand(2).getReg();13259Register oldval = MI.getOperand(3).getReg();13260Register newval = MI.getOperand(4).getReg();13261DebugLoc dl = MI.getDebugLoc();1326213263MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);13264MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);13265MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);13266F->insert(It, loop1MBB);13267F->insert(It, loop2MBB);13268F->insert(It, exitMBB);13269exitMBB->splice(exitMBB->begin(), BB,13270std::next(MachineBasicBlock::iterator(MI)), BB->end());13271exitMBB->transferSuccessorsAndUpdatePHIs(BB);1327213273MachineRegisterInfo &RegInfo = F->getRegInfo();13274const TargetRegisterClass *RC =13275is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;13276const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;1327713278Register PtrReg = RegInfo.createVirtualRegister(RC);13279Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);13280Register ShiftReg =13281isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);13282Register NewVal2Reg = RegInfo.createVirtualRegister(GPRC);13283Register NewVal3Reg = RegInfo.createVirtualRegister(GPRC);13284Register OldVal2Reg = RegInfo.createVirtualRegister(GPRC);13285Register OldVal3Reg = RegInfo.createVirtualRegister(GPRC);13286Register MaskReg = RegInfo.createVirtualRegister(GPRC);13287Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);13288Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);13289Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);13290Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);13291Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);13292Register Ptr1Reg;13293Register TmpReg = RegInfo.createVirtualRegister(GPRC);13294Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;13295Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);13296// thisMBB:13297// ...13298// fallthrough --> loopMBB13299BB->addSuccessor(loop1MBB);1330013301// The 4-byte load must be aligned, while a char or short may be13302// anywhere in the word. Hence all this nasty bookkeeping code.13303// add ptr1, ptrA, ptrB [copy if ptrA==0]13304// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]13305// xori shift, shift1, 24 [16]13306// rlwinm ptr, ptr1, 0, 0, 2913307// slw newval2, newval, shift13308// slw oldval2, oldval,shift13309// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]13310// slw mask, mask2, shift13311// and newval3, newval2, mask13312// and oldval3, oldval2, mask13313// loop1MBB:13314// lwarx tmpDest, ptr13315// and tmp, tmpDest, mask13316// cmpw tmp, oldval313317// bne- exitBB13318// loop2MBB:13319// andc tmp2, tmpDest, mask13320// or tmp4, tmp2, newval313321// stwcx. tmp4, ptr13322// bne- loop1MBB13323// b exitBB13324// exitBB:13325// srw dest, tmpDest, shift13326if (ptrA != ZeroReg) {13327Ptr1Reg = RegInfo.createVirtualRegister(RC);13328BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)13329.addReg(ptrA)13330.addReg(ptrB);13331} else {13332Ptr1Reg = ptrB;13333}1333413335// We need use 32-bit subregister to avoid mismatch register class in 64-bit13336// mode.13337BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)13338.addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)13339.addImm(3)13340.addImm(27)13341.addImm(is8bit ? 28 : 27);13342if (!isLittleEndian)13343BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)13344.addReg(Shift1Reg)13345.addImm(is8bit ? 24 : 16);13346if (is64bit)13347BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)13348.addReg(Ptr1Reg)13349.addImm(0)13350.addImm(61);13351else13352BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)13353.addReg(Ptr1Reg)13354.addImm(0)13355.addImm(0)13356.addImm(29);13357BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)13358.addReg(newval)13359.addReg(ShiftReg);13360BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)13361.addReg(oldval)13362.addReg(ShiftReg);13363if (is8bit)13364BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);13365else {13366BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);13367BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)13368.addReg(Mask3Reg)13369.addImm(65535);13370}13371BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)13372.addReg(Mask2Reg)13373.addReg(ShiftReg);13374BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)13375.addReg(NewVal2Reg)13376.addReg(MaskReg);13377BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)13378.addReg(OldVal2Reg)13379.addReg(MaskReg);1338013381BB = loop1MBB;13382BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)13383.addReg(ZeroReg)13384.addReg(PtrReg);13385BuildMI(BB, dl, TII->get(PPC::AND), TmpReg)13386.addReg(TmpDestReg)13387.addReg(MaskReg);13388BuildMI(BB, dl, TII->get(PPC::CMPW), CrReg)13389.addReg(TmpReg)13390.addReg(OldVal3Reg);13391BuildMI(BB, dl, TII->get(PPC::BCC))13392.addImm(PPC::PRED_NE)13393.addReg(CrReg)13394.addMBB(exitMBB);13395BB->addSuccessor(loop2MBB);13396BB->addSuccessor(exitMBB);1339713398BB = loop2MBB;13399BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)13400.addReg(TmpDestReg)13401.addReg(MaskReg);13402BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg)13403.addReg(Tmp2Reg)13404.addReg(NewVal3Reg);13405BuildMI(BB, dl, TII->get(PPC::STWCX))13406.addReg(Tmp4Reg)13407.addReg(ZeroReg)13408.addReg(PtrReg);13409BuildMI(BB, dl, TII->get(PPC::BCC))13410.addImm(PPC::PRED_NE)13411.addReg(PPC::CR0)13412.addMBB(loop1MBB);13413BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);13414BB->addSuccessor(loop1MBB);13415BB->addSuccessor(exitMBB);1341613417// exitMBB:13418// ...13419BB = exitMBB;13420BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest)13421.addReg(TmpReg)13422.addReg(ShiftReg);13423} else if (MI.getOpcode() == PPC::FADDrtz) {13424// This pseudo performs an FADD with rounding mode temporarily forced13425// to round-to-zero. We emit this via custom inserter since the FPSCR13426// is not modeled at the SelectionDAG level.13427Register Dest = MI.getOperand(0).getReg();13428Register Src1 = MI.getOperand(1).getReg();13429Register Src2 = MI.getOperand(2).getReg();13430DebugLoc dl = MI.getDebugLoc();1343113432MachineRegisterInfo &RegInfo = F->getRegInfo();13433Register MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);1343413435// Save FPSCR value.13436BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);1343713438// Set rounding mode to round-to-zero.13439BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1))13440.addImm(31)13441.addReg(PPC::RM, RegState::ImplicitDefine);1344213443BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0))13444.addImm(30)13445.addReg(PPC::RM, RegState::ImplicitDefine);1344613447// Perform addition.13448auto MIB = BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest)13449.addReg(Src1)13450.addReg(Src2);13451if (MI.getFlag(MachineInstr::NoFPExcept))13452MIB.setMIFlag(MachineInstr::NoFPExcept);1345313454// Restore FPSCR value.13455BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);13456} else if (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||13457MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT ||13458MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||13459MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8) {13460unsigned Opcode = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||13461MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8)13462? PPC::ANDI8_rec13463: PPC::ANDI_rec;13464bool IsEQ = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||13465MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8);1346613467MachineRegisterInfo &RegInfo = F->getRegInfo();13468Register Dest = RegInfo.createVirtualRegister(13469Opcode == PPC::ANDI_rec ? &PPC::GPRCRegClass : &PPC::G8RCRegClass);1347013471DebugLoc Dl = MI.getDebugLoc();13472BuildMI(*BB, MI, Dl, TII->get(Opcode), Dest)13473.addReg(MI.getOperand(1).getReg())13474.addImm(1);13475BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),13476MI.getOperand(0).getReg())13477.addReg(IsEQ ? PPC::CR0EQ : PPC::CR0GT);13478} else if (MI.getOpcode() == PPC::TCHECK_RET) {13479DebugLoc Dl = MI.getDebugLoc();13480MachineRegisterInfo &RegInfo = F->getRegInfo();13481Register CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);13482BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);13483BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),13484MI.getOperand(0).getReg())13485.addReg(CRReg);13486} else if (MI.getOpcode() == PPC::TBEGIN_RET) {13487DebugLoc Dl = MI.getDebugLoc();13488unsigned Imm = MI.getOperand(1).getImm();13489BuildMI(*BB, MI, Dl, TII->get(PPC::TBEGIN)).addImm(Imm);13490BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),13491MI.getOperand(0).getReg())13492.addReg(PPC::CR0EQ);13493} else if (MI.getOpcode() == PPC::SETRNDi) {13494DebugLoc dl = MI.getDebugLoc();13495Register OldFPSCRReg = MI.getOperand(0).getReg();1349613497// Save FPSCR value.13498if (MRI.use_empty(OldFPSCRReg))13499BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), OldFPSCRReg);13500else13501BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);1350213503// The floating point rounding mode is in the bits 62:63 of FPCSR, and has13504// the following settings:13505// 00 Round to nearest13506// 01 Round to 013507// 10 Round to +inf13508// 11 Round to -inf1350913510// When the operand is immediate, using the two least significant bits of13511// the immediate to set the bits 62:63 of FPSCR.13512unsigned Mode = MI.getOperand(1).getImm();13513BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))13514.addImm(31)13515.addReg(PPC::RM, RegState::ImplicitDefine);1351613517BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))13518.addImm(30)13519.addReg(PPC::RM, RegState::ImplicitDefine);13520} else if (MI.getOpcode() == PPC::SETRND) {13521DebugLoc dl = MI.getDebugLoc();1352213523// Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg13524// or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg.13525// If the target doesn't have DirectMove, we should use stack to do the13526// conversion, because the target doesn't have the instructions like mtvsrd13527// or mfvsrd to do this conversion directly.13528auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) {13529if (Subtarget.hasDirectMove()) {13530BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), DestReg)13531.addReg(SrcReg);13532} else {13533// Use stack to do the register copy.13534unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD;13535MachineRegisterInfo &RegInfo = F->getRegInfo();13536const TargetRegisterClass *RC = RegInfo.getRegClass(SrcReg);13537if (RC == &PPC::F8RCRegClass) {13538// Copy register from F8RCRegClass to G8RCRegclass.13539assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) &&13540"Unsupported RegClass.");1354113542StoreOp = PPC::STFD;13543LoadOp = PPC::LD;13544} else {13545// Copy register from G8RCRegClass to F8RCRegclass.13546assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) &&13547(RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) &&13548"Unsupported RegClass.");13549}1355013551MachineFrameInfo &MFI = F->getFrameInfo();13552int FrameIdx = MFI.CreateStackObject(8, Align(8), false);1355313554MachineMemOperand *MMOStore = F->getMachineMemOperand(13555MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),13556MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),13557MFI.getObjectAlign(FrameIdx));1355813559// Store the SrcReg into the stack.13560BuildMI(*BB, MI, dl, TII->get(StoreOp))13561.addReg(SrcReg)13562.addImm(0)13563.addFrameIndex(FrameIdx)13564.addMemOperand(MMOStore);1356513566MachineMemOperand *MMOLoad = F->getMachineMemOperand(13567MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),13568MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),13569MFI.getObjectAlign(FrameIdx));1357013571// Load from the stack where SrcReg is stored, and save to DestReg,13572// so we have done the RegClass conversion from RegClass::SrcReg to13573// RegClass::DestReg.13574BuildMI(*BB, MI, dl, TII->get(LoadOp), DestReg)13575.addImm(0)13576.addFrameIndex(FrameIdx)13577.addMemOperand(MMOLoad);13578}13579};1358013581Register OldFPSCRReg = MI.getOperand(0).getReg();1358213583// Save FPSCR value.13584BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);1358513586// When the operand is gprc register, use two least significant bits of the13587// register and mtfsf instruction to set the bits 62:63 of FPSCR.13588//13589// copy OldFPSCRTmpReg, OldFPSCRReg13590// (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1)13591// rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 6213592// copy NewFPSCRReg, NewFPSCRTmpReg13593// mtfsf 255, NewFPSCRReg13594MachineOperand SrcOp = MI.getOperand(1);13595MachineRegisterInfo &RegInfo = F->getRegInfo();13596Register OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);1359713598copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);1359913600Register ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);13601Register ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);1360213603// The first operand of INSERT_SUBREG should be a register which has13604// subregisters, we only care about its RegClass, so we should use an13605// IMPLICIT_DEF register.13606BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), ImDefReg);13607BuildMI(*BB, MI, dl, TII->get(PPC::INSERT_SUBREG), ExtSrcReg)13608.addReg(ImDefReg)13609.add(SrcOp)13610.addImm(1);1361113612Register NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);13613BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg)13614.addReg(OldFPSCRTmpReg)13615.addReg(ExtSrcReg)13616.addImm(0)13617.addImm(62);1361813619Register NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);13620copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);1362113622// The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:6313623// bits of FPSCR.13624BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF))13625.addImm(255)13626.addReg(NewFPSCRReg)13627.addImm(0)13628.addImm(0);13629} else if (MI.getOpcode() == PPC::SETFLM) {13630DebugLoc Dl = MI.getDebugLoc();1363113632// Result of setflm is previous FPSCR content, so we need to save it first.13633Register OldFPSCRReg = MI.getOperand(0).getReg();13634if (MRI.use_empty(OldFPSCRReg))13635BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::IMPLICIT_DEF), OldFPSCRReg);13636else13637BuildMI(*BB, MI, Dl, TII->get(PPC::MFFS), OldFPSCRReg);1363813639// Put bits in 32:63 to FPSCR.13640Register NewFPSCRReg = MI.getOperand(1).getReg();13641BuildMI(*BB, MI, Dl, TII->get(PPC::MTFSF))13642.addImm(255)13643.addReg(NewFPSCRReg)13644.addImm(0)13645.addImm(0);13646} else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 ||13647MI.getOpcode() == PPC::PROBED_ALLOCA_64) {13648return emitProbedAlloca(MI, BB);13649} else if (MI.getOpcode() == PPC::SPLIT_QUADWORD) {13650DebugLoc DL = MI.getDebugLoc();13651Register Src = MI.getOperand(2).getReg();13652Register Lo = MI.getOperand(0).getReg();13653Register Hi = MI.getOperand(1).getReg();13654BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY))13655.addDef(Lo)13656.addUse(Src, 0, PPC::sub_gp8_x1);13657BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY))13658.addDef(Hi)13659.addUse(Src, 0, PPC::sub_gp8_x0);13660} else if (MI.getOpcode() == PPC::LQX_PSEUDO ||13661MI.getOpcode() == PPC::STQX_PSEUDO) {13662DebugLoc DL = MI.getDebugLoc();13663// Ptr is used as the ptr_rc_no_r0 part13664// of LQ/STQ's memory operand and adding result of RA and RB,13665// so it has to be g8rc_and_g8rc_nox0.13666Register Ptr =13667F->getRegInfo().createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass);13668Register Val = MI.getOperand(0).getReg();13669Register RA = MI.getOperand(1).getReg();13670Register RB = MI.getOperand(2).getReg();13671BuildMI(*BB, MI, DL, TII->get(PPC::ADD8), Ptr).addReg(RA).addReg(RB);13672BuildMI(*BB, MI, DL,13673MI.getOpcode() == PPC::LQX_PSEUDO ? TII->get(PPC::LQ)13674: TII->get(PPC::STQ))13675.addReg(Val, MI.getOpcode() == PPC::LQX_PSEUDO ? RegState::Define : 0)13676.addImm(0)13677.addReg(Ptr);13678} else {13679llvm_unreachable("Unexpected instr type to insert");13680}1368113682MI.eraseFromParent(); // The pseudo instruction is gone now.13683return BB;13684}1368513686//===----------------------------------------------------------------------===//13687// Target Optimization Hooks13688//===----------------------------------------------------------------------===//1368913690static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {13691// For the estimates, convergence is quadratic, so we essentially double the13692// number of digits correct after every iteration. For both FRE and FRSQRTE,13693// the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),13694// this is 2^-14. IEEE float has 23 digits and double has 52 digits.13695int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;13696if (VT.getScalarType() == MVT::f64)13697RefinementSteps++;13698return RefinementSteps;13699}1370013701SDValue PPCTargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,13702const DenormalMode &Mode) const {13703// We only have VSX Vector Test for software Square Root.13704EVT VT = Op.getValueType();13705if (!isTypeLegal(MVT::i1) ||13706(VT != MVT::f64 &&13707((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX())))13708return TargetLowering::getSqrtInputTest(Op, DAG, Mode);1370913710SDLoc DL(Op);13711// The output register of FTSQRT is CR field.13712SDValue FTSQRT = DAG.getNode(PPCISD::FTSQRT, DL, MVT::i32, Op);13713// ftsqrt BF,FRB13714// Let e_b be the unbiased exponent of the double-precision13715// floating-point operand in register FRB.13716// fe_flag is set to 1 if either of the following conditions occurs.13717// - The double-precision floating-point operand in register FRB is a zero,13718// a NaN, or an infinity, or a negative value.13719// - e_b is less than or equal to -970.13720// Otherwise fe_flag is set to 0.13721// Both VSX and non-VSX versions would set EQ bit in the CR if the number is13722// not eligible for iteration. (zero/negative/infinity/nan or unbiased13723// exponent is less than -970)13724SDValue SRIdxVal = DAG.getTargetConstant(PPC::sub_eq, DL, MVT::i32);13725return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i1,13726FTSQRT, SRIdxVal),137270);13728}1372913730SDValue13731PPCTargetLowering::getSqrtResultForDenormInput(SDValue Op,13732SelectionDAG &DAG) const {13733// We only have VSX Vector Square Root.13734EVT VT = Op.getValueType();13735if (VT != MVT::f64 &&13736((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX()))13737return TargetLowering::getSqrtResultForDenormInput(Op, DAG);1373813739return DAG.getNode(PPCISD::FSQRT, SDLoc(Op), VT, Op);13740}1374113742SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,13743int Enabled, int &RefinementSteps,13744bool &UseOneConstNR,13745bool Reciprocal) const {13746EVT VT = Operand.getValueType();13747if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||13748(VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||13749(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||13750(VT == MVT::v2f64 && Subtarget.hasVSX())) {13751if (RefinementSteps == ReciprocalEstimate::Unspecified)13752RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);1375313754// The Newton-Raphson computation with a single constant does not provide13755// enough accuracy on some CPUs.13756UseOneConstNR = !Subtarget.needsTwoConstNR();13757return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);13758}13759return SDValue();13760}1376113762SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,13763int Enabled,13764int &RefinementSteps) const {13765EVT VT = Operand.getValueType();13766if ((VT == MVT::f32 && Subtarget.hasFRES()) ||13767(VT == MVT::f64 && Subtarget.hasFRE()) ||13768(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||13769(VT == MVT::v2f64 && Subtarget.hasVSX())) {13770if (RefinementSteps == ReciprocalEstimate::Unspecified)13771RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);13772return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);13773}13774return SDValue();13775}1377613777unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {13778// Note: This functionality is used only when unsafe-fp-math is enabled, and13779// on cores with reciprocal estimates (which are used when unsafe-fp-math is13780// enabled for division), this functionality is redundant with the default13781// combiner logic (once the division -> reciprocal/multiply transformation13782// has taken place). As a result, this matters more for older cores than for13783// newer ones.1378413785// Combine multiple FDIVs with the same divisor into multiple FMULs by the13786// reciprocal if there are two or more FDIVs (for embedded cores with only13787// one FP pipeline) for three or more FDIVs (for generic OOO cores).13788switch (Subtarget.getCPUDirective()) {13789default:13790return 3;13791case PPC::DIR_440:13792case PPC::DIR_A2:13793case PPC::DIR_E500:13794case PPC::DIR_E500mc:13795case PPC::DIR_E5500:13796return 2;13797}13798}1379913800// isConsecutiveLSLoc needs to work even if all adds have not yet been13801// collapsed, and so we need to look through chains of them.13802static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,13803int64_t& Offset, SelectionDAG &DAG) {13804if (DAG.isBaseWithConstantOffset(Loc)) {13805Base = Loc.getOperand(0);13806Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();1380713808// The base might itself be a base plus an offset, and if so, accumulate13809// that as well.13810getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG);13811}13812}1381313814static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,13815unsigned Bytes, int Dist,13816SelectionDAG &DAG) {13817if (VT.getSizeInBits() / 8 != Bytes)13818return false;1381913820SDValue BaseLoc = Base->getBasePtr();13821if (Loc.getOpcode() == ISD::FrameIndex) {13822if (BaseLoc.getOpcode() != ISD::FrameIndex)13823return false;13824const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();13825int FI = cast<FrameIndexSDNode>(Loc)->getIndex();13826int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();13827int FS = MFI.getObjectSize(FI);13828int BFS = MFI.getObjectSize(BFI);13829if (FS != BFS || FS != (int)Bytes) return false;13830return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes);13831}1383213833SDValue Base1 = Loc, Base2 = BaseLoc;13834int64_t Offset1 = 0, Offset2 = 0;13835getBaseWithConstantOffset(Loc, Base1, Offset1, DAG);13836getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG);13837if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))13838return true;1383913840const TargetLowering &TLI = DAG.getTargetLoweringInfo();13841const GlobalValue *GV1 = nullptr;13842const GlobalValue *GV2 = nullptr;13843Offset1 = 0;13844Offset2 = 0;13845bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);13846bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);13847if (isGA1 && isGA2 && GV1 == GV2)13848return Offset1 == (Offset2 + Dist*Bytes);13849return false;13850}1385113852// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does13853// not enforce equality of the chain operands.13854static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,13855unsigned Bytes, int Dist,13856SelectionDAG &DAG) {13857if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) {13858EVT VT = LS->getMemoryVT();13859SDValue Loc = LS->getBasePtr();13860return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);13861}1386213863if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {13864EVT VT;13865switch (N->getConstantOperandVal(1)) {13866default: return false;13867case Intrinsic::ppc_altivec_lvx:13868case Intrinsic::ppc_altivec_lvxl:13869case Intrinsic::ppc_vsx_lxvw4x:13870case Intrinsic::ppc_vsx_lxvw4x_be:13871VT = MVT::v4i32;13872break;13873case Intrinsic::ppc_vsx_lxvd2x:13874case Intrinsic::ppc_vsx_lxvd2x_be:13875VT = MVT::v2f64;13876break;13877case Intrinsic::ppc_altivec_lvebx:13878VT = MVT::i8;13879break;13880case Intrinsic::ppc_altivec_lvehx:13881VT = MVT::i16;13882break;13883case Intrinsic::ppc_altivec_lvewx:13884VT = MVT::i32;13885break;13886}1388713888return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG);13889}1389013891if (N->getOpcode() == ISD::INTRINSIC_VOID) {13892EVT VT;13893switch (N->getConstantOperandVal(1)) {13894default: return false;13895case Intrinsic::ppc_altivec_stvx:13896case Intrinsic::ppc_altivec_stvxl:13897case Intrinsic::ppc_vsx_stxvw4x:13898VT = MVT::v4i32;13899break;13900case Intrinsic::ppc_vsx_stxvd2x:13901VT = MVT::v2f64;13902break;13903case Intrinsic::ppc_vsx_stxvw4x_be:13904VT = MVT::v4i32;13905break;13906case Intrinsic::ppc_vsx_stxvd2x_be:13907VT = MVT::v2f64;13908break;13909case Intrinsic::ppc_altivec_stvebx:13910VT = MVT::i8;13911break;13912case Intrinsic::ppc_altivec_stvehx:13913VT = MVT::i16;13914break;13915case Intrinsic::ppc_altivec_stvewx:13916VT = MVT::i32;13917break;13918}1391913920return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG);13921}1392213923return false;13924}1392513926// Return true is there is a nearyby consecutive load to the one provided13927// (regardless of alignment). We search up and down the chain, looking though13928// token factors and other loads (but nothing else). As a result, a true result13929// indicates that it is safe to create a new consecutive load adjacent to the13930// load provided.13931static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {13932SDValue Chain = LD->getChain();13933EVT VT = LD->getMemoryVT();1393413935SmallSet<SDNode *, 16> LoadRoots;13936SmallVector<SDNode *, 8> Queue(1, Chain.getNode());13937SmallSet<SDNode *, 16> Visited;1393813939// First, search up the chain, branching to follow all token-factor operands.13940// If we find a consecutive load, then we're done, otherwise, record all13941// nodes just above the top-level loads and token factors.13942while (!Queue.empty()) {13943SDNode *ChainNext = Queue.pop_back_val();13944if (!Visited.insert(ChainNext).second)13945continue;1394613947if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {13948if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))13949return true;1395013951if (!Visited.count(ChainLD->getChain().getNode()))13952Queue.push_back(ChainLD->getChain().getNode());13953} else if (ChainNext->getOpcode() == ISD::TokenFactor) {13954for (const SDUse &O : ChainNext->ops())13955if (!Visited.count(O.getNode()))13956Queue.push_back(O.getNode());13957} else13958LoadRoots.insert(ChainNext);13959}1396013961// Second, search down the chain, starting from the top-level nodes recorded13962// in the first phase. These top-level nodes are the nodes just above all13963// loads and token factors. Starting with their uses, recursively look though13964// all loads (just the chain uses) and token factors to find a consecutive13965// load.13966Visited.clear();13967Queue.clear();1396813969for (SDNode *I : LoadRoots) {13970Queue.push_back(I);1397113972while (!Queue.empty()) {13973SDNode *LoadRoot = Queue.pop_back_val();13974if (!Visited.insert(LoadRoot).second)13975continue;1397613977if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))13978if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))13979return true;1398013981for (SDNode *U : LoadRoot->uses())13982if (((isa<MemSDNode>(U) &&13983cast<MemSDNode>(U)->getChain().getNode() == LoadRoot) ||13984U->getOpcode() == ISD::TokenFactor) &&13985!Visited.count(U))13986Queue.push_back(U);13987}13988}1398913990return false;13991}1399213993/// This function is called when we have proved that a SETCC node can be replaced13994/// by subtraction (and other supporting instructions) so that the result of13995/// comparison is kept in a GPR instead of CR. This function is purely for13996/// codegen purposes and has some flags to guide the codegen process.13997static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,13998bool Swap, SDLoc &DL, SelectionDAG &DAG) {13999assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");1400014001// Zero extend the operands to the largest legal integer. Originally, they14002// must be of a strictly smaller size.14003auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0),14004DAG.getConstant(Size, DL, MVT::i32));14005auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1),14006DAG.getConstant(Size, DL, MVT::i32));1400714008// Swap if needed. Depends on the condition code.14009if (Swap)14010std::swap(Op0, Op1);1401114012// Subtract extended integers.14013auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1);1401414015// Move the sign bit to the least significant position and zero out the rest.14016// Now the least significant bit carries the result of original comparison.14017auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode,14018DAG.getConstant(Size - 1, DL, MVT::i32));14019auto Final = Shifted;1402014021// Complement the result if needed. Based on the condition code.14022if (Complement)14023Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted,14024DAG.getConstant(1, DL, MVT::i64));1402514026return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final);14027}1402814029SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,14030DAGCombinerInfo &DCI) const {14031assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");1403214033SelectionDAG &DAG = DCI.DAG;14034SDLoc DL(N);1403514036// Size of integers being compared has a critical role in the following14037// analysis, so we prefer to do this when all types are legal.14038if (!DCI.isAfterLegalizeDAG())14039return SDValue();1404014041// If all users of SETCC extend its value to a legal integer type14042// then we replace SETCC with a subtraction14043for (const SDNode *U : N->uses())14044if (U->getOpcode() != ISD::ZERO_EXTEND)14045return SDValue();1404614047ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();14048auto OpSize = N->getOperand(0).getValueSizeInBits();1404914050unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();1405114052if (OpSize < Size) {14053switch (CC) {14054default: break;14055case ISD::SETULT:14056return generateEquivalentSub(N, Size, false, false, DL, DAG);14057case ISD::SETULE:14058return generateEquivalentSub(N, Size, true, true, DL, DAG);14059case ISD::SETUGT:14060return generateEquivalentSub(N, Size, false, true, DL, DAG);14061case ISD::SETUGE:14062return generateEquivalentSub(N, Size, true, false, DL, DAG);14063}14064}1406514066return SDValue();14067}1406814069SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,14070DAGCombinerInfo &DCI) const {14071SelectionDAG &DAG = DCI.DAG;14072SDLoc dl(N);1407314074assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");14075// If we're tracking CR bits, we need to be careful that we don't have:14076// trunc(binary-ops(zext(x), zext(y)))14077// or14078// trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)14079// such that we're unnecessarily moving things into GPRs when it would be14080// better to keep them in CR bits.1408114082// Note that trunc here can be an actual i1 trunc, or can be the effective14083// truncation that comes from a setcc or select_cc.14084if (N->getOpcode() == ISD::TRUNCATE &&14085N->getValueType(0) != MVT::i1)14086return SDValue();1408714088if (N->getOperand(0).getValueType() != MVT::i32 &&14089N->getOperand(0).getValueType() != MVT::i64)14090return SDValue();1409114092if (N->getOpcode() == ISD::SETCC ||14093N->getOpcode() == ISD::SELECT_CC) {14094// If we're looking at a comparison, then we need to make sure that the14095// high bits (all except for the first) don't matter the result.14096ISD::CondCode CC =14097cast<CondCodeSDNode>(N->getOperand(14098N->getOpcode() == ISD::SETCC ? 2 : 4))->get();14099unsigned OpBits = N->getOperand(0).getValueSizeInBits();1410014101if (ISD::isSignedIntSetCC(CC)) {14102if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits ||14103DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)14104return SDValue();14105} else if (ISD::isUnsignedIntSetCC(CC)) {14106if (!DAG.MaskedValueIsZero(N->getOperand(0),14107APInt::getHighBitsSet(OpBits, OpBits-1)) ||14108!DAG.MaskedValueIsZero(N->getOperand(1),14109APInt::getHighBitsSet(OpBits, OpBits-1)))14110return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)14111: SDValue());14112} else {14113// This is neither a signed nor an unsigned comparison, just make sure14114// that the high bits are equal.14115KnownBits Op1Known = DAG.computeKnownBits(N->getOperand(0));14116KnownBits Op2Known = DAG.computeKnownBits(N->getOperand(1));1411714118// We don't really care about what is known about the first bit (if14119// anything), so pretend that it is known zero for both to ensure they can14120// be compared as constants.14121Op1Known.Zero.setBit(0); Op1Known.One.clearBit(0);14122Op2Known.Zero.setBit(0); Op2Known.One.clearBit(0);1412314124if (!Op1Known.isConstant() || !Op2Known.isConstant() ||14125Op1Known.getConstant() != Op2Known.getConstant())14126return SDValue();14127}14128}1412914130// We now know that the higher-order bits are irrelevant, we just need to14131// make sure that all of the intermediate operations are bit operations, and14132// all inputs are extensions.14133if (N->getOperand(0).getOpcode() != ISD::AND &&14134N->getOperand(0).getOpcode() != ISD::OR &&14135N->getOperand(0).getOpcode() != ISD::XOR &&14136N->getOperand(0).getOpcode() != ISD::SELECT &&14137N->getOperand(0).getOpcode() != ISD::SELECT_CC &&14138N->getOperand(0).getOpcode() != ISD::TRUNCATE &&14139N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&14140N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&14141N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)14142return SDValue();1414314144if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&14145N->getOperand(1).getOpcode() != ISD::AND &&14146N->getOperand(1).getOpcode() != ISD::OR &&14147N->getOperand(1).getOpcode() != ISD::XOR &&14148N->getOperand(1).getOpcode() != ISD::SELECT &&14149N->getOperand(1).getOpcode() != ISD::SELECT_CC &&14150N->getOperand(1).getOpcode() != ISD::TRUNCATE &&14151N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&14152N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&14153N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)14154return SDValue();1415514156SmallVector<SDValue, 4> Inputs;14157SmallVector<SDValue, 8> BinOps, PromOps;14158SmallPtrSet<SDNode *, 16> Visited;1415914160for (unsigned i = 0; i < 2; ++i) {14161if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||14162N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||14163N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&14164N->getOperand(i).getOperand(0).getValueType() == MVT::i1) ||14165isa<ConstantSDNode>(N->getOperand(i)))14166Inputs.push_back(N->getOperand(i));14167else14168BinOps.push_back(N->getOperand(i));1416914170if (N->getOpcode() == ISD::TRUNCATE)14171break;14172}1417314174// Visit all inputs, collect all binary operations (and, or, xor and14175// select) that are all fed by extensions.14176while (!BinOps.empty()) {14177SDValue BinOp = BinOps.pop_back_val();1417814179if (!Visited.insert(BinOp.getNode()).second)14180continue;1418114182PromOps.push_back(BinOp);1418314184for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {14185// The condition of the select is not promoted.14186if (BinOp.getOpcode() == ISD::SELECT && i == 0)14187continue;14188if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)14189continue;1419014191if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||14192BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||14193BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&14194BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) ||14195isa<ConstantSDNode>(BinOp.getOperand(i))) {14196Inputs.push_back(BinOp.getOperand(i));14197} else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||14198BinOp.getOperand(i).getOpcode() == ISD::OR ||14199BinOp.getOperand(i).getOpcode() == ISD::XOR ||14200BinOp.getOperand(i).getOpcode() == ISD::SELECT ||14201BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||14202BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||14203BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||14204BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||14205BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {14206BinOps.push_back(BinOp.getOperand(i));14207} else {14208// We have an input that is not an extension or another binary14209// operation; we'll abort this transformation.14210return SDValue();14211}14212}14213}1421414215// Make sure that this is a self-contained cluster of operations (which14216// is not quite the same thing as saying that everything has only one14217// use).14218for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {14219if (isa<ConstantSDNode>(Inputs[i]))14220continue;1422114222for (const SDNode *User : Inputs[i].getNode()->uses()) {14223if (User != N && !Visited.count(User))14224return SDValue();1422514226// Make sure that we're not going to promote the non-output-value14227// operand(s) or SELECT or SELECT_CC.14228// FIXME: Although we could sometimes handle this, and it does occur in14229// practice that one of the condition inputs to the select is also one of14230// the outputs, we currently can't deal with this.14231if (User->getOpcode() == ISD::SELECT) {14232if (User->getOperand(0) == Inputs[i])14233return SDValue();14234} else if (User->getOpcode() == ISD::SELECT_CC) {14235if (User->getOperand(0) == Inputs[i] ||14236User->getOperand(1) == Inputs[i])14237return SDValue();14238}14239}14240}1424114242for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {14243for (const SDNode *User : PromOps[i].getNode()->uses()) {14244if (User != N && !Visited.count(User))14245return SDValue();1424614247// Make sure that we're not going to promote the non-output-value14248// operand(s) or SELECT or SELECT_CC.14249// FIXME: Although we could sometimes handle this, and it does occur in14250// practice that one of the condition inputs to the select is also one of14251// the outputs, we currently can't deal with this.14252if (User->getOpcode() == ISD::SELECT) {14253if (User->getOperand(0) == PromOps[i])14254return SDValue();14255} else if (User->getOpcode() == ISD::SELECT_CC) {14256if (User->getOperand(0) == PromOps[i] ||14257User->getOperand(1) == PromOps[i])14258return SDValue();14259}14260}14261}1426214263// Replace all inputs with the extension operand.14264for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {14265// Constants may have users outside the cluster of to-be-promoted nodes,14266// and so we need to replace those as we do the promotions.14267if (isa<ConstantSDNode>(Inputs[i]))14268continue;14269else14270DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));14271}1427214273std::list<HandleSDNode> PromOpHandles;14274for (auto &PromOp : PromOps)14275PromOpHandles.emplace_back(PromOp);1427614277// Replace all operations (these are all the same, but have a different14278// (i1) return type). DAG.getNode will validate that the types of14279// a binary operator match, so go through the list in reverse so that14280// we've likely promoted both operands first. Any intermediate truncations or14281// extensions disappear.14282while (!PromOpHandles.empty()) {14283SDValue PromOp = PromOpHandles.back().getValue();14284PromOpHandles.pop_back();1428514286if (PromOp.getOpcode() == ISD::TRUNCATE ||14287PromOp.getOpcode() == ISD::SIGN_EXTEND ||14288PromOp.getOpcode() == ISD::ZERO_EXTEND ||14289PromOp.getOpcode() == ISD::ANY_EXTEND) {14290if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&14291PromOp.getOperand(0).getValueType() != MVT::i1) {14292// The operand is not yet ready (see comment below).14293PromOpHandles.emplace_front(PromOp);14294continue;14295}1429614297SDValue RepValue = PromOp.getOperand(0);14298if (isa<ConstantSDNode>(RepValue))14299RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);1430014301DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);14302continue;14303}1430414305unsigned C;14306switch (PromOp.getOpcode()) {14307default: C = 0; break;14308case ISD::SELECT: C = 1; break;14309case ISD::SELECT_CC: C = 2; break;14310}1431114312if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&14313PromOp.getOperand(C).getValueType() != MVT::i1) ||14314(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&14315PromOp.getOperand(C+1).getValueType() != MVT::i1)) {14316// The to-be-promoted operands of this node have not yet been14317// promoted (this should be rare because we're going through the14318// list backward, but if one of the operands has several users in14319// this cluster of to-be-promoted nodes, it is possible).14320PromOpHandles.emplace_front(PromOp);14321continue;14322}1432314324SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),14325PromOp.getNode()->op_end());1432614327// If there are any constant inputs, make sure they're replaced now.14328for (unsigned i = 0; i < 2; ++i)14329if (isa<ConstantSDNode>(Ops[C+i]))14330Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);1433114332DAG.ReplaceAllUsesOfValueWith(PromOp,14333DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));14334}1433514336// Now we're left with the initial truncation itself.14337if (N->getOpcode() == ISD::TRUNCATE)14338return N->getOperand(0);1433914340// Otherwise, this is a comparison. The operands to be compared have just14341// changed type (to i1), but everything else is the same.14342return SDValue(N, 0);14343}1434414345SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,14346DAGCombinerInfo &DCI) const {14347SelectionDAG &DAG = DCI.DAG;14348SDLoc dl(N);1434914350// If we're tracking CR bits, we need to be careful that we don't have:14351// zext(binary-ops(trunc(x), trunc(y)))14352// or14353// zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)14354// such that we're unnecessarily moving things into CR bits that can more14355// efficiently stay in GPRs. Note that if we're not certain that the high14356// bits are set as required by the final extension, we still may need to do14357// some masking to get the proper behavior.1435814359// This same functionality is important on PPC64 when dealing with14360// 32-to-64-bit extensions; these occur often when 32-bit values are used as14361// the return values of functions. Because it is so similar, it is handled14362// here as well.1436314364if (N->getValueType(0) != MVT::i32 &&14365N->getValueType(0) != MVT::i64)14366return SDValue();1436714368if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) ||14369(N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64())))14370return SDValue();1437114372if (N->getOperand(0).getOpcode() != ISD::AND &&14373N->getOperand(0).getOpcode() != ISD::OR &&14374N->getOperand(0).getOpcode() != ISD::XOR &&14375N->getOperand(0).getOpcode() != ISD::SELECT &&14376N->getOperand(0).getOpcode() != ISD::SELECT_CC)14377return SDValue();1437814379SmallVector<SDValue, 4> Inputs;14380SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;14381SmallPtrSet<SDNode *, 16> Visited;1438214383// Visit all inputs, collect all binary operations (and, or, xor and14384// select) that are all fed by truncations.14385while (!BinOps.empty()) {14386SDValue BinOp = BinOps.pop_back_val();1438714388if (!Visited.insert(BinOp.getNode()).second)14389continue;1439014391PromOps.push_back(BinOp);1439214393for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {14394// The condition of the select is not promoted.14395if (BinOp.getOpcode() == ISD::SELECT && i == 0)14396continue;14397if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)14398continue;1439914400if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||14401isa<ConstantSDNode>(BinOp.getOperand(i))) {14402Inputs.push_back(BinOp.getOperand(i));14403} else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||14404BinOp.getOperand(i).getOpcode() == ISD::OR ||14405BinOp.getOperand(i).getOpcode() == ISD::XOR ||14406BinOp.getOperand(i).getOpcode() == ISD::SELECT ||14407BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {14408BinOps.push_back(BinOp.getOperand(i));14409} else {14410// We have an input that is not a truncation or another binary14411// operation; we'll abort this transformation.14412return SDValue();14413}14414}14415}1441614417// The operands of a select that must be truncated when the select is14418// promoted because the operand is actually part of the to-be-promoted set.14419DenseMap<SDNode *, EVT> SelectTruncOp[2];1442014421// Make sure that this is a self-contained cluster of operations (which14422// is not quite the same thing as saying that everything has only one14423// use).14424for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {14425if (isa<ConstantSDNode>(Inputs[i]))14426continue;1442714428for (SDNode *User : Inputs[i].getNode()->uses()) {14429if (User != N && !Visited.count(User))14430return SDValue();1443114432// If we're going to promote the non-output-value operand(s) or SELECT or14433// SELECT_CC, record them for truncation.14434if (User->getOpcode() == ISD::SELECT) {14435if (User->getOperand(0) == Inputs[i])14436SelectTruncOp[0].insert(std::make_pair(User,14437User->getOperand(0).getValueType()));14438} else if (User->getOpcode() == ISD::SELECT_CC) {14439if (User->getOperand(0) == Inputs[i])14440SelectTruncOp[0].insert(std::make_pair(User,14441User->getOperand(0).getValueType()));14442if (User->getOperand(1) == Inputs[i])14443SelectTruncOp[1].insert(std::make_pair(User,14444User->getOperand(1).getValueType()));14445}14446}14447}1444814449for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {14450for (SDNode *User : PromOps[i].getNode()->uses()) {14451if (User != N && !Visited.count(User))14452return SDValue();1445314454// If we're going to promote the non-output-value operand(s) or SELECT or14455// SELECT_CC, record them for truncation.14456if (User->getOpcode() == ISD::SELECT) {14457if (User->getOperand(0) == PromOps[i])14458SelectTruncOp[0].insert(std::make_pair(User,14459User->getOperand(0).getValueType()));14460} else if (User->getOpcode() == ISD::SELECT_CC) {14461if (User->getOperand(0) == PromOps[i])14462SelectTruncOp[0].insert(std::make_pair(User,14463User->getOperand(0).getValueType()));14464if (User->getOperand(1) == PromOps[i])14465SelectTruncOp[1].insert(std::make_pair(User,14466User->getOperand(1).getValueType()));14467}14468}14469}1447014471unsigned PromBits = N->getOperand(0).getValueSizeInBits();14472bool ReallyNeedsExt = false;14473if (N->getOpcode() != ISD::ANY_EXTEND) {14474// If all of the inputs are not already sign/zero extended, then14475// we'll still need to do that at the end.14476for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {14477if (isa<ConstantSDNode>(Inputs[i]))14478continue;1447914480unsigned OpBits =14481Inputs[i].getOperand(0).getValueSizeInBits();14482assert(PromBits < OpBits && "Truncation not to a smaller bit count?");1448314484if ((N->getOpcode() == ISD::ZERO_EXTEND &&14485!DAG.MaskedValueIsZero(Inputs[i].getOperand(0),14486APInt::getHighBitsSet(OpBits,14487OpBits-PromBits))) ||14488(N->getOpcode() == ISD::SIGN_EXTEND &&14489DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <14490(OpBits-(PromBits-1)))) {14491ReallyNeedsExt = true;14492break;14493}14494}14495}1449614497// Replace all inputs, either with the truncation operand, or a14498// truncation or extension to the final output type.14499for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {14500// Constant inputs need to be replaced with the to-be-promoted nodes that14501// use them because they might have users outside of the cluster of14502// promoted nodes.14503if (isa<ConstantSDNode>(Inputs[i]))14504continue;1450514506SDValue InSrc = Inputs[i].getOperand(0);14507if (Inputs[i].getValueType() == N->getValueType(0))14508DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);14509else if (N->getOpcode() == ISD::SIGN_EXTEND)14510DAG.ReplaceAllUsesOfValueWith(Inputs[i],14511DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));14512else if (N->getOpcode() == ISD::ZERO_EXTEND)14513DAG.ReplaceAllUsesOfValueWith(Inputs[i],14514DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));14515else14516DAG.ReplaceAllUsesOfValueWith(Inputs[i],14517DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));14518}1451914520std::list<HandleSDNode> PromOpHandles;14521for (auto &PromOp : PromOps)14522PromOpHandles.emplace_back(PromOp);1452314524// Replace all operations (these are all the same, but have a different14525// (promoted) return type). DAG.getNode will validate that the types of14526// a binary operator match, so go through the list in reverse so that14527// we've likely promoted both operands first.14528while (!PromOpHandles.empty()) {14529SDValue PromOp = PromOpHandles.back().getValue();14530PromOpHandles.pop_back();1453114532unsigned C;14533switch (PromOp.getOpcode()) {14534default: C = 0; break;14535case ISD::SELECT: C = 1; break;14536case ISD::SELECT_CC: C = 2; break;14537}1453814539if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&14540PromOp.getOperand(C).getValueType() != N->getValueType(0)) ||14541(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&14542PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {14543// The to-be-promoted operands of this node have not yet been14544// promoted (this should be rare because we're going through the14545// list backward, but if one of the operands has several users in14546// this cluster of to-be-promoted nodes, it is possible).14547PromOpHandles.emplace_front(PromOp);14548continue;14549}1455014551// For SELECT and SELECT_CC nodes, we do a similar check for any14552// to-be-promoted comparison inputs.14553if (PromOp.getOpcode() == ISD::SELECT ||14554PromOp.getOpcode() == ISD::SELECT_CC) {14555if ((SelectTruncOp[0].count(PromOp.getNode()) &&14556PromOp.getOperand(0).getValueType() != N->getValueType(0)) ||14557(SelectTruncOp[1].count(PromOp.getNode()) &&14558PromOp.getOperand(1).getValueType() != N->getValueType(0))) {14559PromOpHandles.emplace_front(PromOp);14560continue;14561}14562}1456314564SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),14565PromOp.getNode()->op_end());1456614567// If this node has constant inputs, then they'll need to be promoted here.14568for (unsigned i = 0; i < 2; ++i) {14569if (!isa<ConstantSDNode>(Ops[C+i]))14570continue;14571if (Ops[C+i].getValueType() == N->getValueType(0))14572continue;1457314574if (N->getOpcode() == ISD::SIGN_EXTEND)14575Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));14576else if (N->getOpcode() == ISD::ZERO_EXTEND)14577Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));14578else14579Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));14580}1458114582// If we've promoted the comparison inputs of a SELECT or SELECT_CC,14583// truncate them again to the original value type.14584if (PromOp.getOpcode() == ISD::SELECT ||14585PromOp.getOpcode() == ISD::SELECT_CC) {14586auto SI0 = SelectTruncOp[0].find(PromOp.getNode());14587if (SI0 != SelectTruncOp[0].end())14588Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]);14589auto SI1 = SelectTruncOp[1].find(PromOp.getNode());14590if (SI1 != SelectTruncOp[1].end())14591Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]);14592}1459314594DAG.ReplaceAllUsesOfValueWith(PromOp,14595DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));14596}1459714598// Now we're left with the initial extension itself.14599if (!ReallyNeedsExt)14600return N->getOperand(0);1460114602// To zero extend, just mask off everything except for the first bit (in the14603// i1 case).14604if (N->getOpcode() == ISD::ZERO_EXTEND)14605return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),14606DAG.getConstant(APInt::getLowBitsSet(14607N->getValueSizeInBits(0), PromBits),14608dl, N->getValueType(0)));1460914610assert(N->getOpcode() == ISD::SIGN_EXTEND &&14611"Invalid extension type");14612EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());14613SDValue ShiftCst =14614DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);14615return DAG.getNode(14616ISD::SRA, dl, N->getValueType(0),14617DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst),14618ShiftCst);14619}1462014621SDValue PPCTargetLowering::combineSetCC(SDNode *N,14622DAGCombinerInfo &DCI) const {14623assert(N->getOpcode() == ISD::SETCC &&14624"Should be called with a SETCC node");1462514626ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();14627if (CC == ISD::SETNE || CC == ISD::SETEQ) {14628SDValue LHS = N->getOperand(0);14629SDValue RHS = N->getOperand(1);1463014631// If there is a '0 - y' pattern, canonicalize the pattern to the RHS.14632if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&14633LHS.hasOneUse())14634std::swap(LHS, RHS);1463514636// x == 0-y --> x+y == 014637// x != 0-y --> x+y != 014638if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&14639RHS.hasOneUse()) {14640SDLoc DL(N);14641SelectionDAG &DAG = DCI.DAG;14642EVT VT = N->getValueType(0);14643EVT OpVT = LHS.getValueType();14644SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));14645return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);14646}14647}1464814649return DAGCombineTruncBoolExt(N, DCI);14650}1465114652// Is this an extending load from an f32 to an f64?14653static bool isFPExtLoad(SDValue Op) {14654if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()))14655return LD->getExtensionType() == ISD::EXTLOAD &&14656Op.getValueType() == MVT::f64;14657return false;14658}1465914660/// Reduces the number of fp-to-int conversion when building a vector.14661///14662/// If this vector is built out of floating to integer conversions,14663/// transform it to a vector built out of floating point values followed by a14664/// single floating to integer conversion of the vector.14665/// Namely (build_vector (fptosi $A), (fptosi $B), ...)14666/// becomes (fptosi (build_vector ($A, $B, ...)))14667SDValue PPCTargetLowering::14668combineElementTruncationToVectorTruncation(SDNode *N,14669DAGCombinerInfo &DCI) const {14670assert(N->getOpcode() == ISD::BUILD_VECTOR &&14671"Should be called with a BUILD_VECTOR node");1467214673SelectionDAG &DAG = DCI.DAG;14674SDLoc dl(N);1467514676SDValue FirstInput = N->getOperand(0);14677assert(FirstInput.getOpcode() == PPCISD::MFVSR &&14678"The input operand must be an fp-to-int conversion.");1467914680// This combine happens after legalization so the fp_to_[su]i nodes are14681// already converted to PPCSISD nodes.14682unsigned FirstConversion = FirstInput.getOperand(0).getOpcode();14683if (FirstConversion == PPCISD::FCTIDZ ||14684FirstConversion == PPCISD::FCTIDUZ ||14685FirstConversion == PPCISD::FCTIWZ ||14686FirstConversion == PPCISD::FCTIWUZ) {14687bool IsSplat = true;14688bool Is32Bit = FirstConversion == PPCISD::FCTIWZ ||14689FirstConversion == PPCISD::FCTIWUZ;14690EVT SrcVT = FirstInput.getOperand(0).getValueType();14691SmallVector<SDValue, 4> Ops;14692EVT TargetVT = N->getValueType(0);14693for (int i = 0, e = N->getNumOperands(); i < e; ++i) {14694SDValue NextOp = N->getOperand(i);14695if (NextOp.getOpcode() != PPCISD::MFVSR)14696return SDValue();14697unsigned NextConversion = NextOp.getOperand(0).getOpcode();14698if (NextConversion != FirstConversion)14699return SDValue();14700// If we are converting to 32-bit integers, we need to add an FP_ROUND.14701// This is not valid if the input was originally double precision. It is14702// also not profitable to do unless this is an extending load in which14703// case doing this combine will allow us to combine consecutive loads.14704if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0)))14705return SDValue();14706if (N->getOperand(i) != FirstInput)14707IsSplat = false;14708}1470914710// If this is a splat, we leave it as-is since there will be only a single14711// fp-to-int conversion followed by a splat of the integer. This is better14712// for 32-bit and smaller ints and neutral for 64-bit ints.14713if (IsSplat)14714return SDValue();1471514716// Now that we know we have the right type of node, get its operands14717for (int i = 0, e = N->getNumOperands(); i < e; ++i) {14718SDValue In = N->getOperand(i).getOperand(0);14719if (Is32Bit) {14720// For 32-bit values, we need to add an FP_ROUND node (if we made it14721// here, we know that all inputs are extending loads so this is safe).14722if (In.isUndef())14723Ops.push_back(DAG.getUNDEF(SrcVT));14724else {14725SDValue Trunc =14726DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, In.getOperand(0),14727DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));14728Ops.push_back(Trunc);14729}14730} else14731Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0));14732}1473314734unsigned Opcode;14735if (FirstConversion == PPCISD::FCTIDZ ||14736FirstConversion == PPCISD::FCTIWZ)14737Opcode = ISD::FP_TO_SINT;14738else14739Opcode = ISD::FP_TO_UINT;1474014741EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;14742SDValue BV = DAG.getBuildVector(NewVT, dl, Ops);14743return DAG.getNode(Opcode, dl, TargetVT, BV);14744}14745return SDValue();14746}1474714748/// Reduce the number of loads when building a vector.14749///14750/// Building a vector out of multiple loads can be converted to a load14751/// of the vector type if the loads are consecutive. If the loads are14752/// consecutive but in descending order, a shuffle is added at the end14753/// to reorder the vector.14754static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {14755assert(N->getOpcode() == ISD::BUILD_VECTOR &&14756"Should be called with a BUILD_VECTOR node");1475714758SDLoc dl(N);1475914760// Return early for non byte-sized type, as they can't be consecutive.14761if (!N->getValueType(0).getVectorElementType().isByteSized())14762return SDValue();1476314764bool InputsAreConsecutiveLoads = true;14765bool InputsAreReverseConsecutive = true;14766unsigned ElemSize = N->getValueType(0).getScalarType().getStoreSize();14767SDValue FirstInput = N->getOperand(0);14768bool IsRoundOfExtLoad = false;14769LoadSDNode *FirstLoad = nullptr;1477014771if (FirstInput.getOpcode() == ISD::FP_ROUND &&14772FirstInput.getOperand(0).getOpcode() == ISD::LOAD) {14773FirstLoad = cast<LoadSDNode>(FirstInput.getOperand(0));14774IsRoundOfExtLoad = FirstLoad->getExtensionType() == ISD::EXTLOAD;14775}14776// Not a build vector of (possibly fp_rounded) loads.14777if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) ||14778N->getNumOperands() == 1)14779return SDValue();1478014781if (!IsRoundOfExtLoad)14782FirstLoad = cast<LoadSDNode>(FirstInput);1478314784SmallVector<LoadSDNode *, 4> InputLoads;14785InputLoads.push_back(FirstLoad);14786for (int i = 1, e = N->getNumOperands(); i < e; ++i) {14787// If any inputs are fp_round(extload), they all must be.14788if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND)14789return SDValue();1479014791SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) :14792N->getOperand(i);14793if (NextInput.getOpcode() != ISD::LOAD)14794return SDValue();1479514796SDValue PreviousInput =14797IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1);14798LoadSDNode *LD1 = cast<LoadSDNode>(PreviousInput);14799LoadSDNode *LD2 = cast<LoadSDNode>(NextInput);1480014801// If any inputs are fp_round(extload), they all must be.14802if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)14803return SDValue();1480414805// We only care about regular loads. The PPC-specific load intrinsics14806// will not lead to a merge opportunity.14807if (!DAG.areNonVolatileConsecutiveLoads(LD2, LD1, ElemSize, 1))14808InputsAreConsecutiveLoads = false;14809if (!DAG.areNonVolatileConsecutiveLoads(LD1, LD2, ElemSize, 1))14810InputsAreReverseConsecutive = false;1481114812// Exit early if the loads are neither consecutive nor reverse consecutive.14813if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)14814return SDValue();14815InputLoads.push_back(LD2);14816}1481714818assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&14819"The loads cannot be both consecutive and reverse consecutive.");1482014821SDValue WideLoad;14822SDValue ReturnSDVal;14823if (InputsAreConsecutiveLoads) {14824assert(FirstLoad && "Input needs to be a LoadSDNode.");14825WideLoad = DAG.getLoad(N->getValueType(0), dl, FirstLoad->getChain(),14826FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),14827FirstLoad->getAlign());14828ReturnSDVal = WideLoad;14829} else if (InputsAreReverseConsecutive) {14830LoadSDNode *LastLoad = InputLoads.back();14831assert(LastLoad && "Input needs to be a LoadSDNode.");14832WideLoad = DAG.getLoad(N->getValueType(0), dl, LastLoad->getChain(),14833LastLoad->getBasePtr(), LastLoad->getPointerInfo(),14834LastLoad->getAlign());14835SmallVector<int, 16> Ops;14836for (int i = N->getNumOperands() - 1; i >= 0; i--)14837Ops.push_back(i);1483814839ReturnSDVal = DAG.getVectorShuffle(N->getValueType(0), dl, WideLoad,14840DAG.getUNDEF(N->getValueType(0)), Ops);14841} else14842return SDValue();1484314844for (auto *LD : InputLoads)14845DAG.makeEquivalentMemoryOrdering(LD, WideLoad);14846return ReturnSDVal;14847}1484814849// This function adds the required vector_shuffle needed to get14850// the elements of the vector extract in the correct position14851// as specified by the CorrectElems encoding.14852static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,14853SDValue Input, uint64_t Elems,14854uint64_t CorrectElems) {14855SDLoc dl(N);1485614857unsigned NumElems = Input.getValueType().getVectorNumElements();14858SmallVector<int, 16> ShuffleMask(NumElems, -1);1485914860// Knowing the element indices being extracted from the original14861// vector and the order in which they're being inserted, just put14862// them at element indices required for the instruction.14863for (unsigned i = 0; i < N->getNumOperands(); i++) {14864if (DAG.getDataLayout().isLittleEndian())14865ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;14866else14867ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;14868CorrectElems = CorrectElems >> 8;14869Elems = Elems >> 8;14870}1487114872SDValue Shuffle =14873DAG.getVectorShuffle(Input.getValueType(), dl, Input,14874DAG.getUNDEF(Input.getValueType()), ShuffleMask);1487514876EVT VT = N->getValueType(0);14877SDValue Conv = DAG.getBitcast(VT, Shuffle);1487814879EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),14880Input.getValueType().getVectorElementType(),14881VT.getVectorNumElements());14882return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Conv,14883DAG.getValueType(ExtVT));14884}1488514886// Look for build vector patterns where input operands come from sign14887// extended vector_extract elements of specific indices. If the correct indices14888// aren't used, add a vector shuffle to fix up the indices and create14889// SIGN_EXTEND_INREG node which selects the vector sign extend instructions14890// during instruction selection.14891static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {14892// This array encodes the indices that the vector sign extend instructions14893// extract from when extending from one type to another for both BE and LE.14894// The right nibble of each byte corresponds to the LE incides.14895// and the left nibble of each byte corresponds to the BE incides.14896// For example: 0x3074B8FC byte->word14897// For LE: the allowed indices are: 0x0,0x4,0x8,0xC14898// For BE: the allowed indices are: 0x3,0x7,0xB,0xF14899// For example: 0x000070F8 byte->double word14900// For LE: the allowed indices are: 0x0,0x814901// For BE: the allowed indices are: 0x7,0xF14902uint64_t TargetElems[] = {149030x3074B8FC, // b->w149040x000070F8, // b->d149050x10325476, // h->w149060x00003074, // h->d149070x00001032, // w->d14908};1490914910uint64_t Elems = 0;14911int Index;14912SDValue Input;1491314914auto isSExtOfVecExtract = [&](SDValue Op) -> bool {14915if (!Op)14916return false;14917if (Op.getOpcode() != ISD::SIGN_EXTEND &&14918Op.getOpcode() != ISD::SIGN_EXTEND_INREG)14919return false;1492014921// A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value14922// of the right width.14923SDValue Extract = Op.getOperand(0);14924if (Extract.getOpcode() == ISD::ANY_EXTEND)14925Extract = Extract.getOperand(0);14926if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)14927return false;1492814929ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1));14930if (!ExtOp)14931return false;1493214933Index = ExtOp->getZExtValue();14934if (Input && Input != Extract.getOperand(0))14935return false;1493614937if (!Input)14938Input = Extract.getOperand(0);1493914940Elems = Elems << 8;14941Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;14942Elems |= Index;1494314944return true;14945};1494614947// If the build vector operands aren't sign extended vector extracts,14948// of the same input vector, then return.14949for (unsigned i = 0; i < N->getNumOperands(); i++) {14950if (!isSExtOfVecExtract(N->getOperand(i))) {14951return SDValue();14952}14953}1495414955// If the vector extract indices are not correct, add the appropriate14956// vector_shuffle.14957int TgtElemArrayIdx;14958int InputSize = Input.getValueType().getScalarSizeInBits();14959int OutputSize = N->getValueType(0).getScalarSizeInBits();14960if (InputSize + OutputSize == 40)14961TgtElemArrayIdx = 0;14962else if (InputSize + OutputSize == 72)14963TgtElemArrayIdx = 1;14964else if (InputSize + OutputSize == 48)14965TgtElemArrayIdx = 2;14966else if (InputSize + OutputSize == 80)14967TgtElemArrayIdx = 3;14968else if (InputSize + OutputSize == 96)14969TgtElemArrayIdx = 4;14970else14971return SDValue();1497214973uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];14974CorrectElems = DAG.getDataLayout().isLittleEndian()14975? CorrectElems & 0x0F0F0F0F0F0F0F0F14976: CorrectElems & 0xF0F0F0F0F0F0F0F0;14977if (Elems != CorrectElems) {14978return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);14979}1498014981// Regular lowering will catch cases where a shuffle is not needed.14982return SDValue();14983}1498414985// Look for the pattern of a load from a narrow width to i128, feeding14986// into a BUILD_VECTOR of v1i128. Replace this sequence with a PPCISD node14987// (LXVRZX). This node represents a zero extending load that will be matched14988// to the Load VSX Vector Rightmost instructions.14989static SDValue combineBVZEXTLOAD(SDNode *N, SelectionDAG &DAG) {14990SDLoc DL(N);1499114992// This combine is only eligible for a BUILD_VECTOR of v1i128.14993if (N->getValueType(0) != MVT::v1i128)14994return SDValue();1499514996SDValue Operand = N->getOperand(0);14997// Proceed with the transformation if the operand to the BUILD_VECTOR14998// is a load instruction.14999if (Operand.getOpcode() != ISD::LOAD)15000return SDValue();1500115002auto *LD = cast<LoadSDNode>(Operand);15003EVT MemoryType = LD->getMemoryVT();1500415005// This transformation is only valid if the we are loading either a byte,15006// halfword, word, or doubleword.15007bool ValidLDType = MemoryType == MVT::i8 || MemoryType == MVT::i16 ||15008MemoryType == MVT::i32 || MemoryType == MVT::i64;1500915010// Ensure that the load from the narrow width is being zero extended to i128.15011if (!ValidLDType ||15012(LD->getExtensionType() != ISD::ZEXTLOAD &&15013LD->getExtensionType() != ISD::EXTLOAD))15014return SDValue();1501515016SDValue LoadOps[] = {15017LD->getChain(), LD->getBasePtr(),15018DAG.getIntPtrConstant(MemoryType.getScalarSizeInBits(), DL)};1501915020return DAG.getMemIntrinsicNode(PPCISD::LXVRZX, DL,15021DAG.getVTList(MVT::v1i128, MVT::Other),15022LoadOps, MemoryType, LD->getMemOperand());15023}1502415025SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,15026DAGCombinerInfo &DCI) const {15027assert(N->getOpcode() == ISD::BUILD_VECTOR &&15028"Should be called with a BUILD_VECTOR node");1502915030SelectionDAG &DAG = DCI.DAG;15031SDLoc dl(N);1503215033if (!Subtarget.hasVSX())15034return SDValue();1503515036// The target independent DAG combiner will leave a build_vector of15037// float-to-int conversions intact. We can generate MUCH better code for15038// a float-to-int conversion of a vector of floats.15039SDValue FirstInput = N->getOperand(0);15040if (FirstInput.getOpcode() == PPCISD::MFVSR) {15041SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);15042if (Reduced)15043return Reduced;15044}1504515046// If we're building a vector out of consecutive loads, just load that15047// vector type.15048SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);15049if (Reduced)15050return Reduced;1505115052// If we're building a vector out of extended elements from another vector15053// we have P9 vector integer extend instructions. The code assumes legal15054// input types (i.e. it can't handle things like v4i16) so do not run before15055// legalization.15056if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) {15057Reduced = combineBVOfVecSExt(N, DAG);15058if (Reduced)15059return Reduced;15060}1506115062// On Power10, the Load VSX Vector Rightmost instructions can be utilized15063// if this is a BUILD_VECTOR of v1i128, and if the operand to the BUILD_VECTOR15064// is a load from <valid narrow width> to i128.15065if (Subtarget.isISA3_1()) {15066SDValue BVOfZLoad = combineBVZEXTLOAD(N, DAG);15067if (BVOfZLoad)15068return BVOfZLoad;15069}1507015071if (N->getValueType(0) != MVT::v2f64)15072return SDValue();1507315074// Looking for:15075// (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))15076if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&15077FirstInput.getOpcode() != ISD::UINT_TO_FP)15078return SDValue();15079if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP &&15080N->getOperand(1).getOpcode() != ISD::UINT_TO_FP)15081return SDValue();15082if (FirstInput.getOpcode() != N->getOperand(1).getOpcode())15083return SDValue();1508415085SDValue Ext1 = FirstInput.getOperand(0);15086SDValue Ext2 = N->getOperand(1).getOperand(0);15087if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||15088Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)15089return SDValue();1509015091ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1));15092ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));15093if (!Ext1Op || !Ext2Op)15094return SDValue();15095if (Ext1.getOperand(0).getValueType() != MVT::v4i32 ||15096Ext1.getOperand(0) != Ext2.getOperand(0))15097return SDValue();1509815099int FirstElem = Ext1Op->getZExtValue();15100int SecondElem = Ext2Op->getZExtValue();15101int SubvecIdx;15102if (FirstElem == 0 && SecondElem == 1)15103SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;15104else if (FirstElem == 2 && SecondElem == 3)15105SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;15106else15107return SDValue();1510815109SDValue SrcVec = Ext1.getOperand(0);15110auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ?15111PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;15112return DAG.getNode(NodeType, dl, MVT::v2f64,15113SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl));15114}1511515116SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,15117DAGCombinerInfo &DCI) const {15118assert((N->getOpcode() == ISD::SINT_TO_FP ||15119N->getOpcode() == ISD::UINT_TO_FP) &&15120"Need an int -> FP conversion node here");1512115122if (useSoftFloat() || !Subtarget.has64BitSupport())15123return SDValue();1512415125SelectionDAG &DAG = DCI.DAG;15126SDLoc dl(N);15127SDValue Op(N, 0);1512815129// Don't handle ppc_fp128 here or conversions that are out-of-range capable15130// from the hardware.15131if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)15132return SDValue();15133if (!Op.getOperand(0).getValueType().isSimple())15134return SDValue();15135if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) ||15136Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64))15137return SDValue();1513815139SDValue FirstOperand(Op.getOperand(0));15140bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&15141(FirstOperand.getValueType() == MVT::i8 ||15142FirstOperand.getValueType() == MVT::i16);15143if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {15144bool Signed = N->getOpcode() == ISD::SINT_TO_FP;15145bool DstDouble = Op.getValueType() == MVT::f64;15146unsigned ConvOp = Signed ?15147(DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) :15148(DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);15149SDValue WidthConst =15150DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2,15151dl, false);15152LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode());15153SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };15154SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,15155DAG.getVTList(MVT::f64, MVT::Other),15156Ops, MVT::i8, LDN->getMemOperand());15157DAG.makeEquivalentMemoryOrdering(LDN, Ld);1515815159// For signed conversion, we need to sign-extend the value in the VSR15160if (Signed) {15161SDValue ExtOps[] = { Ld, WidthConst };15162SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps);15163return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext);15164} else15165return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);15166}151671516815169// For i32 intermediate values, unfortunately, the conversion functions15170// leave the upper 32 bits of the value are undefined. Within the set of15171// scalar instructions, we have no method for zero- or sign-extending the15172// value. Thus, we cannot handle i32 intermediate values here.15173if (Op.getOperand(0).getValueType() == MVT::i32)15174return SDValue();1517515176assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&15177"UINT_TO_FP is supported only with FPCVT");1517815179// If we have FCFIDS, then use it when converting to single-precision.15180// Otherwise, convert to double-precision and then round.15181unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)15182? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS15183: PPCISD::FCFIDS)15184: (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU15185: PPCISD::FCFID);15186MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)15187? MVT::f3215188: MVT::f64;1518915190// If we're converting from a float, to an int, and back to a float again,15191// then we don't need the store/load pair at all.15192if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT &&15193Subtarget.hasFPCVT()) ||15194(Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) {15195SDValue Src = Op.getOperand(0).getOperand(0);15196if (Src.getValueType() == MVT::f32) {15197Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);15198DCI.AddToWorklist(Src.getNode());15199} else if (Src.getValueType() != MVT::f64) {15200// Make sure that we don't pick up a ppc_fp128 source value.15201return SDValue();15202}1520315204unsigned FCTOp =15205Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :15206PPCISD::FCTIDUZ;1520715208SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src);15209SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp);1521015211if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {15212FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,15213DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));15214DCI.AddToWorklist(FP.getNode());15215}1521615217return FP;15218}1521915220return SDValue();15221}1522215223// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for15224// builtins) into loads with swaps.15225SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,15226DAGCombinerInfo &DCI) const {15227// Delay VSX load for LE combine until after LegalizeOps to prioritize other15228// load combines.15229if (DCI.isBeforeLegalizeOps())15230return SDValue();1523115232SelectionDAG &DAG = DCI.DAG;15233SDLoc dl(N);15234SDValue Chain;15235SDValue Base;15236MachineMemOperand *MMO;1523715238switch (N->getOpcode()) {15239default:15240llvm_unreachable("Unexpected opcode for little endian VSX load");15241case ISD::LOAD: {15242LoadSDNode *LD = cast<LoadSDNode>(N);15243Chain = LD->getChain();15244Base = LD->getBasePtr();15245MMO = LD->getMemOperand();15246// If the MMO suggests this isn't a load of a full vector, leave15247// things alone. For a built-in, we have to make the change for15248// correctness, so if there is a size problem that will be a bug.15249if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)15250return SDValue();15251break;15252}15253case ISD::INTRINSIC_W_CHAIN: {15254MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);15255Chain = Intrin->getChain();15256// Similarly to the store case below, Intrin->getBasePtr() doesn't get15257// us what we want. Get operand 2 instead.15258Base = Intrin->getOperand(2);15259MMO = Intrin->getMemOperand();15260break;15261}15262}1526315264MVT VecTy = N->getValueType(0).getSimpleVT();1526515266SDValue LoadOps[] = { Chain, Base };15267SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,15268DAG.getVTList(MVT::v2f64, MVT::Other),15269LoadOps, MVT::v2f64, MMO);1527015271DCI.AddToWorklist(Load.getNode());15272Chain = Load.getValue(1);15273SDValue Swap = DAG.getNode(15274PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load);15275DCI.AddToWorklist(Swap.getNode());1527615277// Add a bitcast if the resulting load type doesn't match v2f64.15278if (VecTy != MVT::v2f64) {15279SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap);15280DCI.AddToWorklist(N.getNode());15281// Package {bitcast value, swap's chain} to match Load's shape.15282return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other),15283N, Swap.getValue(1));15284}1528515286return Swap;15287}1528815289// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for15290// builtins) into stores with swaps.15291SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,15292DAGCombinerInfo &DCI) const {15293// Delay VSX store for LE combine until after LegalizeOps to prioritize other15294// store combines.15295if (DCI.isBeforeLegalizeOps())15296return SDValue();1529715298SelectionDAG &DAG = DCI.DAG;15299SDLoc dl(N);15300SDValue Chain;15301SDValue Base;15302unsigned SrcOpnd;15303MachineMemOperand *MMO;1530415305switch (N->getOpcode()) {15306default:15307llvm_unreachable("Unexpected opcode for little endian VSX store");15308case ISD::STORE: {15309StoreSDNode *ST = cast<StoreSDNode>(N);15310Chain = ST->getChain();15311Base = ST->getBasePtr();15312MMO = ST->getMemOperand();15313SrcOpnd = 1;15314// If the MMO suggests this isn't a store of a full vector, leave15315// things alone. For a built-in, we have to make the change for15316// correctness, so if there is a size problem that will be a bug.15317if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)15318return SDValue();15319break;15320}15321case ISD::INTRINSIC_VOID: {15322MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);15323Chain = Intrin->getChain();15324// Intrin->getBasePtr() oddly does not get what we want.15325Base = Intrin->getOperand(3);15326MMO = Intrin->getMemOperand();15327SrcOpnd = 2;15328break;15329}15330}1533115332SDValue Src = N->getOperand(SrcOpnd);15333MVT VecTy = Src.getValueType().getSimpleVT();1533415335// All stores are done as v2f64 and possible bit cast.15336if (VecTy != MVT::v2f64) {15337Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);15338DCI.AddToWorklist(Src.getNode());15339}1534015341SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,15342DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src);15343DCI.AddToWorklist(Swap.getNode());15344Chain = Swap.getValue(1);15345SDValue StoreOps[] = { Chain, Swap, Base };15346SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,15347DAG.getVTList(MVT::Other),15348StoreOps, VecTy, MMO);15349DCI.AddToWorklist(Store.getNode());15350return Store;15351}1535215353// Handle DAG combine for STORE (FP_TO_INT F).15354SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,15355DAGCombinerInfo &DCI) const {15356SelectionDAG &DAG = DCI.DAG;15357SDLoc dl(N);15358unsigned Opcode = N->getOperand(1).getOpcode();15359(void)Opcode;15360bool Strict = N->getOperand(1)->isStrictFPOpcode();1536115362assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT ||15363Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT)15364&& "Not a FP_TO_INT Instruction!");1536515366SDValue Val = N->getOperand(1).getOperand(Strict ? 1 : 0);15367EVT Op1VT = N->getOperand(1).getValueType();15368EVT ResVT = Val.getValueType();1536915370if (!Subtarget.hasVSX() || !Subtarget.hasFPCVT() || !isTypeLegal(ResVT))15371return SDValue();1537215373// Only perform combine for conversion to i64/i32 or power9 i16/i8.15374bool ValidTypeForStoreFltAsInt =15375(Op1VT == MVT::i32 || (Op1VT == MVT::i64 && Subtarget.isPPC64()) ||15376(Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));1537715378// TODO: Lower conversion from f128 on all VSX targets15379if (ResVT == MVT::ppcf128 || (ResVT == MVT::f128 && !Subtarget.hasP9Vector()))15380return SDValue();1538115382if ((Op1VT != MVT::i64 && !Subtarget.hasP8Vector()) ||15383cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)15384return SDValue();1538515386Val = convertFPToInt(N->getOperand(1), DAG, Subtarget);1538715388// Set number of bytes being converted.15389unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8;15390SDValue Ops[] = {N->getOperand(0), Val, N->getOperand(2),15391DAG.getIntPtrConstant(ByteSize, dl, false),15392DAG.getValueType(Op1VT)};1539315394Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl,15395DAG.getVTList(MVT::Other), Ops,15396cast<StoreSDNode>(N)->getMemoryVT(),15397cast<StoreSDNode>(N)->getMemOperand());1539815399return Val;15400}1540115402static bool isAlternatingShuffMask(const ArrayRef<int> &Mask, int NumElts) {15403// Check that the source of the element keeps flipping15404// (i.e. Mask[i] < NumElts -> Mask[i+i] >= NumElts).15405bool PrevElemFromFirstVec = Mask[0] < NumElts;15406for (int i = 1, e = Mask.size(); i < e; i++) {15407if (PrevElemFromFirstVec && Mask[i] < NumElts)15408return false;15409if (!PrevElemFromFirstVec && Mask[i] >= NumElts)15410return false;15411PrevElemFromFirstVec = !PrevElemFromFirstVec;15412}15413return true;15414}1541515416static bool isSplatBV(SDValue Op) {15417if (Op.getOpcode() != ISD::BUILD_VECTOR)15418return false;15419SDValue FirstOp;1542015421// Find first non-undef input.15422for (int i = 0, e = Op.getNumOperands(); i < e; i++) {15423FirstOp = Op.getOperand(i);15424if (!FirstOp.isUndef())15425break;15426}1542715428// All inputs are undef or the same as the first non-undef input.15429for (int i = 1, e = Op.getNumOperands(); i < e; i++)15430if (Op.getOperand(i) != FirstOp && !Op.getOperand(i).isUndef())15431return false;15432return true;15433}1543415435static SDValue isScalarToVec(SDValue Op) {15436if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)15437return Op;15438if (Op.getOpcode() != ISD::BITCAST)15439return SDValue();15440Op = Op.getOperand(0);15441if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)15442return Op;15443return SDValue();15444}1544515446// Fix up the shuffle mask to account for the fact that the result of15447// scalar_to_vector is not in lane zero. This just takes all values in15448// the ranges specified by the min/max indices and adds the number of15449// elements required to ensure each element comes from the respective15450// position in the valid lane.15451// On little endian, that's just the corresponding element in the other15452// half of the vector. On big endian, it is in the same half but right15453// justified rather than left justified in that half.15454static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV,15455int LHSMaxIdx, int RHSMinIdx,15456int RHSMaxIdx, int HalfVec,15457unsigned ValidLaneWidth,15458const PPCSubtarget &Subtarget) {15459for (int i = 0, e = ShuffV.size(); i < e; i++) {15460int Idx = ShuffV[i];15461if ((Idx >= 0 && Idx < LHSMaxIdx) || (Idx >= RHSMinIdx && Idx < RHSMaxIdx))15462ShuffV[i] +=15463Subtarget.isLittleEndian() ? HalfVec : HalfVec - ValidLaneWidth;15464}15465}1546615467// Replace a SCALAR_TO_VECTOR with a SCALAR_TO_VECTOR_PERMUTED except if15468// the original is:15469// (<n x Ty> (scalar_to_vector (Ty (extract_elt <n x Ty> %a, C))))15470// In such a case, just change the shuffle mask to extract the element15471// from the permuted index.15472static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG,15473const PPCSubtarget &Subtarget) {15474SDLoc dl(OrigSToV);15475EVT VT = OrigSToV.getValueType();15476assert(OrigSToV.getOpcode() == ISD::SCALAR_TO_VECTOR &&15477"Expecting a SCALAR_TO_VECTOR here");15478SDValue Input = OrigSToV.getOperand(0);1547915480if (Input.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {15481ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Input.getOperand(1));15482SDValue OrigVector = Input.getOperand(0);1548315484// Can't handle non-const element indices or different vector types15485// for the input to the extract and the output of the scalar_to_vector.15486if (Idx && VT == OrigVector.getValueType()) {15487unsigned NumElts = VT.getVectorNumElements();15488assert(15489NumElts > 1 &&15490"Cannot produce a permuted scalar_to_vector for one element vector");15491SmallVector<int, 16> NewMask(NumElts, -1);15492unsigned ResultInElt = NumElts / 2;15493ResultInElt -= Subtarget.isLittleEndian() ? 0 : 1;15494NewMask[ResultInElt] = Idx->getZExtValue();15495return DAG.getVectorShuffle(VT, dl, OrigVector, OrigVector, NewMask);15496}15497}15498return DAG.getNode(PPCISD::SCALAR_TO_VECTOR_PERMUTED, dl, VT,15499OrigSToV.getOperand(0));15500}1550115502// On little endian subtargets, combine shuffles such as:15503// vector_shuffle<16,1,17,3,18,5,19,7,20,9,21,11,22,13,23,15>, <zero>, %b15504// into:15505// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7>, <zero>, %b15506// because the latter can be matched to a single instruction merge.15507// Furthermore, SCALAR_TO_VECTOR on little endian always involves a permute15508// to put the value into element zero. Adjust the shuffle mask so that the15509// vector can remain in permuted form (to prevent a swap prior to a shuffle).15510// On big endian targets, this is still useful for SCALAR_TO_VECTOR15511// nodes with elements smaller than doubleword because all the ways15512// of getting scalar data into a vector register put the value in the15513// rightmost element of the left half of the vector.15514SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,15515SelectionDAG &DAG) const {15516SDValue LHS = SVN->getOperand(0);15517SDValue RHS = SVN->getOperand(1);15518auto Mask = SVN->getMask();15519int NumElts = LHS.getValueType().getVectorNumElements();15520SDValue Res(SVN, 0);15521SDLoc dl(SVN);15522bool IsLittleEndian = Subtarget.isLittleEndian();1552315524// On big endian targets this is only useful for subtargets with direct moves.15525// On little endian targets it would be useful for all subtargets with VSX.15526// However adding special handling for LE subtargets without direct moves15527// would be wasted effort since the minimum arch for LE is ISA 2.07 (Power8)15528// which includes direct moves.15529if (!Subtarget.hasDirectMove())15530return Res;1553115532// If this is not a shuffle of a shuffle and the first element comes from15533// the second vector, canonicalize to the commuted form. This will make it15534// more likely to match one of the single instruction patterns.15535if (Mask[0] >= NumElts && LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&15536RHS.getOpcode() != ISD::VECTOR_SHUFFLE) {15537std::swap(LHS, RHS);15538Res = DAG.getCommutedVectorShuffle(*SVN);15539Mask = cast<ShuffleVectorSDNode>(Res)->getMask();15540}1554115542// Adjust the shuffle mask if either input vector comes from a15543// SCALAR_TO_VECTOR and keep the respective input vector in permuted15544// form (to prevent the need for a swap).15545SmallVector<int, 16> ShuffV(Mask);15546SDValue SToVLHS = isScalarToVec(LHS);15547SDValue SToVRHS = isScalarToVec(RHS);15548if (SToVLHS || SToVRHS) {15549// FIXME: If both LHS and RHS are SCALAR_TO_VECTOR, but are not the15550// same type and have differing element sizes, then do not perform15551// the following transformation. The current transformation for15552// SCALAR_TO_VECTOR assumes that both input vectors have the same15553// element size. This will be updated in the future to account for15554// differing sizes of the LHS and RHS.15555if (SToVLHS && SToVRHS &&15556(SToVLHS.getValueType().getScalarSizeInBits() !=15557SToVRHS.getValueType().getScalarSizeInBits()))15558return Res;1555915560int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements()15561: SToVRHS.getValueType().getVectorNumElements();15562int NumEltsOut = ShuffV.size();15563// The width of the "valid lane" (i.e. the lane that contains the value that15564// is vectorized) needs to be expressed in terms of the number of elements15565// of the shuffle. It is thereby the ratio of the values before and after15566// any bitcast.15567unsigned ValidLaneWidth =15568SToVLHS ? SToVLHS.getValueType().getScalarSizeInBits() /15569LHS.getValueType().getScalarSizeInBits()15570: SToVRHS.getValueType().getScalarSizeInBits() /15571RHS.getValueType().getScalarSizeInBits();1557215573// Initially assume that neither input is permuted. These will be adjusted15574// accordingly if either input is.15575int LHSMaxIdx = -1;15576int RHSMinIdx = -1;15577int RHSMaxIdx = -1;15578int HalfVec = LHS.getValueType().getVectorNumElements() / 2;1557915580// Get the permuted scalar to vector nodes for the source(s) that come from15581// ISD::SCALAR_TO_VECTOR.15582// On big endian systems, this only makes sense for element sizes smaller15583// than 64 bits since for 64-bit elements, all instructions already put15584// the value into element zero. Since scalar size of LHS and RHS may differ15585// after isScalarToVec, this should be checked using their own sizes.15586if (SToVLHS) {15587if (!IsLittleEndian && SToVLHS.getValueType().getScalarSizeInBits() >= 64)15588return Res;15589// Set up the values for the shuffle vector fixup.15590LHSMaxIdx = NumEltsOut / NumEltsIn;15591SToVLHS = getSToVPermuted(SToVLHS, DAG, Subtarget);15592if (SToVLHS.getValueType() != LHS.getValueType())15593SToVLHS = DAG.getBitcast(LHS.getValueType(), SToVLHS);15594LHS = SToVLHS;15595}15596if (SToVRHS) {15597if (!IsLittleEndian && SToVRHS.getValueType().getScalarSizeInBits() >= 64)15598return Res;15599RHSMinIdx = NumEltsOut;15600RHSMaxIdx = NumEltsOut / NumEltsIn + RHSMinIdx;15601SToVRHS = getSToVPermuted(SToVRHS, DAG, Subtarget);15602if (SToVRHS.getValueType() != RHS.getValueType())15603SToVRHS = DAG.getBitcast(RHS.getValueType(), SToVRHS);15604RHS = SToVRHS;15605}1560615607// Fix up the shuffle mask to reflect where the desired element actually is.15608// The minimum and maximum indices that correspond to element zero for both15609// the LHS and RHS are computed and will control which shuffle mask entries15610// are to be changed. For example, if the RHS is permuted, any shuffle mask15611// entries in the range [RHSMinIdx,RHSMaxIdx) will be adjusted.15612fixupShuffleMaskForPermutedSToV(ShuffV, LHSMaxIdx, RHSMinIdx, RHSMaxIdx,15613HalfVec, ValidLaneWidth, Subtarget);15614Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);1561515616// We may have simplified away the shuffle. We won't be able to do anything15617// further with it here.15618if (!isa<ShuffleVectorSDNode>(Res))15619return Res;15620Mask = cast<ShuffleVectorSDNode>(Res)->getMask();15621}1562215623SDValue TheSplat = IsLittleEndian ? RHS : LHS;15624// The common case after we commuted the shuffle is that the RHS is a splat15625// and we have elements coming in from the splat at indices that are not15626// conducive to using a merge.15627// Example:15628// vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, <zero>15629if (!isSplatBV(TheSplat))15630return Res;1563115632// We are looking for a mask such that all even elements are from15633// one vector and all odd elements from the other.15634if (!isAlternatingShuffMask(Mask, NumElts))15635return Res;1563615637// Adjust the mask so we are pulling in the same index from the splat15638// as the index from the interesting vector in consecutive elements.15639if (IsLittleEndian) {15640// Example (even elements from first vector):15641// vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>15642if (Mask[0] < NumElts)15643for (int i = 1, e = Mask.size(); i < e; i += 2) {15644if (ShuffV[i] < 0)15645continue;15646// If element from non-splat is undef, pick first element from splat.15647ShuffV[i] = (ShuffV[i - 1] >= 0 ? ShuffV[i - 1] : 0) + NumElts;15648}15649// Example (odd elements from first vector):15650// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>15651else15652for (int i = 0, e = Mask.size(); i < e; i += 2) {15653if (ShuffV[i] < 0)15654continue;15655// If element from non-splat is undef, pick first element from splat.15656ShuffV[i] = (ShuffV[i + 1] >= 0 ? ShuffV[i + 1] : 0) + NumElts;15657}15658} else {15659// Example (even elements from first vector):15660// vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> <zero>, t115661if (Mask[0] < NumElts)15662for (int i = 0, e = Mask.size(); i < e; i += 2) {15663if (ShuffV[i] < 0)15664continue;15665// If element from non-splat is undef, pick first element from splat.15666ShuffV[i] = ShuffV[i + 1] >= 0 ? ShuffV[i + 1] - NumElts : 0;15667}15668// Example (odd elements from first vector):15669// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> <zero>, t115670else15671for (int i = 1, e = Mask.size(); i < e; i += 2) {15672if (ShuffV[i] < 0)15673continue;15674// If element from non-splat is undef, pick first element from splat.15675ShuffV[i] = ShuffV[i - 1] >= 0 ? ShuffV[i - 1] - NumElts : 0;15676}15677}1567815679// If the RHS has undefs, we need to remove them since we may have created15680// a shuffle that adds those instead of the splat value.15681SDValue SplatVal =15682cast<BuildVectorSDNode>(TheSplat.getNode())->getSplatValue();15683TheSplat = DAG.getSplatBuildVector(TheSplat.getValueType(), dl, SplatVal);1568415685if (IsLittleEndian)15686RHS = TheSplat;15687else15688LHS = TheSplat;15689return DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);15690}1569115692SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,15693LSBaseSDNode *LSBase,15694DAGCombinerInfo &DCI) const {15695assert((ISD::isNormalLoad(LSBase) || ISD::isNormalStore(LSBase)) &&15696"Not a reverse memop pattern!");1569715698auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool {15699auto Mask = SVN->getMask();15700int i = 0;15701auto I = Mask.rbegin();15702auto E = Mask.rend();1570315704for (; I != E; ++I) {15705if (*I != i)15706return false;15707i++;15708}15709return true;15710};1571115712SelectionDAG &DAG = DCI.DAG;15713EVT VT = SVN->getValueType(0);1571415715if (!isTypeLegal(VT) || !Subtarget.isLittleEndian() || !Subtarget.hasVSX())15716return SDValue();1571715718// Before P9, we have PPCVSXSwapRemoval pass to hack the element order.15719// See comment in PPCVSXSwapRemoval.cpp.15720// It is conflict with PPCVSXSwapRemoval opt. So we don't do it.15721if (!Subtarget.hasP9Vector())15722return SDValue();1572315724if(!IsElementReverse(SVN))15725return SDValue();1572615727if (LSBase->getOpcode() == ISD::LOAD) {15728// If the load return value 0 has more than one user except the15729// shufflevector instruction, it is not profitable to replace the15730// shufflevector with a reverse load.15731for (SDNode::use_iterator UI = LSBase->use_begin(), UE = LSBase->use_end();15732UI != UE; ++UI)15733if (UI.getUse().getResNo() == 0 && UI->getOpcode() != ISD::VECTOR_SHUFFLE)15734return SDValue();1573515736SDLoc dl(LSBase);15737SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};15738return DAG.getMemIntrinsicNode(15739PPCISD::LOAD_VEC_BE, dl, DAG.getVTList(VT, MVT::Other), LoadOps,15740LSBase->getMemoryVT(), LSBase->getMemOperand());15741}1574215743if (LSBase->getOpcode() == ISD::STORE) {15744// If there are other uses of the shuffle, the swap cannot be avoided.15745// Forcing the use of an X-Form (since swapped stores only have15746// X-Forms) without removing the swap is unprofitable.15747if (!SVN->hasOneUse())15748return SDValue();1574915750SDLoc dl(LSBase);15751SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(0),15752LSBase->getBasePtr()};15753return DAG.getMemIntrinsicNode(15754PPCISD::STORE_VEC_BE, dl, DAG.getVTList(MVT::Other), StoreOps,15755LSBase->getMemoryVT(), LSBase->getMemOperand());15756}1575715758llvm_unreachable("Expected a load or store node here");15759}1576015761static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) {15762unsigned IntrinsicID = Intrin.getConstantOperandVal(1);15763if (IntrinsicID == Intrinsic::ppc_stdcx)15764StoreWidth = 8;15765else if (IntrinsicID == Intrinsic::ppc_stwcx)15766StoreWidth = 4;15767else if (IntrinsicID == Intrinsic::ppc_sthcx)15768StoreWidth = 2;15769else if (IntrinsicID == Intrinsic::ppc_stbcx)15770StoreWidth = 1;15771else15772return false;15773return true;15774}1577515776SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,15777DAGCombinerInfo &DCI) const {15778SelectionDAG &DAG = DCI.DAG;15779SDLoc dl(N);15780switch (N->getOpcode()) {15781default: break;15782case ISD::ADD:15783return combineADD(N, DCI);15784case ISD::AND: {15785// We don't want (and (zext (shift...)), C) if C fits in the width of the15786// original input as that will prevent us from selecting optimal rotates.15787// This only matters if the input to the extend is i32 widened to i64.15788SDValue Op1 = N->getOperand(0);15789SDValue Op2 = N->getOperand(1);15790if ((Op1.getOpcode() != ISD::ZERO_EXTEND &&15791Op1.getOpcode() != ISD::ANY_EXTEND) ||15792!isa<ConstantSDNode>(Op2) || N->getValueType(0) != MVT::i64 ||15793Op1.getOperand(0).getValueType() != MVT::i32)15794break;15795SDValue NarrowOp = Op1.getOperand(0);15796if (NarrowOp.getOpcode() != ISD::SHL && NarrowOp.getOpcode() != ISD::SRL &&15797NarrowOp.getOpcode() != ISD::ROTL && NarrowOp.getOpcode() != ISD::ROTR)15798break;1579915800uint64_t Imm = Op2->getAsZExtVal();15801// Make sure that the constant is narrow enough to fit in the narrow type.15802if (!isUInt<32>(Imm))15803break;15804SDValue ConstOp = DAG.getConstant(Imm, dl, MVT::i32);15805SDValue NarrowAnd = DAG.getNode(ISD::AND, dl, MVT::i32, NarrowOp, ConstOp);15806return DAG.getZExtOrTrunc(NarrowAnd, dl, N->getValueType(0));15807}15808case ISD::SHL:15809return combineSHL(N, DCI);15810case ISD::SRA:15811return combineSRA(N, DCI);15812case ISD::SRL:15813return combineSRL(N, DCI);15814case ISD::MUL:15815return combineMUL(N, DCI);15816case ISD::FMA:15817case PPCISD::FNMSUB:15818return combineFMALike(N, DCI);15819case PPCISD::SHL:15820if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.15821return N->getOperand(0);15822break;15823case PPCISD::SRL:15824if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0.15825return N->getOperand(0);15826break;15827case PPCISD::SRA:15828if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {15829if (C->isZero() || // 0 >>s V -> 0.15830C->isAllOnes()) // -1 >>s V -> -1.15831return N->getOperand(0);15832}15833break;15834case ISD::SIGN_EXTEND:15835case ISD::ZERO_EXTEND:15836case ISD::ANY_EXTEND:15837return DAGCombineExtBoolTrunc(N, DCI);15838case ISD::TRUNCATE:15839return combineTRUNCATE(N, DCI);15840case ISD::SETCC:15841if (SDValue CSCC = combineSetCC(N, DCI))15842return CSCC;15843[[fallthrough]];15844case ISD::SELECT_CC:15845return DAGCombineTruncBoolExt(N, DCI);15846case ISD::SINT_TO_FP:15847case ISD::UINT_TO_FP:15848return combineFPToIntToFP(N, DCI);15849case ISD::VECTOR_SHUFFLE:15850if (ISD::isNormalLoad(N->getOperand(0).getNode())) {15851LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0));15852return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI);15853}15854return combineVectorShuffle(cast<ShuffleVectorSDNode>(N), DCI.DAG);15855case ISD::STORE: {1585615857EVT Op1VT = N->getOperand(1).getValueType();15858unsigned Opcode = N->getOperand(1).getOpcode();1585915860if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT ||15861Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT) {15862SDValue Val = combineStoreFPToInt(N, DCI);15863if (Val)15864return Val;15865}1586615867if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) {15868ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N->getOperand(1));15869SDValue Val= combineVReverseMemOP(SVN, cast<LSBaseSDNode>(N), DCI);15870if (Val)15871return Val;15872}1587315874// Turn STORE (BSWAP) -> sthbrx/stwbrx.15875if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP &&15876N->getOperand(1).getNode()->hasOneUse() &&15877(Op1VT == MVT::i32 || Op1VT == MVT::i16 ||15878(Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {1587915880// STBRX can only handle simple types and it makes no sense to store less15881// two bytes in byte-reversed order.15882EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();15883if (mVT.isExtended() || mVT.getSizeInBits() < 16)15884break;1588515886SDValue BSwapOp = N->getOperand(1).getOperand(0);15887// Do an any-extend to 32-bits if this is a half-word input.15888if (BSwapOp.getValueType() == MVT::i16)15889BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);1589015891// If the type of BSWAP operand is wider than stored memory width15892// it need to be shifted to the right side before STBRX.15893if (Op1VT.bitsGT(mVT)) {15894int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();15895BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,15896DAG.getConstant(Shift, dl, MVT::i32));15897// Need to truncate if this is a bswap of i64 stored as i32/i16.15898if (Op1VT == MVT::i64)15899BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp);15900}1590115902SDValue Ops[] = {15903N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT)15904};15905return15906DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),15907Ops, cast<StoreSDNode>(N)->getMemoryVT(),15908cast<StoreSDNode>(N)->getMemOperand());15909}1591015911// STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>15912// So it can increase the chance of CSE constant construction.15913if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&15914isa<ConstantSDNode>(N->getOperand(1)) && Op1VT == MVT::i32) {15915// Need to sign-extended to 64-bits to handle negative values.15916EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();15917uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),15918MemVT.getSizeInBits());15919SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64);1592015921// DAG.getTruncStore() can't be used here because it doesn't accept15922// the general (base + offset) addressing mode.15923// So we use UpdateNodeOperands and setTruncatingStore instead.15924DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2),15925N->getOperand(3));15926cast<StoreSDNode>(N)->setTruncatingStore(true);15927return SDValue(N, 0);15928}1592915930// For little endian, VSX stores require generating xxswapd/lxvd2x.15931// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.15932if (Op1VT.isSimple()) {15933MVT StoreVT = Op1VT.getSimpleVT();15934if (Subtarget.needsSwapsForVSXMemOps() &&15935(StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||15936StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))15937return expandVSXStoreForLE(N, DCI);15938}15939break;15940}15941case ISD::LOAD: {15942LoadSDNode *LD = cast<LoadSDNode>(N);15943EVT VT = LD->getValueType(0);1594415945// For little endian, VSX loads require generating lxvd2x/xxswapd.15946// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.15947if (VT.isSimple()) {15948MVT LoadVT = VT.getSimpleVT();15949if (Subtarget.needsSwapsForVSXMemOps() &&15950(LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||15951LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))15952return expandVSXLoadForLE(N, DCI);15953}1595415955// We sometimes end up with a 64-bit integer load, from which we extract15956// two single-precision floating-point numbers. This happens with15957// std::complex<float>, and other similar structures, because of the way we15958// canonicalize structure copies. However, if we lack direct moves,15959// then the final bitcasts from the extracted integer values to the15960// floating-point numbers turn into store/load pairs. Even with direct moves,15961// just loading the two floating-point numbers is likely better.15962auto ReplaceTwoFloatLoad = [&]() {15963if (VT != MVT::i64)15964return false;1596515966if (LD->getExtensionType() != ISD::NON_EXTLOAD ||15967LD->isVolatile())15968return false;1596915970// We're looking for a sequence like this:15971// t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i6415972// t16: i64 = srl t13, Constant:i32<32>15973// t17: i32 = truncate t1615974// t18: f32 = bitcast t1715975// t19: i32 = truncate t1315976// t20: f32 = bitcast t191597715978if (!LD->hasNUsesOfValue(2, 0))15979return false;1598015981auto UI = LD->use_begin();15982while (UI.getUse().getResNo() != 0) ++UI;15983SDNode *Trunc = *UI++;15984while (UI.getUse().getResNo() != 0) ++UI;15985SDNode *RightShift = *UI;15986if (Trunc->getOpcode() != ISD::TRUNCATE)15987std::swap(Trunc, RightShift);1598815989if (Trunc->getOpcode() != ISD::TRUNCATE ||15990Trunc->getValueType(0) != MVT::i32 ||15991!Trunc->hasOneUse())15992return false;15993if (RightShift->getOpcode() != ISD::SRL ||15994!isa<ConstantSDNode>(RightShift->getOperand(1)) ||15995RightShift->getConstantOperandVal(1) != 32 ||15996!RightShift->hasOneUse())15997return false;1599815999SDNode *Trunc2 = *RightShift->use_begin();16000if (Trunc2->getOpcode() != ISD::TRUNCATE ||16001Trunc2->getValueType(0) != MVT::i32 ||16002!Trunc2->hasOneUse())16003return false;1600416005SDNode *Bitcast = *Trunc->use_begin();16006SDNode *Bitcast2 = *Trunc2->use_begin();1600716008if (Bitcast->getOpcode() != ISD::BITCAST ||16009Bitcast->getValueType(0) != MVT::f32)16010return false;16011if (Bitcast2->getOpcode() != ISD::BITCAST ||16012Bitcast2->getValueType(0) != MVT::f32)16013return false;1601416015if (Subtarget.isLittleEndian())16016std::swap(Bitcast, Bitcast2);1601716018// Bitcast has the second float (in memory-layout order) and Bitcast216019// has the first one.1602016021SDValue BasePtr = LD->getBasePtr();16022if (LD->isIndexed()) {16023assert(LD->getAddressingMode() == ISD::PRE_INC &&16024"Non-pre-inc AM on PPC?");16025BasePtr =16026DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,16027LD->getOffset());16028}1602916030auto MMOFlags =16031LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;16032SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,16033LD->getPointerInfo(), LD->getAlign(),16034MMOFlags, LD->getAAInfo());16035SDValue AddPtr =16036DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),16037BasePtr, DAG.getIntPtrConstant(4, dl));16038SDValue FloatLoad2 = DAG.getLoad(16039MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,16040LD->getPointerInfo().getWithOffset(4),16041commonAlignment(LD->getAlign(), 4), MMOFlags, LD->getAAInfo());1604216043if (LD->isIndexed()) {16044// Note that DAGCombine should re-form any pre-increment load(s) from16045// what is produced here if that makes sense.16046DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);16047}1604816049DCI.CombineTo(Bitcast2, FloatLoad);16050DCI.CombineTo(Bitcast, FloatLoad2);1605116052DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),16053SDValue(FloatLoad2.getNode(), 1));16054return true;16055};1605616057if (ReplaceTwoFloatLoad())16058return SDValue(N, 0);1605916060EVT MemVT = LD->getMemoryVT();16061Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());16062Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty);16063if (LD->isUnindexed() && VT.isVector() &&16064((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&16065// P8 and later hardware should just use LOAD.16066!Subtarget.hasP8Vector() &&16067(VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||16068VT == MVT::v4f32))) &&16069LD->getAlign() < ABIAlignment) {16070// This is a type-legal unaligned Altivec load.16071SDValue Chain = LD->getChain();16072SDValue Ptr = LD->getBasePtr();16073bool isLittleEndian = Subtarget.isLittleEndian();1607416075// This implements the loading of unaligned vectors as described in16076// the venerable Apple Velocity Engine overview. Specifically:16077// https://developer.apple.com/hardwaredrivers/ve/alignment.html16078// https://developer.apple.com/hardwaredrivers/ve/code_optimization.html16079//16080// The general idea is to expand a sequence of one or more unaligned16081// loads into an alignment-based permutation-control instruction (lvsl16082// or lvsr), a series of regular vector loads (which always truncate16083// their input address to an aligned address), and a series of16084// permutations. The results of these permutations are the requested16085// loaded values. The trick is that the last "extra" load is not taken16086// from the address you might suspect (sizeof(vector) bytes after the16087// last requested load), but rather sizeof(vector) - 1 bytes after the16088// last requested vector. The point of this is to avoid a page fault if16089// the base address happened to be aligned. This works because if the16090// base address is aligned, then adding less than a full vector length16091// will cause the last vector in the sequence to be (re)loaded.16092// Otherwise, the next vector will be fetched as you might suspect was16093// necessary.1609416095// We might be able to reuse the permutation generation from16096// a different base address offset from this one by an aligned amount.16097// The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this16098// optimization later.16099Intrinsic::ID Intr, IntrLD, IntrPerm;16100MVT PermCntlTy, PermTy, LDTy;16101Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr16102: Intrinsic::ppc_altivec_lvsl;16103IntrLD = Intrinsic::ppc_altivec_lvx;16104IntrPerm = Intrinsic::ppc_altivec_vperm;16105PermCntlTy = MVT::v16i8;16106PermTy = MVT::v4i32;16107LDTy = MVT::v4i32;1610816109SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);1611016111// Create the new MMO for the new base load. It is like the original MMO,16112// but represents an area in memory almost twice the vector size centered16113// on the original address. If the address is unaligned, we might start16114// reading up to (sizeof(vector)-1) bytes below the address of the16115// original unaligned load.16116MachineFunction &MF = DAG.getMachineFunction();16117MachineMemOperand *BaseMMO =16118MF.getMachineMemOperand(LD->getMemOperand(),16119-(int64_t)MemVT.getStoreSize()+1,161202*MemVT.getStoreSize()-1);1612116122// Create the new base load.16123SDValue LDXIntID =16124DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout()));16125SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };16126SDValue BaseLoad =16127DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,16128DAG.getVTList(PermTy, MVT::Other),16129BaseLoadOps, LDTy, BaseMMO);1613016131// Note that the value of IncOffset (which is provided to the next16132// load's pointer info offset value, and thus used to calculate the16133// alignment), and the value of IncValue (which is actually used to16134// increment the pointer value) are different! This is because we16135// require the next load to appear to be aligned, even though it16136// is actually offset from the base pointer by a lesser amount.16137int IncOffset = VT.getSizeInBits() / 8;16138int IncValue = IncOffset;1613916140// Walk (both up and down) the chain looking for another load at the real16141// (aligned) offset (the alignment of the other load does not matter in16142// this case). If found, then do not use the offset reduction trick, as16143// that will prevent the loads from being later combined (as they would16144// otherwise be duplicates).16145if (!findConsecutiveLoad(LD, DAG))16146--IncValue;1614716148SDValue Increment =16149DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout()));16150Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);1615116152MachineMemOperand *ExtraMMO =16153MF.getMachineMemOperand(LD->getMemOperand(),161541, 2*MemVT.getStoreSize()-1);16155SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };16156SDValue ExtraLoad =16157DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,16158DAG.getVTList(PermTy, MVT::Other),16159ExtraLoadOps, LDTy, ExtraMMO);1616016161SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,16162BaseLoad.getValue(1), ExtraLoad.getValue(1));1616316164// Because vperm has a big-endian bias, we must reverse the order16165// of the input vectors and complement the permute control vector16166// when generating little endian code. We have already handled the16167// latter by using lvsr instead of lvsl, so just reverse BaseLoad16168// and ExtraLoad here.16169SDValue Perm;16170if (isLittleEndian)16171Perm = BuildIntrinsicOp(IntrPerm,16172ExtraLoad, BaseLoad, PermCntl, DAG, dl);16173else16174Perm = BuildIntrinsicOp(IntrPerm,16175BaseLoad, ExtraLoad, PermCntl, DAG, dl);1617616177if (VT != PermTy)16178Perm = Subtarget.hasAltivec()16179? DAG.getNode(ISD::BITCAST, dl, VT, Perm)16180: DAG.getNode(ISD::FP_ROUND, dl, VT, Perm,16181DAG.getTargetConstant(1, dl, MVT::i64));16182// second argument is 1 because this rounding16183// is always exact.1618416185// The output of the permutation is our loaded result, the TokenFactor is16186// our new chain.16187DCI.CombineTo(N, Perm, TF);16188return SDValue(N, 0);16189}16190}16191break;16192case ISD::INTRINSIC_WO_CHAIN: {16193bool isLittleEndian = Subtarget.isLittleEndian();16194unsigned IID = N->getConstantOperandVal(0);16195Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr16196: Intrinsic::ppc_altivec_lvsl);16197if (IID == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) {16198SDValue Add = N->getOperand(1);1619916200int Bits = 4 /* 16 byte alignment */;1620116202if (DAG.MaskedValueIsZero(Add->getOperand(1),16203APInt::getAllOnes(Bits /* alignment */)16204.zext(Add.getScalarValueSizeInBits()))) {16205SDNode *BasePtr = Add->getOperand(0).getNode();16206for (SDNode *U : BasePtr->uses()) {16207if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&16208U->getConstantOperandVal(0) == IID) {16209// We've found another LVSL/LVSR, and this address is an aligned16210// multiple of that one. The results will be the same, so use the16211// one we've just found instead.1621216213return SDValue(U, 0);16214}16215}16216}1621716218if (isa<ConstantSDNode>(Add->getOperand(1))) {16219SDNode *BasePtr = Add->getOperand(0).getNode();16220for (SDNode *U : BasePtr->uses()) {16221if (U->getOpcode() == ISD::ADD &&16222isa<ConstantSDNode>(U->getOperand(1)) &&16223(Add->getConstantOperandVal(1) - U->getConstantOperandVal(1)) %16224(1ULL << Bits) ==162250) {16226SDNode *OtherAdd = U;16227for (SDNode *V : OtherAdd->uses()) {16228if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&16229V->getConstantOperandVal(0) == IID) {16230return SDValue(V, 0);16231}16232}16233}16234}16235}16236}1623716238// Combine vmaxsw/h/b(a, a's negation) to abs(a)16239// Expose the vabsduw/h/b opportunity for down stream16240if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() &&16241(IID == Intrinsic::ppc_altivec_vmaxsw ||16242IID == Intrinsic::ppc_altivec_vmaxsh ||16243IID == Intrinsic::ppc_altivec_vmaxsb)) {16244SDValue V1 = N->getOperand(1);16245SDValue V2 = N->getOperand(2);16246if ((V1.getSimpleValueType() == MVT::v4i32 ||16247V1.getSimpleValueType() == MVT::v8i16 ||16248V1.getSimpleValueType() == MVT::v16i8) &&16249V1.getSimpleValueType() == V2.getSimpleValueType()) {16250// (0-a, a)16251if (V1.getOpcode() == ISD::SUB &&16252ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&16253V1.getOperand(1) == V2) {16254return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2);16255}16256// (a, 0-a)16257if (V2.getOpcode() == ISD::SUB &&16258ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&16259V2.getOperand(1) == V1) {16260return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);16261}16262// (x-y, y-x)16263if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB &&16264V1.getOperand(0) == V2.getOperand(1) &&16265V1.getOperand(1) == V2.getOperand(0)) {16266return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);16267}16268}16269}16270}1627116272break;16273case ISD::INTRINSIC_W_CHAIN:16274switch (N->getConstantOperandVal(1)) {16275default:16276break;16277case Intrinsic::ppc_altivec_vsum4sbs:16278case Intrinsic::ppc_altivec_vsum4shs:16279case Intrinsic::ppc_altivec_vsum4ubs: {16280// These sum-across intrinsics only have a chain due to the side effect16281// that they may set the SAT bit. If we know the SAT bit will not be set16282// for some inputs, we can replace any uses of their chain with the16283// input chain.16284if (BuildVectorSDNode *BVN =16285dyn_cast<BuildVectorSDNode>(N->getOperand(3))) {16286APInt APSplatBits, APSplatUndef;16287unsigned SplatBitSize;16288bool HasAnyUndefs;16289bool BVNIsConstantSplat = BVN->isConstantSplat(16290APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0,16291!Subtarget.isLittleEndian());16292// If the constant splat vector is 0, the SAT bit will not be set.16293if (BVNIsConstantSplat && APSplatBits == 0)16294DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), N->getOperand(0));16295}16296return SDValue();16297}16298case Intrinsic::ppc_vsx_lxvw4x:16299case Intrinsic::ppc_vsx_lxvd2x:16300// For little endian, VSX loads require generating lxvd2x/xxswapd.16301// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.16302if (Subtarget.needsSwapsForVSXMemOps())16303return expandVSXLoadForLE(N, DCI);16304break;16305}16306break;16307case ISD::INTRINSIC_VOID:16308// For little endian, VSX stores require generating xxswapd/stxvd2x.16309// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.16310if (Subtarget.needsSwapsForVSXMemOps()) {16311switch (N->getConstantOperandVal(1)) {16312default:16313break;16314case Intrinsic::ppc_vsx_stxvw4x:16315case Intrinsic::ppc_vsx_stxvd2x:16316return expandVSXStoreForLE(N, DCI);16317}16318}16319break;16320case ISD::BSWAP: {16321// Turn BSWAP (LOAD) -> lhbrx/lwbrx.16322// For subtargets without LDBRX, we can still do better than the default16323// expansion even for 64-bit BSWAP (LOAD).16324bool Is64BitBswapOn64BitTgt =16325Subtarget.isPPC64() && N->getValueType(0) == MVT::i64;16326bool IsSingleUseNormalLd = ISD::isNormalLoad(N->getOperand(0).getNode()) &&16327N->getOperand(0).hasOneUse();16328if (IsSingleUseNormalLd &&16329(N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||16330(Subtarget.hasLDBRX() && Is64BitBswapOn64BitTgt))) {16331SDValue Load = N->getOperand(0);16332LoadSDNode *LD = cast<LoadSDNode>(Load);16333// Create the byte-swapping load.16334SDValue Ops[] = {16335LD->getChain(), // Chain16336LD->getBasePtr(), // Ptr16337DAG.getValueType(N->getValueType(0)) // VT16338};16339SDValue BSLoad =16340DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,16341DAG.getVTList(N->getValueType(0) == MVT::i64 ?16342MVT::i64 : MVT::i32, MVT::Other),16343Ops, LD->getMemoryVT(), LD->getMemOperand());1634416345// If this is an i16 load, insert the truncate.16346SDValue ResVal = BSLoad;16347if (N->getValueType(0) == MVT::i16)16348ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);1634916350// First, combine the bswap away. This makes the value produced by the16351// load dead.16352DCI.CombineTo(N, ResVal);1635316354// Next, combine the load away, we give it a bogus result value but a real16355// chain result. The result value is dead because the bswap is dead.16356DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));1635716358// Return N so it doesn't get rechecked!16359return SDValue(N, 0);16360}16361// Convert this to two 32-bit bswap loads and a BUILD_PAIR. Do this only16362// before legalization so that the BUILD_PAIR is handled correctly.16363if (!DCI.isBeforeLegalize() || !Is64BitBswapOn64BitTgt ||16364!IsSingleUseNormalLd)16365return SDValue();16366LoadSDNode *LD = cast<LoadSDNode>(N->getOperand(0));1636716368// Can't split volatile or atomic loads.16369if (!LD->isSimple())16370return SDValue();16371SDValue BasePtr = LD->getBasePtr();16372SDValue Lo = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr,16373LD->getPointerInfo(), LD->getAlign());16374Lo = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Lo);16375BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,16376DAG.getIntPtrConstant(4, dl));16377MachineMemOperand *NewMMO = DAG.getMachineFunction().getMachineMemOperand(16378LD->getMemOperand(), 4, 4);16379SDValue Hi = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr, NewMMO);16380Hi = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Hi);16381SDValue Res;16382if (Subtarget.isLittleEndian())16383Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Hi, Lo);16384else16385Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);16386SDValue TF =16387DAG.getNode(ISD::TokenFactor, dl, MVT::Other,16388Hi.getOperand(0).getValue(1), Lo.getOperand(0).getValue(1));16389DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), TF);16390return Res;16391}16392case PPCISD::VCMP:16393// If a VCMP_rec node already exists with exactly the same operands as this16394// node, use its result instead of this node (VCMP_rec computes both a CR616395// and a normal output).16396//16397if (!N->getOperand(0).hasOneUse() &&16398!N->getOperand(1).hasOneUse() &&16399!N->getOperand(2).hasOneUse()) {1640016401// Scan all of the users of the LHS, looking for VCMP_rec's that match.16402SDNode *VCMPrecNode = nullptr;1640316404SDNode *LHSN = N->getOperand(0).getNode();16405for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();16406UI != E; ++UI)16407if (UI->getOpcode() == PPCISD::VCMP_rec &&16408UI->getOperand(1) == N->getOperand(1) &&16409UI->getOperand(2) == N->getOperand(2) &&16410UI->getOperand(0) == N->getOperand(0)) {16411VCMPrecNode = *UI;16412break;16413}1641416415// If there is no VCMP_rec node, or if the flag value has a single use,16416// don't transform this.16417if (!VCMPrecNode || VCMPrecNode->hasNUsesOfValue(0, 1))16418break;1641916420// Look at the (necessarily single) use of the flag value. If it has a16421// chain, this transformation is more complex. Note that multiple things16422// could use the value result, which we should ignore.16423SDNode *FlagUser = nullptr;16424for (SDNode::use_iterator UI = VCMPrecNode->use_begin();16425FlagUser == nullptr; ++UI) {16426assert(UI != VCMPrecNode->use_end() && "Didn't find user!");16427SDNode *User = *UI;16428for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {16429if (User->getOperand(i) == SDValue(VCMPrecNode, 1)) {16430FlagUser = User;16431break;16432}16433}16434}1643516436// If the user is a MFOCRF instruction, we know this is safe.16437// Otherwise we give up for right now.16438if (FlagUser->getOpcode() == PPCISD::MFOCRF)16439return SDValue(VCMPrecNode, 0);16440}16441break;16442case ISD::BR_CC: {16443// If this is a branch on an altivec predicate comparison, lower this so16444// that we don't have to do a MFOCRF: instead, branch directly on CR6. This16445// lowering is done pre-legalize, because the legalizer lowers the predicate16446// compare down to code that is difficult to reassemble.16447// This code also handles branches that depend on the result of a store16448// conditional.16449ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();16450SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);1645116452int CompareOpc;16453bool isDot;1645416455if (!isa<ConstantSDNode>(RHS) || (CC != ISD::SETEQ && CC != ISD::SETNE))16456break;1645716458// Since we are doing this pre-legalize, the RHS can be a constant of16459// arbitrary bitwidth which may cause issues when trying to get the value16460// from the underlying APInt.16461auto RHSAPInt = RHS->getAsAPIntVal();16462if (!RHSAPInt.isIntN(64))16463break;1646416465unsigned Val = RHSAPInt.getZExtValue();16466auto isImpossibleCompare = [&]() {16467// If this is a comparison against something other than 0/1, then we know16468// that the condition is never/always true.16469if (Val != 0 && Val != 1) {16470if (CC == ISD::SETEQ) // Cond never true, remove branch.16471return N->getOperand(0);16472// Always !=, turn it into an unconditional branch.16473return DAG.getNode(ISD::BR, dl, MVT::Other,16474N->getOperand(0), N->getOperand(4));16475}16476return SDValue();16477};16478// Combine branches fed by store conditional instructions (st[bhwd]cx).16479unsigned StoreWidth = 0;16480if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&16481isStoreConditional(LHS, StoreWidth)) {16482if (SDValue Impossible = isImpossibleCompare())16483return Impossible;16484PPC::Predicate CompOpc;16485// eq 0 => ne16486// ne 0 => eq16487// eq 1 => eq16488// ne 1 => ne16489if (Val == 0)16490CompOpc = CC == ISD::SETEQ ? PPC::PRED_NE : PPC::PRED_EQ;16491else16492CompOpc = CC == ISD::SETEQ ? PPC::PRED_EQ : PPC::PRED_NE;1649316494SDValue Ops[] = {LHS.getOperand(0), LHS.getOperand(2), LHS.getOperand(3),16495DAG.getConstant(StoreWidth, dl, MVT::i32)};16496auto *MemNode = cast<MemSDNode>(LHS);16497SDValue ConstSt = DAG.getMemIntrinsicNode(16498PPCISD::STORE_COND, dl,16499DAG.getVTList(MVT::i32, MVT::Other, MVT::Glue), Ops,16500MemNode->getMemoryVT(), MemNode->getMemOperand());1650116502SDValue InChain;16503// Unchain the branch from the original store conditional.16504if (N->getOperand(0) == LHS.getValue(1))16505InChain = LHS.getOperand(0);16506else if (N->getOperand(0).getOpcode() == ISD::TokenFactor) {16507SmallVector<SDValue, 4> InChains;16508SDValue InTF = N->getOperand(0);16509for (int i = 0, e = InTF.getNumOperands(); i < e; i++)16510if (InTF.getOperand(i) != LHS.getValue(1))16511InChains.push_back(InTF.getOperand(i));16512InChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, InChains);16513}1651416515return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, InChain,16516DAG.getConstant(CompOpc, dl, MVT::i32),16517DAG.getRegister(PPC::CR0, MVT::i32), N->getOperand(4),16518ConstSt.getValue(2));16519}1652016521if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&16522getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {16523assert(isDot && "Can't compare against a vector result!");1652416525if (SDValue Impossible = isImpossibleCompare())16526return Impossible;1652716528bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);16529// Create the PPCISD altivec 'dot' comparison node.16530SDValue Ops[] = {16531LHS.getOperand(2), // LHS of compare16532LHS.getOperand(3), // RHS of compare16533DAG.getConstant(CompareOpc, dl, MVT::i32)16534};16535EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };16536SDValue CompNode = DAG.getNode(PPCISD::VCMP_rec, dl, VTs, Ops);1653716538// Unpack the result based on how the target uses it.16539PPC::Predicate CompOpc;16540switch (LHS.getConstantOperandVal(1)) {16541default: // Can't happen, don't crash on invalid number though.16542case 0: // Branch on the value of the EQ bit of CR6.16543CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;16544break;16545case 1: // Branch on the inverted value of the EQ bit of CR6.16546CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;16547break;16548case 2: // Branch on the value of the LT bit of CR6.16549CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;16550break;16551case 3: // Branch on the inverted value of the LT bit of CR6.16552CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;16553break;16554}1655516556return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),16557DAG.getConstant(CompOpc, dl, MVT::i32),16558DAG.getRegister(PPC::CR6, MVT::i32),16559N->getOperand(4), CompNode.getValue(1));16560}16561break;16562}16563case ISD::BUILD_VECTOR:16564return DAGCombineBuildVector(N, DCI);16565}1656616567return SDValue();16568}1656916570SDValue16571PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,16572SelectionDAG &DAG,16573SmallVectorImpl<SDNode *> &Created) const {16574// fold (sdiv X, pow2)16575EVT VT = N->getValueType(0);16576if (VT == MVT::i64 && !Subtarget.isPPC64())16577return SDValue();16578if ((VT != MVT::i32 && VT != MVT::i64) ||16579!(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))16580return SDValue();1658116582SDLoc DL(N);16583SDValue N0 = N->getOperand(0);1658416585bool IsNegPow2 = Divisor.isNegatedPowerOf2();16586unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countr_zero();16587SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT);1658816589SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt);16590Created.push_back(Op.getNode());1659116592if (IsNegPow2) {16593Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);16594Created.push_back(Op.getNode());16595}1659616597return Op;16598}1659916600//===----------------------------------------------------------------------===//16601// Inline Assembly Support16602//===----------------------------------------------------------------------===//1660316604void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,16605KnownBits &Known,16606const APInt &DemandedElts,16607const SelectionDAG &DAG,16608unsigned Depth) const {16609Known.resetAll();16610switch (Op.getOpcode()) {16611default: break;16612case PPCISD::LBRX: {16613// lhbrx is known to have the top bits cleared out.16614if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)16615Known.Zero = 0xFFFF0000;16616break;16617}16618case ISD::INTRINSIC_WO_CHAIN: {16619switch (Op.getConstantOperandVal(0)) {16620default: break;16621case Intrinsic::ppc_altivec_vcmpbfp_p:16622case Intrinsic::ppc_altivec_vcmpeqfp_p:16623case Intrinsic::ppc_altivec_vcmpequb_p:16624case Intrinsic::ppc_altivec_vcmpequh_p:16625case Intrinsic::ppc_altivec_vcmpequw_p:16626case Intrinsic::ppc_altivec_vcmpequd_p:16627case Intrinsic::ppc_altivec_vcmpequq_p:16628case Intrinsic::ppc_altivec_vcmpgefp_p:16629case Intrinsic::ppc_altivec_vcmpgtfp_p:16630case Intrinsic::ppc_altivec_vcmpgtsb_p:16631case Intrinsic::ppc_altivec_vcmpgtsh_p:16632case Intrinsic::ppc_altivec_vcmpgtsw_p:16633case Intrinsic::ppc_altivec_vcmpgtsd_p:16634case Intrinsic::ppc_altivec_vcmpgtsq_p:16635case Intrinsic::ppc_altivec_vcmpgtub_p:16636case Intrinsic::ppc_altivec_vcmpgtuh_p:16637case Intrinsic::ppc_altivec_vcmpgtuw_p:16638case Intrinsic::ppc_altivec_vcmpgtud_p:16639case Intrinsic::ppc_altivec_vcmpgtuq_p:16640Known.Zero = ~1U; // All bits but the low one are known to be zero.16641break;16642}16643break;16644}16645case ISD::INTRINSIC_W_CHAIN: {16646switch (Op.getConstantOperandVal(1)) {16647default:16648break;16649case Intrinsic::ppc_load2r:16650// Top bits are cleared for load2r (which is the same as lhbrx).16651Known.Zero = 0xFFFF0000;16652break;16653}16654break;16655}16656}16657}1665816659Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {16660switch (Subtarget.getCPUDirective()) {16661default: break;16662case PPC::DIR_970:16663case PPC::DIR_PWR4:16664case PPC::DIR_PWR5:16665case PPC::DIR_PWR5X:16666case PPC::DIR_PWR6:16667case PPC::DIR_PWR6X:16668case PPC::DIR_PWR7:16669case PPC::DIR_PWR8:16670case PPC::DIR_PWR9:16671case PPC::DIR_PWR10:16672case PPC::DIR_PWR11:16673case PPC::DIR_PWR_FUTURE: {16674if (!ML)16675break;1667616677if (!DisableInnermostLoopAlign32) {16678// If the nested loop is an innermost loop, prefer to a 32-byte alignment,16679// so that we can decrease cache misses and branch-prediction misses.16680// Actual alignment of the loop will depend on the hotness check and other16681// logic in alignBlocks.16682if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())16683return Align(32);16684}1668516686const PPCInstrInfo *TII = Subtarget.getInstrInfo();1668716688// For small loops (between 5 and 8 instructions), align to a 32-byte16689// boundary so that the entire loop fits in one instruction-cache line.16690uint64_t LoopSize = 0;16691for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)16692for (const MachineInstr &J : **I) {16693LoopSize += TII->getInstSizeInBytes(J);16694if (LoopSize > 32)16695break;16696}1669716698if (LoopSize > 16 && LoopSize <= 32)16699return Align(32);1670016701break;16702}16703}1670416705return TargetLowering::getPrefLoopAlignment(ML);16706}1670716708/// getConstraintType - Given a constraint, return the type of16709/// constraint it is for this target.16710PPCTargetLowering::ConstraintType16711PPCTargetLowering::getConstraintType(StringRef Constraint) const {16712if (Constraint.size() == 1) {16713switch (Constraint[0]) {16714default: break;16715case 'b':16716case 'r':16717case 'f':16718case 'd':16719case 'v':16720case 'y':16721return C_RegisterClass;16722case 'Z':16723// FIXME: While Z does indicate a memory constraint, it specifically16724// indicates an r+r address (used in conjunction with the 'y' modifier16725// in the replacement string). Currently, we're forcing the base16726// register to be r0 in the asm printer (which is interpreted as zero)16727// and forming the complete address in the second register. This is16728// suboptimal.16729return C_Memory;16730}16731} else if (Constraint == "wc") { // individual CR bits.16732return C_RegisterClass;16733} else if (Constraint == "wa" || Constraint == "wd" ||16734Constraint == "wf" || Constraint == "ws" ||16735Constraint == "wi" || Constraint == "ww") {16736return C_RegisterClass; // VSX registers.16737}16738return TargetLowering::getConstraintType(Constraint);16739}1674016741/// Examine constraint type and operand type and determine a weight value.16742/// This object must already have been set up with the operand type16743/// and the current alternative constraint selected.16744TargetLowering::ConstraintWeight16745PPCTargetLowering::getSingleConstraintMatchWeight(16746AsmOperandInfo &info, const char *constraint) const {16747ConstraintWeight weight = CW_Invalid;16748Value *CallOperandVal = info.CallOperandVal;16749// If we don't have a value, we can't do a match,16750// but allow it at the lowest weight.16751if (!CallOperandVal)16752return CW_Default;16753Type *type = CallOperandVal->getType();1675416755// Look at the constraint type.16756if (StringRef(constraint) == "wc" && type->isIntegerTy(1))16757return CW_Register; // an individual CR bit.16758else if ((StringRef(constraint) == "wa" ||16759StringRef(constraint) == "wd" ||16760StringRef(constraint) == "wf") &&16761type->isVectorTy())16762return CW_Register;16763else if (StringRef(constraint) == "wi" && type->isIntegerTy(64))16764return CW_Register; // just hold 64-bit integers data.16765else if (StringRef(constraint) == "ws" && type->isDoubleTy())16766return CW_Register;16767else if (StringRef(constraint) == "ww" && type->isFloatTy())16768return CW_Register;1676916770switch (*constraint) {16771default:16772weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);16773break;16774case 'b':16775if (type->isIntegerTy())16776weight = CW_Register;16777break;16778case 'f':16779if (type->isFloatTy())16780weight = CW_Register;16781break;16782case 'd':16783if (type->isDoubleTy())16784weight = CW_Register;16785break;16786case 'v':16787if (type->isVectorTy())16788weight = CW_Register;16789break;16790case 'y':16791weight = CW_Register;16792break;16793case 'Z':16794weight = CW_Memory;16795break;16796}16797return weight;16798}1679916800std::pair<unsigned, const TargetRegisterClass *>16801PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,16802StringRef Constraint,16803MVT VT) const {16804if (Constraint.size() == 1) {16805// GCC RS6000 Constraint Letters16806switch (Constraint[0]) {16807case 'b': // R1-R3116808if (VT == MVT::i64 && Subtarget.isPPC64())16809return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);16810return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);16811case 'r': // R0-R3116812if (VT == MVT::i64 && Subtarget.isPPC64())16813return std::make_pair(0U, &PPC::G8RCRegClass);16814return std::make_pair(0U, &PPC::GPRCRegClass);16815// 'd' and 'f' constraints are both defined to be "the floating point16816// registers", where one is for 32-bit and the other for 64-bit. We don't16817// really care overly much here so just give them all the same reg classes.16818case 'd':16819case 'f':16820if (Subtarget.hasSPE()) {16821if (VT == MVT::f32 || VT == MVT::i32)16822return std::make_pair(0U, &PPC::GPRCRegClass);16823if (VT == MVT::f64 || VT == MVT::i64)16824return std::make_pair(0U, &PPC::SPERCRegClass);16825} else {16826if (VT == MVT::f32 || VT == MVT::i32)16827return std::make_pair(0U, &PPC::F4RCRegClass);16828if (VT == MVT::f64 || VT == MVT::i64)16829return std::make_pair(0U, &PPC::F8RCRegClass);16830}16831break;16832case 'v':16833if (Subtarget.hasAltivec() && VT.isVector())16834return std::make_pair(0U, &PPC::VRRCRegClass);16835else if (Subtarget.hasVSX())16836// Scalars in Altivec registers only make sense with VSX.16837return std::make_pair(0U, &PPC::VFRCRegClass);16838break;16839case 'y': // crrc16840return std::make_pair(0U, &PPC::CRRCRegClass);16841}16842} else if (Constraint == "wc" && Subtarget.useCRBits()) {16843// An individual CR bit.16844return std::make_pair(0U, &PPC::CRBITRCRegClass);16845} else if ((Constraint == "wa" || Constraint == "wd" ||16846Constraint == "wf" || Constraint == "wi") &&16847Subtarget.hasVSX()) {16848// A VSX register for either a scalar (FP) or vector. There is no16849// support for single precision scalars on subtargets prior to Power8.16850if (VT.isVector())16851return std::make_pair(0U, &PPC::VSRCRegClass);16852if (VT == MVT::f32 && Subtarget.hasP8Vector())16853return std::make_pair(0U, &PPC::VSSRCRegClass);16854return std::make_pair(0U, &PPC::VSFRCRegClass);16855} else if ((Constraint == "ws" || Constraint == "ww") && Subtarget.hasVSX()) {16856if (VT == MVT::f32 && Subtarget.hasP8Vector())16857return std::make_pair(0U, &PPC::VSSRCRegClass);16858else16859return std::make_pair(0U, &PPC::VSFRCRegClass);16860} else if (Constraint == "lr") {16861if (VT == MVT::i64)16862return std::make_pair(0U, &PPC::LR8RCRegClass);16863else16864return std::make_pair(0U, &PPC::LRRCRegClass);16865}1686616867// Handle special cases of physical registers that are not properly handled16868// by the base class.16869if (Constraint[0] == '{' && Constraint[Constraint.size() - 1] == '}') {16870// If we name a VSX register, we can't defer to the base class because it16871// will not recognize the correct register (their names will be VSL{0-31}16872// and V{0-31} so they won't match). So we match them here.16873if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {16874int VSNum = atoi(Constraint.data() + 3);16875assert(VSNum >= 0 && VSNum <= 63 &&16876"Attempted to access a vsr out of range");16877if (VSNum < 32)16878return std::make_pair(PPC::VSL0 + VSNum, &PPC::VSRCRegClass);16879return std::make_pair(PPC::V0 + VSNum - 32, &PPC::VSRCRegClass);16880}1688116882// For float registers, we can't defer to the base class as it will match16883// the SPILLTOVSRRC class.16884if (Constraint.size() > 3 && Constraint[1] == 'f') {16885int RegNum = atoi(Constraint.data() + 2);16886if (RegNum > 31 || RegNum < 0)16887report_fatal_error("Invalid floating point register number");16888if (VT == MVT::f32 || VT == MVT::i32)16889return Subtarget.hasSPE()16890? std::make_pair(PPC::R0 + RegNum, &PPC::GPRCRegClass)16891: std::make_pair(PPC::F0 + RegNum, &PPC::F4RCRegClass);16892if (VT == MVT::f64 || VT == MVT::i64)16893return Subtarget.hasSPE()16894? std::make_pair(PPC::S0 + RegNum, &PPC::SPERCRegClass)16895: std::make_pair(PPC::F0 + RegNum, &PPC::F8RCRegClass);16896}16897}1689816899std::pair<unsigned, const TargetRegisterClass *> R =16900TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);1690116902// r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers16903// (which we call X[0-9]+). If a 64-bit value has been requested, and a16904// 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent16905// register.16906// FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use16907// the AsmName field from *RegisterInfo.td, then this would not be necessary.16908if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&16909PPC::GPRCRegClass.contains(R.first))16910return std::make_pair(TRI->getMatchingSuperReg(R.first,16911PPC::sub_32, &PPC::G8RCRegClass),16912&PPC::G8RCRegClass);1691316914// GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.16915if (!R.second && StringRef("{cc}").equals_insensitive(Constraint)) {16916R.first = PPC::CR0;16917R.second = &PPC::CRRCRegClass;16918}16919// FIXME: This warning should ideally be emitted in the front end.16920const auto &TM = getTargetMachine();16921if (Subtarget.isAIXABI() && !TM.getAIXExtendedAltivecABI()) {16922if (((R.first >= PPC::V20 && R.first <= PPC::V31) ||16923(R.first >= PPC::VF20 && R.first <= PPC::VF31)) &&16924(R.second == &PPC::VSRCRegClass || R.second == &PPC::VSFRCRegClass))16925errs() << "warning: vector registers 20 to 32 are reserved in the "16926"default AIX AltiVec ABI and cannot be used\n";16927}1692816929return R;16930}1693116932/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops16933/// vector. If it is invalid, don't add anything to Ops.16934void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,16935StringRef Constraint,16936std::vector<SDValue> &Ops,16937SelectionDAG &DAG) const {16938SDValue Result;1693916940// Only support length 1 constraints.16941if (Constraint.size() > 1)16942return;1694316944char Letter = Constraint[0];16945switch (Letter) {16946default: break;16947case 'I':16948case 'J':16949case 'K':16950case 'L':16951case 'M':16952case 'N':16953case 'O':16954case 'P': {16955ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);16956if (!CST) return; // Must be an immediate to match.16957SDLoc dl(Op);16958int64_t Value = CST->getSExtValue();16959EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative16960// numbers are printed as such.16961switch (Letter) {16962default: llvm_unreachable("Unknown constraint letter!");16963case 'I': // "I" is a signed 16-bit constant.16964if (isInt<16>(Value))16965Result = DAG.getTargetConstant(Value, dl, TCVT);16966break;16967case 'J': // "J" is a constant with only the high-order 16 bits nonzero.16968if (isShiftedUInt<16, 16>(Value))16969Result = DAG.getTargetConstant(Value, dl, TCVT);16970break;16971case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.16972if (isShiftedInt<16, 16>(Value))16973Result = DAG.getTargetConstant(Value, dl, TCVT);16974break;16975case 'K': // "K" is a constant with only the low-order 16 bits nonzero.16976if (isUInt<16>(Value))16977Result = DAG.getTargetConstant(Value, dl, TCVT);16978break;16979case 'M': // "M" is a constant that is greater than 31.16980if (Value > 31)16981Result = DAG.getTargetConstant(Value, dl, TCVT);16982break;16983case 'N': // "N" is a positive constant that is an exact power of two.16984if (Value > 0 && isPowerOf2_64(Value))16985Result = DAG.getTargetConstant(Value, dl, TCVT);16986break;16987case 'O': // "O" is the constant zero.16988if (Value == 0)16989Result = DAG.getTargetConstant(Value, dl, TCVT);16990break;16991case 'P': // "P" is a constant whose negation is a signed 16-bit constant.16992if (isInt<16>(-Value))16993Result = DAG.getTargetConstant(Value, dl, TCVT);16994break;16995}16996break;16997}16998}1699917000if (Result.getNode()) {17001Ops.push_back(Result);17002return;17003}1700417005// Handle standard constraint letters.17006TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);17007}1700817009void PPCTargetLowering::CollectTargetIntrinsicOperands(const CallInst &I,17010SmallVectorImpl<SDValue> &Ops,17011SelectionDAG &DAG) const {17012if (I.getNumOperands() <= 1)17013return;17014if (!isa<ConstantSDNode>(Ops[1].getNode()))17015return;17016auto IntrinsicID = Ops[1].getNode()->getAsZExtVal();17017if (IntrinsicID != Intrinsic::ppc_tdw && IntrinsicID != Intrinsic::ppc_tw &&17018IntrinsicID != Intrinsic::ppc_trapd && IntrinsicID != Intrinsic::ppc_trap)17019return;1702017021if (MDNode *MDN = I.getMetadata(LLVMContext::MD_annotation))17022Ops.push_back(DAG.getMDNode(MDN));17023}1702417025// isLegalAddressingMode - Return true if the addressing mode represented17026// by AM is legal for this target, for a load/store of the specified type.17027bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,17028const AddrMode &AM, Type *Ty,17029unsigned AS,17030Instruction *I) const {17031// Vector type r+i form is supported since power9 as DQ form. We don't check17032// the offset matching DQ form requirement(off % 16 == 0), because on PowerPC,17033// imm form is preferred and the offset can be adjusted to use imm form later17034// in pass PPCLoopInstrFormPrep. Also in LSR, for one LSRUse, it uses min and17035// max offset to check legal addressing mode, we should be a little aggressive17036// to contain other offsets for that LSRUse.17037if (Ty->isVectorTy() && AM.BaseOffs != 0 && !Subtarget.hasP9Vector())17038return false;1703917040// PPC allows a sign-extended 16-bit immediate field.17041if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)17042return false;1704317044// No global is ever allowed as a base.17045if (AM.BaseGV)17046return false;1704717048// PPC only support r+r,17049switch (AM.Scale) {17050case 0: // "r+i" or just "i", depending on HasBaseReg.17051break;17052case 1:17053if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.17054return false;17055// Otherwise we have r+r or r+i.17056break;17057case 2:17058if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed.17059return false;17060// Allow 2*r as r+r.17061break;17062default:17063// No other scales are supported.17064return false;17065}1706617067return true;17068}1706917070SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,17071SelectionDAG &DAG) const {17072MachineFunction &MF = DAG.getMachineFunction();17073MachineFrameInfo &MFI = MF.getFrameInfo();17074MFI.setReturnAddressIsTaken(true);1707517076if (verifyReturnAddressArgumentIsConstant(Op, DAG))17077return SDValue();1707817079SDLoc dl(Op);17080unsigned Depth = Op.getConstantOperandVal(0);1708117082// Make sure the function does not optimize away the store of the RA to17083// the stack.17084PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();17085FuncInfo->setLRStoreRequired();17086bool isPPC64 = Subtarget.isPPC64();17087auto PtrVT = getPointerTy(MF.getDataLayout());1708817089if (Depth > 0) {17090// The link register (return address) is saved in the caller's frame17091// not the callee's stack frame. So we must get the caller's frame17092// address and load the return address at the LR offset from there.17093SDValue FrameAddr =17094DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),17095LowerFRAMEADDR(Op, DAG), MachinePointerInfo());17096SDValue Offset =17097DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,17098isPPC64 ? MVT::i64 : MVT::i32);17099return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),17100DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),17101MachinePointerInfo());17102}1710317104// Just load the return address off the stack.17105SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);17106return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,17107MachinePointerInfo());17108}1710917110SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,17111SelectionDAG &DAG) const {17112SDLoc dl(Op);17113unsigned Depth = Op.getConstantOperandVal(0);1711417115MachineFunction &MF = DAG.getMachineFunction();17116MachineFrameInfo &MFI = MF.getFrameInfo();17117MFI.setFrameAddressIsTaken(true);1711817119EVT PtrVT = getPointerTy(MF.getDataLayout());17120bool isPPC64 = PtrVT == MVT::i64;1712117122// Naked functions never have a frame pointer, and so we use r1. For all17123// other functions, this decision must be delayed until during PEI.17124unsigned FrameReg;17125if (MF.getFunction().hasFnAttribute(Attribute::Naked))17126FrameReg = isPPC64 ? PPC::X1 : PPC::R1;17127else17128FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;1712917130SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,17131PtrVT);17132while (Depth--)17133FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),17134FrameAddr, MachinePointerInfo());17135return FrameAddr;17136}1713717138// FIXME? Maybe this could be a TableGen attribute on some registers and17139// this table could be generated automatically from RegInfo.17140Register PPCTargetLowering::getRegisterByName(const char* RegName, LLT VT,17141const MachineFunction &MF) const {17142bool isPPC64 = Subtarget.isPPC64();1714317144bool is64Bit = isPPC64 && VT == LLT::scalar(64);17145if (!is64Bit && VT != LLT::scalar(32))17146report_fatal_error("Invalid register global variable type");1714717148Register Reg = StringSwitch<Register>(RegName)17149.Case("r1", is64Bit ? PPC::X1 : PPC::R1)17150.Case("r2", isPPC64 ? Register() : PPC::R2)17151.Case("r13", (is64Bit ? PPC::X13 : PPC::R13))17152.Default(Register());1715317154if (Reg)17155return Reg;17156report_fatal_error("Invalid register name global variable");17157}1715817159bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {17160// 32-bit SVR4 ABI access everything as got-indirect.17161if (Subtarget.is32BitELFABI())17162return true;1716317164// AIX accesses everything indirectly through the TOC, which is similar to17165// the GOT.17166if (Subtarget.isAIXABI())17167return true;1716817169CodeModel::Model CModel = getTargetMachine().getCodeModel();17170// If it is small or large code model, module locals are accessed17171// indirectly by loading their address from .toc/.got.17172if (CModel == CodeModel::Small || CModel == CodeModel::Large)17173return true;1717417175// JumpTable and BlockAddress are accessed as got-indirect.17176if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA))17177return true;1717817179if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA))17180return Subtarget.isGVIndirectSymbol(G->getGlobal());1718117182return false;17183}1718417185bool17186PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {17187// The PowerPC target isn't yet aware of offsets.17188return false;17189}1719017191bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,17192const CallInst &I,17193MachineFunction &MF,17194unsigned Intrinsic) const {17195switch (Intrinsic) {17196case Intrinsic::ppc_atomicrmw_xchg_i128:17197case Intrinsic::ppc_atomicrmw_add_i128:17198case Intrinsic::ppc_atomicrmw_sub_i128:17199case Intrinsic::ppc_atomicrmw_nand_i128:17200case Intrinsic::ppc_atomicrmw_and_i128:17201case Intrinsic::ppc_atomicrmw_or_i128:17202case Intrinsic::ppc_atomicrmw_xor_i128:17203case Intrinsic::ppc_cmpxchg_i128:17204Info.opc = ISD::INTRINSIC_W_CHAIN;17205Info.memVT = MVT::i128;17206Info.ptrVal = I.getArgOperand(0);17207Info.offset = 0;17208Info.align = Align(16);17209Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |17210MachineMemOperand::MOVolatile;17211return true;17212case Intrinsic::ppc_atomic_load_i128:17213Info.opc = ISD::INTRINSIC_W_CHAIN;17214Info.memVT = MVT::i128;17215Info.ptrVal = I.getArgOperand(0);17216Info.offset = 0;17217Info.align = Align(16);17218Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;17219return true;17220case Intrinsic::ppc_atomic_store_i128:17221Info.opc = ISD::INTRINSIC_VOID;17222Info.memVT = MVT::i128;17223Info.ptrVal = I.getArgOperand(2);17224Info.offset = 0;17225Info.align = Align(16);17226Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;17227return true;17228case Intrinsic::ppc_altivec_lvx:17229case Intrinsic::ppc_altivec_lvxl:17230case Intrinsic::ppc_altivec_lvebx:17231case Intrinsic::ppc_altivec_lvehx:17232case Intrinsic::ppc_altivec_lvewx:17233case Intrinsic::ppc_vsx_lxvd2x:17234case Intrinsic::ppc_vsx_lxvw4x:17235case Intrinsic::ppc_vsx_lxvd2x_be:17236case Intrinsic::ppc_vsx_lxvw4x_be:17237case Intrinsic::ppc_vsx_lxvl:17238case Intrinsic::ppc_vsx_lxvll: {17239EVT VT;17240switch (Intrinsic) {17241case Intrinsic::ppc_altivec_lvebx:17242VT = MVT::i8;17243break;17244case Intrinsic::ppc_altivec_lvehx:17245VT = MVT::i16;17246break;17247case Intrinsic::ppc_altivec_lvewx:17248VT = MVT::i32;17249break;17250case Intrinsic::ppc_vsx_lxvd2x:17251case Intrinsic::ppc_vsx_lxvd2x_be:17252VT = MVT::v2f64;17253break;17254default:17255VT = MVT::v4i32;17256break;17257}1725817259Info.opc = ISD::INTRINSIC_W_CHAIN;17260Info.memVT = VT;17261Info.ptrVal = I.getArgOperand(0);17262Info.offset = -VT.getStoreSize()+1;17263Info.size = 2*VT.getStoreSize()-1;17264Info.align = Align(1);17265Info.flags = MachineMemOperand::MOLoad;17266return true;17267}17268case Intrinsic::ppc_altivec_stvx:17269case Intrinsic::ppc_altivec_stvxl:17270case Intrinsic::ppc_altivec_stvebx:17271case Intrinsic::ppc_altivec_stvehx:17272case Intrinsic::ppc_altivec_stvewx:17273case Intrinsic::ppc_vsx_stxvd2x:17274case Intrinsic::ppc_vsx_stxvw4x:17275case Intrinsic::ppc_vsx_stxvd2x_be:17276case Intrinsic::ppc_vsx_stxvw4x_be:17277case Intrinsic::ppc_vsx_stxvl:17278case Intrinsic::ppc_vsx_stxvll: {17279EVT VT;17280switch (Intrinsic) {17281case Intrinsic::ppc_altivec_stvebx:17282VT = MVT::i8;17283break;17284case Intrinsic::ppc_altivec_stvehx:17285VT = MVT::i16;17286break;17287case Intrinsic::ppc_altivec_stvewx:17288VT = MVT::i32;17289break;17290case Intrinsic::ppc_vsx_stxvd2x:17291case Intrinsic::ppc_vsx_stxvd2x_be:17292VT = MVT::v2f64;17293break;17294default:17295VT = MVT::v4i32;17296break;17297}1729817299Info.opc = ISD::INTRINSIC_VOID;17300Info.memVT = VT;17301Info.ptrVal = I.getArgOperand(1);17302Info.offset = -VT.getStoreSize()+1;17303Info.size = 2*VT.getStoreSize()-1;17304Info.align = Align(1);17305Info.flags = MachineMemOperand::MOStore;17306return true;17307}17308case Intrinsic::ppc_stdcx:17309case Intrinsic::ppc_stwcx:17310case Intrinsic::ppc_sthcx:17311case Intrinsic::ppc_stbcx: {17312EVT VT;17313auto Alignment = Align(8);17314switch (Intrinsic) {17315case Intrinsic::ppc_stdcx:17316VT = MVT::i64;17317break;17318case Intrinsic::ppc_stwcx:17319VT = MVT::i32;17320Alignment = Align(4);17321break;17322case Intrinsic::ppc_sthcx:17323VT = MVT::i16;17324Alignment = Align(2);17325break;17326case Intrinsic::ppc_stbcx:17327VT = MVT::i8;17328Alignment = Align(1);17329break;17330}17331Info.opc = ISD::INTRINSIC_W_CHAIN;17332Info.memVT = VT;17333Info.ptrVal = I.getArgOperand(0);17334Info.offset = 0;17335Info.align = Alignment;17336Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;17337return true;17338}17339default:17340break;17341}1734217343return false;17344}1734517346/// It returns EVT::Other if the type should be determined using generic17347/// target-independent logic.17348EVT PPCTargetLowering::getOptimalMemOpType(17349const MemOp &Op, const AttributeList &FuncAttributes) const {17350if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None) {17351// We should use Altivec/VSX loads and stores when available. For unaligned17352// addresses, unaligned VSX loads are only fast starting with the P8.17353if (Subtarget.hasAltivec() && Op.size() >= 16) {17354if (Op.isMemset() && Subtarget.hasVSX()) {17355uint64_t TailSize = Op.size() % 16;17356// For memset lowering, EXTRACT_VECTOR_ELT tries to return constant17357// element if vector element type matches tail store. For tail size17358// 3/4, the tail store is i32, v4i32 cannot be used, need a legal one.17359if (TailSize > 2 && TailSize <= 4) {17360return MVT::v8i16;17361}17362return MVT::v4i32;17363}17364if (Op.isAligned(Align(16)) || Subtarget.hasP8Vector())17365return MVT::v4i32;17366}17367}1736817369if (Subtarget.isPPC64()) {17370return MVT::i64;17371}1737217373return MVT::i32;17374}1737517376/// Returns true if it is beneficial to convert a load of a constant17377/// to just the constant itself.17378bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,17379Type *Ty) const {17380assert(Ty->isIntegerTy());1738117382unsigned BitSize = Ty->getPrimitiveSizeInBits();17383return !(BitSize == 0 || BitSize > 64);17384}1738517386bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {17387if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())17388return false;17389unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();17390unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();17391return NumBits1 == 64 && NumBits2 == 32;17392}1739317394bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {17395if (!VT1.isInteger() || !VT2.isInteger())17396return false;17397unsigned NumBits1 = VT1.getSizeInBits();17398unsigned NumBits2 = VT2.getSizeInBits();17399return NumBits1 == 64 && NumBits2 == 32;17400}1740117402bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {17403// Generally speaking, zexts are not free, but they are free when they can be17404// folded with other operations.17405if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {17406EVT MemVT = LD->getMemoryVT();17407if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 ||17408(Subtarget.isPPC64() && MemVT == MVT::i32)) &&17409(LD->getExtensionType() == ISD::NON_EXTLOAD ||17410LD->getExtensionType() == ISD::ZEXTLOAD))17411return true;17412}1741317414// FIXME: Add other cases...17415// - 32-bit shifts with a zext to i6417416// - zext after ctlz, bswap, etc.17417// - zext after and by a constant mask1741817419return TargetLowering::isZExtFree(Val, VT2);17420}1742117422bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {17423assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&17424"invalid fpext types");17425// Extending to float128 is not free.17426if (DestVT == MVT::f128)17427return false;17428return true;17429}1743017431bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {17432return isInt<16>(Imm) || isUInt<16>(Imm);17433}1743417435bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {17436return isInt<16>(Imm) || isUInt<16>(Imm);17437}1743817439bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Align,17440MachineMemOperand::Flags,17441unsigned *Fast) const {17442if (DisablePPCUnaligned)17443return false;1744417445// PowerPC supports unaligned memory access for simple non-vector types.17446// Although accessing unaligned addresses is not as efficient as accessing17447// aligned addresses, it is generally more efficient than manual expansion,17448// and generally only traps for software emulation when crossing page17449// boundaries.1745017451if (!VT.isSimple())17452return false;1745317454if (VT.isFloatingPoint() && !VT.isVector() &&17455!Subtarget.allowsUnalignedFPAccess())17456return false;1745717458if (VT.getSimpleVT().isVector()) {17459if (Subtarget.hasVSX()) {17460if (VT != MVT::v2f64 && VT != MVT::v2i64 &&17461VT != MVT::v4f32 && VT != MVT::v4i32)17462return false;17463} else {17464return false;17465}17466}1746717468if (VT == MVT::ppcf128)17469return false;1747017471if (Fast)17472*Fast = 1;1747317474return true;17475}1747617477bool PPCTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,17478SDValue C) const {17479// Check integral scalar types.17480if (!VT.isScalarInteger())17481return false;17482if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {17483if (!ConstNode->getAPIntValue().isSignedIntN(64))17484return false;17485// This transformation will generate >= 2 operations. But the following17486// cases will generate <= 2 instructions during ISEL. So exclude them.17487// 1. If the constant multiplier fits 16 bits, it can be handled by one17488// HW instruction, ie. MULLI17489// 2. If the multiplier after shifted fits 16 bits, an extra shift17490// instruction is needed than case 1, ie. MULLI and RLDICR17491int64_t Imm = ConstNode->getSExtValue();17492unsigned Shift = llvm::countr_zero<uint64_t>(Imm);17493Imm >>= Shift;17494if (isInt<16>(Imm))17495return false;17496uint64_t UImm = static_cast<uint64_t>(Imm);17497if (isPowerOf2_64(UImm + 1) || isPowerOf2_64(UImm - 1) ||17498isPowerOf2_64(1 - UImm) || isPowerOf2_64(-1 - UImm))17499return true;17500}17501return false;17502}1750317504bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,17505EVT VT) const {17506return isFMAFasterThanFMulAndFAdd(17507MF.getFunction(), VT.getTypeForEVT(MF.getFunction().getContext()));17508}1750917510bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,17511Type *Ty) const {17512if (Subtarget.hasSPE() || Subtarget.useSoftFloat())17513return false;17514switch (Ty->getScalarType()->getTypeID()) {17515case Type::FloatTyID:17516case Type::DoubleTyID:17517return true;17518case Type::FP128TyID:17519return Subtarget.hasP9Vector();17520default:17521return false;17522}17523}1752417525// FIXME: add more patterns which are not profitable to hoist.17526bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {17527if (!I->hasOneUse())17528return true;1752917530Instruction *User = I->user_back();17531assert(User && "A single use instruction with no uses.");1753217533switch (I->getOpcode()) {17534case Instruction::FMul: {17535// Don't break FMA, PowerPC prefers FMA.17536if (User->getOpcode() != Instruction::FSub &&17537User->getOpcode() != Instruction::FAdd)17538return true;1753917540const TargetOptions &Options = getTargetMachine().Options;17541const Function *F = I->getFunction();17542const DataLayout &DL = F->getDataLayout();17543Type *Ty = User->getOperand(0)->getType();1754417545return !(17546isFMAFasterThanFMulAndFAdd(*F, Ty) &&17547isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&17548(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));17549}17550case Instruction::Load: {17551// Don't break "store (load float*)" pattern, this pattern will be combined17552// to "store (load int32)" in later InstCombine pass. See function17553// combineLoadToOperationType. On PowerPC, loading a float point takes more17554// cycles than loading a 32 bit integer.17555LoadInst *LI = cast<LoadInst>(I);17556// For the loads that combineLoadToOperationType does nothing, like17557// ordered load, it should be profitable to hoist them.17558// For swifterror load, it can only be used for pointer to pointer type, so17559// later type check should get rid of this case.17560if (!LI->isUnordered())17561return true;1756217563if (User->getOpcode() != Instruction::Store)17564return true;1756517566if (I->getType()->getTypeID() != Type::FloatTyID)17567return true;1756817569return false;17570}17571default:17572return true;17573}17574return true;17575}1757617577const MCPhysReg *17578PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {17579// LR is a callee-save register, but we must treat it as clobbered by any call17580// site. Hence we include LR in the scratch registers, which are in turn added17581// as implicit-defs for stackmaps and patchpoints. The same reasoning applies17582// to CTR, which is used by any indirect call.17583static const MCPhysReg ScratchRegs[] = {17584PPC::X12, PPC::LR8, PPC::CTR8, 017585};1758617587return ScratchRegs;17588}1758917590Register PPCTargetLowering::getExceptionPointerRegister(17591const Constant *PersonalityFn) const {17592return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;17593}1759417595Register PPCTargetLowering::getExceptionSelectorRegister(17596const Constant *PersonalityFn) const {17597return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;17598}1759917600bool17601PPCTargetLowering::shouldExpandBuildVectorWithShuffles(17602EVT VT , unsigned DefinedValues) const {17603if (VT == MVT::v2i64)17604return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves1760517606if (Subtarget.hasVSX())17607return true;1760817609return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);17610}1761117612Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {17613if (DisableILPPref || Subtarget.enableMachineScheduler())17614return TargetLowering::getSchedulingPreference(N);1761517616return Sched::ILP;17617}1761817619// Create a fast isel object.17620FastISel *17621PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,17622const TargetLibraryInfo *LibInfo) const {17623return PPC::createFastISel(FuncInfo, LibInfo);17624}1762517626// 'Inverted' means the FMA opcode after negating one multiplicand.17627// For example, (fma -a b c) = (fnmsub a b c)17628static unsigned invertFMAOpcode(unsigned Opc) {17629switch (Opc) {17630default:17631llvm_unreachable("Invalid FMA opcode for PowerPC!");17632case ISD::FMA:17633return PPCISD::FNMSUB;17634case PPCISD::FNMSUB:17635return ISD::FMA;17636}17637}1763817639SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,17640bool LegalOps, bool OptForSize,17641NegatibleCost &Cost,17642unsigned Depth) const {17643if (Depth > SelectionDAG::MaxRecursionDepth)17644return SDValue();1764517646unsigned Opc = Op.getOpcode();17647EVT VT = Op.getValueType();17648SDNodeFlags Flags = Op.getNode()->getFlags();1764917650switch (Opc) {17651case PPCISD::FNMSUB:17652if (!Op.hasOneUse() || !isTypeLegal(VT))17653break;1765417655const TargetOptions &Options = getTargetMachine().Options;17656SDValue N0 = Op.getOperand(0);17657SDValue N1 = Op.getOperand(1);17658SDValue N2 = Op.getOperand(2);17659SDLoc Loc(Op);1766017661NegatibleCost N2Cost = NegatibleCost::Expensive;17662SDValue NegN2 =17663getNegatedExpression(N2, DAG, LegalOps, OptForSize, N2Cost, Depth + 1);1766417665if (!NegN2)17666return SDValue();1766717668// (fneg (fnmsub a b c)) => (fnmsub (fneg a) b (fneg c))17669// (fneg (fnmsub a b c)) => (fnmsub a (fneg b) (fneg c))17670// These transformations may change sign of zeroes. For example,17671// -(-ab-(-c))=-0 while -(-(ab-c))=+0 when a=b=c=1.17672if (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) {17673// Try and choose the cheaper one to negate.17674NegatibleCost N0Cost = NegatibleCost::Expensive;17675SDValue NegN0 = getNegatedExpression(N0, DAG, LegalOps, OptForSize,17676N0Cost, Depth + 1);1767717678NegatibleCost N1Cost = NegatibleCost::Expensive;17679SDValue NegN1 = getNegatedExpression(N1, DAG, LegalOps, OptForSize,17680N1Cost, Depth + 1);1768117682if (NegN0 && N0Cost <= N1Cost) {17683Cost = std::min(N0Cost, N2Cost);17684return DAG.getNode(Opc, Loc, VT, NegN0, N1, NegN2, Flags);17685} else if (NegN1) {17686Cost = std::min(N1Cost, N2Cost);17687return DAG.getNode(Opc, Loc, VT, N0, NegN1, NegN2, Flags);17688}17689}1769017691// (fneg (fnmsub a b c)) => (fma a b (fneg c))17692if (isOperationLegal(ISD::FMA, VT)) {17693Cost = N2Cost;17694return DAG.getNode(ISD::FMA, Loc, VT, N0, N1, NegN2, Flags);17695}1769617697break;17698}1769917700return TargetLowering::getNegatedExpression(Op, DAG, LegalOps, OptForSize,17701Cost, Depth);17702}1770317704// Override to enable LOAD_STACK_GUARD lowering on Linux.17705bool PPCTargetLowering::useLoadStackGuardNode() const {17706if (!Subtarget.isTargetLinux())17707return TargetLowering::useLoadStackGuardNode();17708return true;17709}1771017711// Override to disable global variable loading on Linux and insert AIX canary17712// word declaration.17713void PPCTargetLowering::insertSSPDeclarations(Module &M) const {17714if (Subtarget.isAIXABI()) {17715M.getOrInsertGlobal(AIXSSPCanaryWordName,17716PointerType::getUnqual(M.getContext()));17717return;17718}17719if (!Subtarget.isTargetLinux())17720return TargetLowering::insertSSPDeclarations(M);17721}1772217723Value *PPCTargetLowering::getSDagStackGuard(const Module &M) const {17724if (Subtarget.isAIXABI())17725return M.getGlobalVariable(AIXSSPCanaryWordName);17726return TargetLowering::getSDagStackGuard(M);17727}1772817729bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,17730bool ForCodeSize) const {17731if (!VT.isSimple() || !Subtarget.hasVSX())17732return false;1773317734switch(VT.getSimpleVT().SimpleTy) {17735default:17736// For FP types that are currently not supported by PPC backend, return17737// false. Examples: f16, f80.17738return false;17739case MVT::f32:17740case MVT::f64: {17741if (Subtarget.hasPrefixInstrs() && Subtarget.hasP10Vector()) {17742// we can materialize all immediatess via XXSPLTI32DX and XXSPLTIDP.17743return true;17744}17745bool IsExact;17746APSInt IntResult(16, false);17747// The rounding mode doesn't really matter because we only care about floats17748// that can be converted to integers exactly.17749Imm.convertToInteger(IntResult, APFloat::rmTowardZero, &IsExact);17750// For exact values in the range [-16, 15] we can materialize the float.17751if (IsExact && IntResult <= 15 && IntResult >= -16)17752return true;17753return Imm.isZero();17754}17755case MVT::ppcf128:17756return Imm.isPosZero();17757}17758}1775917760// For vector shift operation op, fold17761// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)17762static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,17763SelectionDAG &DAG) {17764SDValue N0 = N->getOperand(0);17765SDValue N1 = N->getOperand(1);17766EVT VT = N0.getValueType();17767unsigned OpSizeInBits = VT.getScalarSizeInBits();17768unsigned Opcode = N->getOpcode();17769unsigned TargetOpcode;1777017771switch (Opcode) {17772default:17773llvm_unreachable("Unexpected shift operation");17774case ISD::SHL:17775TargetOpcode = PPCISD::SHL;17776break;17777case ISD::SRL:17778TargetOpcode = PPCISD::SRL;17779break;17780case ISD::SRA:17781TargetOpcode = PPCISD::SRA;17782break;17783}1778417785if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&17786N1->getOpcode() == ISD::AND)17787if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))17788if (Mask->getZExtValue() == OpSizeInBits - 1)17789return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));1779017791return SDValue();17792}1779317794SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {17795if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))17796return Value;1779717798SDValue N0 = N->getOperand(0);17799ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));17800if (!Subtarget.isISA3_0() || !Subtarget.isPPC64() ||17801N0.getOpcode() != ISD::SIGN_EXTEND ||17802N0.getOperand(0).getValueType() != MVT::i32 || CN1 == nullptr ||17803N->getValueType(0) != MVT::i64)17804return SDValue();1780517806// We can't save an operation here if the value is already extended, and17807// the existing shift is easier to combine.17808SDValue ExtsSrc = N0.getOperand(0);17809if (ExtsSrc.getOpcode() == ISD::TRUNCATE &&17810ExtsSrc.getOperand(0).getOpcode() == ISD::AssertSext)17811return SDValue();1781217813SDLoc DL(N0);17814SDValue ShiftBy = SDValue(CN1, 0);17815// We want the shift amount to be i32 on the extswli, but the shift could17816// have an i64.17817if (ShiftBy.getValueType() == MVT::i64)17818ShiftBy = DCI.DAG.getConstant(CN1->getZExtValue(), DL, MVT::i32);1781917820return DCI.DAG.getNode(PPCISD::EXTSWSLI, DL, MVT::i64, N0->getOperand(0),17821ShiftBy);17822}1782317824SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {17825if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))17826return Value;1782717828return SDValue();17829}1783017831SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {17832if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))17833return Value;1783417835return SDValue();17836}1783717838// Transform (add X, (zext(setne Z, C))) -> (addze X, (addic (addi Z, -C), -1))17839// Transform (add X, (zext(sete Z, C))) -> (addze X, (subfic (addi Z, -C), 0))17840// When C is zero, the equation (addi Z, -C) can be simplified to Z17841// Requirement: -C in [-32768, 32767], X and Z are MVT::i64 types17842static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG,17843const PPCSubtarget &Subtarget) {17844if (!Subtarget.isPPC64())17845return SDValue();1784617847SDValue LHS = N->getOperand(0);17848SDValue RHS = N->getOperand(1);1784917850auto isZextOfCompareWithConstant = [](SDValue Op) {17851if (Op.getOpcode() != ISD::ZERO_EXTEND || !Op.hasOneUse() ||17852Op.getValueType() != MVT::i64)17853return false;1785417855SDValue Cmp = Op.getOperand(0);17856if (Cmp.getOpcode() != ISD::SETCC || !Cmp.hasOneUse() ||17857Cmp.getOperand(0).getValueType() != MVT::i64)17858return false;1785917860if (auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1))) {17861int64_t NegConstant = 0 - Constant->getSExtValue();17862// Due to the limitations of the addi instruction,17863// -C is required to be [-32768, 32767].17864return isInt<16>(NegConstant);17865}1786617867return false;17868};1786917870bool LHSHasPattern = isZextOfCompareWithConstant(LHS);17871bool RHSHasPattern = isZextOfCompareWithConstant(RHS);1787217873// If there is a pattern, canonicalize a zext operand to the RHS.17874if (LHSHasPattern && !RHSHasPattern)17875std::swap(LHS, RHS);17876else if (!LHSHasPattern && !RHSHasPattern)17877return SDValue();1787817879SDLoc DL(N);17880SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue);17881SDValue Cmp = RHS.getOperand(0);17882SDValue Z = Cmp.getOperand(0);17883auto *Constant = cast<ConstantSDNode>(Cmp.getOperand(1));17884int64_t NegConstant = 0 - Constant->getSExtValue();1788517886switch(cast<CondCodeSDNode>(Cmp.getOperand(2))->get()) {17887default: break;17888case ISD::SETNE: {17889// when C == 017890// --> addze X, (addic Z, -1).carry17891// /17892// add X, (zext(setne Z, C))--17893// \ when -32768 <= -C <= 32767 && C != 017894// --> addze X, (addic (addi Z, -C), -1).carry17895SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,17896DAG.getConstant(NegConstant, DL, MVT::i64));17897SDValue AddOrZ = NegConstant != 0 ? Add : Z;17898SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue),17899AddOrZ, DAG.getConstant(-1ULL, DL, MVT::i64));17900return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),17901SDValue(Addc.getNode(), 1));17902}17903case ISD::SETEQ: {17904// when C == 017905// --> addze X, (subfic Z, 0).carry17906// /17907// add X, (zext(sete Z, C))--17908// \ when -32768 <= -C <= 32767 && C != 017909// --> addze X, (subfic (addi Z, -C), 0).carry17910SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,17911DAG.getConstant(NegConstant, DL, MVT::i64));17912SDValue AddOrZ = NegConstant != 0 ? Add : Z;17913SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue),17914DAG.getConstant(0, DL, MVT::i64), AddOrZ);17915return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),17916SDValue(Subc.getNode(), 1));17917}17918}1791917920return SDValue();17921}1792217923// Transform17924// (add C1, (MAT_PCREL_ADDR GlobalAddr+C2)) to17925// (MAT_PCREL_ADDR GlobalAddr+(C1+C2))17926// In this case both C1 and C2 must be known constants.17927// C1+C2 must fit into a 34 bit signed integer.17928static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,17929const PPCSubtarget &Subtarget) {17930if (!Subtarget.isUsingPCRelativeCalls())17931return SDValue();1793217933// Check both Operand 0 and Operand 1 of the ADD node for the PCRel node.17934// If we find that node try to cast the Global Address and the Constant.17935SDValue LHS = N->getOperand(0);17936SDValue RHS = N->getOperand(1);1793717938if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)17939std::swap(LHS, RHS);1794017941if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)17942return SDValue();1794317944// Operand zero of PPCISD::MAT_PCREL_ADDR is the GA node.17945GlobalAddressSDNode *GSDN = dyn_cast<GlobalAddressSDNode>(LHS.getOperand(0));17946ConstantSDNode* ConstNode = dyn_cast<ConstantSDNode>(RHS);1794717948// Check that both casts succeeded.17949if (!GSDN || !ConstNode)17950return SDValue();1795117952int64_t NewOffset = GSDN->getOffset() + ConstNode->getSExtValue();17953SDLoc DL(GSDN);1795417955// The signed int offset needs to fit in 34 bits.17956if (!isInt<34>(NewOffset))17957return SDValue();1795817959// The new global address is a copy of the old global address except17960// that it has the updated Offset.17961SDValue GA =17962DAG.getTargetGlobalAddress(GSDN->getGlobal(), DL, GSDN->getValueType(0),17963NewOffset, GSDN->getTargetFlags());17964SDValue MatPCRel =17965DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, GSDN->getValueType(0), GA);17966return MatPCRel;17967}1796817969SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {17970if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget))17971return Value;1797217973if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget))17974return Value;1797517976return SDValue();17977}1797817979// Detect TRUNCATE operations on bitcasts of float128 values.17980// What we are looking for here is the situtation where we extract a subset17981// of bits from a 128 bit float.17982// This can be of two forms:17983// 1) BITCAST of f128 feeding TRUNCATE17984// 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE17985// The reason this is required is because we do not have a legal i128 type17986// and so we want to prevent having to store the f128 and then reload part17987// of it.17988SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,17989DAGCombinerInfo &DCI) const {17990// If we are using CRBits then try that first.17991if (Subtarget.useCRBits()) {17992// Check if CRBits did anything and return that if it did.17993if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI))17994return CRTruncValue;17995}1799617997SDLoc dl(N);17998SDValue Op0 = N->getOperand(0);1799918000// Looking for a truncate of i128 to i64.18001if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64)18002return SDValue();1800318004int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0;1800518006// SRL feeding TRUNCATE.18007if (Op0.getOpcode() == ISD::SRL) {18008ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Op0.getOperand(1));18009// The right shift has to be by 64 bits.18010if (!ConstNode || ConstNode->getZExtValue() != 64)18011return SDValue();1801218013// Switch the element number to extract.18014EltToExtract = EltToExtract ? 0 : 1;18015// Update Op0 past the SRL.18016Op0 = Op0.getOperand(0);18017}1801818019// BITCAST feeding a TRUNCATE possibly via SRL.18020if (Op0.getOpcode() == ISD::BITCAST &&18021Op0.getValueType() == MVT::i128 &&18022Op0.getOperand(0).getValueType() == MVT::f128) {18023SDValue Bitcast = DCI.DAG.getBitcast(MVT::v2i64, Op0.getOperand(0));18024return DCI.DAG.getNode(18025ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Bitcast,18026DCI.DAG.getTargetConstant(EltToExtract, dl, MVT::i32));18027}18028return SDValue();18029}1803018031SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {18032SelectionDAG &DAG = DCI.DAG;1803318034ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N->getOperand(1));18035if (!ConstOpOrElement)18036return SDValue();1803718038// An imul is usually smaller than the alternative sequence for legal type.18039if (DAG.getMachineFunction().getFunction().hasMinSize() &&18040isOperationLegal(ISD::MUL, N->getValueType(0)))18041return SDValue();1804218043auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool {18044switch (this->Subtarget.getCPUDirective()) {18045default:18046// TODO: enhance the condition for subtarget before pwr818047return false;18048case PPC::DIR_PWR8:18049// type mul add shl18050// scalar 4 1 118051// vector 7 2 218052return true;18053case PPC::DIR_PWR9:18054case PPC::DIR_PWR10:18055case PPC::DIR_PWR11:18056case PPC::DIR_PWR_FUTURE:18057// type mul add shl18058// scalar 5 2 218059// vector 7 2 21806018061// The cycle RATIO of related operations are showed as a table above.18062// Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both18063// scalar and vector type. For 2 instrs patterns, add/sub + shl18064// are 4, it is always profitable; but for 3 instrs patterns18065// (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6.18066// So we should only do it for vector type.18067return IsAddOne && IsNeg ? VT.isVector() : true;18068}18069};1807018071EVT VT = N->getValueType(0);18072SDLoc DL(N);1807318074const APInt &MulAmt = ConstOpOrElement->getAPIntValue();18075bool IsNeg = MulAmt.isNegative();18076APInt MulAmtAbs = MulAmt.abs();1807718078if ((MulAmtAbs - 1).isPowerOf2()) {18079// (mul x, 2^N + 1) => (add (shl x, N), x)18080// (mul x, -(2^N + 1)) => -(add (shl x, N), x)1808118082if (!IsProfitable(IsNeg, true, VT))18083return SDValue();1808418085SDValue Op0 = N->getOperand(0);18086SDValue Op1 =18087DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),18088DAG.getConstant((MulAmtAbs - 1).logBase2(), DL, VT));18089SDValue Res = DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);1809018091if (!IsNeg)18092return Res;1809318094return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);18095} else if ((MulAmtAbs + 1).isPowerOf2()) {18096// (mul x, 2^N - 1) => (sub (shl x, N), x)18097// (mul x, -(2^N - 1)) => (sub x, (shl x, N))1809818099if (!IsProfitable(IsNeg, false, VT))18100return SDValue();1810118102SDValue Op0 = N->getOperand(0);18103SDValue Op1 =18104DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),18105DAG.getConstant((MulAmtAbs + 1).logBase2(), DL, VT));1810618107if (!IsNeg)18108return DAG.getNode(ISD::SUB, DL, VT, Op1, Op0);18109else18110return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1);1811118112} else {18113return SDValue();18114}18115}1811618117// Combine fma-like op (like fnmsub) with fnegs to appropriate op. Do this18118// in combiner since we need to check SD flags and other subtarget features.18119SDValue PPCTargetLowering::combineFMALike(SDNode *N,18120DAGCombinerInfo &DCI) const {18121SDValue N0 = N->getOperand(0);18122SDValue N1 = N->getOperand(1);18123SDValue N2 = N->getOperand(2);18124SDNodeFlags Flags = N->getFlags();18125EVT VT = N->getValueType(0);18126SelectionDAG &DAG = DCI.DAG;18127const TargetOptions &Options = getTargetMachine().Options;18128unsigned Opc = N->getOpcode();18129bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();18130bool LegalOps = !DCI.isBeforeLegalizeOps();18131SDLoc Loc(N);1813218133if (!isOperationLegal(ISD::FMA, VT))18134return SDValue();1813518136// Allowing transformation to FNMSUB may change sign of zeroes when ab-c=018137// since (fnmsub a b c)=-0 while c-ab=+0.18138if (!Flags.hasNoSignedZeros() && !Options.NoSignedZerosFPMath)18139return SDValue();1814018141// (fma (fneg a) b c) => (fnmsub a b c)18142// (fnmsub (fneg a) b c) => (fma a b c)18143if (SDValue NegN0 = getCheaperNegatedExpression(N0, DAG, LegalOps, CodeSize))18144return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, NegN0, N1, N2, Flags);1814518146// (fma a (fneg b) c) => (fnmsub a b c)18147// (fnmsub a (fneg b) c) => (fma a b c)18148if (SDValue NegN1 = getCheaperNegatedExpression(N1, DAG, LegalOps, CodeSize))18149return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, N0, NegN1, N2, Flags);1815018151return SDValue();18152}1815318154bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {18155// Only duplicate to increase tail-calls for the 64bit SysV ABIs.18156if (!Subtarget.is64BitELFABI())18157return false;1815818159// If not a tail call then no need to proceed.18160if (!CI->isTailCall())18161return false;1816218163// If sibling calls have been disabled and tail-calls aren't guaranteed18164// there is no reason to duplicate.18165auto &TM = getTargetMachine();18166if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)18167return false;1816818169// Can't tail call a function called indirectly, or if it has variadic args.18170const Function *Callee = CI->getCalledFunction();18171if (!Callee || Callee->isVarArg())18172return false;1817318174// Make sure the callee and caller calling conventions are eligible for tco.18175const Function *Caller = CI->getParent()->getParent();18176if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(),18177CI->getCallingConv()))18178return false;1817918180// If the function is local then we have a good chance at tail-calling it18181return getTargetMachine().shouldAssumeDSOLocal(Callee);18182}1818318184bool PPCTargetLowering::18185isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {18186const Value *Mask = AndI.getOperand(1);18187// If the mask is suitable for andi. or andis. we should sink the and.18188if (const ConstantInt *CI = dyn_cast<ConstantInt>(Mask)) {18189// Can't handle constants wider than 64-bits.18190if (CI->getBitWidth() > 64)18191return false;18192int64_t ConstVal = CI->getZExtValue();18193return isUInt<16>(ConstVal) ||18194(isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF));18195}1819618197// For non-constant masks, we can always use the record-form and.18198return true;18199}1820018201/// getAddrModeForFlags - Based on the set of address flags, select the most18202/// optimal instruction format to match by.18203PPC::AddrMode PPCTargetLowering::getAddrModeForFlags(unsigned Flags) const {18204// This is not a node we should be handling here.18205if (Flags == PPC::MOF_None)18206return PPC::AM_None;18207// Unaligned D-Forms are tried first, followed by the aligned D-Forms.18208for (auto FlagSet : AddrModesMap.at(PPC::AM_DForm))18209if ((Flags & FlagSet) == FlagSet)18210return PPC::AM_DForm;18211for (auto FlagSet : AddrModesMap.at(PPC::AM_DSForm))18212if ((Flags & FlagSet) == FlagSet)18213return PPC::AM_DSForm;18214for (auto FlagSet : AddrModesMap.at(PPC::AM_DQForm))18215if ((Flags & FlagSet) == FlagSet)18216return PPC::AM_DQForm;18217for (auto FlagSet : AddrModesMap.at(PPC::AM_PrefixDForm))18218if ((Flags & FlagSet) == FlagSet)18219return PPC::AM_PrefixDForm;18220// If no other forms are selected, return an X-Form as it is the most18221// general addressing mode.18222return PPC::AM_XForm;18223}1822418225/// Set alignment flags based on whether or not the Frame Index is aligned.18226/// Utilized when computing flags for address computation when selecting18227/// load and store instructions.18228static void setAlignFlagsForFI(SDValue N, unsigned &FlagSet,18229SelectionDAG &DAG) {18230bool IsAdd = ((N.getOpcode() == ISD::ADD) || (N.getOpcode() == ISD::OR));18231FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(IsAdd ? N.getOperand(0) : N);18232if (!FI)18233return;18234const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();18235unsigned FrameIndexAlign = MFI.getObjectAlign(FI->getIndex()).value();18236// If this is (add $FI, $S16Imm), the alignment flags are already set18237// based on the immediate. We just need to clear the alignment flags18238// if the FI alignment is weaker.18239if ((FrameIndexAlign % 4) != 0)18240FlagSet &= ~PPC::MOF_RPlusSImm16Mult4;18241if ((FrameIndexAlign % 16) != 0)18242FlagSet &= ~PPC::MOF_RPlusSImm16Mult16;18243// If the address is a plain FrameIndex, set alignment flags based on18244// FI alignment.18245if (!IsAdd) {18246if ((FrameIndexAlign % 4) == 0)18247FlagSet |= PPC::MOF_RPlusSImm16Mult4;18248if ((FrameIndexAlign % 16) == 0)18249FlagSet |= PPC::MOF_RPlusSImm16Mult16;18250}18251}1825218253/// Given a node, compute flags that are used for address computation when18254/// selecting load and store instructions. The flags computed are stored in18255/// FlagSet. This function takes into account whether the node is a constant,18256/// an ADD, OR, or a constant, and computes the address flags accordingly.18257static void computeFlagsForAddressComputation(SDValue N, unsigned &FlagSet,18258SelectionDAG &DAG) {18259// Set the alignment flags for the node depending on if the node is18260// 4-byte or 16-byte aligned.18261auto SetAlignFlagsForImm = [&](uint64_t Imm) {18262if ((Imm & 0x3) == 0)18263FlagSet |= PPC::MOF_RPlusSImm16Mult4;18264if ((Imm & 0xf) == 0)18265FlagSet |= PPC::MOF_RPlusSImm16Mult16;18266};1826718268if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {18269// All 32-bit constants can be computed as LIS + Disp.18270const APInt &ConstImm = CN->getAPIntValue();18271if (ConstImm.isSignedIntN(32)) { // Flag to handle 32-bit constants.18272FlagSet |= PPC::MOF_AddrIsSImm32;18273SetAlignFlagsForImm(ConstImm.getZExtValue());18274setAlignFlagsForFI(N, FlagSet, DAG);18275}18276if (ConstImm.isSignedIntN(34)) // Flag to handle 34-bit constants.18277FlagSet |= PPC::MOF_RPlusSImm34;18278else // Let constant materialization handle large constants.18279FlagSet |= PPC::MOF_NotAddNorCst;18280} else if (N.getOpcode() == ISD::ADD || provablyDisjointOr(DAG, N)) {18281// This address can be represented as an addition of:18282// - Register + Imm16 (possibly a multiple of 4/16)18283// - Register + Imm3418284// - Register + PPCISD::Lo18285// - Register + Register18286// In any case, we won't have to match this as Base + Zero.18287SDValue RHS = N.getOperand(1);18288if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(RHS)) {18289const APInt &ConstImm = CN->getAPIntValue();18290if (ConstImm.isSignedIntN(16)) {18291FlagSet |= PPC::MOF_RPlusSImm16; // Signed 16-bit immediates.18292SetAlignFlagsForImm(ConstImm.getZExtValue());18293setAlignFlagsForFI(N, FlagSet, DAG);18294}18295if (ConstImm.isSignedIntN(34))18296FlagSet |= PPC::MOF_RPlusSImm34; // Signed 34-bit immediates.18297else18298FlagSet |= PPC::MOF_RPlusR; // Register.18299} else if (RHS.getOpcode() == PPCISD::Lo && !RHS.getConstantOperandVal(1))18300FlagSet |= PPC::MOF_RPlusLo; // PPCISD::Lo.18301else18302FlagSet |= PPC::MOF_RPlusR;18303} else { // The address computation is not a constant or an addition.18304setAlignFlagsForFI(N, FlagSet, DAG);18305FlagSet |= PPC::MOF_NotAddNorCst;18306}18307}1830818309static bool isPCRelNode(SDValue N) {18310return (N.getOpcode() == PPCISD::MAT_PCREL_ADDR ||18311isValidPCRelNode<ConstantPoolSDNode>(N) ||18312isValidPCRelNode<GlobalAddressSDNode>(N) ||18313isValidPCRelNode<JumpTableSDNode>(N) ||18314isValidPCRelNode<BlockAddressSDNode>(N));18315}1831618317/// computeMOFlags - Given a node N and it's Parent (a MemSDNode), compute18318/// the address flags of the load/store instruction that is to be matched.18319unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,18320SelectionDAG &DAG) const {18321unsigned FlagSet = PPC::MOF_None;1832218323// Compute subtarget flags.18324if (!Subtarget.hasP9Vector())18325FlagSet |= PPC::MOF_SubtargetBeforeP9;18326else18327FlagSet |= PPC::MOF_SubtargetP9;1832818329if (Subtarget.hasPrefixInstrs())18330FlagSet |= PPC::MOF_SubtargetP10;1833118332if (Subtarget.hasSPE())18333FlagSet |= PPC::MOF_SubtargetSPE;1833418335// Check if we have a PCRel node and return early.18336if ((FlagSet & PPC::MOF_SubtargetP10) && isPCRelNode(N))18337return FlagSet;1833818339// If the node is the paired load/store intrinsics, compute flags for18340// address computation and return early.18341unsigned ParentOp = Parent->getOpcode();18342if (Subtarget.isISA3_1() && ((ParentOp == ISD::INTRINSIC_W_CHAIN) ||18343(ParentOp == ISD::INTRINSIC_VOID))) {18344unsigned ID = Parent->getConstantOperandVal(1);18345if ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) {18346SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp)18347? Parent->getOperand(2)18348: Parent->getOperand(3);18349computeFlagsForAddressComputation(IntrinOp, FlagSet, DAG);18350FlagSet |= PPC::MOF_Vector;18351return FlagSet;18352}18353}1835418355// Mark this as something we don't want to handle here if it is atomic18356// or pre-increment instruction.18357if (const LSBaseSDNode *LSB = dyn_cast<LSBaseSDNode>(Parent))18358if (LSB->isIndexed())18359return PPC::MOF_None;1836018361// Compute in-memory type flags. This is based on if there are scalars,18362// floats or vectors.18363const MemSDNode *MN = dyn_cast<MemSDNode>(Parent);18364assert(MN && "Parent should be a MemSDNode!");18365EVT MemVT = MN->getMemoryVT();18366unsigned Size = MemVT.getSizeInBits();18367if (MemVT.isScalarInteger()) {18368assert(Size <= 128 &&18369"Not expecting scalar integers larger than 16 bytes!");18370if (Size < 32)18371FlagSet |= PPC::MOF_SubWordInt;18372else if (Size == 32)18373FlagSet |= PPC::MOF_WordInt;18374else18375FlagSet |= PPC::MOF_DoubleWordInt;18376} else if (MemVT.isVector() && !MemVT.isFloatingPoint()) { // Integer vectors.18377if (Size == 128)18378FlagSet |= PPC::MOF_Vector;18379else if (Size == 256) {18380assert(Subtarget.pairedVectorMemops() &&18381"256-bit vectors are only available when paired vector memops is "18382"enabled!");18383FlagSet |= PPC::MOF_Vector;18384} else18385llvm_unreachable("Not expecting illegal vectors!");18386} else { // Floating point type: can be scalar, f128 or vector types.18387if (Size == 32 || Size == 64)18388FlagSet |= PPC::MOF_ScalarFloat;18389else if (MemVT == MVT::f128 || MemVT.isVector())18390FlagSet |= PPC::MOF_Vector;18391else18392llvm_unreachable("Not expecting illegal scalar floats!");18393}1839418395// Compute flags for address computation.18396computeFlagsForAddressComputation(N, FlagSet, DAG);1839718398// Compute type extension flags.18399if (const LoadSDNode *LN = dyn_cast<LoadSDNode>(Parent)) {18400switch (LN->getExtensionType()) {18401case ISD::SEXTLOAD:18402FlagSet |= PPC::MOF_SExt;18403break;18404case ISD::EXTLOAD:18405case ISD::ZEXTLOAD:18406FlagSet |= PPC::MOF_ZExt;18407break;18408case ISD::NON_EXTLOAD:18409FlagSet |= PPC::MOF_NoExt;18410break;18411}18412} else18413FlagSet |= PPC::MOF_NoExt;1841418415// For integers, no extension is the same as zero extension.18416// We set the extension mode to zero extension so we don't have18417// to add separate entries in AddrModesMap for loads and stores.18418if (MemVT.isScalarInteger() && (FlagSet & PPC::MOF_NoExt)) {18419FlagSet |= PPC::MOF_ZExt;18420FlagSet &= ~PPC::MOF_NoExt;18421}1842218423// If we don't have prefixed instructions, 34-bit constants should be18424// treated as PPC::MOF_NotAddNorCst so they can match D-Forms.18425bool IsNonP1034BitConst =18426((PPC::MOF_RPlusSImm34 | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubtargetP10) &18427FlagSet) == PPC::MOF_RPlusSImm34;18428if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::OR &&18429IsNonP1034BitConst)18430FlagSet |= PPC::MOF_NotAddNorCst;1843118432return FlagSet;18433}1843418435/// SelectForceXFormMode - Given the specified address, force it to be18436/// represented as an indexed [r+r] operation (an XForm instruction).18437PPC::AddrMode PPCTargetLowering::SelectForceXFormMode(SDValue N, SDValue &Disp,18438SDValue &Base,18439SelectionDAG &DAG) const {1844018441PPC::AddrMode Mode = PPC::AM_XForm;18442int16_t ForceXFormImm = 0;18443if (provablyDisjointOr(DAG, N) &&18444!isIntS16Immediate(N.getOperand(1), ForceXFormImm)) {18445Disp = N.getOperand(0);18446Base = N.getOperand(1);18447return Mode;18448}1844918450// If the address is the result of an add, we will utilize the fact that the18451// address calculation includes an implicit add. However, we can reduce18452// register pressure if we do not materialize a constant just for use as the18453// index register. We only get rid of the add if it is not an add of a18454// value and a 16-bit signed constant and both have a single use.18455if (N.getOpcode() == ISD::ADD &&18456(!isIntS16Immediate(N.getOperand(1), ForceXFormImm) ||18457!N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) {18458Disp = N.getOperand(0);18459Base = N.getOperand(1);18460return Mode;18461}1846218463// Otherwise, use R0 as the base register.18464Disp = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,18465N.getValueType());18466Base = N;1846718468return Mode;18469}1847018471bool PPCTargetLowering::splitValueIntoRegisterParts(18472SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,18473unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {18474EVT ValVT = Val.getValueType();18475// If we are splitting a scalar integer into f64 parts (i.e. so they18476// can be placed into VFRC registers), we need to zero extend and18477// bitcast the values. This will ensure the value is placed into a18478// VSR using direct moves or stack operations as needed.18479if (PartVT == MVT::f64 &&18480(ValVT == MVT::i32 || ValVT == MVT::i16 || ValVT == MVT::i8)) {18481Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);18482Val = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Val);18483Parts[0] = Val;18484return true;18485}18486return false;18487}1848818489SDValue PPCTargetLowering::lowerToLibCall(const char *LibCallName, SDValue Op,18490SelectionDAG &DAG) const {18491const TargetLowering &TLI = DAG.getTargetLoweringInfo();18492TargetLowering::CallLoweringInfo CLI(DAG);18493EVT RetVT = Op.getValueType();18494Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());18495SDValue Callee =18496DAG.getExternalSymbol(LibCallName, TLI.getPointerTy(DAG.getDataLayout()));18497bool SignExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, false);18498TargetLowering::ArgListTy Args;18499TargetLowering::ArgListEntry Entry;18500for (const SDValue &N : Op->op_values()) {18501EVT ArgVT = N.getValueType();18502Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());18503Entry.Node = N;18504Entry.Ty = ArgTy;18505Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, SignExtend);18506Entry.IsZExt = !Entry.IsSExt;18507Args.push_back(Entry);18508}1850918510SDValue InChain = DAG.getEntryNode();18511SDValue TCChain = InChain;18512const Function &F = DAG.getMachineFunction().getFunction();18513bool isTailCall =18514TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&18515(RetTy == F.getReturnType() || F.getReturnType()->isVoidTy());18516if (isTailCall)18517InChain = TCChain;18518CLI.setDebugLoc(SDLoc(Op))18519.setChain(InChain)18520.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args))18521.setTailCall(isTailCall)18522.setSExtResult(SignExtend)18523.setZExtResult(!SignExtend)18524.setIsPostTypeLegalization(true);18525return TLI.LowerCallTo(CLI).first;18526}1852718528SDValue PPCTargetLowering::lowerLibCallBasedOnType(18529const char *LibCallFloatName, const char *LibCallDoubleName, SDValue Op,18530SelectionDAG &DAG) const {18531if (Op.getValueType() == MVT::f32)18532return lowerToLibCall(LibCallFloatName, Op, DAG);1853318534if (Op.getValueType() == MVT::f64)18535return lowerToLibCall(LibCallDoubleName, Op, DAG);1853618537return SDValue();18538}1853918540bool PPCTargetLowering::isLowringToMASSFiniteSafe(SDValue Op) const {18541SDNodeFlags Flags = Op.getNode()->getFlags();18542return isLowringToMASSSafe(Op) && Flags.hasNoSignedZeros() &&18543Flags.hasNoNaNs() && Flags.hasNoInfs();18544}1854518546bool PPCTargetLowering::isLowringToMASSSafe(SDValue Op) const {18547return Op.getNode()->getFlags().hasApproximateFuncs();18548}1854918550bool PPCTargetLowering::isScalarMASSConversionEnabled() const {18551return getTargetMachine().Options.PPCGenScalarMASSEntries;18552}1855318554SDValue PPCTargetLowering::lowerLibCallBase(const char *LibCallDoubleName,18555const char *LibCallFloatName,18556const char *LibCallDoubleNameFinite,18557const char *LibCallFloatNameFinite,18558SDValue Op,18559SelectionDAG &DAG) const {18560if (!isScalarMASSConversionEnabled() || !isLowringToMASSSafe(Op))18561return SDValue();1856218563if (!isLowringToMASSFiniteSafe(Op))18564return lowerLibCallBasedOnType(LibCallFloatName, LibCallDoubleName, Op,18565DAG);1856618567return lowerLibCallBasedOnType(LibCallFloatNameFinite,18568LibCallDoubleNameFinite, Op, DAG);18569}1857018571SDValue PPCTargetLowering::lowerPow(SDValue Op, SelectionDAG &DAG) const {18572return lowerLibCallBase("__xl_pow", "__xl_powf", "__xl_pow_finite",18573"__xl_powf_finite", Op, DAG);18574}1857518576SDValue PPCTargetLowering::lowerSin(SDValue Op, SelectionDAG &DAG) const {18577return lowerLibCallBase("__xl_sin", "__xl_sinf", "__xl_sin_finite",18578"__xl_sinf_finite", Op, DAG);18579}1858018581SDValue PPCTargetLowering::lowerCos(SDValue Op, SelectionDAG &DAG) const {18582return lowerLibCallBase("__xl_cos", "__xl_cosf", "__xl_cos_finite",18583"__xl_cosf_finite", Op, DAG);18584}1858518586SDValue PPCTargetLowering::lowerLog(SDValue Op, SelectionDAG &DAG) const {18587return lowerLibCallBase("__xl_log", "__xl_logf", "__xl_log_finite",18588"__xl_logf_finite", Op, DAG);18589}1859018591SDValue PPCTargetLowering::lowerLog10(SDValue Op, SelectionDAG &DAG) const {18592return lowerLibCallBase("__xl_log10", "__xl_log10f", "__xl_log10_finite",18593"__xl_log10f_finite", Op, DAG);18594}1859518596SDValue PPCTargetLowering::lowerExp(SDValue Op, SelectionDAG &DAG) const {18597return lowerLibCallBase("__xl_exp", "__xl_expf", "__xl_exp_finite",18598"__xl_expf_finite", Op, DAG);18599}1860018601// If we happen to match to an aligned D-Form, check if the Frame Index is18602// adequately aligned. If it is not, reset the mode to match to X-Form.18603static void setXFormForUnalignedFI(SDValue N, unsigned Flags,18604PPC::AddrMode &Mode) {18605if (!isa<FrameIndexSDNode>(N))18606return;18607if ((Mode == PPC::AM_DSForm && !(Flags & PPC::MOF_RPlusSImm16Mult4)) ||18608(Mode == PPC::AM_DQForm && !(Flags & PPC::MOF_RPlusSImm16Mult16)))18609Mode = PPC::AM_XForm;18610}1861118612/// SelectOptimalAddrMode - Based on a node N and it's Parent (a MemSDNode),18613/// compute the address flags of the node, get the optimal address mode based18614/// on the flags, and set the Base and Disp based on the address mode.18615PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,18616SDValue N, SDValue &Disp,18617SDValue &Base,18618SelectionDAG &DAG,18619MaybeAlign Align) const {18620SDLoc DL(Parent);1862118622// Compute the address flags.18623unsigned Flags = computeMOFlags(Parent, N, DAG);1862418625// Get the optimal address mode based on the Flags.18626PPC::AddrMode Mode = getAddrModeForFlags(Flags);1862718628// If the address mode is DS-Form or DQ-Form, check if the FI is aligned.18629// Select an X-Form load if it is not.18630setXFormForUnalignedFI(N, Flags, Mode);1863118632// Set the mode to PC-Relative addressing mode if we have a valid PC-Rel node.18633if ((Mode == PPC::AM_XForm) && isPCRelNode(N)) {18634assert(Subtarget.isUsingPCRelativeCalls() &&18635"Must be using PC-Relative calls when a valid PC-Relative node is "18636"present!");18637Mode = PPC::AM_PCRel;18638}1863918640// Set Base and Disp accordingly depending on the address mode.18641switch (Mode) {18642case PPC::AM_DForm:18643case PPC::AM_DSForm:18644case PPC::AM_DQForm: {18645// This is a register plus a 16-bit immediate. The base will be the18646// register and the displacement will be the immediate unless it18647// isn't sufficiently aligned.18648if (Flags & PPC::MOF_RPlusSImm16) {18649SDValue Op0 = N.getOperand(0);18650SDValue Op1 = N.getOperand(1);18651int16_t Imm = Op1->getAsZExtVal();18652if (!Align || isAligned(*Align, Imm)) {18653Disp = DAG.getTargetConstant(Imm, DL, N.getValueType());18654Base = Op0;18655if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op0)) {18656Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());18657fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());18658}18659break;18660}18661}18662// This is a register plus the @lo relocation. The base is the register18663// and the displacement is the global address.18664else if (Flags & PPC::MOF_RPlusLo) {18665Disp = N.getOperand(1).getOperand(0); // The global address.18666assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||18667Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||18668Disp.getOpcode() == ISD::TargetConstantPool ||18669Disp.getOpcode() == ISD::TargetJumpTable);18670Base = N.getOperand(0);18671break;18672}18673// This is a constant address at most 32 bits. The base will be18674// zero or load-immediate-shifted and the displacement will be18675// the low 16 bits of the address.18676else if (Flags & PPC::MOF_AddrIsSImm32) {18677auto *CN = cast<ConstantSDNode>(N);18678EVT CNType = CN->getValueType(0);18679uint64_t CNImm = CN->getZExtValue();18680// If this address fits entirely in a 16-bit sext immediate field, codegen18681// this as "d, 0".18682int16_t Imm;18683if (isIntS16Immediate(CN, Imm) && (!Align || isAligned(*Align, Imm))) {18684Disp = DAG.getTargetConstant(Imm, DL, CNType);18685Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,18686CNType);18687break;18688}18689// Handle 32-bit sext immediate with LIS + Addr mode.18690if ((CNType == MVT::i32 || isInt<32>(CNImm)) &&18691(!Align || isAligned(*Align, CNImm))) {18692int32_t Addr = (int32_t)CNImm;18693// Otherwise, break this down into LIS + Disp.18694Disp = DAG.getTargetConstant((int16_t)Addr, DL, MVT::i32);18695Base =18696DAG.getTargetConstant((Addr - (int16_t)Addr) >> 16, DL, MVT::i32);18697uint32_t LIS = CNType == MVT::i32 ? PPC::LIS : PPC::LIS8;18698Base = SDValue(DAG.getMachineNode(LIS, DL, CNType, Base), 0);18699break;18700}18701}18702// Otherwise, the PPC:MOF_NotAdd flag is set. Load/Store is Non-foldable.18703Disp = DAG.getTargetConstant(0, DL, getPointerTy(DAG.getDataLayout()));18704if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {18705Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());18706fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());18707} else18708Base = N;18709break;18710}18711case PPC::AM_PrefixDForm: {18712int64_t Imm34 = 0;18713unsigned Opcode = N.getOpcode();18714if (((Opcode == ISD::ADD) || (Opcode == ISD::OR)) &&18715(isIntS34Immediate(N.getOperand(1), Imm34))) {18716// N is an Add/OR Node, and it's operand is a 34-bit signed immediate.18717Disp = DAG.getTargetConstant(Imm34, DL, N.getValueType());18718if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))18719Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());18720else18721Base = N.getOperand(0);18722} else if (isIntS34Immediate(N, Imm34)) {18723// The address is a 34-bit signed immediate.18724Disp = DAG.getTargetConstant(Imm34, DL, N.getValueType());18725Base = DAG.getRegister(PPC::ZERO8, N.getValueType());18726}18727break;18728}18729case PPC::AM_PCRel: {18730// When selecting PC-Relative instructions, "Base" is not utilized as18731// we select the address as [PC+imm].18732Disp = N;18733break;18734}18735case PPC::AM_None:18736break;18737default: { // By default, X-Form is always available to be selected.18738// When a frame index is not aligned, we also match by XForm.18739FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N);18740Base = FI ? N : N.getOperand(1);18741Disp = FI ? DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,18742N.getValueType())18743: N.getOperand(0);18744break;18745}18746}18747return Mode;18748}1874918750CCAssignFn *PPCTargetLowering::ccAssignFnForCall(CallingConv::ID CC,18751bool Return,18752bool IsVarArg) const {18753switch (CC) {18754case CallingConv::Cold:18755return (Return ? RetCC_PPC_Cold : CC_PPC64_ELF);18756default:18757return CC_PPC64_ELF;18758}18759}1876018761bool PPCTargetLowering::shouldInlineQuadwordAtomics() const {18762return Subtarget.isPPC64() && Subtarget.hasQuadwordAtomics();18763}1876418765TargetLowering::AtomicExpansionKind18766PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {18767unsigned Size = AI->getType()->getPrimitiveSizeInBits();18768if (shouldInlineQuadwordAtomics() && Size == 128)18769return AtomicExpansionKind::MaskedIntrinsic;1877018771switch (AI->getOperation()) {18772case AtomicRMWInst::UIncWrap:18773case AtomicRMWInst::UDecWrap:18774return AtomicExpansionKind::CmpXChg;18775default:18776return TargetLowering::shouldExpandAtomicRMWInIR(AI);18777}1877818779llvm_unreachable("unreachable atomicrmw operation");18780}1878118782TargetLowering::AtomicExpansionKind18783PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {18784unsigned Size = AI->getNewValOperand()->getType()->getPrimitiveSizeInBits();18785if (shouldInlineQuadwordAtomics() && Size == 128)18786return AtomicExpansionKind::MaskedIntrinsic;18787return TargetLowering::shouldExpandAtomicCmpXchgInIR(AI);18788}1878918790static Intrinsic::ID18791getIntrinsicForAtomicRMWBinOp128(AtomicRMWInst::BinOp BinOp) {18792switch (BinOp) {18793default:18794llvm_unreachable("Unexpected AtomicRMW BinOp");18795case AtomicRMWInst::Xchg:18796return Intrinsic::ppc_atomicrmw_xchg_i128;18797case AtomicRMWInst::Add:18798return Intrinsic::ppc_atomicrmw_add_i128;18799case AtomicRMWInst::Sub:18800return Intrinsic::ppc_atomicrmw_sub_i128;18801case AtomicRMWInst::And:18802return Intrinsic::ppc_atomicrmw_and_i128;18803case AtomicRMWInst::Or:18804return Intrinsic::ppc_atomicrmw_or_i128;18805case AtomicRMWInst::Xor:18806return Intrinsic::ppc_atomicrmw_xor_i128;18807case AtomicRMWInst::Nand:18808return Intrinsic::ppc_atomicrmw_nand_i128;18809}18810}1881118812Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(18813IRBuilderBase &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,18814Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {18815assert(shouldInlineQuadwordAtomics() && "Only support quadword now");18816Module *M = Builder.GetInsertBlock()->getParent()->getParent();18817Type *ValTy = Incr->getType();18818assert(ValTy->getPrimitiveSizeInBits() == 128);18819Function *RMW = Intrinsic::getDeclaration(18820M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation()));18821Type *Int64Ty = Type::getInt64Ty(M->getContext());18822Value *IncrLo = Builder.CreateTrunc(Incr, Int64Ty, "incr_lo");18823Value *IncrHi =18824Builder.CreateTrunc(Builder.CreateLShr(Incr, 64), Int64Ty, "incr_hi");18825Value *LoHi = Builder.CreateCall(RMW, {AlignedAddr, IncrLo, IncrHi});18826Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");18827Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");18828Lo = Builder.CreateZExt(Lo, ValTy, "lo64");18829Hi = Builder.CreateZExt(Hi, ValTy, "hi64");18830return Builder.CreateOr(18831Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");18832}1883318834Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(18835IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,18836Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {18837assert(shouldInlineQuadwordAtomics() && "Only support quadword now");18838Module *M = Builder.GetInsertBlock()->getParent()->getParent();18839Type *ValTy = CmpVal->getType();18840assert(ValTy->getPrimitiveSizeInBits() == 128);18841Function *IntCmpXchg =18842Intrinsic::getDeclaration(M, Intrinsic::ppc_cmpxchg_i128);18843Type *Int64Ty = Type::getInt64Ty(M->getContext());18844Value *CmpLo = Builder.CreateTrunc(CmpVal, Int64Ty, "cmp_lo");18845Value *CmpHi =18846Builder.CreateTrunc(Builder.CreateLShr(CmpVal, 64), Int64Ty, "cmp_hi");18847Value *NewLo = Builder.CreateTrunc(NewVal, Int64Ty, "new_lo");18848Value *NewHi =18849Builder.CreateTrunc(Builder.CreateLShr(NewVal, 64), Int64Ty, "new_hi");18850emitLeadingFence(Builder, CI, Ord);18851Value *LoHi =18852Builder.CreateCall(IntCmpXchg, {AlignedAddr, CmpLo, CmpHi, NewLo, NewHi});18853emitTrailingFence(Builder, CI, Ord);18854Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");18855Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");18856Lo = Builder.CreateZExt(Lo, ValTy, "lo64");18857Hi = Builder.CreateZExt(Hi, ValTy, "hi64");18858return Builder.CreateOr(18859Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");18860}188611886218863