Path: blob/main/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
103835 views
//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// This file defines the interfaces that ARM uses to lower LLVM code into a9// selection DAG.10//11//===----------------------------------------------------------------------===//1213#include "ARMISelLowering.h"14#include "ARMBaseInstrInfo.h"15#include "ARMBaseRegisterInfo.h"16#include "ARMCallingConv.h"17#include "ARMConstantPoolValue.h"18#include "ARMMachineFunctionInfo.h"19#include "ARMPerfectShuffle.h"20#include "ARMRegisterInfo.h"21#include "ARMSelectionDAGInfo.h"22#include "ARMSubtarget.h"23#include "ARMTargetTransformInfo.h"24#include "MCTargetDesc/ARMAddressingModes.h"25#include "MCTargetDesc/ARMBaseInfo.h"26#include "Utils/ARMBaseInfo.h"27#include "llvm/ADT/APFloat.h"28#include "llvm/ADT/APInt.h"29#include "llvm/ADT/ArrayRef.h"30#include "llvm/ADT/BitVector.h"31#include "llvm/ADT/DenseMap.h"32#include "llvm/ADT/STLExtras.h"33#include "llvm/ADT/SmallPtrSet.h"34#include "llvm/ADT/SmallVector.h"35#include "llvm/ADT/Statistic.h"36#include "llvm/ADT/StringExtras.h"37#include "llvm/ADT/StringRef.h"38#include "llvm/ADT/StringSwitch.h"39#include "llvm/ADT/Twine.h"40#include "llvm/Analysis/VectorUtils.h"41#include "llvm/CodeGen/CallingConvLower.h"42#include "llvm/CodeGen/ComplexDeinterleavingPass.h"43#include "llvm/CodeGen/ISDOpcodes.h"44#include "llvm/CodeGen/IntrinsicLowering.h"45#include "llvm/CodeGen/MachineBasicBlock.h"46#include "llvm/CodeGen/MachineConstantPool.h"47#include "llvm/CodeGen/MachineFrameInfo.h"48#include "llvm/CodeGen/MachineFunction.h"49#include "llvm/CodeGen/MachineInstr.h"50#include "llvm/CodeGen/MachineInstrBuilder.h"51#include "llvm/CodeGen/MachineJumpTableInfo.h"52#include "llvm/CodeGen/MachineMemOperand.h"53#include "llvm/CodeGen/MachineOperand.h"54#include "llvm/CodeGen/MachineRegisterInfo.h"55#include "llvm/CodeGen/RuntimeLibcallUtil.h"56#include "llvm/CodeGen/SelectionDAG.h"57#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"58#include "llvm/CodeGen/SelectionDAGNodes.h"59#include "llvm/CodeGen/TargetInstrInfo.h"60#include "llvm/CodeGen/TargetLowering.h"61#include "llvm/CodeGen/TargetOpcodes.h"62#include "llvm/CodeGen/TargetRegisterInfo.h"63#include "llvm/CodeGen/TargetSubtargetInfo.h"64#include "llvm/CodeGen/ValueTypes.h"65#include "llvm/CodeGenTypes/MachineValueType.h"66#include "llvm/IR/Attributes.h"67#include "llvm/IR/CallingConv.h"68#include "llvm/IR/Constant.h"69#include "llvm/IR/Constants.h"70#include "llvm/IR/DataLayout.h"71#include "llvm/IR/DebugLoc.h"72#include "llvm/IR/DerivedTypes.h"73#include "llvm/IR/Function.h"74#include "llvm/IR/GlobalAlias.h"75#include "llvm/IR/GlobalValue.h"76#include "llvm/IR/GlobalVariable.h"77#include "llvm/IR/IRBuilder.h"78#include "llvm/IR/InlineAsm.h"79#include "llvm/IR/Instruction.h"80#include "llvm/IR/Instructions.h"81#include "llvm/IR/IntrinsicInst.h"82#include "llvm/IR/Intrinsics.h"83#include "llvm/IR/IntrinsicsARM.h"84#include "llvm/IR/Module.h"85#include "llvm/IR/PatternMatch.h"86#include "llvm/IR/Type.h"87#include "llvm/IR/User.h"88#include "llvm/IR/Value.h"89#include "llvm/MC/MCInstrDesc.h"90#include "llvm/MC/MCInstrItineraries.h"91#include "llvm/MC/MCRegisterInfo.h"92#include "llvm/MC/MCSchedule.h"93#include "llvm/Support/AtomicOrdering.h"94#include "llvm/Support/BranchProbability.h"95#include "llvm/Support/Casting.h"96#include "llvm/Support/CodeGen.h"97#include "llvm/Support/CommandLine.h"98#include "llvm/Support/Compiler.h"99#include "llvm/Support/Debug.h"100#include "llvm/Support/ErrorHandling.h"101#include "llvm/Support/KnownBits.h"102#include "llvm/Support/MathExtras.h"103#include "llvm/Support/raw_ostream.h"104#include "llvm/Target/TargetMachine.h"105#include "llvm/Target/TargetOptions.h"106#include "llvm/TargetParser/Triple.h"107#include <algorithm>108#include <cassert>109#include <cstdint>110#include <cstdlib>111#include <iterator>112#include <limits>113#include <optional>114#include <tuple>115#include <utility>116#include <vector>117118using namespace llvm;119using namespace llvm::PatternMatch;120121#define DEBUG_TYPE "arm-isel"122123STATISTIC(NumTailCalls, "Number of tail calls");124STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");125STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");126STATISTIC(NumConstpoolPromoted,127"Number of constants with their storage promoted into constant pools");128129static cl::opt<bool>130ARMInterworking("arm-interworking", cl::Hidden,131cl::desc("Enable / disable ARM interworking (for debugging only)"),132cl::init(true));133134static cl::opt<bool> EnableConstpoolPromotion(135"arm-promote-constant", cl::Hidden,136cl::desc("Enable / disable promotion of unnamed_addr constants into "137"constant pools"),138cl::init(false)); // FIXME: set to true by default once PR32780 is fixed139static cl::opt<unsigned> ConstpoolPromotionMaxSize(140"arm-promote-constant-max-size", cl::Hidden,141cl::desc("Maximum size of constant to promote into a constant pool"),142cl::init(64));143static cl::opt<unsigned> ConstpoolPromotionMaxTotal(144"arm-promote-constant-max-total", cl::Hidden,145cl::desc("Maximum size of ALL constants to promote into a constant pool"),146cl::init(128));147148cl::opt<unsigned>149MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,150cl::desc("Maximum interleave factor for MVE VLDn to generate."),151cl::init(2));152153// The APCS parameter registers.154static const MCPhysReg GPRArgRegs[] = {155ARM::R0, ARM::R1, ARM::R2, ARM::R3156};157158static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg,159SelectionDAG &DAG, const SDLoc &DL) {160assert(Arg.ArgVT.isScalarInteger());161assert(Arg.ArgVT.bitsLT(MVT::i32));162SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);163SDValue Ext =164DAG.getNode(Arg.Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,165MVT::i32, Trunc);166return Ext;167}168169void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {170if (VT != PromotedLdStVT) {171setOperationAction(ISD::LOAD, VT, Promote);172AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);173174setOperationAction(ISD::STORE, VT, Promote);175AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);176}177178MVT ElemTy = VT.getVectorElementType();179if (ElemTy != MVT::f64)180setOperationAction(ISD::SETCC, VT, Custom);181setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);182setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);183if (ElemTy == MVT::i32) {184setOperationAction(ISD::SINT_TO_FP, VT, Custom);185setOperationAction(ISD::UINT_TO_FP, VT, Custom);186setOperationAction(ISD::FP_TO_SINT, VT, Custom);187setOperationAction(ISD::FP_TO_UINT, VT, Custom);188} else {189setOperationAction(ISD::SINT_TO_FP, VT, Expand);190setOperationAction(ISD::UINT_TO_FP, VT, Expand);191setOperationAction(ISD::FP_TO_SINT, VT, Expand);192setOperationAction(ISD::FP_TO_UINT, VT, Expand);193}194setOperationAction(ISD::BUILD_VECTOR, VT, Custom);195setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);196setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);197setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);198setOperationAction(ISD::SELECT, VT, Expand);199setOperationAction(ISD::SELECT_CC, VT, Expand);200setOperationAction(ISD::VSELECT, VT, Expand);201setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);202if (VT.isInteger()) {203setOperationAction(ISD::SHL, VT, Custom);204setOperationAction(ISD::SRA, VT, Custom);205setOperationAction(ISD::SRL, VT, Custom);206}207208// Neon does not support vector divide/remainder operations.209setOperationAction(ISD::SDIV, VT, Expand);210setOperationAction(ISD::UDIV, VT, Expand);211setOperationAction(ISD::FDIV, VT, Expand);212setOperationAction(ISD::SREM, VT, Expand);213setOperationAction(ISD::UREM, VT, Expand);214setOperationAction(ISD::FREM, VT, Expand);215setOperationAction(ISD::SDIVREM, VT, Expand);216setOperationAction(ISD::UDIVREM, VT, Expand);217218if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)219for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,220ISD::UMIN, ISD::UMAX})221setOperationAction(Opcode, VT, Legal);222if (!VT.isFloatingPoint())223for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})224setOperationAction(Opcode, VT, Legal);225}226227void ARMTargetLowering::addDRTypeForNEON(MVT VT) {228addRegisterClass(VT, &ARM::DPRRegClass);229addTypeForNEON(VT, MVT::f64);230}231232void ARMTargetLowering::addQRTypeForNEON(MVT VT) {233addRegisterClass(VT, &ARM::DPairRegClass);234addTypeForNEON(VT, MVT::v2f64);235}236237void ARMTargetLowering::setAllExpand(MVT VT) {238for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)239setOperationAction(Opc, VT, Expand);240241// We support these really simple operations even on types where all242// the actual arithmetic has to be broken down into simpler243// operations or turned into library calls.244setOperationAction(ISD::BITCAST, VT, Legal);245setOperationAction(ISD::LOAD, VT, Legal);246setOperationAction(ISD::STORE, VT, Legal);247setOperationAction(ISD::UNDEF, VT, Legal);248}249250void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,251LegalizeAction Action) {252setLoadExtAction(ISD::EXTLOAD, From, To, Action);253setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);254setLoadExtAction(ISD::SEXTLOAD, From, To, Action);255}256257void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {258const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };259260for (auto VT : IntTypes) {261addRegisterClass(VT, &ARM::MQPRRegClass);262setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);263setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);264setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);265setOperationAction(ISD::BUILD_VECTOR, VT, Custom);266setOperationAction(ISD::SHL, VT, Custom);267setOperationAction(ISD::SRA, VT, Custom);268setOperationAction(ISD::SRL, VT, Custom);269setOperationAction(ISD::SMIN, VT, Legal);270setOperationAction(ISD::SMAX, VT, Legal);271setOperationAction(ISD::UMIN, VT, Legal);272setOperationAction(ISD::UMAX, VT, Legal);273setOperationAction(ISD::ABS, VT, Legal);274setOperationAction(ISD::SETCC, VT, Custom);275setOperationAction(ISD::MLOAD, VT, Custom);276setOperationAction(ISD::MSTORE, VT, Legal);277setOperationAction(ISD::CTLZ, VT, Legal);278setOperationAction(ISD::CTTZ, VT, Custom);279setOperationAction(ISD::BITREVERSE, VT, Legal);280setOperationAction(ISD::BSWAP, VT, Legal);281setOperationAction(ISD::SADDSAT, VT, Legal);282setOperationAction(ISD::UADDSAT, VT, Legal);283setOperationAction(ISD::SSUBSAT, VT, Legal);284setOperationAction(ISD::USUBSAT, VT, Legal);285setOperationAction(ISD::ABDS, VT, Legal);286setOperationAction(ISD::ABDU, VT, Legal);287setOperationAction(ISD::AVGFLOORS, VT, Legal);288setOperationAction(ISD::AVGFLOORU, VT, Legal);289setOperationAction(ISD::AVGCEILS, VT, Legal);290setOperationAction(ISD::AVGCEILU, VT, Legal);291292// No native support for these.293setOperationAction(ISD::UDIV, VT, Expand);294setOperationAction(ISD::SDIV, VT, Expand);295setOperationAction(ISD::UREM, VT, Expand);296setOperationAction(ISD::SREM, VT, Expand);297setOperationAction(ISD::UDIVREM, VT, Expand);298setOperationAction(ISD::SDIVREM, VT, Expand);299setOperationAction(ISD::CTPOP, VT, Expand);300setOperationAction(ISD::SELECT, VT, Expand);301setOperationAction(ISD::SELECT_CC, VT, Expand);302303// Vector reductions304setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);305setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);306setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);307setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);308setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);309setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);310setOperationAction(ISD::VECREDUCE_AND, VT, Custom);311setOperationAction(ISD::VECREDUCE_OR, VT, Custom);312setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);313314if (!HasMVEFP) {315setOperationAction(ISD::SINT_TO_FP, VT, Expand);316setOperationAction(ISD::UINT_TO_FP, VT, Expand);317setOperationAction(ISD::FP_TO_SINT, VT, Expand);318setOperationAction(ISD::FP_TO_UINT, VT, Expand);319} else {320setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);321setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);322}323324// Pre and Post inc are supported on loads and stores325for (unsigned im = (unsigned)ISD::PRE_INC;326im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {327setIndexedLoadAction(im, VT, Legal);328setIndexedStoreAction(im, VT, Legal);329setIndexedMaskedLoadAction(im, VT, Legal);330setIndexedMaskedStoreAction(im, VT, Legal);331}332}333334const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };335for (auto VT : FloatTypes) {336addRegisterClass(VT, &ARM::MQPRRegClass);337if (!HasMVEFP)338setAllExpand(VT);339340// These are legal or custom whether we have MVE.fp or not341setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);342setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);343setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom);344setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);345setOperationAction(ISD::BUILD_VECTOR, VT, Custom);346setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);347setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);348setOperationAction(ISD::SETCC, VT, Custom);349setOperationAction(ISD::MLOAD, VT, Custom);350setOperationAction(ISD::MSTORE, VT, Legal);351setOperationAction(ISD::SELECT, VT, Expand);352setOperationAction(ISD::SELECT_CC, VT, Expand);353354// Pre and Post inc are supported on loads and stores355for (unsigned im = (unsigned)ISD::PRE_INC;356im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {357setIndexedLoadAction(im, VT, Legal);358setIndexedStoreAction(im, VT, Legal);359setIndexedMaskedLoadAction(im, VT, Legal);360setIndexedMaskedStoreAction(im, VT, Legal);361}362363if (HasMVEFP) {364setOperationAction(ISD::FMINNUM, VT, Legal);365setOperationAction(ISD::FMAXNUM, VT, Legal);366setOperationAction(ISD::FROUND, VT, Legal);367setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);368setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);369setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);370setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);371372// No native support for these.373setOperationAction(ISD::FDIV, VT, Expand);374setOperationAction(ISD::FREM, VT, Expand);375setOperationAction(ISD::FSQRT, VT, Expand);376setOperationAction(ISD::FSIN, VT, Expand);377setOperationAction(ISD::FCOS, VT, Expand);378setOperationAction(ISD::FTAN, VT, Expand);379setOperationAction(ISD::FPOW, VT, Expand);380setOperationAction(ISD::FLOG, VT, Expand);381setOperationAction(ISD::FLOG2, VT, Expand);382setOperationAction(ISD::FLOG10, VT, Expand);383setOperationAction(ISD::FEXP, VT, Expand);384setOperationAction(ISD::FEXP2, VT, Expand);385setOperationAction(ISD::FEXP10, VT, Expand);386setOperationAction(ISD::FNEARBYINT, VT, Expand);387}388}389390// Custom Expand smaller than legal vector reductions to prevent false zero391// items being added.392setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);393setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);394setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);395setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);396setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);397setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);398setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);399setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);400401// We 'support' these types up to bitcast/load/store level, regardless of402// MVE integer-only / float support. Only doing FP data processing on the FP403// vector types is inhibited at integer-only level.404const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };405for (auto VT : LongTypes) {406addRegisterClass(VT, &ARM::MQPRRegClass);407setAllExpand(VT);408setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);409setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);410setOperationAction(ISD::BUILD_VECTOR, VT, Custom);411setOperationAction(ISD::VSELECT, VT, Legal);412setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);413}414setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);415416// We can do bitwise operations on v2i64 vectors417setOperationAction(ISD::AND, MVT::v2i64, Legal);418setOperationAction(ISD::OR, MVT::v2i64, Legal);419setOperationAction(ISD::XOR, MVT::v2i64, Legal);420421// It is legal to extload from v4i8 to v4i16 or v4i32.422addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);423addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);424addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);425426// It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.427setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);428setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);429setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);430setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal);431setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal);432433// Some truncating stores are legal too.434setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);435setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);436setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);437438// Pre and Post inc on these are legal, given the correct extends439for (unsigned im = (unsigned)ISD::PRE_INC;440im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {441for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {442setIndexedLoadAction(im, VT, Legal);443setIndexedStoreAction(im, VT, Legal);444setIndexedMaskedLoadAction(im, VT, Legal);445setIndexedMaskedStoreAction(im, VT, Legal);446}447}448449// Predicate types450const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};451for (auto VT : pTypes) {452addRegisterClass(VT, &ARM::VCCRRegClass);453setOperationAction(ISD::BUILD_VECTOR, VT, Custom);454setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);455setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);456setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);457setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);458setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);459setOperationAction(ISD::SETCC, VT, Custom);460setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);461setOperationAction(ISD::LOAD, VT, Custom);462setOperationAction(ISD::STORE, VT, Custom);463setOperationAction(ISD::TRUNCATE, VT, Custom);464setOperationAction(ISD::VSELECT, VT, Expand);465setOperationAction(ISD::SELECT, VT, Expand);466setOperationAction(ISD::SELECT_CC, VT, Expand);467468if (!HasMVEFP) {469setOperationAction(ISD::SINT_TO_FP, VT, Expand);470setOperationAction(ISD::UINT_TO_FP, VT, Expand);471setOperationAction(ISD::FP_TO_SINT, VT, Expand);472setOperationAction(ISD::FP_TO_UINT, VT, Expand);473}474}475setOperationAction(ISD::SETCC, MVT::v2i1, Expand);476setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand);477setOperationAction(ISD::AND, MVT::v2i1, Expand);478setOperationAction(ISD::OR, MVT::v2i1, Expand);479setOperationAction(ISD::XOR, MVT::v2i1, Expand);480setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Expand);481setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Expand);482setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Expand);483setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Expand);484485setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);486setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);487setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);488setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);489setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);490setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);491setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);492setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);493}494495ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,496const ARMSubtarget &STI)497: TargetLowering(TM), Subtarget(&STI) {498RegInfo = Subtarget->getRegisterInfo();499Itins = Subtarget->getInstrItineraryData();500501setBooleanContents(ZeroOrOneBooleanContent);502setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);503504if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&505!Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) {506bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;507for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)508setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),509IsHFTarget ? CallingConv::ARM_AAPCS_VFP510: CallingConv::ARM_AAPCS);511}512513if (Subtarget->isTargetMachO()) {514// Uses VFP for Thumb libfuncs if available.515if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&516Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {517static const struct {518const RTLIB::Libcall Op;519const char * const Name;520const ISD::CondCode Cond;521} LibraryCalls[] = {522// Single-precision floating-point arithmetic.523{ RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },524{ RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },525{ RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },526{ RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },527528// Double-precision floating-point arithmetic.529{ RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },530{ RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },531{ RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },532{ RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },533534// Single-precision comparisons.535{ RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },536{ RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },537{ RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },538{ RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },539{ RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },540{ RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },541{ RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },542543// Double-precision comparisons.544{ RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },545{ RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },546{ RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },547{ RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },548{ RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },549{ RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },550{ RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },551552// Floating-point to integer conversions.553// i64 conversions are done via library routines even when generating VFP554// instructions, so use the same ones.555{ RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },556{ RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },557{ RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },558{ RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },559560// Conversions between floating types.561{ RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },562{ RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },563564// Integer to floating-point conversions.565// i64 conversions are done via library routines even when generating VFP566// instructions, so use the same ones.567// FIXME: There appears to be some naming inconsistency in ARM libgcc:568// e.g., __floatunsidf vs. __floatunssidfvfp.569{ RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },570{ RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },571{ RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },572{ RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },573};574575for (const auto &LC : LibraryCalls) {576setLibcallName(LC.Op, LC.Name);577if (LC.Cond != ISD::SETCC_INVALID)578setCmpLibcallCC(LC.Op, LC.Cond);579}580}581}582583// RTLIB584if (Subtarget->isAAPCS_ABI() &&585(Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||586Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {587static const struct {588const RTLIB::Libcall Op;589const char * const Name;590const CallingConv::ID CC;591const ISD::CondCode Cond;592} LibraryCalls[] = {593// Double-precision floating-point arithmetic helper functions594// RTABI chapter 4.1.2, Table 2595{ RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },596{ RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },597{ RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },598{ RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },599600// Double-precision floating-point comparison helper functions601// RTABI chapter 4.1.2, Table 3602{ RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },603{ RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },604{ RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },605{ RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },606{ RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },607{ RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },608{ RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },609610// Single-precision floating-point arithmetic helper functions611// RTABI chapter 4.1.2, Table 4612{ RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },613{ RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },614{ RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },615{ RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },616617// Single-precision floating-point comparison helper functions618// RTABI chapter 4.1.2, Table 5619{ RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },620{ RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },621{ RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },622{ RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },623{ RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },624{ RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },625{ RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },626627// Floating-point to integer conversions.628// RTABI chapter 4.1.2, Table 6629{ RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },630{ RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },631{ RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },632{ RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },633{ RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },634{ RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },635{ RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },636{ RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },637638// Conversions between floating types.639// RTABI chapter 4.1.2, Table 7640{ RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },641{ RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },642{ RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },643644// Integer to floating-point conversions.645// RTABI chapter 4.1.2, Table 8646{ RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },647{ RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },648{ RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },649{ RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },650{ RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },651{ RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },652{ RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },653{ RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },654655// Long long helper functions656// RTABI chapter 4.2, Table 9657{ RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },658{ RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },659{ RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },660{ RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },661662// Integer division functions663// RTABI chapter 4.3.1664{ RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },665{ RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },666{ RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },667{ RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },668{ RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },669{ RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },670{ RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },671{ RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },672};673674for (const auto &LC : LibraryCalls) {675setLibcallName(LC.Op, LC.Name);676setLibcallCallingConv(LC.Op, LC.CC);677if (LC.Cond != ISD::SETCC_INVALID)678setCmpLibcallCC(LC.Op, LC.Cond);679}680681// EABI dependent RTLIB682if (TM.Options.EABIVersion == EABI::EABI4 ||683TM.Options.EABIVersion == EABI::EABI5) {684static const struct {685const RTLIB::Libcall Op;686const char *const Name;687const CallingConv::ID CC;688const ISD::CondCode Cond;689} MemOpsLibraryCalls[] = {690// Memory operations691// RTABI chapter 4.3.4692{ RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },693{ RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },694{ RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },695};696697for (const auto &LC : MemOpsLibraryCalls) {698setLibcallName(LC.Op, LC.Name);699setLibcallCallingConv(LC.Op, LC.CC);700if (LC.Cond != ISD::SETCC_INVALID)701setCmpLibcallCC(LC.Op, LC.Cond);702}703}704}705706if (Subtarget->isTargetWindows()) {707static const struct {708const RTLIB::Libcall Op;709const char * const Name;710const CallingConv::ID CC;711} LibraryCalls[] = {712{ RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },713{ RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },714{ RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },715{ RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },716{ RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },717{ RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },718{ RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },719{ RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },720};721722for (const auto &LC : LibraryCalls) {723setLibcallName(LC.Op, LC.Name);724setLibcallCallingConv(LC.Op, LC.CC);725}726}727728// Use divmod compiler-rt calls for iOS 5.0 and later.729if (Subtarget->isTargetMachO() &&730!(Subtarget->isTargetIOS() &&731Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {732setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");733setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");734}735736// The half <-> float conversion functions are always soft-float on737// non-watchos platforms, but are needed for some targets which use a738// hard-float calling convention by default.739if (!Subtarget->isTargetWatchABI()) {740if (Subtarget->isAAPCS_ABI()) {741setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);742setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);743setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);744} else {745setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);746setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);747setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);748}749}750751// In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have752// a __gnu_ prefix (which is the default).753if (Subtarget->isTargetAEABI()) {754static const struct {755const RTLIB::Libcall Op;756const char * const Name;757const CallingConv::ID CC;758} LibraryCalls[] = {759{ RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },760{ RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },761{ RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },762};763764for (const auto &LC : LibraryCalls) {765setLibcallName(LC.Op, LC.Name);766setLibcallCallingConv(LC.Op, LC.CC);767}768}769770if (Subtarget->isThumb1Only())771addRegisterClass(MVT::i32, &ARM::tGPRRegClass);772else773addRegisterClass(MVT::i32, &ARM::GPRRegClass);774775if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&776Subtarget->hasFPRegs()) {777addRegisterClass(MVT::f32, &ARM::SPRRegClass);778addRegisterClass(MVT::f64, &ARM::DPRRegClass);779780setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);781setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);782setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);783setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);784785if (!Subtarget->hasVFP2Base())786setAllExpand(MVT::f32);787if (!Subtarget->hasFP64())788setAllExpand(MVT::f64);789}790791if (Subtarget->hasFullFP16()) {792addRegisterClass(MVT::f16, &ARM::HPRRegClass);793setOperationAction(ISD::BITCAST, MVT::i16, Custom);794setOperationAction(ISD::BITCAST, MVT::f16, Custom);795796setOperationAction(ISD::FMINNUM, MVT::f16, Legal);797setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);798}799800if (Subtarget->hasBF16()) {801addRegisterClass(MVT::bf16, &ARM::HPRRegClass);802setAllExpand(MVT::bf16);803if (!Subtarget->hasFullFP16())804setOperationAction(ISD::BITCAST, MVT::bf16, Custom);805}806807for (MVT VT : MVT::fixedlen_vector_valuetypes()) {808for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {809setTruncStoreAction(VT, InnerVT, Expand);810addAllExtLoads(VT, InnerVT, Expand);811}812813setOperationAction(ISD::SMUL_LOHI, VT, Expand);814setOperationAction(ISD::UMUL_LOHI, VT, Expand);815816setOperationAction(ISD::BSWAP, VT, Expand);817}818819setOperationAction(ISD::ConstantFP, MVT::f32, Custom);820setOperationAction(ISD::ConstantFP, MVT::f64, Custom);821822setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);823setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);824825if (Subtarget->hasMVEIntegerOps())826addMVEVectorTypes(Subtarget->hasMVEFloatOps());827828// Combine low-overhead loop intrinsics so that we can lower i1 types.829if (Subtarget->hasLOB()) {830setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC});831}832833if (Subtarget->hasNEON()) {834addDRTypeForNEON(MVT::v2f32);835addDRTypeForNEON(MVT::v8i8);836addDRTypeForNEON(MVT::v4i16);837addDRTypeForNEON(MVT::v2i32);838addDRTypeForNEON(MVT::v1i64);839840addQRTypeForNEON(MVT::v4f32);841addQRTypeForNEON(MVT::v2f64);842addQRTypeForNEON(MVT::v16i8);843addQRTypeForNEON(MVT::v8i16);844addQRTypeForNEON(MVT::v4i32);845addQRTypeForNEON(MVT::v2i64);846847if (Subtarget->hasFullFP16()) {848addQRTypeForNEON(MVT::v8f16);849addDRTypeForNEON(MVT::v4f16);850}851852if (Subtarget->hasBF16()) {853addQRTypeForNEON(MVT::v8bf16);854addDRTypeForNEON(MVT::v4bf16);855}856}857858if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {859// v2f64 is legal so that QR subregs can be extracted as f64 elements, but860// none of Neon, MVE or VFP supports any arithmetic operations on it.861setOperationAction(ISD::FADD, MVT::v2f64, Expand);862setOperationAction(ISD::FSUB, MVT::v2f64, Expand);863setOperationAction(ISD::FMUL, MVT::v2f64, Expand);864// FIXME: Code duplication: FDIV and FREM are expanded always, see865// ARMTargetLowering::addTypeForNEON method for details.866setOperationAction(ISD::FDIV, MVT::v2f64, Expand);867setOperationAction(ISD::FREM, MVT::v2f64, Expand);868// FIXME: Create unittest.869// In another words, find a way when "copysign" appears in DAG with vector870// operands.871setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);872// FIXME: Code duplication: SETCC has custom operation action, see873// ARMTargetLowering::addTypeForNEON method for details.874setOperationAction(ISD::SETCC, MVT::v2f64, Expand);875// FIXME: Create unittest for FNEG and for FABS.876setOperationAction(ISD::FNEG, MVT::v2f64, Expand);877setOperationAction(ISD::FABS, MVT::v2f64, Expand);878setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);879setOperationAction(ISD::FSIN, MVT::v2f64, Expand);880setOperationAction(ISD::FCOS, MVT::v2f64, Expand);881setOperationAction(ISD::FTAN, MVT::v2f64, Expand);882setOperationAction(ISD::FPOW, MVT::v2f64, Expand);883setOperationAction(ISD::FLOG, MVT::v2f64, Expand);884setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);885setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);886setOperationAction(ISD::FEXP, MVT::v2f64, Expand);887setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);888setOperationAction(ISD::FEXP10, MVT::v2f64, Expand);889// FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.890setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);891setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);892setOperationAction(ISD::FRINT, MVT::v2f64, Expand);893setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);894setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);895setOperationAction(ISD::FMA, MVT::v2f64, Expand);896}897898if (Subtarget->hasNEON()) {899// The same with v4f32. But keep in mind that vadd, vsub, vmul are natively900// supported for v4f32.901setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);902setOperationAction(ISD::FSIN, MVT::v4f32, Expand);903setOperationAction(ISD::FCOS, MVT::v4f32, Expand);904setOperationAction(ISD::FTAN, MVT::v4f32, Expand);905setOperationAction(ISD::FPOW, MVT::v4f32, Expand);906setOperationAction(ISD::FLOG, MVT::v4f32, Expand);907setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);908setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);909setOperationAction(ISD::FEXP, MVT::v4f32, Expand);910setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);911setOperationAction(ISD::FEXP10, MVT::v4f32, Expand);912setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);913setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);914setOperationAction(ISD::FRINT, MVT::v4f32, Expand);915setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);916setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);917918// Mark v2f32 intrinsics.919setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);920setOperationAction(ISD::FSIN, MVT::v2f32, Expand);921setOperationAction(ISD::FCOS, MVT::v2f32, Expand);922setOperationAction(ISD::FTAN, MVT::v2f32, Expand);923setOperationAction(ISD::FPOW, MVT::v2f32, Expand);924setOperationAction(ISD::FLOG, MVT::v2f32, Expand);925setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);926setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);927setOperationAction(ISD::FEXP, MVT::v2f32, Expand);928setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);929setOperationAction(ISD::FEXP10, MVT::v2f32, Expand);930setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);931setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);932setOperationAction(ISD::FRINT, MVT::v2f32, Expand);933setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);934setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);935936// Neon does not support some operations on v1i64 and v2i64 types.937setOperationAction(ISD::MUL, MVT::v1i64, Expand);938// Custom handling for some quad-vector types to detect VMULL.939setOperationAction(ISD::MUL, MVT::v8i16, Custom);940setOperationAction(ISD::MUL, MVT::v4i32, Custom);941setOperationAction(ISD::MUL, MVT::v2i64, Custom);942// Custom handling for some vector types to avoid expensive expansions943setOperationAction(ISD::SDIV, MVT::v4i16, Custom);944setOperationAction(ISD::SDIV, MVT::v8i8, Custom);945setOperationAction(ISD::UDIV, MVT::v4i16, Custom);946setOperationAction(ISD::UDIV, MVT::v8i8, Custom);947// Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with948// a destination type that is wider than the source, and nor does949// it have a FP_TO_[SU]INT instruction with a narrower destination than950// source.951setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);952setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);953setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);954setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);955setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);956setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);957setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);958setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);959960setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);961setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);962963// NEON does not have single instruction CTPOP for vectors with element964// types wider than 8-bits. However, custom lowering can leverage the965// v8i8/v16i8 vcnt instruction.966setOperationAction(ISD::CTPOP, MVT::v2i32, Custom);967setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);968setOperationAction(ISD::CTPOP, MVT::v4i16, Custom);969setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);970setOperationAction(ISD::CTPOP, MVT::v1i64, Custom);971setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);972973setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);974setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);975976// NEON does not have single instruction CTTZ for vectors.977setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);978setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);979setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);980setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);981982setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);983setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);984setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);985setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);986987setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);988setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);989setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);990setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);991992setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);993setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);994setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);995setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);996997for (MVT VT : MVT::fixedlen_vector_valuetypes()) {998setOperationAction(ISD::MULHS, VT, Expand);999setOperationAction(ISD::MULHU, VT, Expand);1000}10011002// NEON only has FMA instructions as of VFP4.1003if (!Subtarget->hasVFP4Base()) {1004setOperationAction(ISD::FMA, MVT::v2f32, Expand);1005setOperationAction(ISD::FMA, MVT::v4f32, Expand);1006}10071008setTargetDAGCombine({ISD::SHL, ISD::SRL, ISD::SRA, ISD::FP_TO_SINT,1009ISD::FP_TO_UINT, ISD::FMUL, ISD::LOAD});10101011// It is legal to extload from v4i8 to v4i16 or v4i32.1012for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,1013MVT::v2i32}) {1014for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {1015setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);1016setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);1017setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);1018}1019}10201021for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,1022MVT::v4i32}) {1023setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);1024setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);1025setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);1026setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);1027}1028}10291030if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {1031setTargetDAGCombine(1032{ISD::BUILD_VECTOR, ISD::VECTOR_SHUFFLE, ISD::INSERT_SUBVECTOR,1033ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,1034ISD::SIGN_EXTEND_INREG, ISD::STORE, ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,1035ISD::ANY_EXTEND, ISD::INTRINSIC_WO_CHAIN, ISD::INTRINSIC_W_CHAIN,1036ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST});1037}1038if (Subtarget->hasMVEIntegerOps()) {1039setTargetDAGCombine({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,1040ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC,1041ISD::SETCC});1042}1043if (Subtarget->hasMVEFloatOps()) {1044setTargetDAGCombine(ISD::FADD);1045}10461047if (!Subtarget->hasFP64()) {1048// When targeting a floating-point unit with only single-precision1049// operations, f64 is legal for the few double-precision instructions which1050// are present However, no double-precision operations other than moves,1051// loads and stores are provided by the hardware.1052setOperationAction(ISD::FADD, MVT::f64, Expand);1053setOperationAction(ISD::FSUB, MVT::f64, Expand);1054setOperationAction(ISD::FMUL, MVT::f64, Expand);1055setOperationAction(ISD::FMA, MVT::f64, Expand);1056setOperationAction(ISD::FDIV, MVT::f64, Expand);1057setOperationAction(ISD::FREM, MVT::f64, Expand);1058setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);1059setOperationAction(ISD::FGETSIGN, MVT::f64, Expand);1060setOperationAction(ISD::FNEG, MVT::f64, Expand);1061setOperationAction(ISD::FABS, MVT::f64, Expand);1062setOperationAction(ISD::FSQRT, MVT::f64, Expand);1063setOperationAction(ISD::FSIN, MVT::f64, Expand);1064setOperationAction(ISD::FCOS, MVT::f64, Expand);1065setOperationAction(ISD::FPOW, MVT::f64, Expand);1066setOperationAction(ISD::FLOG, MVT::f64, Expand);1067setOperationAction(ISD::FLOG2, MVT::f64, Expand);1068setOperationAction(ISD::FLOG10, MVT::f64, Expand);1069setOperationAction(ISD::FEXP, MVT::f64, Expand);1070setOperationAction(ISD::FEXP2, MVT::f64, Expand);1071setOperationAction(ISD::FEXP10, MVT::f64, Expand);1072setOperationAction(ISD::FCEIL, MVT::f64, Expand);1073setOperationAction(ISD::FTRUNC, MVT::f64, Expand);1074setOperationAction(ISD::FRINT, MVT::f64, Expand);1075setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);1076setOperationAction(ISD::FFLOOR, MVT::f64, Expand);1077setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);1078setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);1079setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);1080setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);1081setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);1082setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);1083setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);1084setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);1085setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);1086setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom);1087setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom);1088setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);1089}10901091if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {1092setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);1093setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);1094if (Subtarget->hasFullFP16()) {1095setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);1096setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);1097}1098}10991100if (!Subtarget->hasFP16()) {1101setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);1102setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);1103}11041105computeRegisterProperties(Subtarget->getRegisterInfo());11061107// ARM does not have floating-point extending loads.1108for (MVT VT : MVT::fp_valuetypes()) {1109setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);1110setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);1111}11121113// ... or truncating stores1114setTruncStoreAction(MVT::f64, MVT::f32, Expand);1115setTruncStoreAction(MVT::f32, MVT::f16, Expand);1116setTruncStoreAction(MVT::f64, MVT::f16, Expand);11171118// ARM does not have i1 sign extending load.1119for (MVT VT : MVT::integer_valuetypes())1120setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);11211122// ARM supports all 4 flavors of integer indexed load / store.1123if (!Subtarget->isThumb1Only()) {1124for (unsigned im = (unsigned)ISD::PRE_INC;1125im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {1126setIndexedLoadAction(im, MVT::i1, Legal);1127setIndexedLoadAction(im, MVT::i8, Legal);1128setIndexedLoadAction(im, MVT::i16, Legal);1129setIndexedLoadAction(im, MVT::i32, Legal);1130setIndexedStoreAction(im, MVT::i1, Legal);1131setIndexedStoreAction(im, MVT::i8, Legal);1132setIndexedStoreAction(im, MVT::i16, Legal);1133setIndexedStoreAction(im, MVT::i32, Legal);1134}1135} else {1136// Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.1137setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);1138setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);1139}11401141setOperationAction(ISD::SADDO, MVT::i32, Custom);1142setOperationAction(ISD::UADDO, MVT::i32, Custom);1143setOperationAction(ISD::SSUBO, MVT::i32, Custom);1144setOperationAction(ISD::USUBO, MVT::i32, Custom);11451146setOperationAction(ISD::UADDO_CARRY, MVT::i32, Custom);1147setOperationAction(ISD::USUBO_CARRY, MVT::i32, Custom);1148if (Subtarget->hasDSP()) {1149setOperationAction(ISD::SADDSAT, MVT::i8, Custom);1150setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);1151setOperationAction(ISD::SADDSAT, MVT::i16, Custom);1152setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);1153setOperationAction(ISD::UADDSAT, MVT::i8, Custom);1154setOperationAction(ISD::USUBSAT, MVT::i8, Custom);1155setOperationAction(ISD::UADDSAT, MVT::i16, Custom);1156setOperationAction(ISD::USUBSAT, MVT::i16, Custom);1157}1158if (Subtarget->hasBaseDSP()) {1159setOperationAction(ISD::SADDSAT, MVT::i32, Legal);1160setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);1161}11621163// i64 operation support.1164setOperationAction(ISD::MUL, MVT::i64, Expand);1165setOperationAction(ISD::MULHU, MVT::i32, Expand);1166if (Subtarget->isThumb1Only()) {1167setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);1168setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);1169}1170if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()1171|| (Subtarget->isThumb2() && !Subtarget->hasDSP()))1172setOperationAction(ISD::MULHS, MVT::i32, Expand);11731174setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);1175setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);1176setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);1177setOperationAction(ISD::SRL, MVT::i64, Custom);1178setOperationAction(ISD::SRA, MVT::i64, Custom);1179setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);1180setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);1181setOperationAction(ISD::LOAD, MVT::i64, Custom);1182setOperationAction(ISD::STORE, MVT::i64, Custom);11831184// MVE lowers 64 bit shifts to lsll and lsrl1185// assuming that ISD::SRL and SRA of i64 are already marked custom1186if (Subtarget->hasMVEIntegerOps())1187setOperationAction(ISD::SHL, MVT::i64, Custom);11881189// Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.1190if (Subtarget->isThumb1Only()) {1191setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);1192setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);1193setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);1194}11951196if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())1197setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);11981199// ARM does not have ROTL.1200setOperationAction(ISD::ROTL, MVT::i32, Expand);1201for (MVT VT : MVT::fixedlen_vector_valuetypes()) {1202setOperationAction(ISD::ROTL, VT, Expand);1203setOperationAction(ISD::ROTR, VT, Expand);1204}1205setOperationAction(ISD::CTTZ, MVT::i32, Custom);1206setOperationAction(ISD::CTPOP, MVT::i32, Expand);1207if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {1208setOperationAction(ISD::CTLZ, MVT::i32, Expand);1209setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);1210}12111212// @llvm.readcyclecounter requires the Performance Monitors extension.1213// Default to the 0 expansion on unsupported platforms.1214// FIXME: Technically there are older ARM CPUs that have1215// implementation-specific ways of obtaining this information.1216if (Subtarget->hasPerfMon())1217setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);12181219// Only ARMv6 has BSWAP.1220if (!Subtarget->hasV6Ops())1221setOperationAction(ISD::BSWAP, MVT::i32, Expand);12221223bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()1224: Subtarget->hasDivideInARMMode();1225if (!hasDivide) {1226// These are expanded into libcalls if the cpu doesn't have HW divider.1227setOperationAction(ISD::SDIV, MVT::i32, LibCall);1228setOperationAction(ISD::UDIV, MVT::i32, LibCall);1229}12301231if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {1232setOperationAction(ISD::SDIV, MVT::i32, Custom);1233setOperationAction(ISD::UDIV, MVT::i32, Custom);12341235setOperationAction(ISD::SDIV, MVT::i64, Custom);1236setOperationAction(ISD::UDIV, MVT::i64, Custom);1237}12381239setOperationAction(ISD::SREM, MVT::i32, Expand);1240setOperationAction(ISD::UREM, MVT::i32, Expand);12411242// Register based DivRem for AEABI (RTABI 4.2)1243if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||1244Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||1245Subtarget->isTargetWindows()) {1246setOperationAction(ISD::SREM, MVT::i64, Custom);1247setOperationAction(ISD::UREM, MVT::i64, Custom);1248HasStandaloneRem = false;12491250if (Subtarget->isTargetWindows()) {1251const struct {1252const RTLIB::Libcall Op;1253const char * const Name;1254const CallingConv::ID CC;1255} LibraryCalls[] = {1256{ RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },1257{ RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },1258{ RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },1259{ RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },12601261{ RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },1262{ RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },1263{ RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },1264{ RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },1265};12661267for (const auto &LC : LibraryCalls) {1268setLibcallName(LC.Op, LC.Name);1269setLibcallCallingConv(LC.Op, LC.CC);1270}1271} else {1272const struct {1273const RTLIB::Libcall Op;1274const char * const Name;1275const CallingConv::ID CC;1276} LibraryCalls[] = {1277{ RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },1278{ RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },1279{ RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },1280{ RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },12811282{ RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },1283{ RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },1284{ RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },1285{ RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },1286};12871288for (const auto &LC : LibraryCalls) {1289setLibcallName(LC.Op, LC.Name);1290setLibcallCallingConv(LC.Op, LC.CC);1291}1292}12931294setOperationAction(ISD::SDIVREM, MVT::i32, Custom);1295setOperationAction(ISD::UDIVREM, MVT::i32, Custom);1296setOperationAction(ISD::SDIVREM, MVT::i64, Custom);1297setOperationAction(ISD::UDIVREM, MVT::i64, Custom);1298} else {1299setOperationAction(ISD::SDIVREM, MVT::i32, Expand);1300setOperationAction(ISD::UDIVREM, MVT::i32, Expand);1301}13021303setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);1304setOperationAction(ISD::ConstantPool, MVT::i32, Custom);1305setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);1306setOperationAction(ISD::BlockAddress, MVT::i32, Custom);13071308setOperationAction(ISD::TRAP, MVT::Other, Legal);1309setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);13101311// Use the default implementation.1312setOperationAction(ISD::VASTART, MVT::Other, Custom);1313setOperationAction(ISD::VAARG, MVT::Other, Expand);1314setOperationAction(ISD::VACOPY, MVT::Other, Expand);1315setOperationAction(ISD::VAEND, MVT::Other, Expand);1316setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);1317setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);13181319if (Subtarget->isTargetWindows())1320setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);1321else1322setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);13231324// ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use1325// the default expansion.1326InsertFencesForAtomic = false;1327if (Subtarget->hasAnyDataBarrier() &&1328(!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {1329// ATOMIC_FENCE needs custom lowering; the others should have been expanded1330// to ldrex/strex loops already.1331setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);1332if (!Subtarget->isThumb() || !Subtarget->isMClass())1333setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);13341335// On v8, we have particularly efficient implementations of atomic fences1336// if they can be combined with nearby atomic loads and stores.1337if (!Subtarget->hasAcquireRelease() ||1338getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {1339// Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.1340InsertFencesForAtomic = true;1341}1342} else {1343// If there's anything we can use as a barrier, go through custom lowering1344// for ATOMIC_FENCE.1345// If target has DMB in thumb, Fences can be inserted.1346if (Subtarget->hasDataBarrier())1347InsertFencesForAtomic = true;13481349setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,1350Subtarget->hasAnyDataBarrier() ? Custom : Expand);13511352// Set them all for libcall, which will force libcalls.1353setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);1354setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);1355setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);1356setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);1357setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, LibCall);1358setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);1359setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);1360setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, LibCall);1361setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, LibCall);1362setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, LibCall);1363setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, LibCall);1364setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, LibCall);1365// Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the1366// Unordered/Monotonic case.1367if (!InsertFencesForAtomic) {1368setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);1369setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);1370}1371}13721373// Compute supported atomic widths.1374if (Subtarget->isTargetLinux() ||1375(!Subtarget->isMClass() && Subtarget->hasV6Ops())) {1376// For targets where __sync_* routines are reliably available, we use them1377// if necessary.1378//1379// ARM Linux always supports 64-bit atomics through kernel-assisted atomic1380// routines (kernel 3.1 or later). FIXME: Not with compiler-rt?1381//1382// ARMv6 targets have native instructions in ARM mode. For Thumb mode,1383// such targets should provide __sync_* routines, which use the ARM mode1384// instructions. (ARMv6 doesn't have dmb, but it has an equivalent1385// encoding; see ARMISD::MEMBARRIER_MCR.)1386setMaxAtomicSizeInBitsSupported(64);1387} else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||1388Subtarget->hasForced32BitAtomics()) {1389// Cortex-M (besides Cortex-M0) have 32-bit atomics.1390setMaxAtomicSizeInBitsSupported(32);1391} else {1392// We can't assume anything about other targets; just use libatomic1393// routines.1394setMaxAtomicSizeInBitsSupported(0);1395}13961397setMaxDivRemBitWidthSupported(64);13981399setOperationAction(ISD::PREFETCH, MVT::Other, Custom);14001401// Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.1402if (!Subtarget->hasV6Ops()) {1403setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);1404setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);1405}1406setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);14071408if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&1409!Subtarget->isThumb1Only()) {1410// Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR1411// iff target supports vfp2.1412setOperationAction(ISD::BITCAST, MVT::i64, Custom);1413setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);1414setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);1415setOperationAction(ISD::GET_FPENV, MVT::i32, Legal);1416setOperationAction(ISD::SET_FPENV, MVT::i32, Legal);1417setOperationAction(ISD::RESET_FPENV, MVT::Other, Legal);1418setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);1419setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);1420setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);1421}14221423// We want to custom lower some of our intrinsics.1424setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);1425setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);1426setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);1427setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);1428if (Subtarget->useSjLjEH())1429setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");14301431setOperationAction(ISD::SETCC, MVT::i32, Expand);1432setOperationAction(ISD::SETCC, MVT::f32, Expand);1433setOperationAction(ISD::SETCC, MVT::f64, Expand);1434setOperationAction(ISD::SELECT, MVT::i32, Custom);1435setOperationAction(ISD::SELECT, MVT::f32, Custom);1436setOperationAction(ISD::SELECT, MVT::f64, Custom);1437setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);1438setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);1439setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);1440if (Subtarget->hasFullFP16()) {1441setOperationAction(ISD::SETCC, MVT::f16, Expand);1442setOperationAction(ISD::SELECT, MVT::f16, Custom);1443setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);1444}14451446setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);14471448setOperationAction(ISD::BRCOND, MVT::Other, Custom);1449setOperationAction(ISD::BR_CC, MVT::i32, Custom);1450if (Subtarget->hasFullFP16())1451setOperationAction(ISD::BR_CC, MVT::f16, Custom);1452setOperationAction(ISD::BR_CC, MVT::f32, Custom);1453setOperationAction(ISD::BR_CC, MVT::f64, Custom);1454setOperationAction(ISD::BR_JT, MVT::Other, Custom);14551456// We don't support sin/cos/fmod/copysign/pow1457setOperationAction(ISD::FSIN, MVT::f64, Expand);1458setOperationAction(ISD::FSIN, MVT::f32, Expand);1459setOperationAction(ISD::FCOS, MVT::f32, Expand);1460setOperationAction(ISD::FCOS, MVT::f64, Expand);1461setOperationAction(ISD::FSINCOS, MVT::f64, Expand);1462setOperationAction(ISD::FSINCOS, MVT::f32, Expand);1463setOperationAction(ISD::FREM, MVT::f64, Expand);1464setOperationAction(ISD::FREM, MVT::f32, Expand);1465if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&1466!Subtarget->isThumb1Only()) {1467setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);1468setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);1469}1470setOperationAction(ISD::FPOW, MVT::f64, Expand);1471setOperationAction(ISD::FPOW, MVT::f32, Expand);14721473if (!Subtarget->hasVFP4Base()) {1474setOperationAction(ISD::FMA, MVT::f64, Expand);1475setOperationAction(ISD::FMA, MVT::f32, Expand);1476}14771478// Various VFP goodness1479if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {1480// FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.1481if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {1482setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);1483setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);1484}14851486// fp16 is a special v7 extension that adds f16 <-> f32 conversions.1487if (!Subtarget->hasFP16()) {1488setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);1489setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);1490}14911492// Strict floating-point comparisons need custom lowering.1493setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);1494setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);1495setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);1496setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);1497setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);1498setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);1499}15001501// Use __sincos_stret if available.1502if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&1503getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {1504setOperationAction(ISD::FSINCOS, MVT::f64, Custom);1505setOperationAction(ISD::FSINCOS, MVT::f32, Custom);1506}15071508// FP-ARMv8 implements a lot of rounding-like FP operations.1509if (Subtarget->hasFPARMv8Base()) {1510setOperationAction(ISD::FFLOOR, MVT::f32, Legal);1511setOperationAction(ISD::FCEIL, MVT::f32, Legal);1512setOperationAction(ISD::FROUND, MVT::f32, Legal);1513setOperationAction(ISD::FTRUNC, MVT::f32, Legal);1514setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);1515setOperationAction(ISD::FRINT, MVT::f32, Legal);1516setOperationAction(ISD::FMINNUM, MVT::f32, Legal);1517setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);1518if (Subtarget->hasNEON()) {1519setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);1520setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);1521setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);1522setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);1523}15241525if (Subtarget->hasFP64()) {1526setOperationAction(ISD::FFLOOR, MVT::f64, Legal);1527setOperationAction(ISD::FCEIL, MVT::f64, Legal);1528setOperationAction(ISD::FROUND, MVT::f64, Legal);1529setOperationAction(ISD::FTRUNC, MVT::f64, Legal);1530setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);1531setOperationAction(ISD::FRINT, MVT::f64, Legal);1532setOperationAction(ISD::FMINNUM, MVT::f64, Legal);1533setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);1534}1535}15361537// FP16 often need to be promoted to call lib functions1538if (Subtarget->hasFullFP16()) {1539setOperationAction(ISD::FREM, MVT::f16, Promote);1540setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);1541setOperationAction(ISD::FSIN, MVT::f16, Promote);1542setOperationAction(ISD::FCOS, MVT::f16, Promote);1543setOperationAction(ISD::FTAN, MVT::f16, Promote);1544setOperationAction(ISD::FSINCOS, MVT::f16, Promote);1545setOperationAction(ISD::FPOWI, MVT::f16, Promote);1546setOperationAction(ISD::FPOW, MVT::f16, Promote);1547setOperationAction(ISD::FEXP, MVT::f16, Promote);1548setOperationAction(ISD::FEXP2, MVT::f16, Promote);1549setOperationAction(ISD::FEXP10, MVT::f16, Promote);1550setOperationAction(ISD::FLOG, MVT::f16, Promote);1551setOperationAction(ISD::FLOG10, MVT::f16, Promote);1552setOperationAction(ISD::FLOG2, MVT::f16, Promote);15531554setOperationAction(ISD::FROUND, MVT::f16, Legal);1555}15561557if (Subtarget->hasNEON()) {1558// vmin and vmax aren't available in a scalar form, so we can use1559// a NEON instruction with an undef lane instead.1560setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);1561setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);1562setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);1563setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);1564setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);1565setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);1566setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);1567setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);15681569if (Subtarget->hasFullFP16()) {1570setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);1571setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);1572setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);1573setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);15741575setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);1576setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);1577setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);1578setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);1579}1580}15811582// On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has1583// it, but it's just a wrapper around ldexp.1584if (Subtarget->isTargetWindows()) {1585for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})1586if (isOperationExpand(Op, MVT::f32))1587setOperationAction(Op, MVT::f32, Promote);1588}15891590// LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i161591// isn't legal.1592for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})1593if (isOperationExpand(Op, MVT::f16))1594setOperationAction(Op, MVT::f16, Promote);15951596// We have target-specific dag combine patterns for the following nodes:1597// ARMISD::VMOVRRD - No need to call setTargetDAGCombine1598setTargetDAGCombine(1599{ISD::ADD, ISD::SUB, ISD::MUL, ISD::AND, ISD::OR, ISD::XOR});16001601if (Subtarget->hasMVEIntegerOps())1602setTargetDAGCombine(ISD::VSELECT);16031604if (Subtarget->hasV6Ops())1605setTargetDAGCombine(ISD::SRL);1606if (Subtarget->isThumb1Only())1607setTargetDAGCombine(ISD::SHL);1608// Attempt to lower smin/smax to ssat/usat1609if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||1610Subtarget->isThumb2()) {1611setTargetDAGCombine({ISD::SMIN, ISD::SMAX});1612}16131614setStackPointerRegisterToSaveRestore(ARM::SP);16151616if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||1617!Subtarget->hasVFP2Base() || Subtarget->hasMinSize())1618setSchedulingPreference(Sched::RegPressure);1619else1620setSchedulingPreference(Sched::Hybrid);16211622//// temporary - rewrite interface to use type1623MaxStoresPerMemset = 8;1624MaxStoresPerMemsetOptSize = 4;1625MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores1626MaxStoresPerMemcpyOptSize = 2;1627MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores1628MaxStoresPerMemmoveOptSize = 2;16291630// On ARM arguments smaller than 4 bytes are extended, so all arguments1631// are at least 4 bytes aligned.1632setMinStackArgumentAlignment(Align(4));16331634// Prefer likely predicted branches to selects on out-of-order cores.1635PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();16361637setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));1638setPrefFunctionAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));16391640setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));1641}16421643bool ARMTargetLowering::useSoftFloat() const {1644return Subtarget->useSoftFloat();1645}16461647// FIXME: It might make sense to define the representative register class as the1648// nearest super-register that has a non-null superset. For example, DPR_VFP2 is1649// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,1650// SPR's representative would be DPR_VFP2. This should work well if register1651// pressure tracking were modified such that a register use would increment the1652// pressure of the register class's representative and all of it's super1653// classes' representatives transitively. We have not implemented this because1654// of the difficulty prior to coalescing of modeling operand register classes1655// due to the common occurrence of cross class copies and subregister insertions1656// and extractions.1657std::pair<const TargetRegisterClass *, uint8_t>1658ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,1659MVT VT) const {1660const TargetRegisterClass *RRC = nullptr;1661uint8_t Cost = 1;1662switch (VT.SimpleTy) {1663default:1664return TargetLowering::findRepresentativeClass(TRI, VT);1665// Use DPR as representative register class for all floating point1666// and vector types. Since there are 32 SPR registers and 32 DPR registers so1667// the cost is 1 for both f32 and f64.1668case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:1669case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:1670RRC = &ARM::DPRRegClass;1671// When NEON is used for SP, only half of the register file is available1672// because operations that define both SP and DP results will be constrained1673// to the VFP2 class (D0-D15). We currently model this constraint prior to1674// coalescing by double-counting the SP regs. See the FIXME above.1675if (Subtarget->useNEONForSinglePrecisionFP())1676Cost = 2;1677break;1678case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:1679case MVT::v4f32: case MVT::v2f64:1680RRC = &ARM::DPRRegClass;1681Cost = 2;1682break;1683case MVT::v4i64:1684RRC = &ARM::DPRRegClass;1685Cost = 4;1686break;1687case MVT::v8i64:1688RRC = &ARM::DPRRegClass;1689Cost = 8;1690break;1691}1692return std::make_pair(RRC, Cost);1693}16941695const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {1696#define MAKE_CASE(V) \1697case V: \1698return #V;1699switch ((ARMISD::NodeType)Opcode) {1700case ARMISD::FIRST_NUMBER:1701break;1702MAKE_CASE(ARMISD::Wrapper)1703MAKE_CASE(ARMISD::WrapperPIC)1704MAKE_CASE(ARMISD::WrapperJT)1705MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL)1706MAKE_CASE(ARMISD::CALL)1707MAKE_CASE(ARMISD::CALL_PRED)1708MAKE_CASE(ARMISD::CALL_NOLINK)1709MAKE_CASE(ARMISD::tSECALL)1710MAKE_CASE(ARMISD::t2CALL_BTI)1711MAKE_CASE(ARMISD::BRCOND)1712MAKE_CASE(ARMISD::BR_JT)1713MAKE_CASE(ARMISD::BR2_JT)1714MAKE_CASE(ARMISD::RET_GLUE)1715MAKE_CASE(ARMISD::SERET_GLUE)1716MAKE_CASE(ARMISD::INTRET_GLUE)1717MAKE_CASE(ARMISD::PIC_ADD)1718MAKE_CASE(ARMISD::CMP)1719MAKE_CASE(ARMISD::CMN)1720MAKE_CASE(ARMISD::CMPZ)1721MAKE_CASE(ARMISD::CMPFP)1722MAKE_CASE(ARMISD::CMPFPE)1723MAKE_CASE(ARMISD::CMPFPw0)1724MAKE_CASE(ARMISD::CMPFPEw0)1725MAKE_CASE(ARMISD::BCC_i64)1726MAKE_CASE(ARMISD::FMSTAT)1727MAKE_CASE(ARMISD::CMOV)1728MAKE_CASE(ARMISD::SSAT)1729MAKE_CASE(ARMISD::USAT)1730MAKE_CASE(ARMISD::ASRL)1731MAKE_CASE(ARMISD::LSRL)1732MAKE_CASE(ARMISD::LSLL)1733MAKE_CASE(ARMISD::SRL_GLUE)1734MAKE_CASE(ARMISD::SRA_GLUE)1735MAKE_CASE(ARMISD::RRX)1736MAKE_CASE(ARMISD::ADDC)1737MAKE_CASE(ARMISD::ADDE)1738MAKE_CASE(ARMISD::SUBC)1739MAKE_CASE(ARMISD::SUBE)1740MAKE_CASE(ARMISD::LSLS)1741MAKE_CASE(ARMISD::VMOVRRD)1742MAKE_CASE(ARMISD::VMOVDRR)1743MAKE_CASE(ARMISD::VMOVhr)1744MAKE_CASE(ARMISD::VMOVrh)1745MAKE_CASE(ARMISD::VMOVSR)1746MAKE_CASE(ARMISD::EH_SJLJ_SETJMP)1747MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP)1748MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH)1749MAKE_CASE(ARMISD::TC_RETURN)1750MAKE_CASE(ARMISD::THREAD_POINTER)1751MAKE_CASE(ARMISD::DYN_ALLOC)1752MAKE_CASE(ARMISD::MEMBARRIER_MCR)1753MAKE_CASE(ARMISD::PRELOAD)1754MAKE_CASE(ARMISD::LDRD)1755MAKE_CASE(ARMISD::STRD)1756MAKE_CASE(ARMISD::WIN__CHKSTK)1757MAKE_CASE(ARMISD::WIN__DBZCHK)1758MAKE_CASE(ARMISD::PREDICATE_CAST)1759MAKE_CASE(ARMISD::VECTOR_REG_CAST)1760MAKE_CASE(ARMISD::MVESEXT)1761MAKE_CASE(ARMISD::MVEZEXT)1762MAKE_CASE(ARMISD::MVETRUNC)1763MAKE_CASE(ARMISD::VCMP)1764MAKE_CASE(ARMISD::VCMPZ)1765MAKE_CASE(ARMISD::VTST)1766MAKE_CASE(ARMISD::VSHLs)1767MAKE_CASE(ARMISD::VSHLu)1768MAKE_CASE(ARMISD::VSHLIMM)1769MAKE_CASE(ARMISD::VSHRsIMM)1770MAKE_CASE(ARMISD::VSHRuIMM)1771MAKE_CASE(ARMISD::VRSHRsIMM)1772MAKE_CASE(ARMISD::VRSHRuIMM)1773MAKE_CASE(ARMISD::VRSHRNIMM)1774MAKE_CASE(ARMISD::VQSHLsIMM)1775MAKE_CASE(ARMISD::VQSHLuIMM)1776MAKE_CASE(ARMISD::VQSHLsuIMM)1777MAKE_CASE(ARMISD::VQSHRNsIMM)1778MAKE_CASE(ARMISD::VQSHRNuIMM)1779MAKE_CASE(ARMISD::VQSHRNsuIMM)1780MAKE_CASE(ARMISD::VQRSHRNsIMM)1781MAKE_CASE(ARMISD::VQRSHRNuIMM)1782MAKE_CASE(ARMISD::VQRSHRNsuIMM)1783MAKE_CASE(ARMISD::VSLIIMM)1784MAKE_CASE(ARMISD::VSRIIMM)1785MAKE_CASE(ARMISD::VGETLANEu)1786MAKE_CASE(ARMISD::VGETLANEs)1787MAKE_CASE(ARMISD::VMOVIMM)1788MAKE_CASE(ARMISD::VMVNIMM)1789MAKE_CASE(ARMISD::VMOVFPIMM)1790MAKE_CASE(ARMISD::VDUP)1791MAKE_CASE(ARMISD::VDUPLANE)1792MAKE_CASE(ARMISD::VEXT)1793MAKE_CASE(ARMISD::VREV64)1794MAKE_CASE(ARMISD::VREV32)1795MAKE_CASE(ARMISD::VREV16)1796MAKE_CASE(ARMISD::VZIP)1797MAKE_CASE(ARMISD::VUZP)1798MAKE_CASE(ARMISD::VTRN)1799MAKE_CASE(ARMISD::VTBL1)1800MAKE_CASE(ARMISD::VTBL2)1801MAKE_CASE(ARMISD::VMOVN)1802MAKE_CASE(ARMISD::VQMOVNs)1803MAKE_CASE(ARMISD::VQMOVNu)1804MAKE_CASE(ARMISD::VCVTN)1805MAKE_CASE(ARMISD::VCVTL)1806MAKE_CASE(ARMISD::VIDUP)1807MAKE_CASE(ARMISD::VMULLs)1808MAKE_CASE(ARMISD::VMULLu)1809MAKE_CASE(ARMISD::VQDMULH)1810MAKE_CASE(ARMISD::VADDVs)1811MAKE_CASE(ARMISD::VADDVu)1812MAKE_CASE(ARMISD::VADDVps)1813MAKE_CASE(ARMISD::VADDVpu)1814MAKE_CASE(ARMISD::VADDLVs)1815MAKE_CASE(ARMISD::VADDLVu)1816MAKE_CASE(ARMISD::VADDLVAs)1817MAKE_CASE(ARMISD::VADDLVAu)1818MAKE_CASE(ARMISD::VADDLVps)1819MAKE_CASE(ARMISD::VADDLVpu)1820MAKE_CASE(ARMISD::VADDLVAps)1821MAKE_CASE(ARMISD::VADDLVApu)1822MAKE_CASE(ARMISD::VMLAVs)1823MAKE_CASE(ARMISD::VMLAVu)1824MAKE_CASE(ARMISD::VMLAVps)1825MAKE_CASE(ARMISD::VMLAVpu)1826MAKE_CASE(ARMISD::VMLALVs)1827MAKE_CASE(ARMISD::VMLALVu)1828MAKE_CASE(ARMISD::VMLALVps)1829MAKE_CASE(ARMISD::VMLALVpu)1830MAKE_CASE(ARMISD::VMLALVAs)1831MAKE_CASE(ARMISD::VMLALVAu)1832MAKE_CASE(ARMISD::VMLALVAps)1833MAKE_CASE(ARMISD::VMLALVApu)1834MAKE_CASE(ARMISD::VMINVu)1835MAKE_CASE(ARMISD::VMINVs)1836MAKE_CASE(ARMISD::VMAXVu)1837MAKE_CASE(ARMISD::VMAXVs)1838MAKE_CASE(ARMISD::UMAAL)1839MAKE_CASE(ARMISD::UMLAL)1840MAKE_CASE(ARMISD::SMLAL)1841MAKE_CASE(ARMISD::SMLALBB)1842MAKE_CASE(ARMISD::SMLALBT)1843MAKE_CASE(ARMISD::SMLALTB)1844MAKE_CASE(ARMISD::SMLALTT)1845MAKE_CASE(ARMISD::SMULWB)1846MAKE_CASE(ARMISD::SMULWT)1847MAKE_CASE(ARMISD::SMLALD)1848MAKE_CASE(ARMISD::SMLALDX)1849MAKE_CASE(ARMISD::SMLSLD)1850MAKE_CASE(ARMISD::SMLSLDX)1851MAKE_CASE(ARMISD::SMMLAR)1852MAKE_CASE(ARMISD::SMMLSR)1853MAKE_CASE(ARMISD::QADD16b)1854MAKE_CASE(ARMISD::QSUB16b)1855MAKE_CASE(ARMISD::QADD8b)1856MAKE_CASE(ARMISD::QSUB8b)1857MAKE_CASE(ARMISD::UQADD16b)1858MAKE_CASE(ARMISD::UQSUB16b)1859MAKE_CASE(ARMISD::UQADD8b)1860MAKE_CASE(ARMISD::UQSUB8b)1861MAKE_CASE(ARMISD::BUILD_VECTOR)1862MAKE_CASE(ARMISD::BFI)1863MAKE_CASE(ARMISD::VORRIMM)1864MAKE_CASE(ARMISD::VBICIMM)1865MAKE_CASE(ARMISD::VBSP)1866MAKE_CASE(ARMISD::MEMCPY)1867MAKE_CASE(ARMISD::VLD1DUP)1868MAKE_CASE(ARMISD::VLD2DUP)1869MAKE_CASE(ARMISD::VLD3DUP)1870MAKE_CASE(ARMISD::VLD4DUP)1871MAKE_CASE(ARMISD::VLD1_UPD)1872MAKE_CASE(ARMISD::VLD2_UPD)1873MAKE_CASE(ARMISD::VLD3_UPD)1874MAKE_CASE(ARMISD::VLD4_UPD)1875MAKE_CASE(ARMISD::VLD1x2_UPD)1876MAKE_CASE(ARMISD::VLD1x3_UPD)1877MAKE_CASE(ARMISD::VLD1x4_UPD)1878MAKE_CASE(ARMISD::VLD2LN_UPD)1879MAKE_CASE(ARMISD::VLD3LN_UPD)1880MAKE_CASE(ARMISD::VLD4LN_UPD)1881MAKE_CASE(ARMISD::VLD1DUP_UPD)1882MAKE_CASE(ARMISD::VLD2DUP_UPD)1883MAKE_CASE(ARMISD::VLD3DUP_UPD)1884MAKE_CASE(ARMISD::VLD4DUP_UPD)1885MAKE_CASE(ARMISD::VST1_UPD)1886MAKE_CASE(ARMISD::VST2_UPD)1887MAKE_CASE(ARMISD::VST3_UPD)1888MAKE_CASE(ARMISD::VST4_UPD)1889MAKE_CASE(ARMISD::VST1x2_UPD)1890MAKE_CASE(ARMISD::VST1x3_UPD)1891MAKE_CASE(ARMISD::VST1x4_UPD)1892MAKE_CASE(ARMISD::VST2LN_UPD)1893MAKE_CASE(ARMISD::VST3LN_UPD)1894MAKE_CASE(ARMISD::VST4LN_UPD)1895MAKE_CASE(ARMISD::WLS)1896MAKE_CASE(ARMISD::WLSSETUP)1897MAKE_CASE(ARMISD::LE)1898MAKE_CASE(ARMISD::LOOP_DEC)1899MAKE_CASE(ARMISD::CSINV)1900MAKE_CASE(ARMISD::CSNEG)1901MAKE_CASE(ARMISD::CSINC)1902MAKE_CASE(ARMISD::MEMCPYLOOP)1903MAKE_CASE(ARMISD::MEMSETLOOP)1904#undef MAKE_CASE1905}1906return nullptr;1907}19081909EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,1910EVT VT) const {1911if (!VT.isVector())1912return getPointerTy(DL);19131914// MVE has a predicate register.1915if ((Subtarget->hasMVEIntegerOps() &&1916(VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||1917VT == MVT::v16i8)) ||1918(Subtarget->hasMVEFloatOps() &&1919(VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))1920return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());1921return VT.changeVectorElementTypeToInteger();1922}19231924/// getRegClassFor - Return the register class that should be used for the1925/// specified value type.1926const TargetRegisterClass *1927ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {1928(void)isDivergent;1929// Map v4i64 to QQ registers but do not make the type legal. Similarly map1930// v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to1931// load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive1932// MVE Q registers.1933if (Subtarget->hasNEON()) {1934if (VT == MVT::v4i64)1935return &ARM::QQPRRegClass;1936if (VT == MVT::v8i64)1937return &ARM::QQQQPRRegClass;1938}1939if (Subtarget->hasMVEIntegerOps()) {1940if (VT == MVT::v4i64)1941return &ARM::MQQPRRegClass;1942if (VT == MVT::v8i64)1943return &ARM::MQQQQPRRegClass;1944}1945return TargetLowering::getRegClassFor(VT);1946}19471948// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the1949// source/dest is aligned and the copy size is large enough. We therefore want1950// to align such objects passed to memory intrinsics.1951bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,1952Align &PrefAlign) const {1953if (!isa<MemIntrinsic>(CI))1954return false;1955MinSize = 8;1956// On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 11957// cycle faster than 4-byte aligned LDM.1958PrefAlign =1959(Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));1960return true;1961}19621963// Create a fast isel object.1964FastISel *1965ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,1966const TargetLibraryInfo *libInfo) const {1967return ARM::createFastISel(funcInfo, libInfo);1968}19691970Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {1971unsigned NumVals = N->getNumValues();1972if (!NumVals)1973return Sched::RegPressure;19741975for (unsigned i = 0; i != NumVals; ++i) {1976EVT VT = N->getValueType(i);1977if (VT == MVT::Glue || VT == MVT::Other)1978continue;1979if (VT.isFloatingPoint() || VT.isVector())1980return Sched::ILP;1981}19821983if (!N->isMachineOpcode())1984return Sched::RegPressure;19851986// Load are scheduled for latency even if there instruction itinerary1987// is not available.1988const TargetInstrInfo *TII = Subtarget->getInstrInfo();1989const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());19901991if (MCID.getNumDefs() == 0)1992return Sched::RegPressure;1993if (!Itins->isEmpty() &&1994Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)1995return Sched::ILP;19961997return Sched::RegPressure;1998}19992000//===----------------------------------------------------------------------===//2001// Lowering Code2002//===----------------------------------------------------------------------===//20032004static bool isSRL16(const SDValue &Op) {2005if (Op.getOpcode() != ISD::SRL)2006return false;2007if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))2008return Const->getZExtValue() == 16;2009return false;2010}20112012static bool isSRA16(const SDValue &Op) {2013if (Op.getOpcode() != ISD::SRA)2014return false;2015if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))2016return Const->getZExtValue() == 16;2017return false;2018}20192020static bool isSHL16(const SDValue &Op) {2021if (Op.getOpcode() != ISD::SHL)2022return false;2023if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))2024return Const->getZExtValue() == 16;2025return false;2026}20272028// Check for a signed 16-bit value. We special case SRA because it makes it2029// more simple when also looking for SRAs that aren't sign extending a2030// smaller value. Without the check, we'd need to take extra care with2031// checking order for some operations.2032static bool isS16(const SDValue &Op, SelectionDAG &DAG) {2033if (isSRA16(Op))2034return isSHL16(Op.getOperand(0));2035return DAG.ComputeNumSignBits(Op) == 17;2036}20372038/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC2039static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {2040switch (CC) {2041default: llvm_unreachable("Unknown condition code!");2042case ISD::SETNE: return ARMCC::NE;2043case ISD::SETEQ: return ARMCC::EQ;2044case ISD::SETGT: return ARMCC::GT;2045case ISD::SETGE: return ARMCC::GE;2046case ISD::SETLT: return ARMCC::LT;2047case ISD::SETLE: return ARMCC::LE;2048case ISD::SETUGT: return ARMCC::HI;2049case ISD::SETUGE: return ARMCC::HS;2050case ISD::SETULT: return ARMCC::LO;2051case ISD::SETULE: return ARMCC::LS;2052}2053}20542055/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.2056static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,2057ARMCC::CondCodes &CondCode2) {2058CondCode2 = ARMCC::AL;2059switch (CC) {2060default: llvm_unreachable("Unknown FP condition!");2061case ISD::SETEQ:2062case ISD::SETOEQ: CondCode = ARMCC::EQ; break;2063case ISD::SETGT:2064case ISD::SETOGT: CondCode = ARMCC::GT; break;2065case ISD::SETGE:2066case ISD::SETOGE: CondCode = ARMCC::GE; break;2067case ISD::SETOLT: CondCode = ARMCC::MI; break;2068case ISD::SETOLE: CondCode = ARMCC::LS; break;2069case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;2070case ISD::SETO: CondCode = ARMCC::VC; break;2071case ISD::SETUO: CondCode = ARMCC::VS; break;2072case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;2073case ISD::SETUGT: CondCode = ARMCC::HI; break;2074case ISD::SETUGE: CondCode = ARMCC::PL; break;2075case ISD::SETLT:2076case ISD::SETULT: CondCode = ARMCC::LT; break;2077case ISD::SETLE:2078case ISD::SETULE: CondCode = ARMCC::LE; break;2079case ISD::SETNE:2080case ISD::SETUNE: CondCode = ARMCC::NE; break;2081}2082}20832084//===----------------------------------------------------------------------===//2085// Calling Convention Implementation2086//===----------------------------------------------------------------------===//20872088/// getEffectiveCallingConv - Get the effective calling convention, taking into2089/// account presence of floating point hardware and calling convention2090/// limitations, such as support for variadic functions.2091CallingConv::ID2092ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,2093bool isVarArg) const {2094switch (CC) {2095default:2096report_fatal_error("Unsupported calling convention");2097case CallingConv::ARM_AAPCS:2098case CallingConv::ARM_APCS:2099case CallingConv::GHC:2100case CallingConv::CFGuard_Check:2101return CC;2102case CallingConv::PreserveMost:2103return CallingConv::PreserveMost;2104case CallingConv::PreserveAll:2105return CallingConv::PreserveAll;2106case CallingConv::ARM_AAPCS_VFP:2107case CallingConv::Swift:2108case CallingConv::SwiftTail:2109return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;2110case CallingConv::C:2111case CallingConv::Tail:2112if (!Subtarget->isAAPCS_ABI())2113return CallingConv::ARM_APCS;2114else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&2115getTargetMachine().Options.FloatABIType == FloatABI::Hard &&2116!isVarArg)2117return CallingConv::ARM_AAPCS_VFP;2118else2119return CallingConv::ARM_AAPCS;2120case CallingConv::Fast:2121case CallingConv::CXX_FAST_TLS:2122if (!Subtarget->isAAPCS_ABI()) {2123if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)2124return CallingConv::Fast;2125return CallingConv::ARM_APCS;2126} else if (Subtarget->hasVFP2Base() &&2127!Subtarget->isThumb1Only() && !isVarArg)2128return CallingConv::ARM_AAPCS_VFP;2129else2130return CallingConv::ARM_AAPCS;2131}2132}21332134CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,2135bool isVarArg) const {2136return CCAssignFnForNode(CC, false, isVarArg);2137}21382139CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,2140bool isVarArg) const {2141return CCAssignFnForNode(CC, true, isVarArg);2142}21432144/// CCAssignFnForNode - Selects the correct CCAssignFn for the given2145/// CallingConvention.2146CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,2147bool Return,2148bool isVarArg) const {2149switch (getEffectiveCallingConv(CC, isVarArg)) {2150default:2151report_fatal_error("Unsupported calling convention");2152case CallingConv::ARM_APCS:2153return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);2154case CallingConv::ARM_AAPCS:2155return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);2156case CallingConv::ARM_AAPCS_VFP:2157return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);2158case CallingConv::Fast:2159return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);2160case CallingConv::GHC:2161return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);2162case CallingConv::PreserveMost:2163return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);2164case CallingConv::PreserveAll:2165return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);2166case CallingConv::CFGuard_Check:2167return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);2168}2169}21702171SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,2172MVT LocVT, MVT ValVT, SDValue Val) const {2173Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),2174Val);2175if (Subtarget->hasFullFP16()) {2176Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);2177} else {2178Val = DAG.getNode(ISD::TRUNCATE, dl,2179MVT::getIntegerVT(ValVT.getSizeInBits()), Val);2180Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);2181}2182return Val;2183}21842185SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,2186MVT LocVT, MVT ValVT,2187SDValue Val) const {2188if (Subtarget->hasFullFP16()) {2189Val = DAG.getNode(ARMISD::VMOVrh, dl,2190MVT::getIntegerVT(LocVT.getSizeInBits()), Val);2191} else {2192Val = DAG.getNode(ISD::BITCAST, dl,2193MVT::getIntegerVT(ValVT.getSizeInBits()), Val);2194Val = DAG.getNode(ISD::ZERO_EXTEND, dl,2195MVT::getIntegerVT(LocVT.getSizeInBits()), Val);2196}2197return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);2198}21992200/// LowerCallResult - Lower the result values of a call into the2201/// appropriate copies out of appropriate physical registers.2202SDValue ARMTargetLowering::LowerCallResult(2203SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,2204const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,2205SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,2206SDValue ThisVal, bool isCmseNSCall) const {2207// Assign locations to each value returned by this call.2208SmallVector<CCValAssign, 16> RVLocs;2209CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,2210*DAG.getContext());2211CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));22122213// Copy all of the result registers out of their specified physreg.2214for (unsigned i = 0; i != RVLocs.size(); ++i) {2215CCValAssign VA = RVLocs[i];22162217// Pass 'this' value directly from the argument to return value, to avoid2218// reg unit interference2219if (i == 0 && isThisReturn) {2220assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&2221"unexpected return calling convention register assignment");2222InVals.push_back(ThisVal);2223continue;2224}22252226SDValue Val;2227if (VA.needsCustom() &&2228(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {2229// Handle f64 or half of a v2f64.2230SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,2231InGlue);2232Chain = Lo.getValue(1);2233InGlue = Lo.getValue(2);2234VA = RVLocs[++i]; // skip ahead to next loc2235SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,2236InGlue);2237Chain = Hi.getValue(1);2238InGlue = Hi.getValue(2);2239if (!Subtarget->isLittle())2240std::swap (Lo, Hi);2241Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);22422243if (VA.getLocVT() == MVT::v2f64) {2244SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);2245Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,2246DAG.getConstant(0, dl, MVT::i32));22472248VA = RVLocs[++i]; // skip ahead to next loc2249Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);2250Chain = Lo.getValue(1);2251InGlue = Lo.getValue(2);2252VA = RVLocs[++i]; // skip ahead to next loc2253Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);2254Chain = Hi.getValue(1);2255InGlue = Hi.getValue(2);2256if (!Subtarget->isLittle())2257std::swap (Lo, Hi);2258Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);2259Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,2260DAG.getConstant(1, dl, MVT::i32));2261}2262} else {2263Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),2264InGlue);2265Chain = Val.getValue(1);2266InGlue = Val.getValue(2);2267}22682269switch (VA.getLocInfo()) {2270default: llvm_unreachable("Unknown loc info!");2271case CCValAssign::Full: break;2272case CCValAssign::BCvt:2273Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);2274break;2275}22762277// f16 arguments have their size extended to 4 bytes and passed as if they2278// had been copied to the LSBs of a 32-bit register.2279// For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)2280if (VA.needsCustom() &&2281(VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))2282Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);22832284// On CMSE Non-secure Calls, call results (returned values) whose bitwidth2285// is less than 32 bits must be sign- or zero-extended after the call for2286// security reasons. Although the ABI mandates an extension done by the2287// callee, the latter cannot be trusted to follow the rules of the ABI.2288const ISD::InputArg &Arg = Ins[VA.getValNo()];2289if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&2290VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))2291Val = handleCMSEValue(Val, Arg, DAG, dl);22922293InVals.push_back(Val);2294}22952296return Chain;2297}22982299std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(2300const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,2301bool IsTailCall, int SPDiff) const {2302SDValue DstAddr;2303MachinePointerInfo DstInfo;2304int32_t Offset = VA.getLocMemOffset();2305MachineFunction &MF = DAG.getMachineFunction();23062307if (IsTailCall) {2308Offset += SPDiff;2309auto PtrVT = getPointerTy(DAG.getDataLayout());2310int Size = VA.getLocVT().getFixedSizeInBits() / 8;2311int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);2312DstAddr = DAG.getFrameIndex(FI, PtrVT);2313DstInfo =2314MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);2315} else {2316SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);2317DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),2318StackPtr, PtrOff);2319DstInfo =2320MachinePointerInfo::getStack(DAG.getMachineFunction(), Offset);2321}23222323return std::make_pair(DstAddr, DstInfo);2324}23252326void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,2327SDValue Chain, SDValue &Arg,2328RegsToPassVector &RegsToPass,2329CCValAssign &VA, CCValAssign &NextVA,2330SDValue &StackPtr,2331SmallVectorImpl<SDValue> &MemOpChains,2332bool IsTailCall,2333int SPDiff) const {2334SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,2335DAG.getVTList(MVT::i32, MVT::i32), Arg);2336unsigned id = Subtarget->isLittle() ? 0 : 1;2337RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));23382339if (NextVA.isRegLoc())2340RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));2341else {2342assert(NextVA.isMemLoc());2343if (!StackPtr.getNode())2344StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,2345getPointerTy(DAG.getDataLayout()));23462347SDValue DstAddr;2348MachinePointerInfo DstInfo;2349std::tie(DstAddr, DstInfo) =2350computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);2351MemOpChains.push_back(2352DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));2353}2354}23552356static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {2357return (CC == CallingConv::Fast && GuaranteeTailCalls) ||2358CC == CallingConv::Tail || CC == CallingConv::SwiftTail;2359}23602361/// LowerCall - Lowering a call into a callseq_start <-2362/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter2363/// nodes.2364SDValue2365ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,2366SmallVectorImpl<SDValue> &InVals) const {2367SelectionDAG &DAG = CLI.DAG;2368SDLoc &dl = CLI.DL;2369SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;2370SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;2371SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;2372SDValue Chain = CLI.Chain;2373SDValue Callee = CLI.Callee;2374bool &isTailCall = CLI.IsTailCall;2375CallingConv::ID CallConv = CLI.CallConv;2376bool doesNotRet = CLI.DoesNotReturn;2377bool isVarArg = CLI.IsVarArg;23782379MachineFunction &MF = DAG.getMachineFunction();2380ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();2381MachineFunction::CallSiteInfo CSInfo;2382bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();2383bool isThisReturn = false;2384bool isCmseNSCall = false;2385bool isSibCall = false;2386bool PreferIndirect = false;2387bool GuardWithBTI = false;23882389// Analyze operands of the call, assigning locations to each operand.2390SmallVector<CCValAssign, 16> ArgLocs;2391CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,2392*DAG.getContext());2393CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));23942395// Lower 'returns_twice' calls to a pseudo-instruction.2396if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&2397!Subtarget->noBTIAtReturnTwice())2398GuardWithBTI = AFI->branchTargetEnforcement();23992400// Determine whether this is a non-secure function call.2401if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))2402isCmseNSCall = true;24032404// Disable tail calls if they're not supported.2405if (!Subtarget->supportsTailCall())2406isTailCall = false;24072408// For both the non-secure calls and the returns from a CMSE entry function,2409// the function needs to do some extra work afte r the call, or before the2410// return, respectively, thus it cannot end with atail call2411if (isCmseNSCall || AFI->isCmseNSEntryFunction())2412isTailCall = false;24132414if (isa<GlobalAddressSDNode>(Callee)) {2415// If we're optimizing for minimum size and the function is called three or2416// more times in this block, we can improve codesize by calling indirectly2417// as BLXr has a 16-bit encoding.2418auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();2419if (CLI.CB) {2420auto *BB = CLI.CB->getParent();2421PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&2422count_if(GV->users(), [&BB](const User *U) {2423return isa<Instruction>(U) &&2424cast<Instruction>(U)->getParent() == BB;2425}) > 2;2426}2427}2428if (isTailCall) {2429// Check if it's really possible to do a tail call.2430isTailCall =2431IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);24322433if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&2434CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)2435isSibCall = true;24362437// We don't support GuaranteedTailCallOpt for ARM, only automatically2438// detected sibcalls.2439if (isTailCall)2440++NumTailCalls;2441}24422443if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())2444report_fatal_error("failed to perform tail call elimination on a call "2445"site marked musttail");24462447// Get a count of how many bytes are to be pushed on the stack.2448unsigned NumBytes = CCInfo.getStackSize();24492450// SPDiff is the byte offset of the call's argument area from the callee's.2451// Stores to callee stack arguments will be placed in FixedStackSlots offset2452// by this amount for a tail call. In a sibling call it must be 0 because the2453// caller will deallocate the entire stack and the callee still expects its2454// arguments to begin at SP+0. Completely unused for non-tail calls.2455int SPDiff = 0;24562457if (isTailCall && !isSibCall) {2458auto FuncInfo = MF.getInfo<ARMFunctionInfo>();2459unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();24602461// Since callee will pop argument stack as a tail call, we must keep the2462// popped size 16-byte aligned.2463Align StackAlign = DAG.getDataLayout().getStackAlignment();2464NumBytes = alignTo(NumBytes, StackAlign);24652466// SPDiff will be negative if this tail call requires more space than we2467// would automatically have in our incoming argument space. Positive if we2468// can actually shrink the stack.2469SPDiff = NumReusableBytes - NumBytes;24702471// If this call requires more stack than we have available from2472// LowerFormalArguments, tell FrameLowering to reserve space for it.2473if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)2474AFI->setArgRegsSaveSize(-SPDiff);2475}24762477if (isSibCall) {2478// For sibling tail calls, memory operands are available in our caller's stack.2479NumBytes = 0;2480} else {2481// Adjust the stack pointer for the new arguments...2482// These operations are automatically eliminated by the prolog/epilog pass2483Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);2484}24852486SDValue StackPtr =2487DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));24882489RegsToPassVector RegsToPass;2490SmallVector<SDValue, 8> MemOpChains;24912492// During a tail call, stores to the argument area must happen after all of2493// the function's incoming arguments have been loaded because they may alias.2494// This is done by folding in a TokenFactor from LowerFormalArguments, but2495// there's no point in doing so repeatedly so this tracks whether that's2496// happened yet.2497bool AfterFormalArgLoads = false;24982499// Walk the register/memloc assignments, inserting copies/loads. In the case2500// of tail call optimization, arguments are handled later.2501for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();2502i != e;2503++i, ++realArgIdx) {2504CCValAssign &VA = ArgLocs[i];2505SDValue Arg = OutVals[realArgIdx];2506ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;2507bool isByVal = Flags.isByVal();25082509// Promote the value if needed.2510switch (VA.getLocInfo()) {2511default: llvm_unreachable("Unknown loc info!");2512case CCValAssign::Full: break;2513case CCValAssign::SExt:2514Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);2515break;2516case CCValAssign::ZExt:2517Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);2518break;2519case CCValAssign::AExt:2520Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);2521break;2522case CCValAssign::BCvt:2523Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);2524break;2525}25262527if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {2528Chain = DAG.getStackArgumentTokenFactor(Chain);2529AfterFormalArgLoads = true;2530}25312532// f16 arguments have their size extended to 4 bytes and passed as if they2533// had been copied to the LSBs of a 32-bit register.2534// For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)2535if (VA.needsCustom() &&2536(VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {2537Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);2538} else {2539// f16 arguments could have been extended prior to argument lowering.2540// Mask them arguments if this is a CMSE nonsecure call.2541auto ArgVT = Outs[realArgIdx].ArgVT;2542if (isCmseNSCall && (ArgVT == MVT::f16)) {2543auto LocBits = VA.getLocVT().getSizeInBits();2544auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());2545SDValue Mask =2546DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));2547Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);2548Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);2549Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);2550}2551}25522553// f64 and v2f64 might be passed in i32 pairs and must be split into pieces2554if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {2555SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,2556DAG.getConstant(0, dl, MVT::i32));2557SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,2558DAG.getConstant(1, dl, MVT::i32));25592560PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],2561StackPtr, MemOpChains, isTailCall, SPDiff);25622563VA = ArgLocs[++i]; // skip ahead to next loc2564if (VA.isRegLoc()) {2565PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],2566StackPtr, MemOpChains, isTailCall, SPDiff);2567} else {2568assert(VA.isMemLoc());2569SDValue DstAddr;2570MachinePointerInfo DstInfo;2571std::tie(DstAddr, DstInfo) =2572computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);2573MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));2574}2575} else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {2576PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],2577StackPtr, MemOpChains, isTailCall, SPDiff);2578} else if (VA.isRegLoc()) {2579if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&2580Outs[0].VT == MVT::i32) {2581assert(VA.getLocVT() == MVT::i32 &&2582"unexpected calling convention register assignment");2583assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&2584"unexpected use of 'returned'");2585isThisReturn = true;2586}2587const TargetOptions &Options = DAG.getTarget().Options;2588if (Options.EmitCallSiteInfo)2589CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);2590RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));2591} else if (isByVal) {2592assert(VA.isMemLoc());2593unsigned offset = 0;25942595// True if this byval aggregate will be split between registers2596// and memory.2597unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();2598unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();25992600if (CurByValIdx < ByValArgsCount) {26012602unsigned RegBegin, RegEnd;2603CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);26042605EVT PtrVT =2606DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());2607unsigned int i, j;2608for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {2609SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);2610SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);2611SDValue Load =2612DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),2613DAG.InferPtrAlign(AddArg));2614MemOpChains.push_back(Load.getValue(1));2615RegsToPass.push_back(std::make_pair(j, Load));2616}26172618// If parameter size outsides register area, "offset" value2619// helps us to calculate stack slot for remained part properly.2620offset = RegEnd - RegBegin;26212622CCInfo.nextInRegsParam();2623}26242625if (Flags.getByValSize() > 4*offset) {2626auto PtrVT = getPointerTy(DAG.getDataLayout());2627SDValue Dst;2628MachinePointerInfo DstInfo;2629std::tie(Dst, DstInfo) =2630computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);2631SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);2632SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);2633SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,2634MVT::i32);2635SDValue AlignNode =2636DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);26372638SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);2639SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};2640MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,2641Ops));2642}2643} else {2644assert(VA.isMemLoc());2645SDValue DstAddr;2646MachinePointerInfo DstInfo;2647std::tie(DstAddr, DstInfo) =2648computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);26492650SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);2651MemOpChains.push_back(Store);2652}2653}26542655if (!MemOpChains.empty())2656Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);26572658// Build a sequence of copy-to-reg nodes chained together with token chain2659// and flag operands which copy the outgoing args into the appropriate regs.2660SDValue InGlue;2661for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {2662Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,2663RegsToPass[i].second, InGlue);2664InGlue = Chain.getValue(1);2665}26662667// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every2668// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol2669// node so that legalize doesn't hack it.2670bool isDirect = false;26712672const TargetMachine &TM = getTargetMachine();2673const GlobalValue *GVal = nullptr;2674if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))2675GVal = G->getGlobal();2676bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();26772678bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());2679bool isLocalARMFunc = false;2680auto PtrVt = getPointerTy(DAG.getDataLayout());26812682if (Subtarget->genLongCalls()) {2683assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&2684"long-calls codegen is not position independent!");2685// Handle a global address or an external symbol. If it's not one of2686// those, the target's already in a register, so we don't need to do2687// anything extra.2688if (isa<GlobalAddressSDNode>(Callee)) {2689if (Subtarget->genExecuteOnly()) {2690if (Subtarget->useMovt())2691++NumMovwMovt;2692Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,2693DAG.getTargetGlobalAddress(GVal, dl, PtrVt));2694} else {2695// Create a constant pool entry for the callee address2696unsigned ARMPCLabelIndex = AFI->createPICLabelUId();2697ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(2698GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);26992700// Get the address of the callee into a register2701SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));2702Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);2703Callee = DAG.getLoad(2704PtrVt, dl, DAG.getEntryNode(), Addr,2705MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));2706}2707} else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {2708const char *Sym = S->getSymbol();27092710if (Subtarget->genExecuteOnly()) {2711if (Subtarget->useMovt())2712++NumMovwMovt;2713Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,2714DAG.getTargetGlobalAddress(GVal, dl, PtrVt));2715} else {2716// Create a constant pool entry for the callee address2717unsigned ARMPCLabelIndex = AFI->createPICLabelUId();2718ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(2719*DAG.getContext(), Sym, ARMPCLabelIndex, 0);27202721// Get the address of the callee into a register2722SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));2723Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);2724Callee = DAG.getLoad(2725PtrVt, dl, DAG.getEntryNode(), Addr,2726MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));2727}2728}2729} else if (isa<GlobalAddressSDNode>(Callee)) {2730if (!PreferIndirect) {2731isDirect = true;2732bool isDef = GVal->isStrongDefinitionForLinker();27332734// ARM call to a local ARM function is predicable.2735isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);2736// tBX takes a register source operand.2737if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {2738assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");2739Callee = DAG.getNode(2740ARMISD::WrapperPIC, dl, PtrVt,2741DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));2742Callee = DAG.getLoad(2743PtrVt, dl, DAG.getEntryNode(), Callee,2744MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(),2745MachineMemOperand::MODereferenceable |2746MachineMemOperand::MOInvariant);2747} else if (Subtarget->isTargetCOFF()) {2748assert(Subtarget->isTargetWindows() &&2749"Windows is the only supported COFF target");2750unsigned TargetFlags = ARMII::MO_NO_FLAG;2751if (GVal->hasDLLImportStorageClass())2752TargetFlags = ARMII::MO_DLLIMPORT;2753else if (!TM.shouldAssumeDSOLocal(GVal))2754TargetFlags = ARMII::MO_COFFSTUB;2755Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,2756TargetFlags);2757if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))2758Callee =2759DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),2760DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),2761MachinePointerInfo::getGOT(DAG.getMachineFunction()));2762} else {2763Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);2764}2765}2766} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {2767isDirect = true;2768// tBX takes a register source operand.2769const char *Sym = S->getSymbol();2770if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {2771unsigned ARMPCLabelIndex = AFI->createPICLabelUId();2772ARMConstantPoolValue *CPV =2773ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,2774ARMPCLabelIndex, 4);2775SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));2776CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);2777Callee = DAG.getLoad(2778PtrVt, dl, DAG.getEntryNode(), CPAddr,2779MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));2780SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);2781Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);2782} else {2783Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);2784}2785}27862787if (isCmseNSCall) {2788assert(!isARMFunc && !isDirect &&2789"Cannot handle call to ARM function or direct call");2790if (NumBytes > 0) {2791DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(),2792"call to non-secure function would "2793"require passing arguments on stack",2794dl.getDebugLoc());2795DAG.getContext()->diagnose(Diag);2796}2797if (isStructRet) {2798DiagnosticInfoUnsupported Diag(2799DAG.getMachineFunction().getFunction(),2800"call to non-secure function would return value through pointer",2801dl.getDebugLoc());2802DAG.getContext()->diagnose(Diag);2803}2804}28052806// FIXME: handle tail calls differently.2807unsigned CallOpc;2808if (Subtarget->isThumb()) {2809if (GuardWithBTI)2810CallOpc = ARMISD::t2CALL_BTI;2811else if (isCmseNSCall)2812CallOpc = ARMISD::tSECALL;2813else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())2814CallOpc = ARMISD::CALL_NOLINK;2815else2816CallOpc = ARMISD::CALL;2817} else {2818if (!isDirect && !Subtarget->hasV5TOps())2819CallOpc = ARMISD::CALL_NOLINK;2820else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&2821// Emit regular call when code size is the priority2822!Subtarget->hasMinSize())2823// "mov lr, pc; b _foo" to avoid confusing the RSP2824CallOpc = ARMISD::CALL_NOLINK;2825else2826CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;2827}28282829// We don't usually want to end the call-sequence here because we would tidy2830// the frame up *after* the call, however in the ABI-changing tail-call case2831// we've carefully laid out the parameters so that when sp is reset they'll be2832// in the correct location.2833if (isTailCall && !isSibCall) {2834Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);2835InGlue = Chain.getValue(1);2836}28372838std::vector<SDValue> Ops;2839Ops.push_back(Chain);2840Ops.push_back(Callee);28412842if (isTailCall) {2843Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));2844}28452846// Add argument registers to the end of the list so that they are known live2847// into the call.2848for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)2849Ops.push_back(DAG.getRegister(RegsToPass[i].first,2850RegsToPass[i].second.getValueType()));28512852// Add a register mask operand representing the call-preserved registers.2853const uint32_t *Mask;2854const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();2855if (isThisReturn) {2856// For 'this' returns, use the R0-preserving mask if applicable2857Mask = ARI->getThisReturnPreservedMask(MF, CallConv);2858if (!Mask) {2859// Set isThisReturn to false if the calling convention is not one that2860// allows 'returned' to be modeled in this way, so LowerCallResult does2861// not try to pass 'this' straight through2862isThisReturn = false;2863Mask = ARI->getCallPreservedMask(MF, CallConv);2864}2865} else2866Mask = ARI->getCallPreservedMask(MF, CallConv);28672868assert(Mask && "Missing call preserved mask for calling convention");2869Ops.push_back(DAG.getRegisterMask(Mask));28702871if (InGlue.getNode())2872Ops.push_back(InGlue);28732874SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);2875if (isTailCall) {2876MF.getFrameInfo().setHasTailCall();2877SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);2878DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);2879DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));2880return Ret;2881}28822883// Returns a chain and a flag for retval copy to use.2884Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);2885DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);2886InGlue = Chain.getValue(1);2887DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));28882889// If we're guaranteeing tail-calls will be honoured, the callee must2890// pop its own argument stack on return. But this call is *not* a tail call so2891// we need to undo that after it returns to restore the status-quo.2892bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;2893uint64_t CalleePopBytes =2894canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;28952896Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);2897if (!Ins.empty())2898InGlue = Chain.getValue(1);28992900// Handle result values, copying them out of physregs into vregs that we2901// return.2902return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,2903InVals, isThisReturn,2904isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);2905}29062907/// HandleByVal - Every parameter *after* a byval parameter is passed2908/// on the stack. Remember the next parameter register to allocate,2909/// and then confiscate the rest of the parameter registers to insure2910/// this.2911void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,2912Align Alignment) const {2913// Byval (as with any stack) slots are always at least 4 byte aligned.2914Alignment = std::max(Alignment, Align(4));29152916unsigned Reg = State->AllocateReg(GPRArgRegs);2917if (!Reg)2918return;29192920unsigned AlignInRegs = Alignment.value() / 4;2921unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;2922for (unsigned i = 0; i < Waste; ++i)2923Reg = State->AllocateReg(GPRArgRegs);29242925if (!Reg)2926return;29272928unsigned Excess = 4 * (ARM::R4 - Reg);29292930// Special case when NSAA != SP and parameter size greater than size of2931// all remained GPR regs. In that case we can't split parameter, we must2932// send it to stack. We also must set NCRN to R4, so waste all2933// remained registers.2934const unsigned NSAAOffset = State->getStackSize();2935if (NSAAOffset != 0 && Size > Excess) {2936while (State->AllocateReg(GPRArgRegs))2937;2938return;2939}29402941// First register for byval parameter is the first register that wasn't2942// allocated before this method call, so it would be "reg".2943// If parameter is small enough to be saved in range [reg, r4), then2944// the end (first after last) register would be reg + param-size-in-regs,2945// else parameter would be splitted between registers and stack,2946// end register would be r4 in this case.2947unsigned ByValRegBegin = Reg;2948unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);2949State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);2950// Note, first register is allocated in the beginning of function already,2951// allocate remained amount of registers we need.2952for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)2953State->AllocateReg(GPRArgRegs);2954// A byval parameter that is split between registers and memory needs its2955// size truncated here.2956// In the case where the entire structure fits in registers, we set the2957// size in memory to zero.2958Size = std::max<int>(Size - Excess, 0);2959}29602961/// MatchingStackOffset - Return true if the given stack call argument is2962/// already available in the same position (relatively) of the caller's2963/// incoming argument stack.2964static2965bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,2966MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,2967const TargetInstrInfo *TII) {2968unsigned Bytes = Arg.getValueSizeInBits() / 8;2969int FI = std::numeric_limits<int>::max();2970if (Arg.getOpcode() == ISD::CopyFromReg) {2971Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();2972if (!VR.isVirtual())2973return false;2974MachineInstr *Def = MRI->getVRegDef(VR);2975if (!Def)2976return false;2977if (!Flags.isByVal()) {2978if (!TII->isLoadFromStackSlot(*Def, FI))2979return false;2980} else {2981return false;2982}2983} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {2984if (Flags.isByVal())2985// ByVal argument is passed in as a pointer but it's now being2986// dereferenced. e.g.2987// define @foo(%struct.X* %A) {2988// tail call @bar(%struct.X* byval %A)2989// }2990return false;2991SDValue Ptr = Ld->getBasePtr();2992FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);2993if (!FINode)2994return false;2995FI = FINode->getIndex();2996} else2997return false;29982999assert(FI != std::numeric_limits<int>::max());3000if (!MFI.isFixedObjectIndex(FI))3001return false;3002return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);3003}30043005/// IsEligibleForTailCallOptimization - Check whether the call is eligible3006/// for tail call optimization. Targets which want to do tail call3007/// optimization should implement this function. Note that this function also3008/// processes musttail calls, so when this function returns false on a valid3009/// musttail call, a fatal backend error occurs.3010bool ARMTargetLowering::IsEligibleForTailCallOptimization(3011TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,3012SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {3013CallingConv::ID CalleeCC = CLI.CallConv;3014SDValue Callee = CLI.Callee;3015bool isVarArg = CLI.IsVarArg;3016const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;3017const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;3018const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;3019const SelectionDAG &DAG = CLI.DAG;3020MachineFunction &MF = DAG.getMachineFunction();3021const Function &CallerF = MF.getFunction();3022CallingConv::ID CallerCC = CallerF.getCallingConv();30233024assert(Subtarget->supportsTailCall());30253026// Indirect tail calls cannot be optimized for Thumb1 if the args3027// to the call take up r0-r3. The reason is that there are no legal registers3028// left to hold the pointer to the function to be called.3029// Similarly, if the function uses return address sign and authentication,3030// r12 is needed to hold the PAC and is not available to hold the callee3031// address.3032if (Outs.size() >= 4 &&3033(!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {3034if (Subtarget->isThumb1Only())3035return false;3036// Conservatively assume the function spills LR.3037if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true))3038return false;3039}30403041// Look for obvious safe cases to perform tail call optimization that do not3042// require ABI changes. This is what gcc calls sibcall.30433044// Exception-handling functions need a special set of instructions to indicate3045// a return to the hardware. Tail-calling another function would probably3046// break this.3047if (CallerF.hasFnAttribute("interrupt"))3048return false;30493050if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))3051return CalleeCC == CallerCC;30523053// Also avoid sibcall optimization if either caller or callee uses struct3054// return semantics.3055bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();3056bool isCallerStructRet = MF.getFunction().hasStructRetAttr();3057if (isCalleeStructRet || isCallerStructRet)3058return false;30593060// Externally-defined functions with weak linkage should not be3061// tail-called on ARM when the OS does not support dynamic3062// pre-emption of symbols, as the AAELF spec requires normal calls3063// to undefined weak functions to be replaced with a NOP or jump to the3064// next instruction. The behaviour of branch instructions in this3065// situation (as used for tail calls) is implementation-defined, so we3066// cannot rely on the linker replacing the tail call with a return.3067if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {3068const GlobalValue *GV = G->getGlobal();3069const Triple &TT = getTargetMachine().getTargetTriple();3070if (GV->hasExternalWeakLinkage() &&3071(!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))3072return false;3073}30743075// Check that the call results are passed in the same way.3076LLVMContext &C = *DAG.getContext();3077if (!CCState::resultsCompatible(3078getEffectiveCallingConv(CalleeCC, isVarArg),3079getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,3080CCAssignFnForReturn(CalleeCC, isVarArg),3081CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))3082return false;3083// The callee has to preserve all registers the caller needs to preserve.3084const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();3085const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);3086if (CalleeCC != CallerCC) {3087const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);3088if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))3089return false;3090}30913092// If Caller's vararg or byval argument has been split between registers and3093// stack, do not perform tail call, since part of the argument is in caller's3094// local frame.3095const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();3096if (AFI_Caller->getArgRegsSaveSize())3097return false;30983099// If the callee takes no arguments then go on to check the results of the3100// call.3101if (!Outs.empty()) {3102if (CCInfo.getStackSize()) {3103// Check if the arguments are already laid out in the right way as3104// the caller's fixed stack objects.3105MachineFrameInfo &MFI = MF.getFrameInfo();3106const MachineRegisterInfo *MRI = &MF.getRegInfo();3107const TargetInstrInfo *TII = Subtarget->getInstrInfo();3108for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();3109i != e;3110++i, ++realArgIdx) {3111CCValAssign &VA = ArgLocs[i];3112EVT RegVT = VA.getLocVT();3113SDValue Arg = OutVals[realArgIdx];3114ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;3115if (VA.getLocInfo() == CCValAssign::Indirect)3116return false;3117if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {3118// f64 and vector types are split into multiple registers or3119// register/stack-slot combinations. The types will not match3120// the registers; give up on memory f64 refs until we figure3121// out what to do about this.3122if (!VA.isRegLoc())3123return false;3124if (!ArgLocs[++i].isRegLoc())3125return false;3126if (RegVT == MVT::v2f64) {3127if (!ArgLocs[++i].isRegLoc())3128return false;3129if (!ArgLocs[++i].isRegLoc())3130return false;3131}3132} else if (!VA.isRegLoc()) {3133if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,3134MFI, MRI, TII))3135return false;3136}3137}3138}31393140const MachineRegisterInfo &MRI = MF.getRegInfo();3141if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))3142return false;3143}31443145return true;3146}31473148bool3149ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,3150MachineFunction &MF, bool isVarArg,3151const SmallVectorImpl<ISD::OutputArg> &Outs,3152LLVMContext &Context) const {3153SmallVector<CCValAssign, 16> RVLocs;3154CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);3155return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));3156}31573158static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,3159const SDLoc &DL, SelectionDAG &DAG) {3160const MachineFunction &MF = DAG.getMachineFunction();3161const Function &F = MF.getFunction();31623163StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();31643165// See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset3166// version of the "preferred return address". These offsets affect the return3167// instruction if this is a return from PL1 without hypervisor extensions.3168// IRQ/FIQ: +4 "subs pc, lr, #4"3169// SWI: 0 "subs pc, lr, #0"3170// ABORT: +4 "subs pc, lr, #4"3171// UNDEF: +4/+2 "subs pc, lr, #0"3172// UNDEF varies depending on where the exception came from ARM or Thumb3173// mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.31743175int64_t LROffset;3176if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||3177IntKind == "ABORT")3178LROffset = 4;3179else if (IntKind == "SWI" || IntKind == "UNDEF")3180LROffset = 0;3181else3182report_fatal_error("Unsupported interrupt attribute. If present, value "3183"must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");31843185RetOps.insert(RetOps.begin() + 1,3186DAG.getConstant(LROffset, DL, MVT::i32, false));31873188return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);3189}31903191SDValue3192ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,3193bool isVarArg,3194const SmallVectorImpl<ISD::OutputArg> &Outs,3195const SmallVectorImpl<SDValue> &OutVals,3196const SDLoc &dl, SelectionDAG &DAG) const {3197// CCValAssign - represent the assignment of the return value to a location.3198SmallVector<CCValAssign, 16> RVLocs;31993200// CCState - Info about the registers and stack slots.3201CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,3202*DAG.getContext());32033204// Analyze outgoing return values.3205CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));32063207SDValue Glue;3208SmallVector<SDValue, 4> RetOps;3209RetOps.push_back(Chain); // Operand #0 = Chain (updated below)3210bool isLittleEndian = Subtarget->isLittle();32113212MachineFunction &MF = DAG.getMachineFunction();3213ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();3214AFI->setReturnRegsCount(RVLocs.size());32153216// Report error if cmse entry function returns structure through first ptr arg.3217if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {3218// Note: using an empty SDLoc(), as the first line of the function is a3219// better place to report than the last line.3220DiagnosticInfoUnsupported Diag(3221DAG.getMachineFunction().getFunction(),3222"secure entry function would return value through pointer",3223SDLoc().getDebugLoc());3224DAG.getContext()->diagnose(Diag);3225}32263227// Copy the result values into the output registers.3228for (unsigned i = 0, realRVLocIdx = 0;3229i != RVLocs.size();3230++i, ++realRVLocIdx) {3231CCValAssign &VA = RVLocs[i];3232assert(VA.isRegLoc() && "Can only return in registers!");32333234SDValue Arg = OutVals[realRVLocIdx];3235bool ReturnF16 = false;32363237if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {3238// Half-precision return values can be returned like this:3239//3240// t11 f16 = fadd ...3241// t12: i16 = bitcast t113242// t13: i32 = zero_extend t123243// t14: f32 = bitcast t13 <~~~~~~~ Arg3244//3245// to avoid code generation for bitcasts, we simply set Arg to the node3246// that produces the f16 value, t11 in this case.3247//3248if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {3249SDValue ZE = Arg.getOperand(0);3250if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {3251SDValue BC = ZE.getOperand(0);3252if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {3253Arg = BC.getOperand(0);3254ReturnF16 = true;3255}3256}3257}3258}32593260switch (VA.getLocInfo()) {3261default: llvm_unreachable("Unknown loc info!");3262case CCValAssign::Full: break;3263case CCValAssign::BCvt:3264if (!ReturnF16)3265Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);3266break;3267}32683269// Mask f16 arguments if this is a CMSE nonsecure entry.3270auto RetVT = Outs[realRVLocIdx].ArgVT;3271if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {3272if (VA.needsCustom() && VA.getValVT() == MVT::f16) {3273Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);3274} else {3275auto LocBits = VA.getLocVT().getSizeInBits();3276auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());3277SDValue Mask =3278DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));3279Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);3280Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);3281Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);3282}3283}32843285if (VA.needsCustom() &&3286(VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {3287if (VA.getLocVT() == MVT::v2f64) {3288// Extract the first half and return it in two registers.3289SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,3290DAG.getConstant(0, dl, MVT::i32));3291SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,3292DAG.getVTList(MVT::i32, MVT::i32), Half);32933294Chain =3295DAG.getCopyToReg(Chain, dl, VA.getLocReg(),3296HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);3297Glue = Chain.getValue(1);3298RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));3299VA = RVLocs[++i]; // skip ahead to next loc3300Chain =3301DAG.getCopyToReg(Chain, dl, VA.getLocReg(),3302HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);3303Glue = Chain.getValue(1);3304RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));3305VA = RVLocs[++i]; // skip ahead to next loc33063307// Extract the 2nd half and fall through to handle it as an f64 value.3308Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,3309DAG.getConstant(1, dl, MVT::i32));3310}3311// Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is3312// available.3313SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,3314DAG.getVTList(MVT::i32, MVT::i32), Arg);3315Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),3316fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);3317Glue = Chain.getValue(1);3318RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));3319VA = RVLocs[++i]; // skip ahead to next loc3320Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),3321fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);3322} else3323Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);33243325// Guarantee that all emitted copies are3326// stuck together, avoiding something bad.3327Glue = Chain.getValue(1);3328RetOps.push_back(DAG.getRegister(3329VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));3330}3331const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();3332const MCPhysReg *I =3333TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());3334if (I) {3335for (; *I; ++I) {3336if (ARM::GPRRegClass.contains(*I))3337RetOps.push_back(DAG.getRegister(*I, MVT::i32));3338else if (ARM::DPRRegClass.contains(*I))3339RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));3340else3341llvm_unreachable("Unexpected register class in CSRsViaCopy!");3342}3343}33443345// Update chain and glue.3346RetOps[0] = Chain;3347if (Glue.getNode())3348RetOps.push_back(Glue);33493350// CPUs which aren't M-class use a special sequence to return from3351// exceptions (roughly, any instruction setting pc and cpsr simultaneously,3352// though we use "subs pc, lr, #N").3353//3354// M-class CPUs actually use a normal return sequence with a special3355// (hardware-provided) value in LR, so the normal code path works.3356if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&3357!Subtarget->isMClass()) {3358if (Subtarget->isThumb1Only())3359report_fatal_error("interrupt attribute is not supported in Thumb1");3360return LowerInterruptReturn(RetOps, dl, DAG);3361}33623363ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_GLUE :3364ARMISD::RET_GLUE;3365return DAG.getNode(RetNode, dl, MVT::Other, RetOps);3366}33673368bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {3369if (N->getNumValues() != 1)3370return false;3371if (!N->hasNUsesOfValue(1, 0))3372return false;33733374SDValue TCChain = Chain;3375SDNode *Copy = *N->use_begin();3376if (Copy->getOpcode() == ISD::CopyToReg) {3377// If the copy has a glue operand, we conservatively assume it isn't safe to3378// perform a tail call.3379if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)3380return false;3381TCChain = Copy->getOperand(0);3382} else if (Copy->getOpcode() == ARMISD::VMOVRRD) {3383SDNode *VMov = Copy;3384// f64 returned in a pair of GPRs.3385SmallPtrSet<SDNode*, 2> Copies;3386for (SDNode *U : VMov->uses()) {3387if (U->getOpcode() != ISD::CopyToReg)3388return false;3389Copies.insert(U);3390}3391if (Copies.size() > 2)3392return false;33933394for (SDNode *U : VMov->uses()) {3395SDValue UseChain = U->getOperand(0);3396if (Copies.count(UseChain.getNode()))3397// Second CopyToReg3398Copy = U;3399else {3400// We are at the top of this chain.3401// If the copy has a glue operand, we conservatively assume it3402// isn't safe to perform a tail call.3403if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)3404return false;3405// First CopyToReg3406TCChain = UseChain;3407}3408}3409} else if (Copy->getOpcode() == ISD::BITCAST) {3410// f32 returned in a single GPR.3411if (!Copy->hasOneUse())3412return false;3413Copy = *Copy->use_begin();3414if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))3415return false;3416// If the copy has a glue operand, we conservatively assume it isn't safe to3417// perform a tail call.3418if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)3419return false;3420TCChain = Copy->getOperand(0);3421} else {3422return false;3423}34243425bool HasRet = false;3426for (const SDNode *U : Copy->uses()) {3427if (U->getOpcode() != ARMISD::RET_GLUE &&3428U->getOpcode() != ARMISD::INTRET_GLUE)3429return false;3430HasRet = true;3431}34323433if (!HasRet)3434return false;34353436Chain = TCChain;3437return true;3438}34393440bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {3441if (!Subtarget->supportsTailCall())3442return false;34433444if (!CI->isTailCall())3445return false;34463447return true;3448}34493450// Trying to write a 64 bit value so need to split into two 32 bit values first,3451// and pass the lower and high parts through.3452static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {3453SDLoc DL(Op);3454SDValue WriteValue = Op->getOperand(2);34553456// This function is only supposed to be called for i64 type argument.3457assert(WriteValue.getValueType() == MVT::i643458&& "LowerWRITE_REGISTER called for non-i64 type argument.");34593460SDValue Lo, Hi;3461std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);3462SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };3463return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);3464}34653466// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as3467// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is3468// one of the above mentioned nodes. It has to be wrapped because otherwise3469// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only3470// be used to form addressing mode. These wrapped nodes will be selected3471// into MOVi.3472SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,3473SelectionDAG &DAG) const {3474EVT PtrVT = Op.getValueType();3475// FIXME there is no actual debug info here3476SDLoc dl(Op);3477ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);3478SDValue Res;34793480// When generating execute-only code Constant Pools must be promoted to the3481// global data section. It's a bit ugly that we can't share them across basic3482// blocks, but this way we guarantee that execute-only behaves correct with3483// position-independent addressing modes.3484if (Subtarget->genExecuteOnly()) {3485auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();3486auto T = const_cast<Type*>(CP->getType());3487auto C = const_cast<Constant*>(CP->getConstVal());3488auto M = const_cast<Module*>(DAG.getMachineFunction().3489getFunction().getParent());3490auto GV = new GlobalVariable(3491*M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,3492Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +3493Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +3494Twine(AFI->createPICLabelUId())3495);3496SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),3497dl, PtrVT);3498return LowerGlobalAddress(GA, DAG);3499}35003501// The 16-bit ADR instruction can only encode offsets that are multiples of 4,3502// so we need to align to at least 4 bytes when we don't have 32-bit ADR.3503Align CPAlign = CP->getAlign();3504if (Subtarget->isThumb1Only())3505CPAlign = std::max(CPAlign, Align(4));3506if (CP->isMachineConstantPoolEntry())3507Res =3508DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);3509else3510Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);3511return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);3512}35133514unsigned ARMTargetLowering::getJumpTableEncoding() const {3515// If we don't have a 32-bit pc-relative branch instruction then the jump3516// table consists of block addresses. Usually this is inline, but for3517// execute-only it must be placed out-of-line.3518if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())3519return MachineJumpTableInfo::EK_BlockAddress;3520return MachineJumpTableInfo::EK_Inline;3521}35223523SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,3524SelectionDAG &DAG) const {3525MachineFunction &MF = DAG.getMachineFunction();3526ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();3527unsigned ARMPCLabelIndex = 0;3528SDLoc DL(Op);3529EVT PtrVT = getPointerTy(DAG.getDataLayout());3530const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();3531SDValue CPAddr;3532bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();3533if (!IsPositionIndependent) {3534CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));3535} else {3536unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;3537ARMPCLabelIndex = AFI->createPICLabelUId();3538ARMConstantPoolValue *CPV =3539ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,3540ARMCP::CPBlockAddress, PCAdj);3541CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));3542}3543CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);3544SDValue Result = DAG.getLoad(3545PtrVT, DL, DAG.getEntryNode(), CPAddr,3546MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));3547if (!IsPositionIndependent)3548return Result;3549SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);3550return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);3551}35523553/// Convert a TLS address reference into the correct sequence of loads3554/// and calls to compute the variable's address for Darwin, and return an3555/// SDValue containing the final node.35563557/// Darwin only has one TLS scheme which must be capable of dealing with the3558/// fully general situation, in the worst case. This means:3559/// + "extern __thread" declaration.3560/// + Defined in a possibly unknown dynamic library.3561///3562/// The general system is that each __thread variable has a [3 x i32] descriptor3563/// which contains information used by the runtime to calculate the address. The3564/// only part of this the compiler needs to know about is the first word, which3565/// contains a function pointer that must be called with the address of the3566/// entire descriptor in "r0".3567///3568/// Since this descriptor may be in a different unit, in general access must3569/// proceed along the usual ARM rules. A common sequence to produce is:3570///3571/// movw rT1, :lower16:_var$non_lazy_ptr3572/// movt rT1, :upper16:_var$non_lazy_ptr3573/// ldr r0, [rT1]3574/// ldr rT2, [r0]3575/// blx rT23576/// [...address now in r0...]3577SDValue3578ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,3579SelectionDAG &DAG) const {3580assert(Subtarget->isTargetDarwin() &&3581"This function expects a Darwin target");3582SDLoc DL(Op);35833584// First step is to get the address of the actua global symbol. This is where3585// the TLS descriptor lives.3586SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);35873588// The first entry in the descriptor is a function pointer that we must call3589// to obtain the address of the variable.3590SDValue Chain = DAG.getEntryNode();3591SDValue FuncTLVGet = DAG.getLoad(3592MVT::i32, DL, Chain, DescAddr,3593MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4),3594MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |3595MachineMemOperand::MOInvariant);3596Chain = FuncTLVGet.getValue(1);35973598MachineFunction &F = DAG.getMachineFunction();3599MachineFrameInfo &MFI = F.getFrameInfo();3600MFI.setAdjustsStack(true);36013602// TLS calls preserve all registers except those that absolutely must be3603// trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be3604// silly).3605auto TRI =3606getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo();3607auto ARI = static_cast<const ARMRegisterInfo *>(TRI);3608const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());36093610// Finally, we can make the call. This is just a degenerate version of a3611// normal AArch64 call node: r0 takes the address of the descriptor, and3612// returns the address of the variable in this thread.3613Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());3614Chain =3615DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),3616Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),3617DAG.getRegisterMask(Mask), Chain.getValue(1));3618return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));3619}36203621SDValue3622ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,3623SelectionDAG &DAG) const {3624assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");36253626SDValue Chain = DAG.getEntryNode();3627EVT PtrVT = getPointerTy(DAG.getDataLayout());3628SDLoc DL(Op);36293630// Load the current TEB (thread environment block)3631SDValue Ops[] = {Chain,3632DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),3633DAG.getTargetConstant(15, DL, MVT::i32),3634DAG.getTargetConstant(0, DL, MVT::i32),3635DAG.getTargetConstant(13, DL, MVT::i32),3636DAG.getTargetConstant(0, DL, MVT::i32),3637DAG.getTargetConstant(2, DL, MVT::i32)};3638SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,3639DAG.getVTList(MVT::i32, MVT::Other), Ops);36403641SDValue TEB = CurrentTEB.getValue(0);3642Chain = CurrentTEB.getValue(1);36433644// Load the ThreadLocalStoragePointer from the TEB3645// A pointer to the TLS array is located at offset 0x2c from the TEB.3646SDValue TLSArray =3647DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));3648TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());36493650// The pointer to the thread's TLS data area is at the TLS Index scaled by 43651// offset into the TLSArray.36523653// Load the TLS index from the C runtime3654SDValue TLSIndex =3655DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);3656TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);3657TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());36583659SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,3660DAG.getConstant(2, DL, MVT::i32));3661SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,3662DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),3663MachinePointerInfo());36643665// Get the offset of the start of the .tls section (section base)3666const auto *GA = cast<GlobalAddressSDNode>(Op);3667auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);3668SDValue Offset = DAG.getLoad(3669PtrVT, DL, Chain,3670DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,3671DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),3672MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));36733674return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);3675}36763677// Lower ISD::GlobalTLSAddress using the "general dynamic" model3678SDValue3679ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,3680SelectionDAG &DAG) const {3681SDLoc dl(GA);3682EVT PtrVT = getPointerTy(DAG.getDataLayout());3683unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;3684MachineFunction &MF = DAG.getMachineFunction();3685ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();3686unsigned ARMPCLabelIndex = AFI->createPICLabelUId();3687ARMConstantPoolValue *CPV =3688ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,3689ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);3690SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));3691Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);3692Argument = DAG.getLoad(3693PtrVT, dl, DAG.getEntryNode(), Argument,3694MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));3695SDValue Chain = Argument.getValue(1);36963697SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);3698Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);36993700// call __tls_get_addr.3701ArgListTy Args;3702ArgListEntry Entry;3703Entry.Node = Argument;3704Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());3705Args.push_back(Entry);37063707// FIXME: is there useful debug info available here?3708TargetLowering::CallLoweringInfo CLI(DAG);3709CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(3710CallingConv::C, Type::getInt32Ty(*DAG.getContext()),3711DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));37123713std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);3714return CallResult.first;3715}37163717// Lower ISD::GlobalTLSAddress using the "initial exec" or3718// "local exec" model.3719SDValue3720ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,3721SelectionDAG &DAG,3722TLSModel::Model model) const {3723const GlobalValue *GV = GA->getGlobal();3724SDLoc dl(GA);3725SDValue Offset;3726SDValue Chain = DAG.getEntryNode();3727EVT PtrVT = getPointerTy(DAG.getDataLayout());3728// Get the Thread Pointer3729SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);37303731if (model == TLSModel::InitialExec) {3732MachineFunction &MF = DAG.getMachineFunction();3733ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();3734unsigned ARMPCLabelIndex = AFI->createPICLabelUId();3735// Initial exec model.3736unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;3737ARMConstantPoolValue *CPV =3738ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,3739ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,3740true);3741Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));3742Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);3743Offset = DAG.getLoad(3744PtrVT, dl, Chain, Offset,3745MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));3746Chain = Offset.getValue(1);37473748SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);3749Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);37503751Offset = DAG.getLoad(3752PtrVT, dl, Chain, Offset,3753MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));3754} else {3755// local exec model3756assert(model == TLSModel::LocalExec);3757ARMConstantPoolValue *CPV =3758ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);3759Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));3760Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);3761Offset = DAG.getLoad(3762PtrVT, dl, Chain, Offset,3763MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));3764}37653766// The address of the thread local variable is the add of the thread3767// pointer with the offset of the variable.3768return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);3769}37703771SDValue3772ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {3773GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);3774if (DAG.getTarget().useEmulatedTLS())3775return LowerToTLSEmulatedModel(GA, DAG);37763777if (Subtarget->isTargetDarwin())3778return LowerGlobalTLSAddressDarwin(Op, DAG);37793780if (Subtarget->isTargetWindows())3781return LowerGlobalTLSAddressWindows(Op, DAG);37823783// TODO: implement the "local dynamic" model3784assert(Subtarget->isTargetELF() && "Only ELF implemented here");3785TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());37863787switch (model) {3788case TLSModel::GeneralDynamic:3789case TLSModel::LocalDynamic:3790return LowerToTLSGeneralDynamicModel(GA, DAG);3791case TLSModel::InitialExec:3792case TLSModel::LocalExec:3793return LowerToTLSExecModels(GA, DAG, model);3794}3795llvm_unreachable("bogus TLS model");3796}37973798/// Return true if all users of V are within function F, looking through3799/// ConstantExprs.3800static bool allUsersAreInFunction(const Value *V, const Function *F) {3801SmallVector<const User*,4> Worklist(V->users());3802while (!Worklist.empty()) {3803auto *U = Worklist.pop_back_val();3804if (isa<ConstantExpr>(U)) {3805append_range(Worklist, U->users());3806continue;3807}38083809auto *I = dyn_cast<Instruction>(U);3810if (!I || I->getParent()->getParent() != F)3811return false;3812}3813return true;3814}38153816static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,3817const GlobalValue *GV, SelectionDAG &DAG,3818EVT PtrVT, const SDLoc &dl) {3819// If we're creating a pool entry for a constant global with unnamed address,3820// and the global is small enough, we can emit it inline into the constant pool3821// to save ourselves an indirection.3822//3823// This is a win if the constant is only used in one function (so it doesn't3824// need to be duplicated) or duplicating the constant wouldn't increase code3825// size (implying the constant is no larger than 4 bytes).3826const Function &F = DAG.getMachineFunction().getFunction();38273828// We rely on this decision to inline being idemopotent and unrelated to the3829// use-site. We know that if we inline a variable at one use site, we'll3830// inline it elsewhere too (and reuse the constant pool entry). Fast-isel3831// doesn't know about this optimization, so bail out if it's enabled else3832// we could decide to inline here (and thus never emit the GV) but require3833// the GV from fast-isel generated code.3834if (!EnableConstpoolPromotion ||3835DAG.getMachineFunction().getTarget().Options.EnableFastISel)3836return SDValue();38373838auto *GVar = dyn_cast<GlobalVariable>(GV);3839if (!GVar || !GVar->hasInitializer() ||3840!GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||3841!GVar->hasLocalLinkage())3842return SDValue();38433844// If we inline a value that contains relocations, we move the relocations3845// from .data to .text. This is not allowed in position-independent code.3846auto *Init = GVar->getInitializer();3847if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&3848Init->needsDynamicRelocation())3849return SDValue();38503851// The constant islands pass can only really deal with alignment requests3852// <= 4 bytes and cannot pad constants itself. Therefore we cannot promote3853// any type wanting greater alignment requirements than 4 bytes. We also3854// can only promote constants that are multiples of 4 bytes in size or3855// are paddable to a multiple of 4. Currently we only try and pad constants3856// that are strings for simplicity.3857auto *CDAInit = dyn_cast<ConstantDataArray>(Init);3858unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());3859Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);3860unsigned RequiredPadding = 4 - (Size % 4);3861bool PaddingPossible =3862RequiredPadding == 4 || (CDAInit && CDAInit->isString());3863if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||3864Size == 0)3865return SDValue();38663867unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);3868MachineFunction &MF = DAG.getMachineFunction();3869ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();38703871// We can't bloat the constant pool too much, else the ConstantIslands pass3872// may fail to converge. If we haven't promoted this global yet (it may have3873// multiple uses), and promoting it would increase the constant pool size (Sz3874// > 4), ensure we have space to do so up to MaxTotal.3875if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)3876if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=3877ConstpoolPromotionMaxTotal)3878return SDValue();38793880// This is only valid if all users are in a single function; we can't clone3881// the constant in general. The LLVM IR unnamed_addr allows merging3882// constants, but not cloning them.3883//3884// We could potentially allow cloning if we could prove all uses of the3885// constant in the current function don't care about the address, like3886// printf format strings. But that isn't implemented for now.3887if (!allUsersAreInFunction(GVar, &F))3888return SDValue();38893890// We're going to inline this global. Pad it out if needed.3891if (RequiredPadding != 4) {3892StringRef S = CDAInit->getAsString();38933894SmallVector<uint8_t,16> V(S.size());3895std::copy(S.bytes_begin(), S.bytes_end(), V.begin());3896while (RequiredPadding--)3897V.push_back(0);3898Init = ConstantDataArray::get(*DAG.getContext(), V);3899}39003901auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);3902SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));3903if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {3904AFI->markGlobalAsPromotedToConstantPool(GVar);3905AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +3906PaddedSize - 4);3907}3908++NumConstpoolPromoted;3909return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);3910}39113912bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {3913if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))3914if (!(GV = GA->getAliaseeObject()))3915return false;3916if (const auto *V = dyn_cast<GlobalVariable>(GV))3917return V->isConstant();3918return isa<Function>(GV);3919}39203921SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,3922SelectionDAG &DAG) const {3923switch (Subtarget->getTargetTriple().getObjectFormat()) {3924default: llvm_unreachable("unknown object format");3925case Triple::COFF:3926return LowerGlobalAddressWindows(Op, DAG);3927case Triple::ELF:3928return LowerGlobalAddressELF(Op, DAG);3929case Triple::MachO:3930return LowerGlobalAddressDarwin(Op, DAG);3931}3932}39333934SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,3935SelectionDAG &DAG) const {3936EVT PtrVT = getPointerTy(DAG.getDataLayout());3937SDLoc dl(Op);3938const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();3939bool IsRO = isReadOnly(GV);39403941// promoteToConstantPool only if not generating XO text section3942if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())3943if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))3944return V;39453946if (isPositionIndependent()) {3947SDValue G = DAG.getTargetGlobalAddress(3948GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);3949SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);3950if (!GV->isDSOLocal())3951Result =3952DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,3953MachinePointerInfo::getGOT(DAG.getMachineFunction()));3954return Result;3955} else if (Subtarget->isROPI() && IsRO) {3956// PC-relative.3957SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);3958SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);3959return Result;3960} else if (Subtarget->isRWPI() && !IsRO) {3961// SB-relative.3962SDValue RelAddr;3963if (Subtarget->useMovt()) {3964++NumMovwMovt;3965SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);3966RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);3967} else { // use literal pool for address constant3968ARMConstantPoolValue *CPV =3969ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);3970SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));3971CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);3972RelAddr = DAG.getLoad(3973PtrVT, dl, DAG.getEntryNode(), CPAddr,3974MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));3975}3976SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);3977SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);3978return Result;3979}39803981// If we have T2 ops, we can materialize the address directly via movt/movw3982// pair. This is always cheaper. If need to generate Execute Only code, and we3983// only have Thumb1 available, we can't use a constant pool and are forced to3984// use immediate relocations.3985if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {3986if (Subtarget->useMovt())3987++NumMovwMovt;3988// FIXME: Once remat is capable of dealing with instructions with register3989// operands, expand this into two nodes.3990return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,3991DAG.getTargetGlobalAddress(GV, dl, PtrVT));3992} else {3993SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));3994CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);3995return DAG.getLoad(3996PtrVT, dl, DAG.getEntryNode(), CPAddr,3997MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));3998}3999}40004001SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,4002SelectionDAG &DAG) const {4003assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&4004"ROPI/RWPI not currently supported for Darwin");4005EVT PtrVT = getPointerTy(DAG.getDataLayout());4006SDLoc dl(Op);4007const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();40084009if (Subtarget->useMovt())4010++NumMovwMovt;40114012// FIXME: Once remat is capable of dealing with instructions with register4013// operands, expand this into multiple nodes4014unsigned Wrapper =4015isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;40164017SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);4018SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);40194020if (Subtarget->isGVIndirectSymbol(GV))4021Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,4022MachinePointerInfo::getGOT(DAG.getMachineFunction()));4023return Result;4024}40254026SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,4027SelectionDAG &DAG) const {4028assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");4029assert(Subtarget->useMovt() &&4030"Windows on ARM expects to use movw/movt");4031assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&4032"ROPI/RWPI not currently supported for Windows");40334034const TargetMachine &TM = getTargetMachine();4035const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();4036ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;4037if (GV->hasDLLImportStorageClass())4038TargetFlags = ARMII::MO_DLLIMPORT;4039else if (!TM.shouldAssumeDSOLocal(GV))4040TargetFlags = ARMII::MO_COFFSTUB;4041EVT PtrVT = getPointerTy(DAG.getDataLayout());4042SDValue Result;4043SDLoc DL(Op);40444045++NumMovwMovt;40464047// FIXME: Once remat is capable of dealing with instructions with register4048// operands, expand this into two nodes.4049Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,4050DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,4051TargetFlags));4052if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))4053Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,4054MachinePointerInfo::getGOT(DAG.getMachineFunction()));4055return Result;4056}40574058SDValue4059ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {4060SDLoc dl(Op);4061SDValue Val = DAG.getConstant(0, dl, MVT::i32);4062return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,4063DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),4064Op.getOperand(1), Val);4065}40664067SDValue4068ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {4069SDLoc dl(Op);4070return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),4071Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));4072}40734074SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,4075SelectionDAG &DAG) const {4076SDLoc dl(Op);4077return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,4078Op.getOperand(0));4079}40804081SDValue ARMTargetLowering::LowerINTRINSIC_VOID(4082SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {4083unsigned IntNo =4084Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);4085switch (IntNo) {4086default:4087return SDValue(); // Don't custom lower most intrinsics.4088case Intrinsic::arm_gnu_eabi_mcount: {4089MachineFunction &MF = DAG.getMachineFunction();4090EVT PtrVT = getPointerTy(DAG.getDataLayout());4091SDLoc dl(Op);4092SDValue Chain = Op.getOperand(0);4093// call "\01__gnu_mcount_nc"4094const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();4095const uint32_t *Mask =4096ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);4097assert(Mask && "Missing call preserved mask for calling convention");4098// Mark LR an implicit live-in.4099Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));4100SDValue ReturnAddress =4101DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);4102constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};4103SDValue Callee =4104DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);4105SDValue RegisterMask = DAG.getRegisterMask(Mask);4106if (Subtarget->isThumb())4107return SDValue(4108DAG.getMachineNode(4109ARM::tBL_PUSHLR, dl, ResultTys,4110{ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),4111DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),41120);4113return SDValue(4114DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,4115{ReturnAddress, Callee, RegisterMask, Chain}),41160);4117}4118}4119}41204121SDValue4122ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,4123const ARMSubtarget *Subtarget) const {4124unsigned IntNo = Op.getConstantOperandVal(0);4125SDLoc dl(Op);4126switch (IntNo) {4127default: return SDValue(); // Don't custom lower most intrinsics.4128case Intrinsic::thread_pointer: {4129EVT PtrVT = getPointerTy(DAG.getDataLayout());4130return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);4131}4132case Intrinsic::arm_cls: {4133const SDValue &Operand = Op.getOperand(1);4134const EVT VTy = Op.getValueType();4135SDValue SRA =4136DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));4137SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);4138SDValue SHL =4139DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));4140SDValue OR =4141DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));4142SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);4143return Result;4144}4145case Intrinsic::arm_cls64: {4146// cls(x) = if cls(hi(x)) != 31 then cls(hi(x))4147// else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))4148const SDValue &Operand = Op.getOperand(1);4149const EVT VTy = Op.getValueType();4150SDValue Lo, Hi;4151std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);4152SDValue Constant0 = DAG.getConstant(0, dl, VTy);4153SDValue Constant1 = DAG.getConstant(1, dl, VTy);4154SDValue Constant31 = DAG.getConstant(31, dl, VTy);4155SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);4156SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);4157SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);4158SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);4159SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);4160SDValue CheckLo =4161DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);4162SDValue HiIsZero =4163DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);4164SDValue AdjustedLo =4165DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));4166SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);4167SDValue Result =4168DAG.getSelect(dl, VTy, CheckLo,4169DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);4170return Result;4171}4172case Intrinsic::eh_sjlj_lsda: {4173MachineFunction &MF = DAG.getMachineFunction();4174ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();4175unsigned ARMPCLabelIndex = AFI->createPICLabelUId();4176EVT PtrVT = getPointerTy(DAG.getDataLayout());4177SDValue CPAddr;4178bool IsPositionIndependent = isPositionIndependent();4179unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;4180ARMConstantPoolValue *CPV =4181ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,4182ARMCP::CPLSDA, PCAdj);4183CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));4184CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);4185SDValue Result = DAG.getLoad(4186PtrVT, dl, DAG.getEntryNode(), CPAddr,4187MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));41884189if (IsPositionIndependent) {4190SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);4191Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);4192}4193return Result;4194}4195case Intrinsic::arm_neon_vabs:4196return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),4197Op.getOperand(1));4198case Intrinsic::arm_neon_vabds:4199if (Op.getValueType().isInteger())4200return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),4201Op.getOperand(1), Op.getOperand(2));4202return SDValue();4203case Intrinsic::arm_neon_vabdu:4204return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),4205Op.getOperand(1), Op.getOperand(2));4206case Intrinsic::arm_neon_vmulls:4207case Intrinsic::arm_neon_vmullu: {4208unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)4209? ARMISD::VMULLs : ARMISD::VMULLu;4210return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),4211Op.getOperand(1), Op.getOperand(2));4212}4213case Intrinsic::arm_neon_vminnm:4214case Intrinsic::arm_neon_vmaxnm: {4215unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)4216? ISD::FMINNUM : ISD::FMAXNUM;4217return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),4218Op.getOperand(1), Op.getOperand(2));4219}4220case Intrinsic::arm_neon_vminu:4221case Intrinsic::arm_neon_vmaxu: {4222if (Op.getValueType().isFloatingPoint())4223return SDValue();4224unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)4225? ISD::UMIN : ISD::UMAX;4226return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),4227Op.getOperand(1), Op.getOperand(2));4228}4229case Intrinsic::arm_neon_vmins:4230case Intrinsic::arm_neon_vmaxs: {4231// v{min,max}s is overloaded between signed integers and floats.4232if (!Op.getValueType().isFloatingPoint()) {4233unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)4234? ISD::SMIN : ISD::SMAX;4235return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),4236Op.getOperand(1), Op.getOperand(2));4237}4238unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)4239? ISD::FMINIMUM : ISD::FMAXIMUM;4240return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),4241Op.getOperand(1), Op.getOperand(2));4242}4243case Intrinsic::arm_neon_vtbl1:4244return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),4245Op.getOperand(1), Op.getOperand(2));4246case Intrinsic::arm_neon_vtbl2:4247return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),4248Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));4249case Intrinsic::arm_mve_pred_i2v:4250case Intrinsic::arm_mve_pred_v2i:4251return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),4252Op.getOperand(1));4253case Intrinsic::arm_mve_vreinterpretq:4254return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),4255Op.getOperand(1));4256case Intrinsic::arm_mve_lsll:4257return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),4258Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));4259case Intrinsic::arm_mve_asrl:4260return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),4261Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));4262}4263}42644265static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,4266const ARMSubtarget *Subtarget) {4267SDLoc dl(Op);4268auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));4269if (SSID == SyncScope::SingleThread)4270return Op;42714272if (!Subtarget->hasDataBarrier()) {4273// Some ARMv6 cpus can support data barriers with an mcr instruction.4274// Thumb1 and pre-v6 ARM mode use a libcall instead and should never get4275// here.4276assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&4277"Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");4278return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),4279DAG.getConstant(0, dl, MVT::i32));4280}42814282AtomicOrdering Ord =4283static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));4284ARM_MB::MemBOpt Domain = ARM_MB::ISH;4285if (Subtarget->isMClass()) {4286// Only a full system barrier exists in the M-class architectures.4287Domain = ARM_MB::SY;4288} else if (Subtarget->preferISHSTBarriers() &&4289Ord == AtomicOrdering::Release) {4290// Swift happens to implement ISHST barriers in a way that's compatible with4291// Release semantics but weaker than ISH so we'd be fools not to use4292// it. Beware: other processors probably don't!4293Domain = ARM_MB::ISHST;4294}42954296return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),4297DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),4298DAG.getConstant(Domain, dl, MVT::i32));4299}43004301static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,4302const ARMSubtarget *Subtarget) {4303// ARM pre v5TE and Thumb1 does not have preload instructions.4304if (!(Subtarget->isThumb2() ||4305(!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))4306// Just preserve the chain.4307return Op.getOperand(0);43084309SDLoc dl(Op);4310unsigned isRead = ~Op.getConstantOperandVal(2) & 1;4311if (!isRead &&4312(!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))4313// ARMv7 with MP extension has PLDW.4314return Op.getOperand(0);43154316unsigned isData = Op.getConstantOperandVal(4);4317if (Subtarget->isThumb()) {4318// Invert the bits.4319isRead = ~isRead & 1;4320isData = ~isData & 1;4321}43224323return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),4324Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),4325DAG.getConstant(isData, dl, MVT::i32));4326}43274328static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {4329MachineFunction &MF = DAG.getMachineFunction();4330ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();43314332// vastart just stores the address of the VarArgsFrameIndex slot into the4333// memory location argument.4334SDLoc dl(Op);4335EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());4336SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);4337const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();4338return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),4339MachinePointerInfo(SV));4340}43414342SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,4343CCValAssign &NextVA,4344SDValue &Root,4345SelectionDAG &DAG,4346const SDLoc &dl) const {4347MachineFunction &MF = DAG.getMachineFunction();4348ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();43494350const TargetRegisterClass *RC;4351if (AFI->isThumb1OnlyFunction())4352RC = &ARM::tGPRRegClass;4353else4354RC = &ARM::GPRRegClass;43554356// Transform the arguments stored in physical registers into virtual ones.4357Register Reg = MF.addLiveIn(VA.getLocReg(), RC);4358SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);43594360SDValue ArgValue2;4361if (NextVA.isMemLoc()) {4362MachineFrameInfo &MFI = MF.getFrameInfo();4363int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);43644365// Create load node to retrieve arguments from the stack.4366SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));4367ArgValue2 = DAG.getLoad(4368MVT::i32, dl, Root, FIN,4369MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));4370} else {4371Reg = MF.addLiveIn(NextVA.getLocReg(), RC);4372ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);4373}4374if (!Subtarget->isLittle())4375std::swap (ArgValue, ArgValue2);4376return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);4377}43784379// The remaining GPRs hold either the beginning of variable-argument4380// data, or the beginning of an aggregate passed by value (usually4381// byval). Either way, we allocate stack slots adjacent to the data4382// provided by our caller, and store the unallocated registers there.4383// If this is a variadic function, the va_list pointer will begin with4384// these values; otherwise, this reassembles a (byval) structure that4385// was split between registers and memory.4386// Return: The frame index registers were stored into.4387int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,4388const SDLoc &dl, SDValue &Chain,4389const Value *OrigArg,4390unsigned InRegsParamRecordIdx,4391int ArgOffset, unsigned ArgSize) const {4392// Currently, two use-cases possible:4393// Case #1. Non-var-args function, and we meet first byval parameter.4394// Setup first unallocated register as first byval register;4395// eat all remained registers4396// (these two actions are performed by HandleByVal method).4397// Then, here, we initialize stack frame with4398// "store-reg" instructions.4399// Case #2. Var-args function, that doesn't contain byval parameters.4400// The same: eat all remained unallocated registers,4401// initialize stack frame.44024403MachineFunction &MF = DAG.getMachineFunction();4404MachineFrameInfo &MFI = MF.getFrameInfo();4405ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();4406unsigned RBegin, REnd;4407if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {4408CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);4409} else {4410unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);4411RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];4412REnd = ARM::R4;4413}44144415if (REnd != RBegin)4416ArgOffset = -4 * (ARM::R4 - RBegin);44174418auto PtrVT = getPointerTy(DAG.getDataLayout());4419int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);4420SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);44214422SmallVector<SDValue, 4> MemOps;4423const TargetRegisterClass *RC =4424AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;44254426for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {4427Register VReg = MF.addLiveIn(Reg, RC);4428SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);4429SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,4430MachinePointerInfo(OrigArg, 4 * i));4431MemOps.push_back(Store);4432FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));4433}44344435if (!MemOps.empty())4436Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);4437return FrameIndex;4438}44394440// Setup stack frame, the va_list pointer will start from.4441void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,4442const SDLoc &dl, SDValue &Chain,4443unsigned ArgOffset,4444unsigned TotalArgRegsSaveSize,4445bool ForceMutable) const {4446MachineFunction &MF = DAG.getMachineFunction();4447ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();44484449// Try to store any remaining integer argument regs4450// to their spots on the stack so that they may be loaded by dereferencing4451// the result of va_next.4452// If there is no regs to be stored, just point address after last4453// argument passed via stack.4454int FrameIndex = StoreByValRegs(4455CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),4456CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));4457AFI->setVarArgsFrameIndex(FrameIndex);4458}44594460bool ARMTargetLowering::splitValueIntoRegisterParts(4461SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,4462unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {4463EVT ValueVT = Val.getValueType();4464if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {4465unsigned ValueBits = ValueVT.getSizeInBits();4466unsigned PartBits = PartVT.getSizeInBits();4467Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);4468Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);4469Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);4470Parts[0] = Val;4471return true;4472}4473return false;4474}44754476SDValue ARMTargetLowering::joinRegisterPartsIntoValue(4477SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,4478MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {4479if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {4480unsigned ValueBits = ValueVT.getSizeInBits();4481unsigned PartBits = PartVT.getSizeInBits();4482SDValue Val = Parts[0];44834484Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);4485Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);4486Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);4487return Val;4488}4489return SDValue();4490}44914492SDValue ARMTargetLowering::LowerFormalArguments(4493SDValue Chain, CallingConv::ID CallConv, bool isVarArg,4494const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,4495SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {4496MachineFunction &MF = DAG.getMachineFunction();4497MachineFrameInfo &MFI = MF.getFrameInfo();44984499ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();45004501// Assign locations to all of the incoming arguments.4502SmallVector<CCValAssign, 16> ArgLocs;4503CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,4504*DAG.getContext());4505CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));45064507Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();4508unsigned CurArgIdx = 0;45094510// Initially ArgRegsSaveSize is zero.4511// Then we increase this value each time we meet byval parameter.4512// We also increase this value in case of varargs function.4513AFI->setArgRegsSaveSize(0);45144515// Calculate the amount of stack space that we need to allocate to store4516// byval and variadic arguments that are passed in registers.4517// We need to know this before we allocate the first byval or variadic4518// argument, as they will be allocated a stack slot below the CFA (Canonical4519// Frame Address, the stack pointer at entry to the function).4520unsigned ArgRegBegin = ARM::R4;4521for (const CCValAssign &VA : ArgLocs) {4522if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())4523break;45244525unsigned Index = VA.getValNo();4526ISD::ArgFlagsTy Flags = Ins[Index].Flags;4527if (!Flags.isByVal())4528continue;45294530assert(VA.isMemLoc() && "unexpected byval pointer in reg");4531unsigned RBegin, REnd;4532CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);4533ArgRegBegin = std::min(ArgRegBegin, RBegin);45344535CCInfo.nextInRegsParam();4536}4537CCInfo.rewindByValRegsInfo();45384539int lastInsIndex = -1;4540if (isVarArg && MFI.hasVAStart()) {4541unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);4542if (RegIdx != std::size(GPRArgRegs))4543ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);4544}45454546unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);4547AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);4548auto PtrVT = getPointerTy(DAG.getDataLayout());45494550for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {4551CCValAssign &VA = ArgLocs[i];4552if (Ins[VA.getValNo()].isOrigArg()) {4553std::advance(CurOrigArg,4554Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);4555CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();4556}4557// Arguments stored in registers.4558if (VA.isRegLoc()) {4559EVT RegVT = VA.getLocVT();4560SDValue ArgValue;45614562if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {4563// f64 and vector types are split up into multiple registers or4564// combinations of registers and stack slots.4565SDValue ArgValue1 =4566GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);4567VA = ArgLocs[++i]; // skip ahead to next loc4568SDValue ArgValue2;4569if (VA.isMemLoc()) {4570int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);4571SDValue FIN = DAG.getFrameIndex(FI, PtrVT);4572ArgValue2 = DAG.getLoad(4573MVT::f64, dl, Chain, FIN,4574MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));4575} else {4576ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);4577}4578ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);4579ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,4580ArgValue1, DAG.getIntPtrConstant(0, dl));4581ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,4582ArgValue2, DAG.getIntPtrConstant(1, dl));4583} else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {4584ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);4585} else {4586const TargetRegisterClass *RC;45874588if (RegVT == MVT::f16 || RegVT == MVT::bf16)4589RC = &ARM::HPRRegClass;4590else if (RegVT == MVT::f32)4591RC = &ARM::SPRRegClass;4592else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||4593RegVT == MVT::v4bf16)4594RC = &ARM::DPRRegClass;4595else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||4596RegVT == MVT::v8bf16)4597RC = &ARM::QPRRegClass;4598else if (RegVT == MVT::i32)4599RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass4600: &ARM::GPRRegClass;4601else4602llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");46034604// Transform the arguments in physical registers into virtual ones.4605Register Reg = MF.addLiveIn(VA.getLocReg(), RC);4606ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);46074608// If this value is passed in r0 and has the returned attribute (e.g.4609// C++ 'structors), record this fact for later use.4610if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {4611AFI->setPreservesR0();4612}4613}46144615// If this is an 8 or 16-bit value, it is really passed promoted4616// to 32 bits. Insert an assert[sz]ext to capture this, then4617// truncate to the right size.4618switch (VA.getLocInfo()) {4619default: llvm_unreachable("Unknown loc info!");4620case CCValAssign::Full: break;4621case CCValAssign::BCvt:4622ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);4623break;4624}46254626// f16 arguments have their size extended to 4 bytes and passed as if they4627// had been copied to the LSBs of a 32-bit register.4628// For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)4629if (VA.needsCustom() &&4630(VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))4631ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);46324633// On CMSE Entry Functions, formal integer arguments whose bitwidth is4634// less than 32 bits must be sign- or zero-extended in the callee for4635// security reasons. Although the ABI mandates an extension done by the4636// caller, the latter cannot be trusted to follow the rules of the ABI.4637const ISD::InputArg &Arg = Ins[VA.getValNo()];4638if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&4639RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))4640ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);46414642InVals.push_back(ArgValue);4643} else { // VA.isRegLoc()4644// Only arguments passed on the stack should make it here.4645assert(VA.isMemLoc());4646assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");46474648int index = VA.getValNo();46494650// Some Ins[] entries become multiple ArgLoc[] entries.4651// Process them only once.4652if (index != lastInsIndex)4653{4654ISD::ArgFlagsTy Flags = Ins[index].Flags;4655// FIXME: For now, all byval parameter objects are marked mutable.4656// This can be changed with more analysis.4657// In case of tail call optimization mark all arguments mutable.4658// Since they could be overwritten by lowering of arguments in case of4659// a tail call.4660if (Flags.isByVal()) {4661assert(Ins[index].isOrigArg() &&4662"Byval arguments cannot be implicit");4663unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();46644665int FrameIndex = StoreByValRegs(4666CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,4667VA.getLocMemOffset(), Flags.getByValSize());4668InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));4669CCInfo.nextInRegsParam();4670} else {4671unsigned FIOffset = VA.getLocMemOffset();4672int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,4673FIOffset, true);46744675// Create load nodes to retrieve arguments from the stack.4676SDValue FIN = DAG.getFrameIndex(FI, PtrVT);4677InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,4678MachinePointerInfo::getFixedStack(4679DAG.getMachineFunction(), FI)));4680}4681lastInsIndex = index;4682}4683}4684}46854686// varargs4687if (isVarArg && MFI.hasVAStart()) {4688VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),4689TotalArgRegsSaveSize);4690if (AFI->isCmseNSEntryFunction()) {4691DiagnosticInfoUnsupported Diag(4692DAG.getMachineFunction().getFunction(),4693"secure entry function must not be variadic", dl.getDebugLoc());4694DAG.getContext()->diagnose(Diag);4695}4696}46974698unsigned StackArgSize = CCInfo.getStackSize();4699bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;4700if (canGuaranteeTCO(CallConv, TailCallOpt)) {4701// The only way to guarantee a tail call is if the callee restores its4702// argument area, but it must also keep the stack aligned when doing so.4703const DataLayout &DL = DAG.getDataLayout();4704StackArgSize = alignTo(StackArgSize, DL.getStackAlignment());47054706AFI->setArgumentStackToRestore(StackArgSize);4707}4708AFI->setArgumentStackSize(StackArgSize);47094710if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {4711DiagnosticInfoUnsupported Diag(4712DAG.getMachineFunction().getFunction(),4713"secure entry function requires arguments on stack", dl.getDebugLoc());4714DAG.getContext()->diagnose(Diag);4715}47164717return Chain;4718}47194720/// isFloatingPointZero - Return true if this is +0.0.4721static bool isFloatingPointZero(SDValue Op) {4722if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))4723return CFP->getValueAPF().isPosZero();4724else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {4725// Maybe this has already been legalized into the constant pool?4726if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {4727SDValue WrapperOp = Op.getOperand(1).getOperand(0);4728if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))4729if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))4730return CFP->getValueAPF().isPosZero();4731}4732} else if (Op->getOpcode() == ISD::BITCAST &&4733Op->getValueType(0) == MVT::f64) {4734// Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)4735// created by LowerConstantFP().4736SDValue BitcastOp = Op->getOperand(0);4737if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&4738isNullConstant(BitcastOp->getOperand(0)))4739return true;4740}4741return false;4742}47434744/// Returns appropriate ARM CMP (cmp) and corresponding condition code for4745/// the given operands.4746SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,4747SDValue &ARMcc, SelectionDAG &DAG,4748const SDLoc &dl) const {4749if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {4750unsigned C = RHSC->getZExtValue();4751if (!isLegalICmpImmediate((int32_t)C)) {4752// Constant does not fit, try adjusting it by one.4753switch (CC) {4754default: break;4755case ISD::SETLT:4756case ISD::SETGE:4757if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {4758CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;4759RHS = DAG.getConstant(C - 1, dl, MVT::i32);4760}4761break;4762case ISD::SETULT:4763case ISD::SETUGE:4764if (C != 0 && isLegalICmpImmediate(C-1)) {4765CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;4766RHS = DAG.getConstant(C - 1, dl, MVT::i32);4767}4768break;4769case ISD::SETLE:4770case ISD::SETGT:4771if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {4772CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;4773RHS = DAG.getConstant(C + 1, dl, MVT::i32);4774}4775break;4776case ISD::SETULE:4777case ISD::SETUGT:4778if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {4779CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;4780RHS = DAG.getConstant(C + 1, dl, MVT::i32);4781}4782break;4783}4784}4785} else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&4786(ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) {4787// In ARM and Thumb-2, the compare instructions can shift their second4788// operand.4789CC = ISD::getSetCCSwappedOperands(CC);4790std::swap(LHS, RHS);4791}47924793// Thumb1 has very limited immediate modes, so turning an "and" into a4794// shift can save multiple instructions.4795//4796// If we have (x & C1), and C1 is an appropriate mask, we can transform it4797// into "((x << n) >> n)". But that isn't necessarily profitable on its4798// own. If it's the operand to an unsigned comparison with an immediate,4799// we can eliminate one of the shifts: we transform4800// "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".4801//4802// We avoid transforming cases which aren't profitable due to encoding4803// details:4804//4805// 1. C2 fits into the immediate field of a cmp, and the transformed version4806// would not; in that case, we're essentially trading one immediate load for4807// another.4808// 2. C1 is 255 or 65535, so we can use uxtb or uxth.4809// 3. C2 is zero; we have other code for this special case.4810//4811// FIXME: Figure out profitability for Thumb2; we usually can't save an4812// instruction, since the AND is always one instruction anyway, but we could4813// use narrow instructions in some cases.4814if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&4815LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&4816LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&4817!isSignedIntSetCC(CC)) {4818unsigned Mask = LHS.getConstantOperandVal(1);4819auto *RHSC = cast<ConstantSDNode>(RHS.getNode());4820uint64_t RHSV = RHSC->getZExtValue();4821if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {4822unsigned ShiftBits = llvm::countl_zero(Mask);4823if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {4824SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);4825LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);4826RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);4827}4828}4829}48304831// The specific comparison "(x<<c) > 0x80000000U" can be optimized to a4832// single "lsls x, c+1". The shift sets the "C" and "Z" flags the same4833// way a cmp would.4834// FIXME: Add support for ARM/Thumb2; this would need isel patterns, and4835// some tweaks to the heuristics for the previous and->shift transform.4836// FIXME: Optimize cases where the LHS isn't a shift.4837if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&4838isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&4839CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&4840LHS.getConstantOperandVal(1) < 31) {4841unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;4842SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,4843DAG.getVTList(MVT::i32, MVT::i32),4844LHS.getOperand(0),4845DAG.getConstant(ShiftAmt, dl, MVT::i32));4846SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,4847Shift.getValue(1), SDValue());4848ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);4849return Chain.getValue(1);4850}48514852ARMCC::CondCodes CondCode = IntCCToARMCC(CC);48534854// If the RHS is a constant zero then the V (overflow) flag will never be4855// set. This can allow us to simplify GE to PL or LT to MI, which can be4856// simpler for other passes (like the peephole optimiser) to deal with.4857if (isNullConstant(RHS)) {4858switch (CondCode) {4859default: break;4860case ARMCC::GE:4861CondCode = ARMCC::PL;4862break;4863case ARMCC::LT:4864CondCode = ARMCC::MI;4865break;4866}4867}48684869ARMISD::NodeType CompareType;4870switch (CondCode) {4871default:4872CompareType = ARMISD::CMP;4873break;4874case ARMCC::EQ:4875case ARMCC::NE:4876// Uses only Z Flag4877CompareType = ARMISD::CMPZ;4878break;4879}4880ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);4881return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);4882}48834884/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.4885SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,4886SelectionDAG &DAG, const SDLoc &dl,4887bool Signaling) const {4888assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);4889SDValue Cmp;4890if (!isFloatingPointZero(RHS))4891Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP,4892dl, MVT::Glue, LHS, RHS);4893else4894Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0,4895dl, MVT::Glue, LHS);4896return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);4897}48984899/// duplicateCmp - Glue values can have only one use, so this function4900/// duplicates a comparison node.4901SDValue4902ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {4903unsigned Opc = Cmp.getOpcode();4904SDLoc DL(Cmp);4905if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)4906return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));49074908assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");4909Cmp = Cmp.getOperand(0);4910Opc = Cmp.getOpcode();4911if (Opc == ARMISD::CMPFP)4912Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));4913else {4914assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");4915Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));4916}4917return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);4918}49194920// This function returns three things: the arithmetic computation itself4921// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The4922// comparison and the condition code define the case in which the arithmetic4923// computation *does not* overflow.4924std::pair<SDValue, SDValue>4925ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,4926SDValue &ARMcc) const {4927assert(Op.getValueType() == MVT::i32 && "Unsupported value type");49284929SDValue Value, OverflowCmp;4930SDValue LHS = Op.getOperand(0);4931SDValue RHS = Op.getOperand(1);4932SDLoc dl(Op);49334934// FIXME: We are currently always generating CMPs because we don't support4935// generating CMN through the backend. This is not as good as the natural4936// CMP case because it causes a register dependency and cannot be folded4937// later.49384939switch (Op.getOpcode()) {4940default:4941llvm_unreachable("Unknown overflow instruction!");4942case ISD::SADDO:4943ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);4944Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);4945OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);4946break;4947case ISD::UADDO:4948ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);4949// We use ADDC here to correspond to its use in LowerUnsignedALUO.4950// We do not use it in the USUBO case as Value may not be used.4951Value = DAG.getNode(ARMISD::ADDC, dl,4952DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)4953.getValue(0);4954OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);4955break;4956case ISD::SSUBO:4957ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);4958Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);4959OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);4960break;4961case ISD::USUBO:4962ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);4963Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);4964OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);4965break;4966case ISD::UMULO:4967// We generate a UMUL_LOHI and then check if the high word is 0.4968ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);4969Value = DAG.getNode(ISD::UMUL_LOHI, dl,4970DAG.getVTList(Op.getValueType(), Op.getValueType()),4971LHS, RHS);4972OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),4973DAG.getConstant(0, dl, MVT::i32));4974Value = Value.getValue(0); // We only want the low 32 bits for the result.4975break;4976case ISD::SMULO:4977// We generate a SMUL_LOHI and then check if all the bits of the high word4978// are the same as the sign bit of the low word.4979ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);4980Value = DAG.getNode(ISD::SMUL_LOHI, dl,4981DAG.getVTList(Op.getValueType(), Op.getValueType()),4982LHS, RHS);4983OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),4984DAG.getNode(ISD::SRA, dl, Op.getValueType(),4985Value.getValue(0),4986DAG.getConstant(31, dl, MVT::i32)));4987Value = Value.getValue(0); // We only want the low 32 bits for the result.4988break;4989} // switch (...)49904991return std::make_pair(Value, OverflowCmp);4992}49934994SDValue4995ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {4996// Let legalize expand this if it isn't a legal type yet.4997if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))4998return SDValue();49995000SDValue Value, OverflowCmp;5001SDValue ARMcc;5002std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);5003SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);5004SDLoc dl(Op);5005// We use 0 and 1 as false and true values.5006SDValue TVal = DAG.getConstant(1, dl, MVT::i32);5007SDValue FVal = DAG.getConstant(0, dl, MVT::i32);5008EVT VT = Op.getValueType();50095010SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,5011ARMcc, CCR, OverflowCmp);50125013SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);5014return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);5015}50165017static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,5018SelectionDAG &DAG) {5019SDLoc DL(BoolCarry);5020EVT CarryVT = BoolCarry.getValueType();50215022// This converts the boolean value carry into the carry flag by doing5023// ARMISD::SUBC Carry, 15024SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,5025DAG.getVTList(CarryVT, MVT::i32),5026BoolCarry, DAG.getConstant(1, DL, CarryVT));5027return Carry.getValue(1);5028}50295030static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,5031SelectionDAG &DAG) {5032SDLoc DL(Flags);50335034// Now convert the carry flag into a boolean carry. We do this5035// using ARMISD:ADDE 0, 0, Carry5036return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),5037DAG.getConstant(0, DL, MVT::i32),5038DAG.getConstant(0, DL, MVT::i32), Flags);5039}50405041SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,5042SelectionDAG &DAG) const {5043// Let legalize expand this if it isn't a legal type yet.5044if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))5045return SDValue();50465047SDValue LHS = Op.getOperand(0);5048SDValue RHS = Op.getOperand(1);5049SDLoc dl(Op);50505051EVT VT = Op.getValueType();5052SDVTList VTs = DAG.getVTList(VT, MVT::i32);5053SDValue Value;5054SDValue Overflow;5055switch (Op.getOpcode()) {5056default:5057llvm_unreachable("Unknown overflow instruction!");5058case ISD::UADDO:5059Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);5060// Convert the carry flag into a boolean value.5061Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);5062break;5063case ISD::USUBO: {5064Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);5065// Convert the carry flag into a boolean value.5066Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);5067// ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow5068// value. So compute 1 - C.5069Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,5070DAG.getConstant(1, dl, MVT::i32), Overflow);5071break;5072}5073}50745075return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);5076}50775078static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG,5079const ARMSubtarget *Subtarget) {5080EVT VT = Op.getValueType();5081if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())5082return SDValue();5083if (!VT.isSimple())5084return SDValue();50855086unsigned NewOpcode;5087switch (VT.getSimpleVT().SimpleTy) {5088default:5089return SDValue();5090case MVT::i8:5091switch (Op->getOpcode()) {5092case ISD::UADDSAT:5093NewOpcode = ARMISD::UQADD8b;5094break;5095case ISD::SADDSAT:5096NewOpcode = ARMISD::QADD8b;5097break;5098case ISD::USUBSAT:5099NewOpcode = ARMISD::UQSUB8b;5100break;5101case ISD::SSUBSAT:5102NewOpcode = ARMISD::QSUB8b;5103break;5104}5105break;5106case MVT::i16:5107switch (Op->getOpcode()) {5108case ISD::UADDSAT:5109NewOpcode = ARMISD::UQADD16b;5110break;5111case ISD::SADDSAT:5112NewOpcode = ARMISD::QADD16b;5113break;5114case ISD::USUBSAT:5115NewOpcode = ARMISD::UQSUB16b;5116break;5117case ISD::SSUBSAT:5118NewOpcode = ARMISD::QSUB16b;5119break;5120}5121break;5122}51235124SDLoc dl(Op);5125SDValue Add =5126DAG.getNode(NewOpcode, dl, MVT::i32,5127DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),5128DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));5129return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);5130}51315132SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {5133SDValue Cond = Op.getOperand(0);5134SDValue SelectTrue = Op.getOperand(1);5135SDValue SelectFalse = Op.getOperand(2);5136SDLoc dl(Op);5137unsigned Opc = Cond.getOpcode();51385139if (Cond.getResNo() == 1 &&5140(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||5141Opc == ISD::USUBO)) {5142if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))5143return SDValue();51445145SDValue Value, OverflowCmp;5146SDValue ARMcc;5147std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);5148SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);5149EVT VT = Op.getValueType();51505151return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,5152OverflowCmp, DAG);5153}51545155// Convert:5156//5157// (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)5158// (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)5159//5160if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {5161const ConstantSDNode *CMOVTrue =5162dyn_cast<ConstantSDNode>(Cond.getOperand(0));5163const ConstantSDNode *CMOVFalse =5164dyn_cast<ConstantSDNode>(Cond.getOperand(1));51655166if (CMOVTrue && CMOVFalse) {5167unsigned CMOVTrueVal = CMOVTrue->getZExtValue();5168unsigned CMOVFalseVal = CMOVFalse->getZExtValue();51695170SDValue True;5171SDValue False;5172if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {5173True = SelectTrue;5174False = SelectFalse;5175} else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {5176True = SelectFalse;5177False = SelectTrue;5178}51795180if (True.getNode() && False.getNode()) {5181EVT VT = Op.getValueType();5182SDValue ARMcc = Cond.getOperand(2);5183SDValue CCR = Cond.getOperand(3);5184SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);5185assert(True.getValueType() == VT);5186return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);5187}5188}5189}51905191// ARM's BooleanContents value is UndefinedBooleanContent. Mask out the5192// undefined bits before doing a full-word comparison with zero.5193Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,5194DAG.getConstant(1, dl, Cond.getValueType()));51955196return DAG.getSelectCC(dl, Cond,5197DAG.getConstant(0, dl, Cond.getValueType()),5198SelectTrue, SelectFalse, ISD::SETNE);5199}52005201static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,5202bool &swpCmpOps, bool &swpVselOps) {5203// Start by selecting the GE condition code for opcodes that return true for5204// 'equality'5205if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||5206CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)5207CondCode = ARMCC::GE;52085209// and GT for opcodes that return false for 'equality'.5210else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||5211CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)5212CondCode = ARMCC::GT;52135214// Since we are constrained to GE/GT, if the opcode contains 'less', we need5215// to swap the compare operands.5216if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||5217CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)5218swpCmpOps = true;52195220// Both GT and GE are ordered comparisons, and return false for 'unordered'.5221// If we have an unordered opcode, we need to swap the operands to the VSEL5222// instruction (effectively negating the condition).5223//5224// This also has the effect of swapping which one of 'less' or 'greater'5225// returns true, so we also swap the compare operands. It also switches5226// whether we return true for 'equality', so we compensate by picking the5227// opposite condition code to our original choice.5228if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||5229CC == ISD::SETUGT) {5230swpCmpOps = !swpCmpOps;5231swpVselOps = !swpVselOps;5232CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;5233}52345235// 'ordered' is 'anything but unordered', so use the VS condition code and5236// swap the VSEL operands.5237if (CC == ISD::SETO) {5238CondCode = ARMCC::VS;5239swpVselOps = true;5240}52415242// 'unordered or not equal' is 'anything but equal', so use the EQ condition5243// code and swap the VSEL operands. Also do this if we don't care about the5244// unordered case.5245if (CC == ISD::SETUNE || CC == ISD::SETNE) {5246CondCode = ARMCC::EQ;5247swpVselOps = true;5248}5249}52505251SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,5252SDValue TrueVal, SDValue ARMcc, SDValue CCR,5253SDValue Cmp, SelectionDAG &DAG) const {5254if (!Subtarget->hasFP64() && VT == MVT::f64) {5255FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,5256DAG.getVTList(MVT::i32, MVT::i32), FalseVal);5257TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,5258DAG.getVTList(MVT::i32, MVT::i32), TrueVal);52595260SDValue TrueLow = TrueVal.getValue(0);5261SDValue TrueHigh = TrueVal.getValue(1);5262SDValue FalseLow = FalseVal.getValue(0);5263SDValue FalseHigh = FalseVal.getValue(1);52645265SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,5266ARMcc, CCR, Cmp);5267SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,5268ARMcc, CCR, duplicateCmp(Cmp, DAG));52695270return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);5271} else {5272return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,5273Cmp);5274}5275}52765277static bool isGTorGE(ISD::CondCode CC) {5278return CC == ISD::SETGT || CC == ISD::SETGE;5279}52805281static bool isLTorLE(ISD::CondCode CC) {5282return CC == ISD::SETLT || CC == ISD::SETLE;5283}52845285// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.5286// All of these conditions (and their <= and >= counterparts) will do:5287// x < k ? k : x5288// x > k ? x : k5289// k < x ? x : k5290// k > x ? k : x5291static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,5292const SDValue TrueVal, const SDValue FalseVal,5293const ISD::CondCode CC, const SDValue K) {5294return (isGTorGE(CC) &&5295((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||5296(isLTorLE(CC) &&5297((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));5298}52995300// Check if two chained conditionals could be converted into SSAT or USAT.5301//5302// SSAT can replace a set of two conditional selectors that bound a number to an5303// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:5304//5305// x < -k ? -k : (x > k ? k : x)5306// x < -k ? -k : (x < k ? x : k)5307// x > -k ? (x > k ? k : x) : -k5308// x < k ? (x < -k ? -k : x) : k5309// etc.5310//5311// LLVM canonicalizes these to either a min(max()) or a max(min())5312// pattern. This function tries to match one of these and will return a SSAT5313// node if successful.5314//5315// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 15316// is a power of 2.5317static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) {5318EVT VT = Op.getValueType();5319SDValue V1 = Op.getOperand(0);5320SDValue K1 = Op.getOperand(1);5321SDValue TrueVal1 = Op.getOperand(2);5322SDValue FalseVal1 = Op.getOperand(3);5323ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();53245325const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;5326if (Op2.getOpcode() != ISD::SELECT_CC)5327return SDValue();53285329SDValue V2 = Op2.getOperand(0);5330SDValue K2 = Op2.getOperand(1);5331SDValue TrueVal2 = Op2.getOperand(2);5332SDValue FalseVal2 = Op2.getOperand(3);5333ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();53345335SDValue V1Tmp = V1;5336SDValue V2Tmp = V2;53375338// Check that the registers and the constants match a max(min()) or min(max())5339// pattern5340if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||5341K2 != FalseVal2 ||5342!((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))5343return SDValue();53445345// Check that the constant in the lower-bound check is5346// the opposite of the constant in the upper-bound check5347// in 1's complement.5348if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2))5349return SDValue();53505351int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();5352int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();5353int64_t PosVal = std::max(Val1, Val2);5354int64_t NegVal = std::min(Val1, Val2);53555356if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||5357!isPowerOf2_64(PosVal + 1))5358return SDValue();53595360// Handle the difference between USAT (unsigned) and SSAT (signed)5361// saturation5362// At this point, PosVal is guaranteed to be positive5363uint64_t K = PosVal;5364SDLoc dl(Op);5365if (Val1 == ~Val2)5366return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,5367DAG.getConstant(llvm::countr_one(K), dl, VT));5368if (NegVal == 0)5369return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,5370DAG.getConstant(llvm::countr_one(K), dl, VT));53715372return SDValue();5373}53745375// Check if a condition of the type x < k ? k : x can be converted into a5376// bit operation instead of conditional moves.5377// Currently this is allowed given:5378// - The conditions and values match up5379// - k is 0 or -1 (all ones)5380// This function will not check the last condition, thats up to the caller5381// It returns true if the transformation can be made, and in such case5382// returns x in V, and k in SatK.5383static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,5384SDValue &SatK)5385{5386SDValue LHS = Op.getOperand(0);5387SDValue RHS = Op.getOperand(1);5388ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();5389SDValue TrueVal = Op.getOperand(2);5390SDValue FalseVal = Op.getOperand(3);53915392SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)5393? &RHS5394: nullptr;53955396// No constant operation in comparison, early out5397if (!K)5398return false;53995400SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;5401V = (KTmp == TrueVal) ? FalseVal : TrueVal;5402SDValue VTmp = (K && *K == LHS) ? RHS : LHS;54035404// If the constant on left and right side, or variable on left and right,5405// does not match, early out5406if (*K != KTmp || V != VTmp)5407return false;54085409if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {5410SatK = *K;5411return true;5412}54135414return false;5415}54165417bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {5418if (VT == MVT::f32)5419return !Subtarget->hasVFP2Base();5420if (VT == MVT::f64)5421return !Subtarget->hasFP64();5422if (VT == MVT::f16)5423return !Subtarget->hasFullFP16();5424return false;5425}54265427SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {5428EVT VT = Op.getValueType();5429SDLoc dl(Op);54305431// Try to convert two saturating conditional selects into a single SSAT5432if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())5433if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))5434return SatValue;54355436// Try to convert expressions of the form x < k ? k : x (and similar forms)5437// into more efficient bit operations, which is possible when k is 0 or -15438// On ARM and Thumb-2 which have flexible operand 2 this will result in5439// single instructions. On Thumb the shift and the bit operation will be two5440// instructions.5441// Only allow this transformation on full-width (32-bit) operations5442SDValue LowerSatConstant;5443SDValue SatValue;5444if (VT == MVT::i32 &&5445isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {5446SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,5447DAG.getConstant(31, dl, VT));5448if (isNullConstant(LowerSatConstant)) {5449SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,5450DAG.getAllOnesConstant(dl, VT));5451return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);5452} else if (isAllOnesConstant(LowerSatConstant))5453return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);5454}54555456SDValue LHS = Op.getOperand(0);5457SDValue RHS = Op.getOperand(1);5458ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();5459SDValue TrueVal = Op.getOperand(2);5460SDValue FalseVal = Op.getOperand(3);5461ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);5462ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);54635464if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&5465LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {5466unsigned TVal = CTVal->getZExtValue();5467unsigned FVal = CFVal->getZExtValue();5468unsigned Opcode = 0;54695470if (TVal == ~FVal) {5471Opcode = ARMISD::CSINV;5472} else if (TVal == ~FVal + 1) {5473Opcode = ARMISD::CSNEG;5474} else if (TVal + 1 == FVal) {5475Opcode = ARMISD::CSINC;5476} else if (TVal == FVal + 1) {5477Opcode = ARMISD::CSINC;5478std::swap(TrueVal, FalseVal);5479std::swap(TVal, FVal);5480CC = ISD::getSetCCInverse(CC, LHS.getValueType());5481}54825483if (Opcode) {5484// If one of the constants is cheaper than another, materialise the5485// cheaper one and let the csel generate the other.5486if (Opcode != ARMISD::CSINC &&5487HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {5488std::swap(TrueVal, FalseVal);5489std::swap(TVal, FVal);5490CC = ISD::getSetCCInverse(CC, LHS.getValueType());5491}54925493// Attempt to use ZR checking TVal is 0, possibly inverting the condition5494// to get there. CSINC not is invertable like the other two (~(~a) == a,5495// -(-a) == a, but (a+1)+1 != a).5496if (FVal == 0 && Opcode != ARMISD::CSINC) {5497std::swap(TrueVal, FalseVal);5498std::swap(TVal, FVal);5499CC = ISD::getSetCCInverse(CC, LHS.getValueType());5500}55015502// Drops F's value because we can get it by inverting/negating TVal.5503FalseVal = TrueVal;55045505SDValue ARMcc;5506SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);5507EVT VT = TrueVal.getValueType();5508return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);5509}5510}55115512if (isUnsupportedFloatingType(LHS.getValueType())) {5513DAG.getTargetLoweringInfo().softenSetCCOperands(5514DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);55155516// If softenSetCCOperands only returned one value, we should compare it to5517// zero.5518if (!RHS.getNode()) {5519RHS = DAG.getConstant(0, dl, LHS.getValueType());5520CC = ISD::SETNE;5521}5522}55235524if (LHS.getValueType() == MVT::i32) {5525// Try to generate VSEL on ARMv8.5526// The VSEL instruction can't use all the usual ARM condition5527// codes: it only has two bits to select the condition code, so it's5528// constrained to use only GE, GT, VS and EQ.5529//5530// To implement all the various ISD::SETXXX opcodes, we sometimes need to5531// swap the operands of the previous compare instruction (effectively5532// inverting the compare condition, swapping 'less' and 'greater') and5533// sometimes need to swap the operands to the VSEL (which inverts the5534// condition in the sense of firing whenever the previous condition didn't)5535if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||5536TrueVal.getValueType() == MVT::f32 ||5537TrueVal.getValueType() == MVT::f64)) {5538ARMCC::CondCodes CondCode = IntCCToARMCC(CC);5539if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||5540CondCode == ARMCC::VC || CondCode == ARMCC::NE) {5541CC = ISD::getSetCCInverse(CC, LHS.getValueType());5542std::swap(TrueVal, FalseVal);5543}5544}55455546SDValue ARMcc;5547SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);5548SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);5549// Choose GE over PL, which vsel does now support5550if (ARMcc->getAsZExtVal() == ARMCC::PL)5551ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);5552return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);5553}55545555ARMCC::CondCodes CondCode, CondCode2;5556FPCCToARMCC(CC, CondCode, CondCode2);55575558// Normalize the fp compare. If RHS is zero we prefer to keep it there so we5559// match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we5560// must use VSEL (limited condition codes), due to not having conditional f165561// moves.5562if (Subtarget->hasFPARMv8Base() &&5563!(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&5564(TrueVal.getValueType() == MVT::f16 ||5565TrueVal.getValueType() == MVT::f32 ||5566TrueVal.getValueType() == MVT::f64)) {5567bool swpCmpOps = false;5568bool swpVselOps = false;5569checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);55705571if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||5572CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {5573if (swpCmpOps)5574std::swap(LHS, RHS);5575if (swpVselOps)5576std::swap(TrueVal, FalseVal);5577}5578}55795580SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);5581SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);5582SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);5583SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);5584if (CondCode2 != ARMCC::AL) {5585SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);5586// FIXME: Needs another CMP because flag can have but one use.5587SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);5588Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);5589}5590return Result;5591}55925593/// canChangeToInt - Given the fp compare operand, return true if it is suitable5594/// to morph to an integer compare sequence.5595static bool canChangeToInt(SDValue Op, bool &SeenZero,5596const ARMSubtarget *Subtarget) {5597SDNode *N = Op.getNode();5598if (!N->hasOneUse())5599// Otherwise it requires moving the value from fp to integer registers.5600return false;5601if (!N->getNumValues())5602return false;5603EVT VT = Op.getValueType();5604if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())5605// f32 case is generally profitable. f64 case only makes sense when vcmpe +5606// vmrs are very slow, e.g. cortex-a8.5607return false;56085609if (isFloatingPointZero(Op)) {5610SeenZero = true;5611return true;5612}5613return ISD::isNormalLoad(N);5614}56155616static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {5617if (isFloatingPointZero(Op))5618return DAG.getConstant(0, SDLoc(Op), MVT::i32);56195620if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))5621return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),5622Ld->getPointerInfo(), Ld->getAlign(),5623Ld->getMemOperand()->getFlags());56245625llvm_unreachable("Unknown VFP cmp argument!");5626}56275628static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,5629SDValue &RetVal1, SDValue &RetVal2) {5630SDLoc dl(Op);56315632if (isFloatingPointZero(Op)) {5633RetVal1 = DAG.getConstant(0, dl, MVT::i32);5634RetVal2 = DAG.getConstant(0, dl, MVT::i32);5635return;5636}56375638if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {5639SDValue Ptr = Ld->getBasePtr();5640RetVal1 =5641DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),5642Ld->getAlign(), Ld->getMemOperand()->getFlags());56435644EVT PtrType = Ptr.getValueType();5645SDValue NewPtr = DAG.getNode(ISD::ADD, dl,5646PtrType, Ptr, DAG.getConstant(4, dl, PtrType));5647RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,5648Ld->getPointerInfo().getWithOffset(4),5649commonAlignment(Ld->getAlign(), 4),5650Ld->getMemOperand()->getFlags());5651return;5652}56535654llvm_unreachable("Unknown VFP cmp argument!");5655}56565657/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some5658/// f32 and even f64 comparisons to integer ones.5659SDValue5660ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {5661SDValue Chain = Op.getOperand(0);5662ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();5663SDValue LHS = Op.getOperand(2);5664SDValue RHS = Op.getOperand(3);5665SDValue Dest = Op.getOperand(4);5666SDLoc dl(Op);56675668bool LHSSeenZero = false;5669bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);5670bool RHSSeenZero = false;5671bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);5672if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {5673// If unsafe fp math optimization is enabled and there are no other uses of5674// the CMP operands, and the condition code is EQ or NE, we can optimize it5675// to an integer comparison.5676if (CC == ISD::SETOEQ)5677CC = ISD::SETEQ;5678else if (CC == ISD::SETUNE)5679CC = ISD::SETNE;56805681SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);5682SDValue ARMcc;5683if (LHS.getValueType() == MVT::f32) {5684LHS = DAG.getNode(ISD::AND, dl, MVT::i32,5685bitcastf32Toi32(LHS, DAG), Mask);5686RHS = DAG.getNode(ISD::AND, dl, MVT::i32,5687bitcastf32Toi32(RHS, DAG), Mask);5688SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);5689SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);5690return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,5691Chain, Dest, ARMcc, CCR, Cmp);5692}56935694SDValue LHS1, LHS2;5695SDValue RHS1, RHS2;5696expandf64Toi32(LHS, DAG, LHS1, LHS2);5697expandf64Toi32(RHS, DAG, RHS1, RHS2);5698LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);5699RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);5700ARMCC::CondCodes CondCode = IntCCToARMCC(CC);5701ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);5702SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);5703SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };5704return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);5705}57065707return SDValue();5708}57095710SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {5711SDValue Chain = Op.getOperand(0);5712SDValue Cond = Op.getOperand(1);5713SDValue Dest = Op.getOperand(2);5714SDLoc dl(Op);57155716// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch5717// instruction.5718unsigned Opc = Cond.getOpcode();5719bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&5720!Subtarget->isThumb1Only();5721if (Cond.getResNo() == 1 &&5722(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||5723Opc == ISD::USUBO || OptimizeMul)) {5724// Only lower legal XALUO ops.5725if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))5726return SDValue();57275728// The actual operation with overflow check.5729SDValue Value, OverflowCmp;5730SDValue ARMcc;5731std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);57325733// Reverse the condition code.5734ARMCC::CondCodes CondCode =5735(ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();5736CondCode = ARMCC::getOppositeCondition(CondCode);5737ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);5738SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);57395740return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,5741OverflowCmp);5742}57435744return SDValue();5745}57465747SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {5748SDValue Chain = Op.getOperand(0);5749ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();5750SDValue LHS = Op.getOperand(2);5751SDValue RHS = Op.getOperand(3);5752SDValue Dest = Op.getOperand(4);5753SDLoc dl(Op);57545755if (isUnsupportedFloatingType(LHS.getValueType())) {5756DAG.getTargetLoweringInfo().softenSetCCOperands(5757DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);57585759// If softenSetCCOperands only returned one value, we should compare it to5760// zero.5761if (!RHS.getNode()) {5762RHS = DAG.getConstant(0, dl, LHS.getValueType());5763CC = ISD::SETNE;5764}5765}57665767// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch5768// instruction.5769unsigned Opc = LHS.getOpcode();5770bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&5771!Subtarget->isThumb1Only();5772if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&5773(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||5774Opc == ISD::USUBO || OptimizeMul) &&5775(CC == ISD::SETEQ || CC == ISD::SETNE)) {5776// Only lower legal XALUO ops.5777if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))5778return SDValue();57795780// The actual operation with overflow check.5781SDValue Value, OverflowCmp;5782SDValue ARMcc;5783std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);57845785if ((CC == ISD::SETNE) != isOneConstant(RHS)) {5786// Reverse the condition code.5787ARMCC::CondCodes CondCode =5788(ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();5789CondCode = ARMCC::getOppositeCondition(CondCode);5790ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);5791}5792SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);57935794return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,5795OverflowCmp);5796}57975798if (LHS.getValueType() == MVT::i32) {5799SDValue ARMcc;5800SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);5801SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);5802return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,5803Chain, Dest, ARMcc, CCR, Cmp);5804}58055806if (getTargetMachine().Options.UnsafeFPMath &&5807(CC == ISD::SETEQ || CC == ISD::SETOEQ ||5808CC == ISD::SETNE || CC == ISD::SETUNE)) {5809if (SDValue Result = OptimizeVFPBrcond(Op, DAG))5810return Result;5811}58125813ARMCC::CondCodes CondCode, CondCode2;5814FPCCToARMCC(CC, CondCode, CondCode2);58155816SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);5817SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);5818SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);5819SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);5820SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };5821SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);5822if (CondCode2 != ARMCC::AL) {5823ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);5824SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };5825Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);5826}5827return Res;5828}58295830SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {5831SDValue Chain = Op.getOperand(0);5832SDValue Table = Op.getOperand(1);5833SDValue Index = Op.getOperand(2);5834SDLoc dl(Op);58355836EVT PTy = getPointerTy(DAG.getDataLayout());5837JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);5838SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);5839Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);5840Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));5841SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);5842if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {5843// Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table5844// which does another jump to the destination. This also makes it easier5845// to translate it to TBB / TBH later (Thumb2 only).5846// FIXME: This might not work if the function is extremely large.5847return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,5848Addr, Op.getOperand(2), JTI);5849}5850if (isPositionIndependent() || Subtarget->isROPI()) {5851Addr =5852DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,5853MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));5854Chain = Addr.getValue(1);5855Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);5856return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);5857} else {5858Addr =5859DAG.getLoad(PTy, dl, Chain, Addr,5860MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));5861Chain = Addr.getValue(1);5862return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);5863}5864}58655866static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {5867EVT VT = Op.getValueType();5868SDLoc dl(Op);58695870if (Op.getValueType().getVectorElementType() == MVT::i32) {5871if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)5872return Op;5873return DAG.UnrollVectorOp(Op.getNode());5874}58755876const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();58775878EVT NewTy;5879const EVT OpTy = Op.getOperand(0).getValueType();5880if (OpTy == MVT::v4f32)5881NewTy = MVT::v4i32;5882else if (OpTy == MVT::v4f16 && HasFullFP16)5883NewTy = MVT::v4i16;5884else if (OpTy == MVT::v8f16 && HasFullFP16)5885NewTy = MVT::v8i16;5886else5887llvm_unreachable("Invalid type for custom lowering!");58885889if (VT != MVT::v4i16 && VT != MVT::v8i16)5890return DAG.UnrollVectorOp(Op.getNode());58915892Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));5893return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);5894}58955896SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {5897EVT VT = Op.getValueType();5898if (VT.isVector())5899return LowerVectorFP_TO_INT(Op, DAG);59005901bool IsStrict = Op->isStrictFPOpcode();5902SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);59035904if (isUnsupportedFloatingType(SrcVal.getValueType())) {5905RTLIB::Libcall LC;5906if (Op.getOpcode() == ISD::FP_TO_SINT ||5907Op.getOpcode() == ISD::STRICT_FP_TO_SINT)5908LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),5909Op.getValueType());5910else5911LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),5912Op.getValueType());5913SDLoc Loc(Op);5914MakeLibCallOptions CallOptions;5915SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();5916SDValue Result;5917std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,5918CallOptions, Loc, Chain);5919return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;5920}59215922// FIXME: Remove this when we have strict fp instruction selection patterns5923if (IsStrict) {5924SDLoc Loc(Op);5925SDValue Result =5926DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT5927: ISD::FP_TO_UINT,5928Loc, Op.getValueType(), SrcVal);5929return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);5930}59315932return Op;5933}59345935static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,5936const ARMSubtarget *Subtarget) {5937EVT VT = Op.getValueType();5938EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();5939EVT FromVT = Op.getOperand(0).getValueType();59405941if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)5942return Op;5943if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&5944Subtarget->hasFP64())5945return Op;5946if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&5947Subtarget->hasFullFP16())5948return Op;5949if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&5950Subtarget->hasMVEFloatOps())5951return Op;5952if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&5953Subtarget->hasMVEFloatOps())5954return Op;59555956if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)5957return SDValue();59585959SDLoc DL(Op);5960bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;5961unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;5962SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),5963DAG.getValueType(VT.getScalarType()));5964SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,5965DAG.getConstant((1 << BW) - 1, DL, VT));5966if (IsSigned)5967Max = DAG.getNode(ISD::SMAX, DL, VT, Max,5968DAG.getConstant(-(1 << BW), DL, VT));5969return Max;5970}59715972static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {5973EVT VT = Op.getValueType();5974SDLoc dl(Op);59755976if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {5977if (VT.getVectorElementType() == MVT::f32)5978return Op;5979return DAG.UnrollVectorOp(Op.getNode());5980}59815982assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||5983Op.getOperand(0).getValueType() == MVT::v8i16) &&5984"Invalid type for custom lowering!");59855986const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();59875988EVT DestVecType;5989if (VT == MVT::v4f32)5990DestVecType = MVT::v4i32;5991else if (VT == MVT::v4f16 && HasFullFP16)5992DestVecType = MVT::v4i16;5993else if (VT == MVT::v8f16 && HasFullFP16)5994DestVecType = MVT::v8i16;5995else5996return DAG.UnrollVectorOp(Op.getNode());59975998unsigned CastOpc;5999unsigned Opc;6000switch (Op.getOpcode()) {6001default: llvm_unreachable("Invalid opcode!");6002case ISD::SINT_TO_FP:6003CastOpc = ISD::SIGN_EXTEND;6004Opc = ISD::SINT_TO_FP;6005break;6006case ISD::UINT_TO_FP:6007CastOpc = ISD::ZERO_EXTEND;6008Opc = ISD::UINT_TO_FP;6009break;6010}60116012Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));6013return DAG.getNode(Opc, dl, VT, Op);6014}60156016SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {6017EVT VT = Op.getValueType();6018if (VT.isVector())6019return LowerVectorINT_TO_FP(Op, DAG);6020if (isUnsupportedFloatingType(VT)) {6021RTLIB::Libcall LC;6022if (Op.getOpcode() == ISD::SINT_TO_FP)6023LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),6024Op.getValueType());6025else6026LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),6027Op.getValueType());6028MakeLibCallOptions CallOptions;6029return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),6030CallOptions, SDLoc(Op)).first;6031}60326033return Op;6034}60356036SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {6037// Implement fcopysign with a fabs and a conditional fneg.6038SDValue Tmp0 = Op.getOperand(0);6039SDValue Tmp1 = Op.getOperand(1);6040SDLoc dl(Op);6041EVT VT = Op.getValueType();6042EVT SrcVT = Tmp1.getValueType();6043bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||6044Tmp0.getOpcode() == ARMISD::VMOVDRR;6045bool UseNEON = !InGPR && Subtarget->hasNEON();60466047if (UseNEON) {6048// Use VBSL to copy the sign bit.6049unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);6050SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,6051DAG.getTargetConstant(EncodedVal, dl, MVT::i32));6052EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;6053if (VT == MVT::f64)6054Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,6055DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),6056DAG.getConstant(32, dl, MVT::i32));6057else /*if (VT == MVT::f32)*/6058Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);6059if (SrcVT == MVT::f32) {6060Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);6061if (VT == MVT::f64)6062Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,6063DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),6064DAG.getConstant(32, dl, MVT::i32));6065} else if (VT == MVT::f32)6066Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,6067DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),6068DAG.getConstant(32, dl, MVT::i32));6069Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);6070Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);60716072SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff),6073dl, MVT::i32);6074AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);6075SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,6076DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));60776078SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,6079DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),6080DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));6081if (VT == MVT::f32) {6082Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);6083Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,6084DAG.getConstant(0, dl, MVT::i32));6085} else {6086Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);6087}60886089return Res;6090}60916092// Bitcast operand 1 to i32.6093if (SrcVT == MVT::f64)6094Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),6095Tmp1).getValue(1);6096Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);60976098// Or in the signbit with integer operations.6099SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);6100SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);6101Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);6102if (VT == MVT::f32) {6103Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,6104DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);6105return DAG.getNode(ISD::BITCAST, dl, MVT::f32,6106DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));6107}61086109// f64: Or the high part with signbit and then combine two parts.6110Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),6111Tmp0);6112SDValue Lo = Tmp0.getValue(0);6113SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);6114Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);6115return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);6116}61176118SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{6119MachineFunction &MF = DAG.getMachineFunction();6120MachineFrameInfo &MFI = MF.getFrameInfo();6121MFI.setReturnAddressIsTaken(true);61226123if (verifyReturnAddressArgumentIsConstant(Op, DAG))6124return SDValue();61256126EVT VT = Op.getValueType();6127SDLoc dl(Op);6128unsigned Depth = Op.getConstantOperandVal(0);6129if (Depth) {6130SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);6131SDValue Offset = DAG.getConstant(4, dl, MVT::i32);6132return DAG.getLoad(VT, dl, DAG.getEntryNode(),6133DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),6134MachinePointerInfo());6135}61366137// Return LR, which contains the return address. Mark it an implicit live-in.6138Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));6139return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);6140}61416142SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {6143const ARMBaseRegisterInfo &ARI =6144*static_cast<const ARMBaseRegisterInfo*>(RegInfo);6145MachineFunction &MF = DAG.getMachineFunction();6146MachineFrameInfo &MFI = MF.getFrameInfo();6147MFI.setFrameAddressIsTaken(true);61486149EVT VT = Op.getValueType();6150SDLoc dl(Op); // FIXME probably not meaningful6151unsigned Depth = Op.getConstantOperandVal(0);6152Register FrameReg = ARI.getFrameRegister(MF);6153SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);6154while (Depth--)6155FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,6156MachinePointerInfo());6157return FrameAddr;6158}61596160// FIXME? Maybe this could be a TableGen attribute on some registers and6161// this table could be generated automatically from RegInfo.6162Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,6163const MachineFunction &MF) const {6164Register Reg = StringSwitch<unsigned>(RegName)6165.Case("sp", ARM::SP)6166.Default(0);6167if (Reg)6168return Reg;6169report_fatal_error(Twine("Invalid register name \""6170+ StringRef(RegName) + "\"."));6171}61726173// Result is 64 bit value so split into two 32 bit values and return as a6174// pair of values.6175static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,6176SelectionDAG &DAG) {6177SDLoc DL(N);61786179// This function is only supposed to be called for i64 type destination.6180assert(N->getValueType(0) == MVT::i646181&& "ExpandREAD_REGISTER called for non-i64 type result.");61826183SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,6184DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),6185N->getOperand(0),6186N->getOperand(1));61876188Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),6189Read.getValue(1)));6190Results.push_back(Read.getOperand(0));6191}61926193/// \p BC is a bitcast that is about to be turned into a VMOVDRR.6194/// When \p DstVT, the destination type of \p BC, is on the vector6195/// register bank and the source of bitcast, \p Op, operates on the same bank,6196/// it might be possible to combine them, such that everything stays on the6197/// vector register bank.6198/// \p return The node that would replace \p BT, if the combine6199/// is possible.6200static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,6201SelectionDAG &DAG) {6202SDValue Op = BC->getOperand(0);6203EVT DstVT = BC->getValueType(0);62046205// The only vector instruction that can produce a scalar (remember,6206// since the bitcast was about to be turned into VMOVDRR, the source6207// type is i64) from a vector is EXTRACT_VECTOR_ELT.6208// Moreover, we can do this combine only if there is one use.6209// Finally, if the destination type is not a vector, there is not6210// much point on forcing everything on the vector bank.6211if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||6212!Op.hasOneUse())6213return SDValue();62146215// If the index is not constant, we will introduce an additional6216// multiply that will stick.6217// Give up in that case.6218ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));6219if (!Index)6220return SDValue();6221unsigned DstNumElt = DstVT.getVectorNumElements();62226223// Compute the new index.6224const APInt &APIntIndex = Index->getAPIntValue();6225APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);6226NewIndex *= APIntIndex;6227// Check if the new constant index fits into i32.6228if (NewIndex.getBitWidth() > 32)6229return SDValue();62306231// vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->6232// vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)6233SDLoc dl(Op);6234SDValue ExtractSrc = Op.getOperand(0);6235EVT VecVT = EVT::getVectorVT(6236*DAG.getContext(), DstVT.getScalarType(),6237ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);6238SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);6239return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,6240DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));6241}62426243/// ExpandBITCAST - If the target supports VFP, this function is called to6244/// expand a bit convert where either the source or destination type is i64 to6245/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i646246/// operand type is illegal (e.g., v2f32 for a target that doesn't support6247/// vectors), since the legalizer won't know what to do with that.6248SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,6249const ARMSubtarget *Subtarget) const {6250const TargetLowering &TLI = DAG.getTargetLoweringInfo();6251SDLoc dl(N);6252SDValue Op = N->getOperand(0);62536254// This function is only supposed to be called for i16 and i64 types, either6255// as the source or destination of the bit convert.6256EVT SrcVT = Op.getValueType();6257EVT DstVT = N->getValueType(0);62586259if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&6260(DstVT == MVT::f16 || DstVT == MVT::bf16))6261return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),6262DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));62636264if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&6265(SrcVT == MVT::f16 || SrcVT == MVT::bf16))6266return DAG.getNode(6267ISD::TRUNCATE, SDLoc(N), DstVT,6268MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));62696270if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))6271return SDValue();62726273// Turn i64->f64 into VMOVDRR.6274if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {6275// Do not force values to GPRs (this is what VMOVDRR does for the inputs)6276// if we can combine the bitcast with its source.6277if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))6278return Val;6279SDValue Lo, Hi;6280std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);6281return DAG.getNode(ISD::BITCAST, dl, DstVT,6282DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));6283}62846285// Turn f64->i64 into VMOVRRD.6286if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {6287SDValue Cvt;6288if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&6289SrcVT.getVectorNumElements() > 1)6290Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,6291DAG.getVTList(MVT::i32, MVT::i32),6292DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));6293else6294Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,6295DAG.getVTList(MVT::i32, MVT::i32), Op);6296// Merge the pieces into a single i64 value.6297return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));6298}62996300return SDValue();6301}63026303/// getZeroVector - Returns a vector of specified type with all zero elements.6304/// Zero vectors are used to represent vector negation and in those cases6305/// will be implemented with the NEON VNEG instruction. However, VNEG does6306/// not support i64 elements, so sometimes the zero vectors will need to be6307/// explicitly constructed. Regardless, use a canonical VMOV to create the6308/// zero vector.6309static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {6310assert(VT.isVector() && "Expected a vector type");6311// The canonical modified immediate encoding of a zero vector is....0!6312SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);6313EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;6314SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);6315return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);6316}63176318/// LowerShiftRightParts - Lower SRA_PARTS, which returns two6319/// i32 values and take a 2 x i32 value to shift plus a shift amount.6320SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,6321SelectionDAG &DAG) const {6322assert(Op.getNumOperands() == 3 && "Not a double-shift!");6323EVT VT = Op.getValueType();6324unsigned VTBits = VT.getSizeInBits();6325SDLoc dl(Op);6326SDValue ShOpLo = Op.getOperand(0);6327SDValue ShOpHi = Op.getOperand(1);6328SDValue ShAmt = Op.getOperand(2);6329SDValue ARMcc;6330SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);6331unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;63326333assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);63346335SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,6336DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);6337SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);6338SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,6339DAG.getConstant(VTBits, dl, MVT::i32));6340SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);6341SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);6342SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);6343SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),6344ISD::SETGE, ARMcc, DAG, dl);6345SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,6346ARMcc, CCR, CmpLo);63476348SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);6349SDValue HiBigShift = Opc == ISD::SRA6350? DAG.getNode(Opc, dl, VT, ShOpHi,6351DAG.getConstant(VTBits - 1, dl, VT))6352: DAG.getConstant(0, dl, VT);6353SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),6354ISD::SETGE, ARMcc, DAG, dl);6355SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,6356ARMcc, CCR, CmpHi);63576358SDValue Ops[2] = { Lo, Hi };6359return DAG.getMergeValues(Ops, dl);6360}63616362/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two6363/// i32 values and take a 2 x i32 value to shift plus a shift amount.6364SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,6365SelectionDAG &DAG) const {6366assert(Op.getNumOperands() == 3 && "Not a double-shift!");6367EVT VT = Op.getValueType();6368unsigned VTBits = VT.getSizeInBits();6369SDLoc dl(Op);6370SDValue ShOpLo = Op.getOperand(0);6371SDValue ShOpHi = Op.getOperand(1);6372SDValue ShAmt = Op.getOperand(2);6373SDValue ARMcc;6374SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);63756376assert(Op.getOpcode() == ISD::SHL_PARTS);6377SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,6378DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);6379SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);6380SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);6381SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);63826383SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,6384DAG.getConstant(VTBits, dl, MVT::i32));6385SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);6386SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),6387ISD::SETGE, ARMcc, DAG, dl);6388SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,6389ARMcc, CCR, CmpHi);63906391SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),6392ISD::SETGE, ARMcc, DAG, dl);6393SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);6394SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,6395DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);63966397SDValue Ops[2] = { Lo, Hi };6398return DAG.getMergeValues(Ops, dl);6399}64006401SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,6402SelectionDAG &DAG) const {6403// The rounding mode is in bits 23:22 of the FPSCR.6404// The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->06405// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)6406// so that the shift + and get folded into a bitfield extract.6407SDLoc dl(Op);6408SDValue Chain = Op.getOperand(0);6409SDValue Ops[] = {Chain,6410DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};64116412SDValue FPSCR =6413DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);6414Chain = FPSCR.getValue(1);6415SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,6416DAG.getConstant(1U << 22, dl, MVT::i32));6417SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,6418DAG.getConstant(22, dl, MVT::i32));6419SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,6420DAG.getConstant(3, dl, MVT::i32));6421return DAG.getMergeValues({And, Chain}, dl);6422}64236424SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,6425SelectionDAG &DAG) const {6426SDLoc DL(Op);6427SDValue Chain = Op->getOperand(0);6428SDValue RMValue = Op->getOperand(1);64296430// The rounding mode is in bits 23:22 of the FPSCR.6431// The llvm.set.rounding argument value to ARM rounding mode value mapping6432// is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is6433// ((arg - 1) & 3) << 22).6434//6435// It is expected that the argument of llvm.set.rounding is within the6436// segment [0, 3], so NearestTiesToAway (4) is not handled here. It is6437// responsibility of the code generated llvm.set.rounding to ensure this6438// condition.64396440// Calculate new value of FPSCR[23:22].6441RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,6442DAG.getConstant(1, DL, MVT::i32));6443RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,6444DAG.getConstant(0x3, DL, MVT::i32));6445RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,6446DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));64476448// Get current value of FPSCR.6449SDValue Ops[] = {Chain,6450DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};6451SDValue FPSCR =6452DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);6453Chain = FPSCR.getValue(1);6454FPSCR = FPSCR.getValue(0);64556456// Put new rounding mode into FPSCR[23:22].6457const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);6458FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,6459DAG.getConstant(RMMask, DL, MVT::i32));6460FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);6461SDValue Ops2[] = {6462Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};6463return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);6464}64656466SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,6467SelectionDAG &DAG) const {6468SDLoc DL(Op);6469SDValue Chain = Op->getOperand(0);6470SDValue Mode = Op->getOperand(1);64716472// Generate nodes to build:6473// FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)6474SDValue Ops[] = {Chain,6475DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};6476SDValue FPSCR =6477DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);6478Chain = FPSCR.getValue(1);6479FPSCR = FPSCR.getValue(0);64806481SDValue FPSCRMasked =6482DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,6483DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));6484SDValue InputMasked =6485DAG.getNode(ISD::AND, DL, MVT::i32, Mode,6486DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));6487FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);64886489SDValue Ops2[] = {6490Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};6491return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);6492}64936494SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,6495SelectionDAG &DAG) const {6496SDLoc DL(Op);6497SDValue Chain = Op->getOperand(0);64986499// To get the default FP mode all control bits are cleared:6500// FPSCR = FPSCR & (FPStatusBits | FPReservedBits)6501SDValue Ops[] = {Chain,6502DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};6503SDValue FPSCR =6504DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);6505Chain = FPSCR.getValue(1);6506FPSCR = FPSCR.getValue(0);65076508SDValue FPSCRMasked = DAG.getNode(6509ISD::AND, DL, MVT::i32, FPSCR,6510DAG.getConstant(ARM::FPStatusBits | ARM::FPReservedBits, DL, MVT::i32));6511SDValue Ops2[] = {Chain,6512DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),6513FPSCRMasked};6514return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);6515}65166517static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,6518const ARMSubtarget *ST) {6519SDLoc dl(N);6520EVT VT = N->getValueType(0);6521if (VT.isVector() && ST->hasNEON()) {65226523// Compute the least significant set bit: LSB = X & -X6524SDValue X = N->getOperand(0);6525SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);6526SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);65276528EVT ElemTy = VT.getVectorElementType();65296530if (ElemTy == MVT::i8) {6531// Compute with: cttz(x) = ctpop(lsb - 1)6532SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,6533DAG.getTargetConstant(1, dl, ElemTy));6534SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);6535return DAG.getNode(ISD::CTPOP, dl, VT, Bits);6536}65376538if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&6539(N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {6540// Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 06541unsigned NumBits = ElemTy.getSizeInBits();6542SDValue WidthMinus1 =6543DAG.getNode(ARMISD::VMOVIMM, dl, VT,6544DAG.getTargetConstant(NumBits - 1, dl, ElemTy));6545SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);6546return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);6547}65486549// Compute with: cttz(x) = ctpop(lsb - 1)65506551// Compute LSB - 1.6552SDValue Bits;6553if (ElemTy == MVT::i64) {6554// Load constant 0xffff'ffff'ffff'ffff to register.6555SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,6556DAG.getTargetConstant(0x1eff, dl, MVT::i32));6557Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);6558} else {6559SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,6560DAG.getTargetConstant(1, dl, ElemTy));6561Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);6562}6563return DAG.getNode(ISD::CTPOP, dl, VT, Bits);6564}65656566if (!ST->hasV6T2Ops())6567return SDValue();65686569SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));6570return DAG.getNode(ISD::CTLZ, dl, VT, rbit);6571}65726573static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,6574const ARMSubtarget *ST) {6575EVT VT = N->getValueType(0);6576SDLoc DL(N);65776578assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");6579assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||6580VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&6581"Unexpected type for custom ctpop lowering");65826583const TargetLowering &TLI = DAG.getTargetLoweringInfo();6584EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;6585SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));6586Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);65876588// Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.6589unsigned EltSize = 8;6590unsigned NumElts = VT.is64BitVector() ? 8 : 16;6591while (EltSize != VT.getScalarSizeInBits()) {6592SmallVector<SDValue, 8> Ops;6593Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,6594TLI.getPointerTy(DAG.getDataLayout())));6595Ops.push_back(Res);65966597EltSize *= 2;6598NumElts /= 2;6599MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);6600Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);6601}66026603return Res;6604}66056606/// Getvshiftimm - Check if this is a valid build_vector for the immediate6607/// operand of a vector shift operation, where all the elements of the6608/// build_vector must have the same constant integer value.6609static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {6610// Ignore bit_converts.6611while (Op.getOpcode() == ISD::BITCAST)6612Op = Op.getOperand(0);6613BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());6614APInt SplatBits, SplatUndef;6615unsigned SplatBitSize;6616bool HasAnyUndefs;6617if (!BVN ||6618!BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,6619ElementBits) ||6620SplatBitSize > ElementBits)6621return false;6622Cnt = SplatBits.getSExtValue();6623return true;6624}66256626/// isVShiftLImm - Check if this is a valid build_vector for the immediate6627/// operand of a vector shift left operation. That value must be in the range:6628/// 0 <= Value < ElementBits for a left shift; or6629/// 0 <= Value <= ElementBits for a long left shift.6630static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {6631assert(VT.isVector() && "vector shift count is not a vector type");6632int64_t ElementBits = VT.getScalarSizeInBits();6633if (!getVShiftImm(Op, ElementBits, Cnt))6634return false;6635return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);6636}66376638/// isVShiftRImm - Check if this is a valid build_vector for the immediate6639/// operand of a vector shift right operation. For a shift opcode, the value6640/// is positive, but for an intrinsic the value count must be negative. The6641/// absolute value must be in the range:6642/// 1 <= |Value| <= ElementBits for a right shift; or6643/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.6644static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,6645int64_t &Cnt) {6646assert(VT.isVector() && "vector shift count is not a vector type");6647int64_t ElementBits = VT.getScalarSizeInBits();6648if (!getVShiftImm(Op, ElementBits, Cnt))6649return false;6650if (!isIntrinsic)6651return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));6652if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {6653Cnt = -Cnt;6654return true;6655}6656return false;6657}66586659static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,6660const ARMSubtarget *ST) {6661EVT VT = N->getValueType(0);6662SDLoc dl(N);6663int64_t Cnt;66646665if (!VT.isVector())6666return SDValue();66676668// We essentially have two forms here. Shift by an immediate and shift by a6669// vector register (there are also shift by a gpr, but that is just handled6670// with a tablegen pattern). We cannot easily match shift by an immediate in6671// tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.6672// For shifting by a vector, we don't have VSHR, only VSHL (which can be6673// signed or unsigned, and a negative shift indicates a shift right).6674if (N->getOpcode() == ISD::SHL) {6675if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))6676return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),6677DAG.getConstant(Cnt, dl, MVT::i32));6678return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),6679N->getOperand(1));6680}66816682assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&6683"unexpected vector shift opcode");66846685if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {6686unsigned VShiftOpc =6687(N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);6688return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),6689DAG.getConstant(Cnt, dl, MVT::i32));6690}66916692// Other right shifts we don't have operations for (we use a shift left by a6693// negative number).6694EVT ShiftVT = N->getOperand(1).getValueType();6695SDValue NegatedCount = DAG.getNode(6696ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));6697unsigned VShiftOpc =6698(N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);6699return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);6700}67016702static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,6703const ARMSubtarget *ST) {6704EVT VT = N->getValueType(0);6705SDLoc dl(N);67066707// We can get here for a node like i32 = ISD::SHL i32, i646708if (VT != MVT::i64)6709return SDValue();67106711assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||6712N->getOpcode() == ISD::SHL) &&6713"Unknown shift to lower!");67146715unsigned ShOpc = N->getOpcode();6716if (ST->hasMVEIntegerOps()) {6717SDValue ShAmt = N->getOperand(1);6718unsigned ShPartsOpc = ARMISD::LSLL;6719ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);67206721// If the shift amount is greater than 32 or has a greater bitwidth than 646722// then do the default optimisation6723if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||6724(Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))6725return SDValue();67266727// Extract the lower 32 bits of the shift amount if it's not an i326728if (ShAmt->getValueType(0) != MVT::i32)6729ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);67306731if (ShOpc == ISD::SRL) {6732if (!Con)6733// There is no t2LSRLr instruction so negate and perform an lsll if the6734// shift amount is in a register, emulating a right shift.6735ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,6736DAG.getConstant(0, dl, MVT::i32), ShAmt);6737else6738// Else generate an lsrl on the immediate shift amount6739ShPartsOpc = ARMISD::LSRL;6740} else if (ShOpc == ISD::SRA)6741ShPartsOpc = ARMISD::ASRL;67426743// Split Lower/Upper 32 bits of the destination/source6744SDValue Lo, Hi;6745std::tie(Lo, Hi) =6746DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);6747// Generate the shift operation as computed above6748Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,6749ShAmt);6750// The upper 32 bits come from the second return value of lsll6751Hi = SDValue(Lo.getNode(), 1);6752return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);6753}67546755// We only lower SRA, SRL of 1 here, all others use generic lowering.6756if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)6757return SDValue();67586759// If we are in thumb mode, we don't have RRX.6760if (ST->isThumb1Only())6761return SDValue();67626763// Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.6764SDValue Lo, Hi;6765std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);67666767// First, build a SRA_GLUE/SRL_GLUE op, which shifts the top part by one and6768// captures the result into a carry flag.6769unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_GLUE:ARMISD::SRA_GLUE;6770Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);67716772// The low part is an ARMISD::RRX operand, which shifts the carry in.6773Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));67746775// Merge the pieces into a single i64 value.6776return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);6777}67786779static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,6780const ARMSubtarget *ST) {6781bool Invert = false;6782bool Swap = false;6783unsigned Opc = ARMCC::AL;67846785SDValue Op0 = Op.getOperand(0);6786SDValue Op1 = Op.getOperand(1);6787SDValue CC = Op.getOperand(2);6788EVT VT = Op.getValueType();6789ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();6790SDLoc dl(Op);67916792EVT CmpVT;6793if (ST->hasNEON())6794CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();6795else {6796assert(ST->hasMVEIntegerOps() &&6797"No hardware support for integer vector comparison!");67986799if (Op.getValueType().getVectorElementType() != MVT::i1)6800return SDValue();68016802// Make sure we expand floating point setcc to scalar if we do not have6803// mve.fp, so that we can handle them from there.6804if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())6805return SDValue();68066807CmpVT = VT;6808}68096810if (Op0.getValueType().getVectorElementType() == MVT::i64 &&6811(SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {6812// Special-case integer 64-bit equality comparisons. They aren't legal,6813// but they can be lowered with a few vector instructions.6814unsigned CmpElements = CmpVT.getVectorNumElements() * 2;6815EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);6816SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);6817SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);6818SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,6819DAG.getCondCode(ISD::SETEQ));6820SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);6821SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);6822Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);6823if (SetCCOpcode == ISD::SETNE)6824Merged = DAG.getNOT(dl, Merged, CmpVT);6825Merged = DAG.getSExtOrTrunc(Merged, dl, VT);6826return Merged;6827}68286829if (CmpVT.getVectorElementType() == MVT::i64)6830// 64-bit comparisons are not legal in general.6831return SDValue();68326833if (Op1.getValueType().isFloatingPoint()) {6834switch (SetCCOpcode) {6835default: llvm_unreachable("Illegal FP comparison");6836case ISD::SETUNE:6837case ISD::SETNE:6838if (ST->hasMVEFloatOps()) {6839Opc = ARMCC::NE; break;6840} else {6841Invert = true; [[fallthrough]];6842}6843case ISD::SETOEQ:6844case ISD::SETEQ: Opc = ARMCC::EQ; break;6845case ISD::SETOLT:6846case ISD::SETLT: Swap = true; [[fallthrough]];6847case ISD::SETOGT:6848case ISD::SETGT: Opc = ARMCC::GT; break;6849case ISD::SETOLE:6850case ISD::SETLE: Swap = true; [[fallthrough]];6851case ISD::SETOGE:6852case ISD::SETGE: Opc = ARMCC::GE; break;6853case ISD::SETUGE: Swap = true; [[fallthrough]];6854case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;6855case ISD::SETUGT: Swap = true; [[fallthrough]];6856case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;6857case ISD::SETUEQ: Invert = true; [[fallthrough]];6858case ISD::SETONE: {6859// Expand this to (OLT | OGT).6860SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,6861DAG.getConstant(ARMCC::GT, dl, MVT::i32));6862SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,6863DAG.getConstant(ARMCC::GT, dl, MVT::i32));6864SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);6865if (Invert)6866Result = DAG.getNOT(dl, Result, VT);6867return Result;6868}6869case ISD::SETUO: Invert = true; [[fallthrough]];6870case ISD::SETO: {6871// Expand this to (OLT | OGE).6872SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,6873DAG.getConstant(ARMCC::GT, dl, MVT::i32));6874SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,6875DAG.getConstant(ARMCC::GE, dl, MVT::i32));6876SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);6877if (Invert)6878Result = DAG.getNOT(dl, Result, VT);6879return Result;6880}6881}6882} else {6883// Integer comparisons.6884switch (SetCCOpcode) {6885default: llvm_unreachable("Illegal integer comparison");6886case ISD::SETNE:6887if (ST->hasMVEIntegerOps()) {6888Opc = ARMCC::NE; break;6889} else {6890Invert = true; [[fallthrough]];6891}6892case ISD::SETEQ: Opc = ARMCC::EQ; break;6893case ISD::SETLT: Swap = true; [[fallthrough]];6894case ISD::SETGT: Opc = ARMCC::GT; break;6895case ISD::SETLE: Swap = true; [[fallthrough]];6896case ISD::SETGE: Opc = ARMCC::GE; break;6897case ISD::SETULT: Swap = true; [[fallthrough]];6898case ISD::SETUGT: Opc = ARMCC::HI; break;6899case ISD::SETULE: Swap = true; [[fallthrough]];6900case ISD::SETUGE: Opc = ARMCC::HS; break;6901}69026903// Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).6904if (ST->hasNEON() && Opc == ARMCC::EQ) {6905SDValue AndOp;6906if (ISD::isBuildVectorAllZeros(Op1.getNode()))6907AndOp = Op0;6908else if (ISD::isBuildVectorAllZeros(Op0.getNode()))6909AndOp = Op1;69106911// Ignore bitconvert.6912if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)6913AndOp = AndOp.getOperand(0);69146915if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {6916Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));6917Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));6918SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);6919if (!Invert)6920Result = DAG.getNOT(dl, Result, VT);6921return Result;6922}6923}6924}69256926if (Swap)6927std::swap(Op0, Op1);69286929// If one of the operands is a constant vector zero, attempt to fold the6930// comparison to a specialized compare-against-zero form.6931if (ISD::isBuildVectorAllZeros(Op0.getNode()) &&6932(Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||6933Opc == ARMCC::NE)) {6934if (Opc == ARMCC::GE)6935Opc = ARMCC::LE;6936else if (Opc == ARMCC::GT)6937Opc = ARMCC::LT;6938std::swap(Op0, Op1);6939}69406941SDValue Result;6942if (ISD::isBuildVectorAllZeros(Op1.getNode()) &&6943(Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||6944Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))6945Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,6946DAG.getConstant(Opc, dl, MVT::i32));6947else6948Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,6949DAG.getConstant(Opc, dl, MVT::i32));69506951Result = DAG.getSExtOrTrunc(Result, dl, VT);69526953if (Invert)6954Result = DAG.getNOT(dl, Result, VT);69556956return Result;6957}69586959static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {6960SDValue LHS = Op.getOperand(0);6961SDValue RHS = Op.getOperand(1);6962SDValue Carry = Op.getOperand(2);6963SDValue Cond = Op.getOperand(3);6964SDLoc DL(Op);69656966assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");69676968// ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we6969// have to invert the carry first.6970Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,6971DAG.getConstant(1, DL, MVT::i32), Carry);6972// This converts the boolean value carry into the carry flag.6973Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);69746975SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);6976SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);69776978SDValue FVal = DAG.getConstant(0, DL, MVT::i32);6979SDValue TVal = DAG.getConstant(1, DL, MVT::i32);6980SDValue ARMcc = DAG.getConstant(6981IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);6982SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);6983SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,6984Cmp.getValue(1), SDValue());6985return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,6986CCR, Chain.getValue(1));6987}69886989/// isVMOVModifiedImm - Check if the specified splat value corresponds to a6990/// valid vector constant for a NEON or MVE instruction with a "modified6991/// immediate" operand (e.g., VMOV). If so, return the encoded value.6992static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,6993unsigned SplatBitSize, SelectionDAG &DAG,6994const SDLoc &dl, EVT &VT, EVT VectorVT,6995VMOVModImmType type) {6996unsigned OpCmode, Imm;6997bool is128Bits = VectorVT.is128BitVector();69986999// SplatBitSize is set to the smallest size that splats the vector, so a7000// zero vector will always have SplatBitSize == 8. However, NEON modified7001// immediate instructions others than VMOV do not support the 8-bit encoding7002// of a zero vector, and the default encoding of zero is supposed to be the7003// 32-bit version.7004if (SplatBits == 0)7005SplatBitSize = 32;70067007switch (SplatBitSize) {7008case 8:7009if (type != VMOVModImm)7010return SDValue();7011// Any 1-byte value is OK. Op=0, Cmode=1110.7012assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");7013OpCmode = 0xe;7014Imm = SplatBits;7015VT = is128Bits ? MVT::v16i8 : MVT::v8i8;7016break;70177018case 16:7019// NEON's 16-bit VMOV supports splat values where only one byte is nonzero.7020VT = is128Bits ? MVT::v8i16 : MVT::v4i16;7021if ((SplatBits & ~0xff) == 0) {7022// Value = 0x00nn: Op=x, Cmode=100x.7023OpCmode = 0x8;7024Imm = SplatBits;7025break;7026}7027if ((SplatBits & ~0xff00) == 0) {7028// Value = 0xnn00: Op=x, Cmode=101x.7029OpCmode = 0xa;7030Imm = SplatBits >> 8;7031break;7032}7033return SDValue();70347035case 32:7036// NEON's 32-bit VMOV supports splat values where:7037// * only one byte is nonzero, or7038// * the least significant byte is 0xff and the second byte is nonzero, or7039// * the least significant 2 bytes are 0xff and the third is nonzero.7040VT = is128Bits ? MVT::v4i32 : MVT::v2i32;7041if ((SplatBits & ~0xff) == 0) {7042// Value = 0x000000nn: Op=x, Cmode=000x.7043OpCmode = 0;7044Imm = SplatBits;7045break;7046}7047if ((SplatBits & ~0xff00) == 0) {7048// Value = 0x0000nn00: Op=x, Cmode=001x.7049OpCmode = 0x2;7050Imm = SplatBits >> 8;7051break;7052}7053if ((SplatBits & ~0xff0000) == 0) {7054// Value = 0x00nn0000: Op=x, Cmode=010x.7055OpCmode = 0x4;7056Imm = SplatBits >> 16;7057break;7058}7059if ((SplatBits & ~0xff000000) == 0) {7060// Value = 0xnn000000: Op=x, Cmode=011x.7061OpCmode = 0x6;7062Imm = SplatBits >> 24;7063break;7064}70657066// cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC7067if (type == OtherModImm) return SDValue();70687069if ((SplatBits & ~0xffff) == 0 &&7070((SplatBits | SplatUndef) & 0xff) == 0xff) {7071// Value = 0x0000nnff: Op=x, Cmode=1100.7072OpCmode = 0xc;7073Imm = SplatBits >> 8;7074break;7075}70767077// cmode == 0b1101 is not supported for MVE VMVN7078if (type == MVEVMVNModImm)7079return SDValue();70807081if ((SplatBits & ~0xffffff) == 0 &&7082((SplatBits | SplatUndef) & 0xffff) == 0xffff) {7083// Value = 0x00nnffff: Op=x, Cmode=1101.7084OpCmode = 0xd;7085Imm = SplatBits >> 16;7086break;7087}70887089// Note: there are a few 32-bit splat values (specifically: 00ffff00,7090// ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not7091// VMOV.I32. A (very) minor optimization would be to replicate the value7092// and fall through here to test for a valid 64-bit splat. But, then the7093// caller would also need to check and handle the change in size.7094return SDValue();70957096case 64: {7097if (type != VMOVModImm)7098return SDValue();7099// NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.7100uint64_t BitMask = 0xff;7101unsigned ImmMask = 1;7102Imm = 0;7103for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {7104if (((SplatBits | SplatUndef) & BitMask) == BitMask) {7105Imm |= ImmMask;7106} else if ((SplatBits & BitMask) != 0) {7107return SDValue();7108}7109BitMask <<= 8;7110ImmMask <<= 1;7111}71127113if (DAG.getDataLayout().isBigEndian()) {7114// Reverse the order of elements within the vector.7115unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;7116unsigned Mask = (1 << BytesPerElem) - 1;7117unsigned NumElems = 8 / BytesPerElem;7118unsigned NewImm = 0;7119for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {7120unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);7121NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;7122}7123Imm = NewImm;7124}71257126// Op=1, Cmode=1110.7127OpCmode = 0x1e;7128VT = is128Bits ? MVT::v2i64 : MVT::v1i64;7129break;7130}71317132default:7133llvm_unreachable("unexpected size for isVMOVModifiedImm");7134}71357136unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);7137return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);7138}71397140SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,7141const ARMSubtarget *ST) const {7142EVT VT = Op.getValueType();7143bool IsDouble = (VT == MVT::f64);7144ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);7145const APFloat &FPVal = CFP->getValueAPF();71467147// Prevent floating-point constants from using literal loads7148// when execute-only is enabled.7149if (ST->genExecuteOnly()) {7150// We shouldn't trigger this for v6m execute-only7151assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&7152"Unexpected architecture");71537154// If we can represent the constant as an immediate, don't lower it7155if (isFPImmLegal(FPVal, VT))7156return Op;7157// Otherwise, construct as integer, and move to float register7158APInt INTVal = FPVal.bitcastToAPInt();7159SDLoc DL(CFP);7160switch (VT.getSimpleVT().SimpleTy) {7161default:7162llvm_unreachable("Unknown floating point type!");7163break;7164case MVT::f64: {7165SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);7166SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);7167return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);7168}7169case MVT::f32:7170return DAG.getNode(ARMISD::VMOVSR, DL, VT,7171DAG.getConstant(INTVal, DL, MVT::i32));7172}7173}71747175if (!ST->hasVFP3Base())7176return SDValue();71777178// Use the default (constant pool) lowering for double constants when we have7179// an SP-only FPU7180if (IsDouble && !Subtarget->hasFP64())7181return SDValue();71827183// Try splatting with a VMOV.f32...7184int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);71857186if (ImmVal != -1) {7187if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {7188// We have code in place to select a valid ConstantFP already, no need to7189// do any mangling.7190return Op;7191}71927193// It's a float and we are trying to use NEON operations where7194// possible. Lower it to a splat followed by an extract.7195SDLoc DL(Op);7196SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);7197SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,7198NewVal);7199return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,7200DAG.getConstant(0, DL, MVT::i32));7201}72027203// The rest of our options are NEON only, make sure that's allowed before7204// proceeding..7205if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))7206return SDValue();72077208EVT VMovVT;7209uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();72107211// It wouldn't really be worth bothering for doubles except for one very7212// important value, which does happen to match: 0.0. So make sure we don't do7213// anything stupid.7214if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))7215return SDValue();72167217// Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).7218SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),7219VMovVT, VT, VMOVModImm);7220if (NewVal != SDValue()) {7221SDLoc DL(Op);7222SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,7223NewVal);7224if (IsDouble)7225return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);72267227// It's a float: cast and extract a vector element.7228SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,7229VecConstant);7230return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,7231DAG.getConstant(0, DL, MVT::i32));7232}72337234// Finally, try a VMVN.i327235NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,7236VT, VMVNModImm);7237if (NewVal != SDValue()) {7238SDLoc DL(Op);7239SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);72407241if (IsDouble)7242return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);72437244// It's a float: cast and extract a vector element.7245SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,7246VecConstant);7247return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,7248DAG.getConstant(0, DL, MVT::i32));7249}72507251return SDValue();7252}72537254// check if an VEXT instruction can handle the shuffle mask when the7255// vector sources of the shuffle are the same.7256static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {7257unsigned NumElts = VT.getVectorNumElements();72587259// Assume that the first shuffle index is not UNDEF. Fail if it is.7260if (M[0] < 0)7261return false;72627263Imm = M[0];72647265// If this is a VEXT shuffle, the immediate value is the index of the first7266// element. The other shuffle indices must be the successive elements after7267// the first one.7268unsigned ExpectedElt = Imm;7269for (unsigned i = 1; i < NumElts; ++i) {7270// Increment the expected index. If it wraps around, just follow it7271// back to index zero and keep going.7272++ExpectedElt;7273if (ExpectedElt == NumElts)7274ExpectedElt = 0;72757276if (M[i] < 0) continue; // ignore UNDEF indices7277if (ExpectedElt != static_cast<unsigned>(M[i]))7278return false;7279}72807281return true;7282}72837284static bool isVEXTMask(ArrayRef<int> M, EVT VT,7285bool &ReverseVEXT, unsigned &Imm) {7286unsigned NumElts = VT.getVectorNumElements();7287ReverseVEXT = false;72887289// Assume that the first shuffle index is not UNDEF. Fail if it is.7290if (M[0] < 0)7291return false;72927293Imm = M[0];72947295// If this is a VEXT shuffle, the immediate value is the index of the first7296// element. The other shuffle indices must be the successive elements after7297// the first one.7298unsigned ExpectedElt = Imm;7299for (unsigned i = 1; i < NumElts; ++i) {7300// Increment the expected index. If it wraps around, it may still be7301// a VEXT but the source vectors must be swapped.7302ExpectedElt += 1;7303if (ExpectedElt == NumElts * 2) {7304ExpectedElt = 0;7305ReverseVEXT = true;7306}73077308if (M[i] < 0) continue; // ignore UNDEF indices7309if (ExpectedElt != static_cast<unsigned>(M[i]))7310return false;7311}73127313// Adjust the index value if the source operands will be swapped.7314if (ReverseVEXT)7315Imm -= NumElts;73167317return true;7318}73197320static bool isVTBLMask(ArrayRef<int> M, EVT VT) {7321// We can handle <8 x i8> vector shuffles. If the index in the mask is out of7322// range, then 0 is placed into the resulting vector. So pretty much any mask7323// of 8 elements can work here.7324return VT == MVT::v8i8 && M.size() == 8;7325}73267327static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,7328unsigned Index) {7329if (Mask.size() == Elements * 2)7330return Index / Elements;7331return Mask[Index] == 0 ? 0 : 1;7332}73337334// Checks whether the shuffle mask represents a vector transpose (VTRN) by7335// checking that pairs of elements in the shuffle mask represent the same index7336// in each vector, incrementing the expected index by 2 at each step.7337// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]7338// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}7339// v2={e,f,g,h}7340// WhichResult gives the offset for each element in the mask based on which7341// of the two results it belongs to.7342//7343// The transpose can be represented either as:7344// result1 = shufflevector v1, v2, result1_shuffle_mask7345// result2 = shufflevector v1, v2, result2_shuffle_mask7346// where v1/v2 and the shuffle masks have the same number of elements7347// (here WhichResult (see below) indicates which result is being checked)7348//7349// or as:7350// results = shufflevector v1, v2, shuffle_mask7351// where both results are returned in one vector and the shuffle mask has twice7352// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we7353// want to check the low half and high half of the shuffle mask as if it were7354// the other case7355static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {7356unsigned EltSz = VT.getScalarSizeInBits();7357if (EltSz == 64)7358return false;73597360unsigned NumElts = VT.getVectorNumElements();7361if (M.size() != NumElts && M.size() != NumElts*2)7362return false;73637364// If the mask is twice as long as the input vector then we need to check the7365// upper and lower parts of the mask with a matching value for WhichResult7366// FIXME: A mask with only even values will be rejected in case the first7367// element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only7368// M[0] is used to determine WhichResult7369for (unsigned i = 0; i < M.size(); i += NumElts) {7370WhichResult = SelectPairHalf(NumElts, M, i);7371for (unsigned j = 0; j < NumElts; j += 2) {7372if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||7373(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))7374return false;7375}7376}73777378if (M.size() == NumElts*2)7379WhichResult = 0;73807381return true;7382}73837384/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of7385/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".7386/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.7387static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){7388unsigned EltSz = VT.getScalarSizeInBits();7389if (EltSz == 64)7390return false;73917392unsigned NumElts = VT.getVectorNumElements();7393if (M.size() != NumElts && M.size() != NumElts*2)7394return false;73957396for (unsigned i = 0; i < M.size(); i += NumElts) {7397WhichResult = SelectPairHalf(NumElts, M, i);7398for (unsigned j = 0; j < NumElts; j += 2) {7399if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||7400(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))7401return false;7402}7403}74047405if (M.size() == NumElts*2)7406WhichResult = 0;74077408return true;7409}74107411// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking7412// that the mask elements are either all even and in steps of size 2 or all odd7413// and in steps of size 2.7414// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]7415// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}7416// v2={e,f,g,h}7417// Requires similar checks to that of isVTRNMask with7418// respect the how results are returned.7419static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {7420unsigned EltSz = VT.getScalarSizeInBits();7421if (EltSz == 64)7422return false;74237424unsigned NumElts = VT.getVectorNumElements();7425if (M.size() != NumElts && M.size() != NumElts*2)7426return false;74277428for (unsigned i = 0; i < M.size(); i += NumElts) {7429WhichResult = SelectPairHalf(NumElts, M, i);7430for (unsigned j = 0; j < NumElts; ++j) {7431if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)7432return false;7433}7434}74357436if (M.size() == NumElts*2)7437WhichResult = 0;74387439// VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.7440if (VT.is64BitVector() && EltSz == 32)7441return false;74427443return true;7444}74457446/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of7447/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".7448/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,7449static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){7450unsigned EltSz = VT.getScalarSizeInBits();7451if (EltSz == 64)7452return false;74537454unsigned NumElts = VT.getVectorNumElements();7455if (M.size() != NumElts && M.size() != NumElts*2)7456return false;74577458unsigned Half = NumElts / 2;7459for (unsigned i = 0; i < M.size(); i += NumElts) {7460WhichResult = SelectPairHalf(NumElts, M, i);7461for (unsigned j = 0; j < NumElts; j += Half) {7462unsigned Idx = WhichResult;7463for (unsigned k = 0; k < Half; ++k) {7464int MIdx = M[i + j + k];7465if (MIdx >= 0 && (unsigned) MIdx != Idx)7466return false;7467Idx += 2;7468}7469}7470}74717472if (M.size() == NumElts*2)7473WhichResult = 0;74747475// VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.7476if (VT.is64BitVector() && EltSz == 32)7477return false;74787479return true;7480}74817482// Checks whether the shuffle mask represents a vector zip (VZIP) by checking7483// that pairs of elements of the shufflemask represent the same index in each7484// vector incrementing sequentially through the vectors.7485// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]7486// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}7487// v2={e,f,g,h}7488// Requires similar checks to that of isVTRNMask with respect the how results7489// are returned.7490static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {7491unsigned EltSz = VT.getScalarSizeInBits();7492if (EltSz == 64)7493return false;74947495unsigned NumElts = VT.getVectorNumElements();7496if (M.size() != NumElts && M.size() != NumElts*2)7497return false;74987499for (unsigned i = 0; i < M.size(); i += NumElts) {7500WhichResult = SelectPairHalf(NumElts, M, i);7501unsigned Idx = WhichResult * NumElts / 2;7502for (unsigned j = 0; j < NumElts; j += 2) {7503if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||7504(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))7505return false;7506Idx += 1;7507}7508}75097510if (M.size() == NumElts*2)7511WhichResult = 0;75127513// VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.7514if (VT.is64BitVector() && EltSz == 32)7515return false;75167517return true;7518}75197520/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of7521/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".7522/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.7523static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){7524unsigned EltSz = VT.getScalarSizeInBits();7525if (EltSz == 64)7526return false;75277528unsigned NumElts = VT.getVectorNumElements();7529if (M.size() != NumElts && M.size() != NumElts*2)7530return false;75317532for (unsigned i = 0; i < M.size(); i += NumElts) {7533WhichResult = SelectPairHalf(NumElts, M, i);7534unsigned Idx = WhichResult * NumElts / 2;7535for (unsigned j = 0; j < NumElts; j += 2) {7536if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||7537(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))7538return false;7539Idx += 1;7540}7541}75427543if (M.size() == NumElts*2)7544WhichResult = 0;75457546// VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.7547if (VT.is64BitVector() && EltSz == 32)7548return false;75497550return true;7551}75527553/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),7554/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.7555static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,7556unsigned &WhichResult,7557bool &isV_UNDEF) {7558isV_UNDEF = false;7559if (isVTRNMask(ShuffleMask, VT, WhichResult))7560return ARMISD::VTRN;7561if (isVUZPMask(ShuffleMask, VT, WhichResult))7562return ARMISD::VUZP;7563if (isVZIPMask(ShuffleMask, VT, WhichResult))7564return ARMISD::VZIP;75657566isV_UNDEF = true;7567if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))7568return ARMISD::VTRN;7569if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))7570return ARMISD::VUZP;7571if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))7572return ARMISD::VZIP;75737574return 0;7575}75767577/// \return true if this is a reverse operation on an vector.7578static bool isReverseMask(ArrayRef<int> M, EVT VT) {7579unsigned NumElts = VT.getVectorNumElements();7580// Make sure the mask has the right size.7581if (NumElts != M.size())7582return false;75837584// Look for <15, ..., 3, -1, 1, 0>.7585for (unsigned i = 0; i != NumElts; ++i)7586if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))7587return false;75887589return true;7590}75917592static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {7593unsigned NumElts = VT.getVectorNumElements();7594// Make sure the mask has the right size.7595if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))7596return false;75977598// Half-width truncation patterns (e.g. v4i32 -> v8i16):7599// !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>7600// !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>7601// Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>7602// Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>7603int Ofs = Top ? 1 : 0;7604int Upper = SingleSource ? 0 : NumElts;7605for (int i = 0, e = NumElts / 2; i != e; ++i) {7606if (M[i] >= 0 && M[i] != (i * 2) + Ofs)7607return false;7608if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)7609return false;7610}7611return true;7612}76137614static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {7615unsigned NumElts = VT.getVectorNumElements();7616// Make sure the mask has the right size.7617if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))7618return false;76197620// If Top7621// Look for <0, N, 2, N+2, 4, N+4, ..>.7622// This inserts Input2 into Input17623// else if not Top7624// Look for <0, N+1, 2, N+3, 4, N+5, ..>7625// This inserts Input1 into Input27626unsigned Offset = Top ? 0 : 1;7627unsigned N = SingleSource ? 0 : NumElts;7628for (unsigned i = 0; i < NumElts; i += 2) {7629if (M[i] >= 0 && M[i] != (int)i)7630return false;7631if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))7632return false;7633}76347635return true;7636}76377638static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {7639unsigned NumElts = ToVT.getVectorNumElements();7640if (NumElts != M.size())7641return false;76427643// Test if the Trunc can be convertable to a VMOVN with this shuffle. We are7644// looking for patterns of:7645// !rev: 0 N/2 1 N/2+1 2 N/2+2 ...7646// rev: N/2 0 N/2+1 1 N/2+2 2 ...76477648unsigned Off0 = rev ? NumElts / 2 : 0;7649unsigned Off1 = rev ? 0 : NumElts / 2;7650for (unsigned i = 0; i < NumElts; i += 2) {7651if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))7652return false;7653if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))7654return false;7655}76567657return true;7658}76597660// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted7661// from a pair of inputs. For example:7662// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),7663// FP_ROUND(EXTRACT_ELT(Y, 0),7664// FP_ROUND(EXTRACT_ELT(X, 1),7665// FP_ROUND(EXTRACT_ELT(Y, 1), ...)7666static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG,7667const ARMSubtarget *ST) {7668assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");7669if (!ST->hasMVEFloatOps())7670return SDValue();76717672SDLoc dl(BV);7673EVT VT = BV.getValueType();7674if (VT != MVT::v8f16)7675return SDValue();76767677// We are looking for a buildvector of fptrunc elements, where all the7678// elements are interleavingly extracted from two sources. Check the first two7679// items are valid enough and extract some info from them (they are checked7680// properly in the loop below).7681if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||7682BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||7683BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0)7684return SDValue();7685if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||7686BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||7687BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0)7688return SDValue();7689SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);7690SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);7691if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)7692return SDValue();76937694// Check all the values in the BuildVector line up with our expectations.7695for (unsigned i = 1; i < 4; i++) {7696auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {7697return Trunc.getOpcode() == ISD::FP_ROUND &&7698Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&7699Trunc.getOperand(0).getOperand(0) == Op &&7700Trunc.getOperand(0).getConstantOperandVal(1) == Idx;7701};7702if (!Check(BV.getOperand(i * 2 + 0), Op0, i))7703return SDValue();7704if (!Check(BV.getOperand(i * 2 + 1), Op1, i))7705return SDValue();7706}77077708SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,7709DAG.getConstant(0, dl, MVT::i32));7710return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,7711DAG.getConstant(1, dl, MVT::i32));7712}77137714// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted7715// from a single input on alternating lanes. For example:7716// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),7717// FP_ROUND(EXTRACT_ELT(X, 2),7718// FP_ROUND(EXTRACT_ELT(X, 4), ...)7719static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG,7720const ARMSubtarget *ST) {7721assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");7722if (!ST->hasMVEFloatOps())7723return SDValue();77247725SDLoc dl(BV);7726EVT VT = BV.getValueType();7727if (VT != MVT::v4f32)7728return SDValue();77297730// We are looking for a buildvector of fptext elements, where all the7731// elements are alternating lanes from a single source. For example <0,2,4,6>7732// or <1,3,5,7>. Check the first two items are valid enough and extract some7733// info from them (they are checked properly in the loop below).7734if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||7735BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT)7736return SDValue();7737SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);7738int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1);7739if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))7740return SDValue();77417742// Check all the values in the BuildVector line up with our expectations.7743for (unsigned i = 1; i < 4; i++) {7744auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {7745return Trunc.getOpcode() == ISD::FP_EXTEND &&7746Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&7747Trunc.getOperand(0).getOperand(0) == Op &&7748Trunc.getOperand(0).getConstantOperandVal(1) == Idx;7749};7750if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))7751return SDValue();7752}77537754return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,7755DAG.getConstant(Offset, dl, MVT::i32));7756}77577758// If N is an integer constant that can be moved into a register in one7759// instruction, return an SDValue of such a constant (will become a MOV7760// instruction). Otherwise return null.7761static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,7762const ARMSubtarget *ST, const SDLoc &dl) {7763uint64_t Val;7764if (!isa<ConstantSDNode>(N))7765return SDValue();7766Val = N->getAsZExtVal();77677768if (ST->isThumb1Only()) {7769if (Val <= 255 || ~Val <= 255)7770return DAG.getConstant(Val, dl, MVT::i32);7771} else {7772if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)7773return DAG.getConstant(Val, dl, MVT::i32);7774}7775return SDValue();7776}77777778static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,7779const ARMSubtarget *ST) {7780SDLoc dl(Op);7781EVT VT = Op.getValueType();77827783assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");77847785unsigned NumElts = VT.getVectorNumElements();7786unsigned BoolMask;7787unsigned BitsPerBool;7788if (NumElts == 2) {7789BitsPerBool = 8;7790BoolMask = 0xff;7791} else if (NumElts == 4) {7792BitsPerBool = 4;7793BoolMask = 0xf;7794} else if (NumElts == 8) {7795BitsPerBool = 2;7796BoolMask = 0x3;7797} else if (NumElts == 16) {7798BitsPerBool = 1;7799BoolMask = 0x1;7800} else7801return SDValue();78027803// If this is a single value copied into all lanes (a splat), we can just sign7804// extend that single value7805SDValue FirstOp = Op.getOperand(0);7806if (!isa<ConstantSDNode>(FirstOp) &&7807llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {7808return U.get().isUndef() || U.get() == FirstOp;7809})) {7810SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,7811DAG.getValueType(MVT::i1));7812return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);7813}78147815// First create base with bits set where known7816unsigned Bits32 = 0;7817for (unsigned i = 0; i < NumElts; ++i) {7818SDValue V = Op.getOperand(i);7819if (!isa<ConstantSDNode>(V) && !V.isUndef())7820continue;7821bool BitSet = V.isUndef() ? false : V->getAsZExtVal();7822if (BitSet)7823Bits32 |= BoolMask << (i * BitsPerBool);7824}78257826// Add in unknown nodes7827SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,7828DAG.getConstant(Bits32, dl, MVT::i32));7829for (unsigned i = 0; i < NumElts; ++i) {7830SDValue V = Op.getOperand(i);7831if (isa<ConstantSDNode>(V) || V.isUndef())7832continue;7833Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,7834DAG.getConstant(i, dl, MVT::i32));7835}78367837return Base;7838}78397840static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG,7841const ARMSubtarget *ST) {7842if (!ST->hasMVEIntegerOps())7843return SDValue();78447845// We are looking for a buildvector where each element is Op[0] + i*N7846EVT VT = Op.getValueType();7847SDValue Op0 = Op.getOperand(0);7848unsigned NumElts = VT.getVectorNumElements();78497850// Get the increment value from operand 17851SDValue Op1 = Op.getOperand(1);7852if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||7853!isa<ConstantSDNode>(Op1.getOperand(1)))7854return SDValue();7855unsigned N = Op1.getConstantOperandVal(1);7856if (N != 1 && N != 2 && N != 4 && N != 8)7857return SDValue();78587859// Check that each other operand matches7860for (unsigned I = 2; I < NumElts; I++) {7861SDValue OpI = Op.getOperand(I);7862if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||7863!isa<ConstantSDNode>(OpI.getOperand(1)) ||7864OpI.getConstantOperandVal(1) != I * N)7865return SDValue();7866}78677868SDLoc DL(Op);7869return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,7870DAG.getConstant(N, DL, MVT::i32));7871}78727873// Returns true if the operation N can be treated as qr instruction variant at7874// operand Op.7875static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {7876switch (N->getOpcode()) {7877case ISD::ADD:7878case ISD::MUL:7879case ISD::SADDSAT:7880case ISD::UADDSAT:7881return true;7882case ISD::SUB:7883case ISD::SSUBSAT:7884case ISD::USUBSAT:7885return N->getOperand(1).getNode() == Op;7886case ISD::INTRINSIC_WO_CHAIN:7887switch (N->getConstantOperandVal(0)) {7888case Intrinsic::arm_mve_add_predicated:7889case Intrinsic::arm_mve_mul_predicated:7890case Intrinsic::arm_mve_qadd_predicated:7891case Intrinsic::arm_mve_vhadd:7892case Intrinsic::arm_mve_hadd_predicated:7893case Intrinsic::arm_mve_vqdmulh:7894case Intrinsic::arm_mve_qdmulh_predicated:7895case Intrinsic::arm_mve_vqrdmulh:7896case Intrinsic::arm_mve_qrdmulh_predicated:7897case Intrinsic::arm_mve_vqdmull:7898case Intrinsic::arm_mve_vqdmull_predicated:7899return true;7900case Intrinsic::arm_mve_sub_predicated:7901case Intrinsic::arm_mve_qsub_predicated:7902case Intrinsic::arm_mve_vhsub:7903case Intrinsic::arm_mve_hsub_predicated:7904return N->getOperand(2).getNode() == Op;7905default:7906return false;7907}7908default:7909return false;7910}7911}79127913// If this is a case we can't handle, return null and let the default7914// expansion code take care of it.7915SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,7916const ARMSubtarget *ST) const {7917BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());7918SDLoc dl(Op);7919EVT VT = Op.getValueType();79207921if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)7922return LowerBUILD_VECTOR_i1(Op, DAG, ST);79237924if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))7925return R;79267927APInt SplatBits, SplatUndef;7928unsigned SplatBitSize;7929bool HasAnyUndefs;7930if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {7931if (SplatUndef.isAllOnes())7932return DAG.getUNDEF(VT);79337934// If all the users of this constant splat are qr instruction variants,7935// generate a vdup of the constant.7936if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&7937(SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&7938all_of(BVN->uses(),7939[BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {7940EVT DupVT = SplatBitSize == 32 ? MVT::v4i327941: SplatBitSize == 16 ? MVT::v8i167942: MVT::v16i8;7943SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);7944SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);7945return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);7946}79477948if ((ST->hasNEON() && SplatBitSize <= 64) ||7949(ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {7950// Check if an immediate VMOV works.7951EVT VmovVT;7952SDValue Val =7953isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),7954SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);79557956if (Val.getNode()) {7957SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);7958return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);7959}79607961// Try an immediate VMVN.7962uint64_t NegatedImm = (~SplatBits).getZExtValue();7963Val = isVMOVModifiedImm(7964NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,7965VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);7966if (Val.getNode()) {7967SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);7968return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);7969}79707971// Use vmov.f32 to materialize other v2f32 and v4f32 splats.7972if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {7973int ImmVal = ARM_AM::getFP32Imm(SplatBits);7974if (ImmVal != -1) {7975SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);7976return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);7977}7978}79797980// If we are under MVE, generate a VDUP(constant), bitcast to the original7981// type.7982if (ST->hasMVEIntegerOps() &&7983(SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {7984EVT DupVT = SplatBitSize == 32 ? MVT::v4i327985: SplatBitSize == 16 ? MVT::v8i167986: MVT::v16i8;7987SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);7988SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);7989return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);7990}7991}7992}79937994// Scan through the operands to see if only one value is used.7995//7996// As an optimisation, even if more than one value is used it may be more7997// profitable to splat with one value then change some lanes.7998//7999// Heuristically we decide to do this if the vector has a "dominant" value,8000// defined as splatted to more than half of the lanes.8001unsigned NumElts = VT.getVectorNumElements();8002bool isOnlyLowElement = true;8003bool usesOnlyOneValue = true;8004bool hasDominantValue = false;8005bool isConstant = true;80068007// Map of the number of times a particular SDValue appears in the8008// element list.8009DenseMap<SDValue, unsigned> ValueCounts;8010SDValue Value;8011for (unsigned i = 0; i < NumElts; ++i) {8012SDValue V = Op.getOperand(i);8013if (V.isUndef())8014continue;8015if (i > 0)8016isOnlyLowElement = false;8017if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))8018isConstant = false;80198020ValueCounts.insert(std::make_pair(V, 0));8021unsigned &Count = ValueCounts[V];80228023// Is this value dominant? (takes up more than half of the lanes)8024if (++Count > (NumElts / 2)) {8025hasDominantValue = true;8026Value = V;8027}8028}8029if (ValueCounts.size() != 1)8030usesOnlyOneValue = false;8031if (!Value.getNode() && !ValueCounts.empty())8032Value = ValueCounts.begin()->first;80338034if (ValueCounts.empty())8035return DAG.getUNDEF(VT);80368037// Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.8038// Keep going if we are hitting this case.8039if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))8040return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);80418042unsigned EltSize = VT.getScalarSizeInBits();80438044// Use VDUP for non-constant splats. For f32 constant splats, reduce to8045// i32 and try again.8046if (hasDominantValue && EltSize <= 32) {8047if (!isConstant) {8048SDValue N;80498050// If we are VDUPing a value that comes directly from a vector, that will8051// cause an unnecessary move to and from a GPR, where instead we could8052// just use VDUPLANE. We can only do this if the lane being extracted8053// is at a constant index, as the VDUP from lane instructions only have8054// constant-index forms.8055ConstantSDNode *constIndex;8056if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&8057(constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {8058// We need to create a new undef vector to use for the VDUPLANE if the8059// size of the vector from which we get the value is different than the8060// size of the vector that we need to create. We will insert the element8061// such that the register coalescer will remove unnecessary copies.8062if (VT != Value->getOperand(0).getValueType()) {8063unsigned index = constIndex->getAPIntValue().getLimitedValue() %8064VT.getVectorNumElements();8065N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,8066DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),8067Value, DAG.getConstant(index, dl, MVT::i32)),8068DAG.getConstant(index, dl, MVT::i32));8069} else8070N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,8071Value->getOperand(0), Value->getOperand(1));8072} else8073N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);80748075if (!usesOnlyOneValue) {8076// The dominant value was splatted as 'N', but we now have to insert8077// all differing elements.8078for (unsigned I = 0; I < NumElts; ++I) {8079if (Op.getOperand(I) == Value)8080continue;8081SmallVector<SDValue, 3> Ops;8082Ops.push_back(N);8083Ops.push_back(Op.getOperand(I));8084Ops.push_back(DAG.getConstant(I, dl, MVT::i32));8085N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);8086}8087}8088return N;8089}8090if (VT.getVectorElementType().isFloatingPoint()) {8091SmallVector<SDValue, 8> Ops;8092MVT FVT = VT.getVectorElementType().getSimpleVT();8093assert(FVT == MVT::f32 || FVT == MVT::f16);8094MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;8095for (unsigned i = 0; i < NumElts; ++i)8096Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,8097Op.getOperand(i)));8098EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);8099SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);8100Val = LowerBUILD_VECTOR(Val, DAG, ST);8101if (Val.getNode())8102return DAG.getNode(ISD::BITCAST, dl, VT, Val);8103}8104if (usesOnlyOneValue) {8105SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);8106if (isConstant && Val.getNode())8107return DAG.getNode(ARMISD::VDUP, dl, VT, Val);8108}8109}81108111// If all elements are constants and the case above didn't get hit, fall back8112// to the default expansion, which will generate a load from the constant8113// pool.8114if (isConstant)8115return SDValue();81168117// Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and8118// vmovn). Empirical tests suggest this is rarely worth it for vectors of8119// length <= 2.8120if (NumElts >= 4)8121if (SDValue shuffle = ReconstructShuffle(Op, DAG))8122return shuffle;81238124// Attempt to turn a buildvector of scalar fptrunc's or fpext's back into8125// VCVT's8126if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))8127return VCVT;8128if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))8129return VCVT;81308131if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {8132// If we haven't found an efficient lowering, try splitting a 128-bit vector8133// into two 64-bit vectors; we might discover a better way to lower it.8134SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);8135EVT ExtVT = VT.getVectorElementType();8136EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);8137SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));8138if (Lower.getOpcode() == ISD::BUILD_VECTOR)8139Lower = LowerBUILD_VECTOR(Lower, DAG, ST);8140SDValue Upper =8141DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));8142if (Upper.getOpcode() == ISD::BUILD_VECTOR)8143Upper = LowerBUILD_VECTOR(Upper, DAG, ST);8144if (Lower && Upper)8145return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);8146}81478148// Vectors with 32- or 64-bit elements can be built by directly assigning8149// the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands8150// will be legalized.8151if (EltSize >= 32) {8152// Do the expansion with floating-point types, since that is what the VFP8153// registers are defined to use, and since i64 is not legal.8154EVT EltVT = EVT::getFloatingPointVT(EltSize);8155EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);8156SmallVector<SDValue, 8> Ops;8157for (unsigned i = 0; i < NumElts; ++i)8158Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));8159SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);8160return DAG.getNode(ISD::BITCAST, dl, VT, Val);8161}81628163// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we8164// know the default expansion would otherwise fall back on something even8165// worse. For a vector with one or two non-undef values, that's8166// scalar_to_vector for the elements followed by a shuffle (provided the8167// shuffle is valid for the target) and materialization element by element8168// on the stack followed by a load for everything else.8169if (!isConstant && !usesOnlyOneValue) {8170SDValue Vec = DAG.getUNDEF(VT);8171for (unsigned i = 0 ; i < NumElts; ++i) {8172SDValue V = Op.getOperand(i);8173if (V.isUndef())8174continue;8175SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);8176Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);8177}8178return Vec;8179}81808181return SDValue();8182}81838184// Gather data to see if the operation can be modelled as a8185// shuffle in combination with VEXTs.8186SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,8187SelectionDAG &DAG) const {8188assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");8189SDLoc dl(Op);8190EVT VT = Op.getValueType();8191unsigned NumElts = VT.getVectorNumElements();81928193struct ShuffleSourceInfo {8194SDValue Vec;8195unsigned MinElt = std::numeric_limits<unsigned>::max();8196unsigned MaxElt = 0;81978198// We may insert some combination of BITCASTs and VEXT nodes to force Vec to8199// be compatible with the shuffle we intend to construct. As a result8200// ShuffleVec will be some sliding window into the original Vec.8201SDValue ShuffleVec;82028203// Code should guarantee that element i in Vec starts at element "WindowBase8204// + i * WindowScale in ShuffleVec".8205int WindowBase = 0;8206int WindowScale = 1;82078208ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}82098210bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }8211};82128213// First gather all vectors used as an immediate source for this BUILD_VECTOR8214// node.8215SmallVector<ShuffleSourceInfo, 2> Sources;8216for (unsigned i = 0; i < NumElts; ++i) {8217SDValue V = Op.getOperand(i);8218if (V.isUndef())8219continue;8220else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {8221// A shuffle can only come from building a vector from various8222// elements of other vectors.8223return SDValue();8224} else if (!isa<ConstantSDNode>(V.getOperand(1))) {8225// Furthermore, shuffles require a constant mask, whereas extractelts8226// accept variable indices.8227return SDValue();8228}82298230// Add this element source to the list if it's not already there.8231SDValue SourceVec = V.getOperand(0);8232auto Source = llvm::find(Sources, SourceVec);8233if (Source == Sources.end())8234Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));82358236// Update the minimum and maximum lane number seen.8237unsigned EltNo = V.getConstantOperandVal(1);8238Source->MinElt = std::min(Source->MinElt, EltNo);8239Source->MaxElt = std::max(Source->MaxElt, EltNo);8240}82418242// Currently only do something sane when at most two source vectors8243// are involved.8244if (Sources.size() > 2)8245return SDValue();82468247// Find out the smallest element size among result and two sources, and use8248// it as element size to build the shuffle_vector.8249EVT SmallestEltTy = VT.getVectorElementType();8250for (auto &Source : Sources) {8251EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();8252if (SrcEltTy.bitsLT(SmallestEltTy))8253SmallestEltTy = SrcEltTy;8254}8255unsigned ResMultiplier =8256VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();8257NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();8258EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);82598260// If the source vector is too wide or too narrow, we may nevertheless be able8261// to construct a compatible shuffle either by concatenating it with UNDEF or8262// extracting a suitable range of elements.8263for (auto &Src : Sources) {8264EVT SrcVT = Src.ShuffleVec.getValueType();82658266uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();8267uint64_t VTSize = VT.getFixedSizeInBits();8268if (SrcVTSize == VTSize)8269continue;82708271// This stage of the search produces a source with the same element type as8272// the original, but with a total width matching the BUILD_VECTOR output.8273EVT EltVT = SrcVT.getVectorElementType();8274unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();8275EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);82768277if (SrcVTSize < VTSize) {8278if (2 * SrcVTSize != VTSize)8279return SDValue();8280// We can pad out the smaller vector for free, so if it's part of a8281// shuffle...8282Src.ShuffleVec =8283DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,8284DAG.getUNDEF(Src.ShuffleVec.getValueType()));8285continue;8286}82878288if (SrcVTSize != 2 * VTSize)8289return SDValue();82908291if (Src.MaxElt - Src.MinElt >= NumSrcElts) {8292// Span too large for a VEXT to cope8293return SDValue();8294}82958296if (Src.MinElt >= NumSrcElts) {8297// The extraction can just take the second half8298Src.ShuffleVec =8299DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,8300DAG.getConstant(NumSrcElts, dl, MVT::i32));8301Src.WindowBase = -NumSrcElts;8302} else if (Src.MaxElt < NumSrcElts) {8303// The extraction can just take the first half8304Src.ShuffleVec =8305DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,8306DAG.getConstant(0, dl, MVT::i32));8307} else {8308// An actual VEXT is needed8309SDValue VEXTSrc1 =8310DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,8311DAG.getConstant(0, dl, MVT::i32));8312SDValue VEXTSrc2 =8313DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,8314DAG.getConstant(NumSrcElts, dl, MVT::i32));83158316Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,8317VEXTSrc2,8318DAG.getConstant(Src.MinElt, dl, MVT::i32));8319Src.WindowBase = -Src.MinElt;8320}8321}83228323// Another possible incompatibility occurs from the vector element types. We8324// can fix this by bitcasting the source vectors to the same type we intend8325// for the shuffle.8326for (auto &Src : Sources) {8327EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();8328if (SrcEltTy == SmallestEltTy)8329continue;8330assert(ShuffleVT.getVectorElementType() == SmallestEltTy);8331Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);8332Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();8333Src.WindowBase *= Src.WindowScale;8334}83358336// Final check before we try to actually produce a shuffle.8337LLVM_DEBUG(for (auto Src8338: Sources)8339assert(Src.ShuffleVec.getValueType() == ShuffleVT););83408341// The stars all align, our next step is to produce the mask for the shuffle.8342SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);8343int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();8344for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {8345SDValue Entry = Op.getOperand(i);8346if (Entry.isUndef())8347continue;83488349auto Src = llvm::find(Sources, Entry.getOperand(0));8350int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();83518352// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit8353// trunc. So only std::min(SrcBits, DestBits) actually get defined in this8354// segment.8355EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();8356int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),8357VT.getScalarSizeInBits());8358int LanesDefined = BitsDefined / BitsPerShuffleLane;83598360// This source is expected to fill ResMultiplier lanes of the final shuffle,8361// starting at the appropriate offset.8362int *LaneMask = &Mask[i * ResMultiplier];83638364int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;8365ExtractBase += NumElts * (Src - Sources.begin());8366for (int j = 0; j < LanesDefined; ++j)8367LaneMask[j] = ExtractBase + j;8368}836983708371// We can't handle more than two sources. This should have already8372// been checked before this point.8373assert(Sources.size() <= 2 && "Too many sources!");83748375SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };8376for (unsigned i = 0; i < Sources.size(); ++i)8377ShuffleOps[i] = Sources[i].ShuffleVec;83788379SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],8380ShuffleOps[1], Mask, DAG);8381if (!Shuffle)8382return SDValue();8383return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);8384}83858386enum ShuffleOpCodes {8387OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>8388OP_VREV,8389OP_VDUP0,8390OP_VDUP1,8391OP_VDUP2,8392OP_VDUP3,8393OP_VEXT1,8394OP_VEXT2,8395OP_VEXT3,8396OP_VUZPL, // VUZP, left result8397OP_VUZPR, // VUZP, right result8398OP_VZIPL, // VZIP, left result8399OP_VZIPR, // VZIP, right result8400OP_VTRNL, // VTRN, left result8401OP_VTRNR // VTRN, right result8402};84038404static bool isLegalMVEShuffleOp(unsigned PFEntry) {8405unsigned OpNum = (PFEntry >> 26) & 0x0F;8406switch (OpNum) {8407case OP_COPY:8408case OP_VREV:8409case OP_VDUP0:8410case OP_VDUP1:8411case OP_VDUP2:8412case OP_VDUP3:8413return true;8414}8415return false;8416}84178418/// isShuffleMaskLegal - Targets can use this to indicate that they only8419/// support *some* VECTOR_SHUFFLE operations, those with specific masks.8420/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values8421/// are assumed to be legal.8422bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {8423if (VT.getVectorNumElements() == 4 &&8424(VT.is128BitVector() || VT.is64BitVector())) {8425unsigned PFIndexes[4];8426for (unsigned i = 0; i != 4; ++i) {8427if (M[i] < 0)8428PFIndexes[i] = 8;8429else8430PFIndexes[i] = M[i];8431}84328433// Compute the index in the perfect shuffle table.8434unsigned PFTableIndex =8435PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];8436unsigned PFEntry = PerfectShuffleTable[PFTableIndex];8437unsigned Cost = (PFEntry >> 30);84388439if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))8440return true;8441}84428443bool ReverseVEXT, isV_UNDEF;8444unsigned Imm, WhichResult;84458446unsigned EltSize = VT.getScalarSizeInBits();8447if (EltSize >= 32 ||8448ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||8449ShuffleVectorInst::isIdentityMask(M, M.size()) ||8450isVREVMask(M, VT, 64) ||8451isVREVMask(M, VT, 32) ||8452isVREVMask(M, VT, 16))8453return true;8454else if (Subtarget->hasNEON() &&8455(isVEXTMask(M, VT, ReverseVEXT, Imm) ||8456isVTBLMask(M, VT) ||8457isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))8458return true;8459else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&8460isReverseMask(M, VT))8461return true;8462else if (Subtarget->hasMVEIntegerOps() &&8463(isVMOVNMask(M, VT, true, false) ||8464isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))8465return true;8466else if (Subtarget->hasMVEIntegerOps() &&8467(isTruncMask(M, VT, false, false) ||8468isTruncMask(M, VT, false, true) ||8469isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))8470return true;8471else8472return false;8473}84748475/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit8476/// the specified operations to build the shuffle.8477static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,8478SDValue RHS, SelectionDAG &DAG,8479const SDLoc &dl) {8480unsigned OpNum = (PFEntry >> 26) & 0x0F;8481unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);8482unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);84838484if (OpNum == OP_COPY) {8485if (LHSID == (1*9+2)*9+3) return LHS;8486assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");8487return RHS;8488}84898490SDValue OpLHS, OpRHS;8491OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);8492OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);8493EVT VT = OpLHS.getValueType();84948495switch (OpNum) {8496default: llvm_unreachable("Unknown shuffle opcode!");8497case OP_VREV:8498// VREV divides the vector in half and swaps within the half.8499if (VT.getScalarSizeInBits() == 32)8500return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);8501// vrev <4 x i16> -> VREV328502if (VT.getScalarSizeInBits() == 16)8503return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);8504// vrev <4 x i8> -> VREV168505assert(VT.getScalarSizeInBits() == 8);8506return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);8507case OP_VDUP0:8508case OP_VDUP1:8509case OP_VDUP2:8510case OP_VDUP3:8511return DAG.getNode(ARMISD::VDUPLANE, dl, VT,8512OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));8513case OP_VEXT1:8514case OP_VEXT2:8515case OP_VEXT3:8516return DAG.getNode(ARMISD::VEXT, dl, VT,8517OpLHS, OpRHS,8518DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));8519case OP_VUZPL:8520case OP_VUZPR:8521return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),8522OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);8523case OP_VZIPL:8524case OP_VZIPR:8525return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),8526OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);8527case OP_VTRNL:8528case OP_VTRNR:8529return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),8530OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);8531}8532}85338534static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,8535ArrayRef<int> ShuffleMask,8536SelectionDAG &DAG) {8537// Check to see if we can use the VTBL instruction.8538SDValue V1 = Op.getOperand(0);8539SDValue V2 = Op.getOperand(1);8540SDLoc DL(Op);85418542SmallVector<SDValue, 8> VTBLMask;8543for (int I : ShuffleMask)8544VTBLMask.push_back(DAG.getConstant(I, DL, MVT::i32));85458546if (V2.getNode()->isUndef())8547return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,8548DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));85498550return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,8551DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));8552}85538554static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {8555SDLoc DL(Op);8556EVT VT = Op.getValueType();85578558assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&8559"Expect an v8i16/v16i8 type");8560SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));8561// For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,8562// extract the first 8 bytes into the top double word and the last 8 bytes8563// into the bottom double word, through a new vector shuffle that will be8564// turned into a VEXT on Neon, or a couple of VMOVDs on MVE.8565std::vector<int> NewMask;8566for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)8567NewMask.push_back(VT.getVectorNumElements() / 2 + i);8568for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)8569NewMask.push_back(i);8570return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);8571}85728573static EVT getVectorTyFromPredicateVector(EVT VT) {8574switch (VT.getSimpleVT().SimpleTy) {8575case MVT::v2i1:8576return MVT::v2f64;8577case MVT::v4i1:8578return MVT::v4i32;8579case MVT::v8i1:8580return MVT::v8i16;8581case MVT::v16i1:8582return MVT::v16i8;8583default:8584llvm_unreachable("Unexpected vector predicate type");8585}8586}85878588static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT,8589SelectionDAG &DAG) {8590// Converting from boolean predicates to integers involves creating a vector8591// of all ones or all zeroes and selecting the lanes based upon the real8592// predicate.8593SDValue AllOnes =8594DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);8595AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);85968597SDValue AllZeroes =8598DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);8599AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);86008601// Get full vector type from predicate type8602EVT NewVT = getVectorTyFromPredicateVector(VT);86038604SDValue RecastV1;8605// If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast8606// this to a v16i1. This cannot be done with an ordinary bitcast because the8607// sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,8608// since we know in hardware the sizes are really the same.8609if (VT != MVT::v16i1)8610RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);8611else8612RecastV1 = Pred;86138614// Select either all ones or zeroes depending upon the real predicate bits.8615SDValue PredAsVector =8616DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);86178618// Recast our new predicate-as-integer v16i8 vector into something8619// appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.8620return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);8621}86228623static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,8624const ARMSubtarget *ST) {8625EVT VT = Op.getValueType();8626ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());8627ArrayRef<int> ShuffleMask = SVN->getMask();86288629assert(ST->hasMVEIntegerOps() &&8630"No support for vector shuffle of boolean predicates");86318632SDValue V1 = Op.getOperand(0);8633SDValue V2 = Op.getOperand(1);8634SDLoc dl(Op);8635if (isReverseMask(ShuffleMask, VT)) {8636SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);8637SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);8638SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,8639DAG.getConstant(16, dl, MVT::i32));8640return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);8641}86428643// Until we can come up with optimised cases for every single vector8644// shuffle in existence we have chosen the least painful strategy. This is8645// to essentially promote the boolean predicate to a 8-bit integer, where8646// each predicate represents a byte. Then we fall back on a normal integer8647// vector shuffle and convert the result back into a predicate vector. In8648// many cases the generated code might be even better than scalar code8649// operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit8650// fields in a register into 8 other arbitrary 2-bit fields!8651SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);8652EVT NewVT = PredAsVector1.getValueType();8653SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)8654: PromoteMVEPredVector(dl, V2, VT, DAG);8655assert(PredAsVector2.getValueType() == NewVT &&8656"Expected identical vector type in expanded i1 shuffle!");86578658// Do the shuffle!8659SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,8660PredAsVector2, ShuffleMask);86618662// Now return the result of comparing the shuffled vector with zero,8663// which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i18664// we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.8665if (VT == MVT::v2i1) {8666SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);8667SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,8668DAG.getConstant(ARMCC::NE, dl, MVT::i32));8669return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);8670}8671return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,8672DAG.getConstant(ARMCC::NE, dl, MVT::i32));8673}86748675static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,8676ArrayRef<int> ShuffleMask,8677SelectionDAG &DAG) {8678// Attempt to lower the vector shuffle using as many whole register movs as8679// possible. This is useful for types smaller than 32bits, which would8680// often otherwise become a series for grp movs.8681SDLoc dl(Op);8682EVT VT = Op.getValueType();8683if (VT.getScalarSizeInBits() >= 32)8684return SDValue();86858686assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&8687"Unexpected vector type");8688int NumElts = VT.getVectorNumElements();8689int QuarterSize = NumElts / 4;8690// The four final parts of the vector, as i32's8691SDValue Parts[4];86928693// Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not8694// <u,u,u,u>), returning the vmov lane index8695auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {8696// Detect which mov lane this would be from the first non-undef element.8697int MovIdx = -1;8698for (int i = 0; i < Length; i++) {8699if (ShuffleMask[Start + i] >= 0) {8700if (ShuffleMask[Start + i] % Length != i)8701return -1;8702MovIdx = ShuffleMask[Start + i] / Length;8703break;8704}8705}8706// If all items are undef, leave this for other combines8707if (MovIdx == -1)8708return -1;8709// Check the remaining values are the correct part of the same mov8710for (int i = 1; i < Length; i++) {8711if (ShuffleMask[Start + i] >= 0 &&8712(ShuffleMask[Start + i] / Length != MovIdx ||8713ShuffleMask[Start + i] % Length != i))8714return -1;8715}8716return MovIdx;8717};87188719for (int Part = 0; Part < 4; ++Part) {8720// Does this part look like a mov8721int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);8722if (Elt != -1) {8723SDValue Input = Op->getOperand(0);8724if (Elt >= 4) {8725Input = Op->getOperand(1);8726Elt -= 4;8727}8728SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);8729Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,8730DAG.getConstant(Elt, dl, MVT::i32));8731}8732}87338734// Nothing interesting found, just return8735if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])8736return SDValue();87378738// The other parts need to be built with the old shuffle vector, cast to a8739// v4i32 and extract_vector_elts8740if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {8741SmallVector<int, 16> NewShuffleMask;8742for (int Part = 0; Part < 4; ++Part)8743for (int i = 0; i < QuarterSize; i++)8744NewShuffleMask.push_back(8745Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);8746SDValue NewShuffle = DAG.getVectorShuffle(8747VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);8748SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);87498750for (int Part = 0; Part < 4; ++Part)8751if (!Parts[Part])8752Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,8753BitCast, DAG.getConstant(Part, dl, MVT::i32));8754}8755// Build a vector out of the various parts and bitcast it back to the original8756// type.8757SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);8758return DAG.getBitcast(VT, NewVec);8759}87608761static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op,8762ArrayRef<int> ShuffleMask,8763SelectionDAG &DAG) {8764SDValue V1 = Op.getOperand(0);8765SDValue V2 = Op.getOperand(1);8766EVT VT = Op.getValueType();8767unsigned NumElts = VT.getVectorNumElements();87688769// An One-Off Identity mask is one that is mostly an identity mask from as8770// single source but contains a single element out-of-place, either from a8771// different vector or from another position in the same vector. As opposed to8772// lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert8773// pair directly.8774auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,8775int &OffElement) {8776OffElement = -1;8777int NonUndef = 0;8778for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {8779if (Mask[i] == -1)8780continue;8781NonUndef++;8782if (Mask[i] != i + BaseOffset) {8783if (OffElement == -1)8784OffElement = i;8785else8786return false;8787}8788}8789return NonUndef > 2 && OffElement != -1;8790};8791int OffElement;8792SDValue VInput;8793if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))8794VInput = V1;8795else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))8796VInput = V2;8797else8798return SDValue();87998800SDLoc dl(Op);8801EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i168802? MVT::i328803: VT.getScalarType();8804SDValue Elt = DAG.getNode(8805ISD::EXTRACT_VECTOR_ELT, dl, SVT,8806ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,8807DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));8808return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,8809DAG.getVectorIdxConstant(OffElement % NumElts, dl));8810}88118812static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,8813const ARMSubtarget *ST) {8814SDValue V1 = Op.getOperand(0);8815SDValue V2 = Op.getOperand(1);8816SDLoc dl(Op);8817EVT VT = Op.getValueType();8818ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());8819unsigned EltSize = VT.getScalarSizeInBits();88208821if (ST->hasMVEIntegerOps() && EltSize == 1)8822return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);88238824// Convert shuffles that are directly supported on NEON to target-specific8825// DAG nodes, instead of keeping them as shuffles and matching them again8826// during code selection. This is more efficient and avoids the possibility8827// of inconsistencies between legalization and selection.8828// FIXME: floating-point vectors should be canonicalized to integer vectors8829// of the same time so that they get CSEd properly.8830ArrayRef<int> ShuffleMask = SVN->getMask();88318832if (EltSize <= 32) {8833if (SVN->isSplat()) {8834int Lane = SVN->getSplatIndex();8835// If this is undef splat, generate it via "just" vdup, if possible.8836if (Lane == -1) Lane = 0;88378838// Test if V1 is a SCALAR_TO_VECTOR.8839if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {8840return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));8841}8842// Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR8843// (and probably will turn into a SCALAR_TO_VECTOR once legalization8844// reaches it).8845if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&8846!isa<ConstantSDNode>(V1.getOperand(0))) {8847bool IsScalarToVector = true;8848for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)8849if (!V1.getOperand(i).isUndef()) {8850IsScalarToVector = false;8851break;8852}8853if (IsScalarToVector)8854return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));8855}8856return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,8857DAG.getConstant(Lane, dl, MVT::i32));8858}88598860bool ReverseVEXT = false;8861unsigned Imm = 0;8862if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {8863if (ReverseVEXT)8864std::swap(V1, V2);8865return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,8866DAG.getConstant(Imm, dl, MVT::i32));8867}88688869if (isVREVMask(ShuffleMask, VT, 64))8870return DAG.getNode(ARMISD::VREV64, dl, VT, V1);8871if (isVREVMask(ShuffleMask, VT, 32))8872return DAG.getNode(ARMISD::VREV32, dl, VT, V1);8873if (isVREVMask(ShuffleMask, VT, 16))8874return DAG.getNode(ARMISD::VREV16, dl, VT, V1);88758876if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {8877return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,8878DAG.getConstant(Imm, dl, MVT::i32));8879}88808881// Check for Neon shuffles that modify both input vectors in place.8882// If both results are used, i.e., if there are two shuffles with the same8883// source operands and with masks corresponding to both results of one of8884// these operations, DAG memoization will ensure that a single node is8885// used for both shuffles.8886unsigned WhichResult = 0;8887bool isV_UNDEF = false;8888if (ST->hasNEON()) {8889if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(8890ShuffleMask, VT, WhichResult, isV_UNDEF)) {8891if (isV_UNDEF)8892V2 = V1;8893return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)8894.getValue(WhichResult);8895}8896}8897if (ST->hasMVEIntegerOps()) {8898if (isVMOVNMask(ShuffleMask, VT, false, false))8899return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,8900DAG.getConstant(0, dl, MVT::i32));8901if (isVMOVNMask(ShuffleMask, VT, true, false))8902return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,8903DAG.getConstant(1, dl, MVT::i32));8904if (isVMOVNMask(ShuffleMask, VT, true, true))8905return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,8906DAG.getConstant(1, dl, MVT::i32));8907}89088909// Also check for these shuffles through CONCAT_VECTORS: we canonicalize8910// shuffles that produce a result larger than their operands with:8911// shuffle(concat(v1, undef), concat(v2, undef))8912// ->8913// shuffle(concat(v1, v2), undef)8914// because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).8915//8916// This is useful in the general case, but there are special cases where8917// native shuffles produce larger results: the two-result ops.8918//8919// Look through the concat when lowering them:8920// shuffle(concat(v1, v2), undef)8921// ->8922// concat(VZIP(v1, v2):0, :1)8923//8924if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {8925SDValue SubV1 = V1->getOperand(0);8926SDValue SubV2 = V1->getOperand(1);8927EVT SubVT = SubV1.getValueType();89288929// We expect these to have been canonicalized to -1.8930assert(llvm::all_of(ShuffleMask, [&](int i) {8931return i < (int)VT.getVectorNumElements();8932}) && "Unexpected shuffle index into UNDEF operand!");89338934if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(8935ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {8936if (isV_UNDEF)8937SubV2 = SubV1;8938assert((WhichResult == 0) &&8939"In-place shuffle of concat can only have one result!");8940SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),8941SubV1, SubV2);8942return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),8943Res.getValue(1));8944}8945}8946}89478948if (ST->hasMVEIntegerOps() && EltSize <= 32) {8949if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))8950return V;89518952for (bool Top : {false, true}) {8953for (bool SingleSource : {false, true}) {8954if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {8955MVT FromSVT = MVT::getIntegerVT(EltSize * 2);8956MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);8957SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);8958SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,8959SingleSource ? V1 : V2);8960if (Top) {8961SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);8962Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);8963Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);8964}8965return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);8966}8967}8968}8969}89708971// If the shuffle is not directly supported and it has 4 elements, use8972// the PerfectShuffle-generated table to synthesize it from other shuffles.8973unsigned NumElts = VT.getVectorNumElements();8974if (NumElts == 4) {8975unsigned PFIndexes[4];8976for (unsigned i = 0; i != 4; ++i) {8977if (ShuffleMask[i] < 0)8978PFIndexes[i] = 8;8979else8980PFIndexes[i] = ShuffleMask[i];8981}89828983// Compute the index in the perfect shuffle table.8984unsigned PFTableIndex =8985PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];8986unsigned PFEntry = PerfectShuffleTable[PFTableIndex];8987unsigned Cost = (PFEntry >> 30);89888989if (Cost <= 4) {8990if (ST->hasNEON())8991return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);8992else if (isLegalMVEShuffleOp(PFEntry)) {8993unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);8994unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);8995unsigned PFEntryLHS = PerfectShuffleTable[LHSID];8996unsigned PFEntryRHS = PerfectShuffleTable[RHSID];8997if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))8998return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);8999}9000}9001}90029003// Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.9004if (EltSize >= 32) {9005// Do the expansion with floating-point types, since that is what the VFP9006// registers are defined to use, and since i64 is not legal.9007EVT EltVT = EVT::getFloatingPointVT(EltSize);9008EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);9009V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);9010V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);9011SmallVector<SDValue, 8> Ops;9012for (unsigned i = 0; i < NumElts; ++i) {9013if (ShuffleMask[i] < 0)9014Ops.push_back(DAG.getUNDEF(EltVT));9015else9016Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,9017ShuffleMask[i] < (int)NumElts ? V1 : V2,9018DAG.getConstant(ShuffleMask[i] & (NumElts-1),9019dl, MVT::i32)));9020}9021SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);9022return DAG.getNode(ISD::BITCAST, dl, VT, Val);9023}90249025if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&9026isReverseMask(ShuffleMask, VT))9027return LowerReverse_VECTOR_SHUFFLE(Op, DAG);90289029if (ST->hasNEON() && VT == MVT::v8i8)9030if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))9031return NewOp;90329033if (ST->hasMVEIntegerOps())9034if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))9035return NewOp;90369037return SDValue();9038}90399040static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,9041const ARMSubtarget *ST) {9042EVT VecVT = Op.getOperand(0).getValueType();9043SDLoc dl(Op);90449045assert(ST->hasMVEIntegerOps() &&9046"LowerINSERT_VECTOR_ELT_i1 called without MVE!");90479048SDValue Conv =9049DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));9050unsigned Lane = Op.getConstantOperandVal(2);9051unsigned LaneWidth =9052getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;9053unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;9054SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,9055Op.getOperand(1), DAG.getValueType(MVT::i1));9056SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,9057DAG.getConstant(~Mask, dl, MVT::i32));9058return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);9059}90609061SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,9062SelectionDAG &DAG) const {9063// INSERT_VECTOR_ELT is legal only for immediate indexes.9064SDValue Lane = Op.getOperand(2);9065if (!isa<ConstantSDNode>(Lane))9066return SDValue();90679068SDValue Elt = Op.getOperand(1);9069EVT EltVT = Elt.getValueType();90709071if (Subtarget->hasMVEIntegerOps() &&9072Op.getValueType().getScalarSizeInBits() == 1)9073return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);90749075if (getTypeAction(*DAG.getContext(), EltVT) ==9076TargetLowering::TypeSoftPromoteHalf) {9077// INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,9078// but the type system will try to do that if we don't intervene.9079// Reinterpret any such vector-element insertion as one with the9080// corresponding integer types.90819082SDLoc dl(Op);90839084EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());9085assert(getTypeAction(*DAG.getContext(), IEltVT) !=9086TargetLowering::TypeSoftPromoteHalf);90879088SDValue VecIn = Op.getOperand(0);9089EVT VecVT = VecIn.getValueType();9090EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,9091VecVT.getVectorNumElements());90929093SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);9094SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);9095SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,9096IVecIn, IElt, Lane);9097return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);9098}90999100return Op;9101}91029103static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,9104const ARMSubtarget *ST) {9105EVT VecVT = Op.getOperand(0).getValueType();9106SDLoc dl(Op);91079108assert(ST->hasMVEIntegerOps() &&9109"LowerINSERT_VECTOR_ELT_i1 called without MVE!");91109111SDValue Conv =9112DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));9113unsigned Lane = Op.getConstantOperandVal(1);9114unsigned LaneWidth =9115getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;9116SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,9117DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));9118return Shift;9119}91209121static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,9122const ARMSubtarget *ST) {9123// EXTRACT_VECTOR_ELT is legal only for immediate indexes.9124SDValue Lane = Op.getOperand(1);9125if (!isa<ConstantSDNode>(Lane))9126return SDValue();91279128SDValue Vec = Op.getOperand(0);9129EVT VT = Vec.getValueType();91309131if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)9132return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);91339134if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {9135SDLoc dl(Op);9136return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);9137}91389139return Op;9140}91419142static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,9143const ARMSubtarget *ST) {9144SDLoc dl(Op);9145assert(Op.getValueType().getScalarSizeInBits() == 1 &&9146"Unexpected custom CONCAT_VECTORS lowering");9147assert(isPowerOf2_32(Op.getNumOperands()) &&9148"Unexpected custom CONCAT_VECTORS lowering");9149assert(ST->hasMVEIntegerOps() &&9150"CONCAT_VECTORS lowering only supported for MVE");91519152auto ConcatPair = [&](SDValue V1, SDValue V2) {9153EVT Op1VT = V1.getValueType();9154EVT Op2VT = V2.getValueType();9155assert(Op1VT == Op2VT && "Operand types don't match!");9156assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&9157"Unexpected i1 concat operations!");9158EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());91599160SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);9161SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);91629163// We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets9164// promoted to v8i16, etc.9165MVT ElType =9166getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();9167unsigned NumElts = 2 * Op1VT.getVectorNumElements();91689169EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);9170if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {9171// Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller9172// ConcatVT.9173SDValue ConVec =9174DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);9175return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,9176DAG.getConstant(ARMCC::NE, dl, MVT::i32));9177}91789179// Extract the vector elements from Op1 and Op2 one by one and truncate them9180// to be the right size for the destination. For example, if Op1 is v4i19181// then the promoted vector is v4i32. The result of concatenation gives a9182// v8i1, which when promoted is v8i16. That means each i32 element from Op19183// needs truncating to i16 and inserting in the result.9184auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {9185EVT NewVT = NewV.getValueType();9186EVT ConcatVT = ConVec.getValueType();9187unsigned ExtScale = 1;9188if (NewVT == MVT::v2f64) {9189NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);9190ExtScale = 2;9191}9192for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {9193SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,9194DAG.getIntPtrConstant(i * ExtScale, dl));9195ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,9196DAG.getConstant(j, dl, MVT::i32));9197}9198return ConVec;9199};9200unsigned j = 0;9201SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);9202ConVec = ExtractInto(NewV1, ConVec, j);9203ConVec = ExtractInto(NewV2, ConVec, j);92049205// Now return the result of comparing the subvector with zero, which will9206// generate a real predicate, i.e. v4i1, v8i1 or v16i1.9207return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,9208DAG.getConstant(ARMCC::NE, dl, MVT::i32));9209};92109211// Concat each pair of subvectors and pack into the lower half of the array.9212SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());9213while (ConcatOps.size() > 1) {9214for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {9215SDValue V1 = ConcatOps[I];9216SDValue V2 = ConcatOps[I + 1];9217ConcatOps[I / 2] = ConcatPair(V1, V2);9218}9219ConcatOps.resize(ConcatOps.size() / 2);9220}9221return ConcatOps[0];9222}92239224static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,9225const ARMSubtarget *ST) {9226EVT VT = Op->getValueType(0);9227if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)9228return LowerCONCAT_VECTORS_i1(Op, DAG, ST);92299230// The only time a CONCAT_VECTORS operation can have legal types is when9231// two 64-bit vectors are concatenated to a 128-bit vector.9232assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&9233"unexpected CONCAT_VECTORS");9234SDLoc dl(Op);9235SDValue Val = DAG.getUNDEF(MVT::v2f64);9236SDValue Op0 = Op.getOperand(0);9237SDValue Op1 = Op.getOperand(1);9238if (!Op0.isUndef())9239Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,9240DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),9241DAG.getIntPtrConstant(0, dl));9242if (!Op1.isUndef())9243Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,9244DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),9245DAG.getIntPtrConstant(1, dl));9246return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);9247}92489249static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,9250const ARMSubtarget *ST) {9251SDValue V1 = Op.getOperand(0);9252SDValue V2 = Op.getOperand(1);9253SDLoc dl(Op);9254EVT VT = Op.getValueType();9255EVT Op1VT = V1.getValueType();9256unsigned NumElts = VT.getVectorNumElements();9257unsigned Index = V2->getAsZExtVal();92589259assert(VT.getScalarSizeInBits() == 1 &&9260"Unexpected custom EXTRACT_SUBVECTOR lowering");9261assert(ST->hasMVEIntegerOps() &&9262"EXTRACT_SUBVECTOR lowering only supported for MVE");92639264SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);92659266// We now have Op1 promoted to a vector of integers, where v8i1 gets9267// promoted to v8i16, etc.92689269MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();92709271if (NumElts == 2) {9272EVT SubVT = MVT::v4i32;9273SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);9274for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {9275SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,9276DAG.getIntPtrConstant(i, dl));9277SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,9278DAG.getConstant(j, dl, MVT::i32));9279SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,9280DAG.getConstant(j + 1, dl, MVT::i32));9281}9282SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,9283DAG.getConstant(ARMCC::NE, dl, MVT::i32));9284return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);9285}92869287EVT SubVT = MVT::getVectorVT(ElType, NumElts);9288SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);9289for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {9290SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,9291DAG.getIntPtrConstant(i, dl));9292SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,9293DAG.getConstant(j, dl, MVT::i32));9294}92959296// Now return the result of comparing the subvector with zero,9297// which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.9298return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,9299DAG.getConstant(ARMCC::NE, dl, MVT::i32));9300}93019302// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).9303static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG,9304const ARMSubtarget *ST) {9305assert(ST->hasMVEIntegerOps() && "Expected MVE!");9306EVT VT = N->getValueType(0);9307assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&9308"Expected a vector i1 type!");9309SDValue Op = N->getOperand(0);9310EVT FromVT = Op.getValueType();9311SDLoc DL(N);93129313SDValue And =9314DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));9315return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),9316DAG.getCondCode(ISD::SETNE));9317}93189319static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG,9320const ARMSubtarget *Subtarget) {9321if (!Subtarget->hasMVEIntegerOps())9322return SDValue();93239324EVT ToVT = N->getValueType(0);9325if (ToVT.getScalarType() == MVT::i1)9326return LowerTruncatei1(N, DAG, Subtarget);93279328// MVE does not have a single instruction to perform the truncation of a v4i329329// into the lower half of a v8i16, in the same way that a NEON vmovn would.9330// Most of the instructions in MVE follow the 'Beats' system, where moving9331// values from different lanes is usually something that the instructions9332// avoid.9333//9334// Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,9335// which take a the top/bottom half of a larger lane and extend it (or do the9336// opposite, truncating into the top/bottom lane from a larger lane). Note9337// that because of the way we widen lanes, a v4i16 is really a v4i32 using the9338// bottom 16bits from each vector lane. This works really well with T/B9339// instructions, but that doesn't extend to v8i32->v8i16 where the lanes need9340// to move order.9341//9342// But truncates and sext/zext are always going to be fairly common from llvm.9343// We have several options for how to deal with them:9344// - Wherever possible combine them into an instruction that makes them9345// "free". This includes loads/stores, which can perform the trunc as part9346// of the memory operation. Or certain shuffles that can be turned into9347// VMOVN/VMOVL.9348// - Lane Interleaving to transform blocks surrounded by ext/trunc. So9349// trunc(mul(sext(a), sext(b))) may become9350// VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in9351// this case can use VMULL). This is performed in the9352// MVELaneInterleavingPass.9353// - Otherwise we have an option. By default we would expand the9354// zext/sext/trunc into a series of lane extract/inserts going via GPR9355// registers. One for each vector lane in the vector. This can obviously be9356// very expensive.9357// - The other option is to use the fact that loads/store can extend/truncate9358// to turn a trunc into two truncating stack stores and a stack reload. This9359// becomes 3 back-to-back memory operations, but at least that is less than9360// all the insert/extracts.9361//9362// In order to do the last, we convert certain trunc's into MVETRUNC, which9363// are either optimized where they can be, or eventually lowered into stack9364// stores/loads. This prevents us from splitting a v8i16 trunc into two stores9365// two early, where other instructions would be better, and stops us from9366// having to reconstruct multiple buildvector shuffles into loads/stores.9367if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)9368return SDValue();9369EVT FromVT = N->getOperand(0).getValueType();9370if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)9371return SDValue();93729373SDValue Lo, Hi;9374std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);9375SDLoc DL(N);9376return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);9377}93789379static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG,9380const ARMSubtarget *Subtarget) {9381if (!Subtarget->hasMVEIntegerOps())9382return SDValue();93839384// See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.93859386EVT ToVT = N->getValueType(0);9387if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)9388return SDValue();9389SDValue Op = N->getOperand(0);9390EVT FromVT = Op.getValueType();9391if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)9392return SDValue();93939394SDLoc DL(N);9395EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());9396if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)9397ExtVT = MVT::v8i16;93989399unsigned Opcode =9400N->getOpcode() == ISD::SIGN_EXTEND ? ARMISD::MVESEXT : ARMISD::MVEZEXT;9401SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);9402SDValue Ext1 = Ext.getValue(1);94039404if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {9405Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);9406Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);9407}94089409return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);9410}94119412/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each9413/// element has been zero/sign-extended, depending on the isSigned parameter,9414/// from an integer type half its size.9415static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,9416bool isSigned) {9417// A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.9418EVT VT = N->getValueType(0);9419if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {9420SDNode *BVN = N->getOperand(0).getNode();9421if (BVN->getValueType(0) != MVT::v4i32 ||9422BVN->getOpcode() != ISD::BUILD_VECTOR)9423return false;9424unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;9425unsigned HiElt = 1 - LoElt;9426ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));9427ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));9428ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));9429ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));9430if (!Lo0 || !Hi0 || !Lo1 || !Hi1)9431return false;9432if (isSigned) {9433if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&9434Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)9435return true;9436} else {9437if (Hi0->isZero() && Hi1->isZero())9438return true;9439}9440return false;9441}94429443if (N->getOpcode() != ISD::BUILD_VECTOR)9444return false;94459446for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {9447SDNode *Elt = N->getOperand(i).getNode();9448if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {9449unsigned EltSize = VT.getScalarSizeInBits();9450unsigned HalfSize = EltSize / 2;9451if (isSigned) {9452if (!isIntN(HalfSize, C->getSExtValue()))9453return false;9454} else {9455if (!isUIntN(HalfSize, C->getZExtValue()))9456return false;9457}9458continue;9459}9460return false;9461}94629463return true;9464}94659466/// isSignExtended - Check if a node is a vector value that is sign-extended9467/// or a constant BUILD_VECTOR with sign-extended elements.9468static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {9469if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))9470return true;9471if (isExtendedBUILD_VECTOR(N, DAG, true))9472return true;9473return false;9474}94759476/// isZeroExtended - Check if a node is a vector value that is zero-extended (or9477/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.9478static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {9479if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||9480ISD::isZEXTLoad(N))9481return true;9482if (isExtendedBUILD_VECTOR(N, DAG, false))9483return true;9484return false;9485}94869487static EVT getExtensionTo64Bits(const EVT &OrigVT) {9488if (OrigVT.getSizeInBits() >= 64)9489return OrigVT;94909491assert(OrigVT.isSimple() && "Expecting a simple value type");94929493MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;9494switch (OrigSimpleTy) {9495default: llvm_unreachable("Unexpected Vector Type");9496case MVT::v2i8:9497case MVT::v2i16:9498return MVT::v2i32;9499case MVT::v4i8:9500return MVT::v4i16;9501}9502}95039504/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total9505/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.9506/// We insert the required extension here to get the vector to fill a D register.9507static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,9508const EVT &OrigTy,9509const EVT &ExtTy,9510unsigned ExtOpcode) {9511// The vector originally had a size of OrigTy. It was then extended to ExtTy.9512// We expect the ExtTy to be 128-bits total. If the OrigTy is less than9513// 64-bits we need to insert a new extension so that it will be 64-bits.9514assert(ExtTy.is128BitVector() && "Unexpected extension size");9515if (OrigTy.getSizeInBits() >= 64)9516return N;95179518// Must extend size to at least 64 bits to be used as an operand for VMULL.9519EVT NewVT = getExtensionTo64Bits(OrigTy);95209521return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);9522}95239524/// SkipLoadExtensionForVMULL - return a load of the original vector size that9525/// does not do any sign/zero extension. If the original vector is less9526/// than 64 bits, an appropriate extension will be added after the load to9527/// reach a total size of 64 bits. We have to add the extension separately9528/// because ARM does not have a sign/zero extending load for vectors.9529static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {9530EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());95319532// The load already has the right type.9533if (ExtendedTy == LD->getMemoryVT())9534return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),9535LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),9536LD->getMemOperand()->getFlags());95379538// We need to create a zextload/sextload. We cannot just create a load9539// followed by a zext/zext node because LowerMUL is also run during normal9540// operation legalization where we can't create illegal types.9541return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,9542LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),9543LD->getMemoryVT(), LD->getAlign(),9544LD->getMemOperand()->getFlags());9545}95469547/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,9548/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return9549/// the unextended value. The unextended vector should be 64 bits so that it can9550/// be used as an operand to a VMULL instruction. If the original vector size9551/// before extension is less than 64 bits we add a an extension to resize9552/// the vector to 64 bits.9553static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {9554if (N->getOpcode() == ISD::SIGN_EXTEND ||9555N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)9556return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,9557N->getOperand(0)->getValueType(0),9558N->getValueType(0),9559N->getOpcode());95609561if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {9562assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&9563"Expected extending load");95649565SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);9566DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));9567unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;9568SDValue extLoad =9569DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);9570DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);95719572return newLoad;9573}95749575// Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will9576// have been legalized as a BITCAST from v4i32.9577if (N->getOpcode() == ISD::BITCAST) {9578SDNode *BVN = N->getOperand(0).getNode();9579assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&9580BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");9581unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;9582return DAG.getBuildVector(9583MVT::v2i32, SDLoc(N),9584{BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});9585}9586// Construct a new BUILD_VECTOR with elements truncated to half the size.9587assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");9588EVT VT = N->getValueType(0);9589unsigned EltSize = VT.getScalarSizeInBits() / 2;9590unsigned NumElts = VT.getVectorNumElements();9591MVT TruncVT = MVT::getIntegerVT(EltSize);9592SmallVector<SDValue, 8> Ops;9593SDLoc dl(N);9594for (unsigned i = 0; i != NumElts; ++i) {9595const APInt &CInt = N->getConstantOperandAPInt(i);9596// Element types smaller than 32 bits are not legal, so use i32 elements.9597// The values are implicitly truncated so sext vs. zext doesn't matter.9598Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));9599}9600return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);9601}96029603static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {9604unsigned Opcode = N->getOpcode();9605if (Opcode == ISD::ADD || Opcode == ISD::SUB) {9606SDNode *N0 = N->getOperand(0).getNode();9607SDNode *N1 = N->getOperand(1).getNode();9608return N0->hasOneUse() && N1->hasOneUse() &&9609isSignExtended(N0, DAG) && isSignExtended(N1, DAG);9610}9611return false;9612}96139614static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {9615unsigned Opcode = N->getOpcode();9616if (Opcode == ISD::ADD || Opcode == ISD::SUB) {9617SDNode *N0 = N->getOperand(0).getNode();9618SDNode *N1 = N->getOperand(1).getNode();9619return N0->hasOneUse() && N1->hasOneUse() &&9620isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);9621}9622return false;9623}96249625static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {9626// Multiplications are only custom-lowered for 128-bit vectors so that9627// VMULL can be detected. Otherwise v2i64 multiplications are not legal.9628EVT VT = Op.getValueType();9629assert(VT.is128BitVector() && VT.isInteger() &&9630"unexpected type for custom-lowering ISD::MUL");9631SDNode *N0 = Op.getOperand(0).getNode();9632SDNode *N1 = Op.getOperand(1).getNode();9633unsigned NewOpc = 0;9634bool isMLA = false;9635bool isN0SExt = isSignExtended(N0, DAG);9636bool isN1SExt = isSignExtended(N1, DAG);9637if (isN0SExt && isN1SExt)9638NewOpc = ARMISD::VMULLs;9639else {9640bool isN0ZExt = isZeroExtended(N0, DAG);9641bool isN1ZExt = isZeroExtended(N1, DAG);9642if (isN0ZExt && isN1ZExt)9643NewOpc = ARMISD::VMULLu;9644else if (isN1SExt || isN1ZExt) {9645// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these9646// into (s/zext A * s/zext C) + (s/zext B * s/zext C)9647if (isN1SExt && isAddSubSExt(N0, DAG)) {9648NewOpc = ARMISD::VMULLs;9649isMLA = true;9650} else if (isN1ZExt && isAddSubZExt(N0, DAG)) {9651NewOpc = ARMISD::VMULLu;9652isMLA = true;9653} else if (isN0ZExt && isAddSubZExt(N1, DAG)) {9654std::swap(N0, N1);9655NewOpc = ARMISD::VMULLu;9656isMLA = true;9657}9658}96599660if (!NewOpc) {9661if (VT == MVT::v2i64)9662// Fall through to expand this. It is not legal.9663return SDValue();9664else9665// Other vector multiplications are legal.9666return Op;9667}9668}96699670// Legalize to a VMULL instruction.9671SDLoc DL(Op);9672SDValue Op0;9673SDValue Op1 = SkipExtensionForVMULL(N1, DAG);9674if (!isMLA) {9675Op0 = SkipExtensionForVMULL(N0, DAG);9676assert(Op0.getValueType().is64BitVector() &&9677Op1.getValueType().is64BitVector() &&9678"unexpected types for extended operands to VMULL");9679return DAG.getNode(NewOpc, DL, VT, Op0, Op1);9680}96819682// Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during9683// isel lowering to take advantage of no-stall back to back vmul + vmla.9684// vmull q0, d4, d69685// vmlal q0, d5, d69686// is faster than9687// vaddl q0, d4, d59688// vmovl q1, d69689// vmul q0, q0, q19690SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);9691SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);9692EVT Op1VT = Op1.getValueType();9693return DAG.getNode(N0->getOpcode(), DL, VT,9694DAG.getNode(NewOpc, DL, VT,9695DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),9696DAG.getNode(NewOpc, DL, VT,9697DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));9698}96999700static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,9701SelectionDAG &DAG) {9702// TODO: Should this propagate fast-math-flags?97039704// Convert to float9705// float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));9706// float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));9707X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);9708Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);9709X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);9710Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);9711// Get reciprocal estimate.9712// float4 recip = vrecpeq_f32(yf);9713Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,9714DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),9715Y);9716// Because char has a smaller range than uchar, we can actually get away9717// without any newton steps. This requires that we use a weird bias9718// of 0xb000, however (again, this has been exhaustively tested).9719// float4 result = as_float4(as_int4(xf*recip) + 0xb000);9720X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);9721X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);9722Y = DAG.getConstant(0xb000, dl, MVT::v4i32);9723X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);9724X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);9725// Convert back to short.9726X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);9727X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);9728return X;9729}97309731static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,9732SelectionDAG &DAG) {9733// TODO: Should this propagate fast-math-flags?97349735SDValue N2;9736// Convert to float.9737// float4 yf = vcvt_f32_s32(vmovl_s16(y));9738// float4 xf = vcvt_f32_s32(vmovl_s16(x));9739N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);9740N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);9741N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);9742N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);97439744// Use reciprocal estimate and one refinement step.9745// float4 recip = vrecpeq_f32(yf);9746// recip *= vrecpsq_f32(yf, recip);9747N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,9748DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),9749N1);9750N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,9751DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),9752N1, N2);9753N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);9754// Because short has a smaller range than ushort, we can actually get away9755// with only a single newton step. This requires that we use a weird bias9756// of 89, however (again, this has been exhaustively tested).9757// float4 result = as_float4(as_int4(xf*recip) + 0x89);9758N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);9759N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);9760N1 = DAG.getConstant(0x89, dl, MVT::v4i32);9761N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);9762N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);9763// Convert back to integer and return.9764// return vmovn_s32(vcvt_s32_f32(result));9765N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);9766N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);9767return N0;9768}97699770static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG,9771const ARMSubtarget *ST) {9772EVT VT = Op.getValueType();9773assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&9774"unexpected type for custom-lowering ISD::SDIV");97759776SDLoc dl(Op);9777SDValue N0 = Op.getOperand(0);9778SDValue N1 = Op.getOperand(1);9779SDValue N2, N3;97809781if (VT == MVT::v8i8) {9782N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);9783N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);97849785N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,9786DAG.getIntPtrConstant(4, dl));9787N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,9788DAG.getIntPtrConstant(4, dl));9789N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,9790DAG.getIntPtrConstant(0, dl));9791N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,9792DAG.getIntPtrConstant(0, dl));97939794N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i169795N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i1697969797N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);9798N0 = LowerCONCAT_VECTORS(N0, DAG, ST);97999800N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);9801return N0;9802}9803return LowerSDIV_v4i16(N0, N1, dl, DAG);9804}98059806static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG,9807const ARMSubtarget *ST) {9808// TODO: Should this propagate fast-math-flags?9809EVT VT = Op.getValueType();9810assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&9811"unexpected type for custom-lowering ISD::UDIV");98129813SDLoc dl(Op);9814SDValue N0 = Op.getOperand(0);9815SDValue N1 = Op.getOperand(1);9816SDValue N2, N3;98179818if (VT == MVT::v8i8) {9819N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);9820N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);98219822N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,9823DAG.getIntPtrConstant(4, dl));9824N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,9825DAG.getIntPtrConstant(4, dl));9826N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,9827DAG.getIntPtrConstant(0, dl));9828N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,9829DAG.getIntPtrConstant(0, dl));98309831N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i169832N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i1698339834N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);9835N0 = LowerCONCAT_VECTORS(N0, DAG, ST);98369837N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,9838DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,9839MVT::i32),9840N0);9841return N0;9842}98439844// v4i16 sdiv ... Convert to float.9845// float4 yf = vcvt_f32_s32(vmovl_u16(y));9846// float4 xf = vcvt_f32_s32(vmovl_u16(x));9847N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);9848N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);9849N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);9850SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);98519852// Use reciprocal estimate and two refinement steps.9853// float4 recip = vrecpeq_f32(yf);9854// recip *= vrecpsq_f32(yf, recip);9855// recip *= vrecpsq_f32(yf, recip);9856N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,9857DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),9858BN1);9859N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,9860DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),9861BN1, N2);9862N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);9863N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,9864DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),9865BN1, N2);9866N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);9867// Simply multiplying by the reciprocal estimate can leave us a few ulps9868// too low, so we add 2 ulps (exhaustive testing shows that this is enough,9869// and that it will never cause us to return an answer too large).9870// float4 result = as_float4(as_int4(xf*recip) + 2);9871N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);9872N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);9873N1 = DAG.getConstant(2, dl, MVT::v4i32);9874N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);9875N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);9876// Convert back to integer and return.9877// return vmovn_u32(vcvt_s32_f32(result));9878N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);9879N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);9880return N0;9881}98829883static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {9884SDNode *N = Op.getNode();9885EVT VT = N->getValueType(0);9886SDVTList VTs = DAG.getVTList(VT, MVT::i32);98879888SDValue Carry = Op.getOperand(2);98899890SDLoc DL(Op);98919892SDValue Result;9893if (Op.getOpcode() == ISD::UADDO_CARRY) {9894// This converts the boolean value carry into the carry flag.9895Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);98969897// Do the addition proper using the carry flag we wanted.9898Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),9899Op.getOperand(1), Carry);99009901// Now convert the carry flag into a boolean value.9902Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);9903} else {9904// ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we9905// have to invert the carry first.9906Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,9907DAG.getConstant(1, DL, MVT::i32), Carry);9908// This converts the boolean value carry into the carry flag.9909Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);99109911// Do the subtraction proper using the carry flag we wanted.9912Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),9913Op.getOperand(1), Carry);99149915// Now convert the carry flag into a boolean value.9916Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);9917// But the carry returned by ARMISD::SUBE is not a borrow as expected9918// by ISD::USUBO_CARRY, so compute 1 - C.9919Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,9920DAG.getConstant(1, DL, MVT::i32), Carry);9921}99229923// Return both values.9924return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);9925}99269927SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {9928assert(Subtarget->isTargetDarwin());99299930// For iOS, we want to call an alternative entry point: __sincos_stret,9931// return values are passed via sret.9932SDLoc dl(Op);9933SDValue Arg = Op.getOperand(0);9934EVT ArgVT = Arg.getValueType();9935Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());9936auto PtrVT = getPointerTy(DAG.getDataLayout());99379938MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();9939const TargetLowering &TLI = DAG.getTargetLoweringInfo();99409941// Pair of floats / doubles used to pass the result.9942Type *RetTy = StructType::get(ArgTy, ArgTy);9943auto &DL = DAG.getDataLayout();99449945ArgListTy Args;9946bool ShouldUseSRet = Subtarget->isAPCS_ABI();9947SDValue SRet;9948if (ShouldUseSRet) {9949// Create stack object for sret.9950const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);9951const Align StackAlign = DL.getPrefTypeAlign(RetTy);9952int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);9953SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));99549955ArgListEntry Entry;9956Entry.Node = SRet;9957Entry.Ty = PointerType::getUnqual(RetTy->getContext());9958Entry.IsSExt = false;9959Entry.IsZExt = false;9960Entry.IsSRet = true;9961Args.push_back(Entry);9962RetTy = Type::getVoidTy(*DAG.getContext());9963}99649965ArgListEntry Entry;9966Entry.Node = Arg;9967Entry.Ty = ArgTy;9968Entry.IsSExt = false;9969Entry.IsZExt = false;9970Args.push_back(Entry);99719972RTLIB::Libcall LC =9973(ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;9974const char *LibcallName = getLibcallName(LC);9975CallingConv::ID CC = getLibcallCallingConv(LC);9976SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));99779978TargetLowering::CallLoweringInfo CLI(DAG);9979CLI.setDebugLoc(dl)9980.setChain(DAG.getEntryNode())9981.setCallee(CC, RetTy, Callee, std::move(Args))9982.setDiscardResult(ShouldUseSRet);9983std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);99849985if (!ShouldUseSRet)9986return CallResult.first;99879988SDValue LoadSin =9989DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());99909991// Address of cos field.9992SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,9993DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));9994SDValue LoadCos =9995DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());99969997SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);9998return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,9999LoadSin.getValue(0), LoadCos.getValue(0));10000}1000110002SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,10003bool Signed,10004SDValue &Chain) const {10005EVT VT = Op.getValueType();10006assert((VT == MVT::i32 || VT == MVT::i64) &&10007"unexpected type for custom lowering DIV");10008SDLoc dl(Op);1000910010const auto &DL = DAG.getDataLayout();10011const auto &TLI = DAG.getTargetLoweringInfo();1001210013const char *Name = nullptr;10014if (Signed)10015Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";10016else10017Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";1001810019SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));1002010021ARMTargetLowering::ArgListTy Args;1002210023for (auto AI : {1, 0}) {10024ArgListEntry Arg;10025Arg.Node = Op.getOperand(AI);10026Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());10027Args.push_back(Arg);10028}1002910030CallLoweringInfo CLI(DAG);10031CLI.setDebugLoc(dl)10032.setChain(Chain)10033.setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),10034ES, std::move(Args));1003510036return LowerCallTo(CLI).first;10037}1003810039// This is a code size optimisation: return the original SDIV node to10040// DAGCombiner when we don't want to expand SDIV into a sequence of10041// instructions, and an empty node otherwise which will cause the10042// SDIV to be expanded in DAGCombine.10043SDValue10044ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,10045SelectionDAG &DAG,10046SmallVectorImpl<SDNode *> &Created) const {10047// TODO: Support SREM10048if (N->getOpcode() != ISD::SDIV)10049return SDValue();1005010051const auto &ST = DAG.getSubtarget<ARMSubtarget>();10052const bool MinSize = ST.hasMinSize();10053const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()10054: ST.hasDivideInARMMode();1005510056// Don't touch vector types; rewriting this may lead to scalarizing10057// the int divs.10058if (N->getOperand(0).getValueType().isVector())10059return SDValue();1006010061// Bail if MinSize is not set, and also for both ARM and Thumb mode we need10062// hwdiv support for this to be really profitable.10063if (!(MinSize && HasDivide))10064return SDValue();1006510066// ARM mode is a bit simpler than Thumb: we can handle large power10067// of 2 immediates with 1 mov instruction; no further checks required,10068// just return the sdiv node.10069if (!ST.isThumb())10070return SDValue(N, 0);1007110072// In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,10073// and thus lose the code size benefits of a MOVS that requires only 2.10074// TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,10075// but as it's doing exactly this, it's not worth the trouble to get TTI.10076if (Divisor.sgt(128))10077return SDValue();1007810079return SDValue(N, 0);10080}1008110082SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,10083bool Signed) const {10084assert(Op.getValueType() == MVT::i32 &&10085"unexpected type for custom lowering DIV");10086SDLoc dl(Op);1008710088SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,10089DAG.getEntryNode(), Op.getOperand(1));1009010091return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);10092}1009310094static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {10095SDLoc DL(N);10096SDValue Op = N->getOperand(1);10097if (N->getValueType(0) == MVT::i32)10098return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);10099SDValue Lo, Hi;10100std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);10101return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,10102DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));10103}1010410105void ARMTargetLowering::ExpandDIV_Windows(10106SDValue Op, SelectionDAG &DAG, bool Signed,10107SmallVectorImpl<SDValue> &Results) const {10108const auto &DL = DAG.getDataLayout();10109const auto &TLI = DAG.getTargetLoweringInfo();1011010111assert(Op.getValueType() == MVT::i64 &&10112"unexpected type for custom lowering DIV");10113SDLoc dl(Op);1011410115SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());1011610117SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);1011810119SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);10120SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,10121DAG.getConstant(32, dl, TLI.getPointerTy(DL)));10122Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);1012310124Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));10125}1012610127static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {10128LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());10129EVT MemVT = LD->getMemoryVT();10130assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||10131MemVT == MVT::v16i1) &&10132"Expected a predicate type!");10133assert(MemVT == Op.getValueType());10134assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&10135"Expected a non-extending load");10136assert(LD->isUnindexed() && "Expected a unindexed load");1013710138// The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit10139// predicate, with the "v4i1" bits spread out over the 16 bits loaded. We10140// need to make sure that 8/4/2 bits are actually loaded into the correct10141// place, which means loading the value and then shuffling the values into10142// the bottom bits of the predicate.10143// Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect10144// for BE).10145// Speaking of BE, apparently the rest of llvm will assume a reverse order to10146// a natural VMSR(load), so needs to be reversed.1014710148SDLoc dl(Op);10149SDValue Load = DAG.getExtLoad(10150ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),10151EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),10152LD->getMemOperand());10153SDValue Val = Load;10154if (DAG.getDataLayout().isBigEndian())10155Val = DAG.getNode(ISD::SRL, dl, MVT::i32,10156DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),10157DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));10158SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);10159if (MemVT != MVT::v16i1)10160Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,10161DAG.getConstant(0, dl, MVT::i32));10162return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);10163}1016410165void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,10166SelectionDAG &DAG) const {10167LoadSDNode *LD = cast<LoadSDNode>(N);10168EVT MemVT = LD->getMemoryVT();10169assert(LD->isUnindexed() && "Loads should be unindexed at this point.");1017010171if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&10172!Subtarget->isThumb1Only() && LD->isVolatile() &&10173LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {10174SDLoc dl(N);10175SDValue Result = DAG.getMemIntrinsicNode(10176ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),10177{LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());10178SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);10179SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);10180SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);10181Results.append({Pair, Result.getValue(2)});10182}10183}1018410185static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {10186StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());10187EVT MemVT = ST->getMemoryVT();10188assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||10189MemVT == MVT::v16i1) &&10190"Expected a predicate type!");10191assert(MemVT == ST->getValue().getValueType());10192assert(!ST->isTruncatingStore() && "Expected a non-extending store");10193assert(ST->isUnindexed() && "Expected a unindexed store");1019410195// Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with10196// top bits unset and a scalar store.10197SDLoc dl(Op);10198SDValue Build = ST->getValue();10199if (MemVT != MVT::v16i1) {10200SmallVector<SDValue, 16> Ops;10201for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {10202unsigned Elt = DAG.getDataLayout().isBigEndian()10203? MemVT.getVectorNumElements() - I - 110204: I;10205Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,10206DAG.getConstant(Elt, dl, MVT::i32)));10207}10208for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)10209Ops.push_back(DAG.getUNDEF(MVT::i32));10210Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);10211}10212SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);10213if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())10214GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,10215DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),10216DAG.getConstant(16, dl, MVT::i32));10217return DAG.getTruncStore(10218ST->getChain(), dl, GRP, ST->getBasePtr(),10219EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),10220ST->getMemOperand());10221}1022210223static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,10224const ARMSubtarget *Subtarget) {10225StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());10226EVT MemVT = ST->getMemoryVT();10227assert(ST->isUnindexed() && "Stores should be unindexed at this point.");1022810229if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&10230!Subtarget->isThumb1Only() && ST->isVolatile() &&10231ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {10232SDNode *N = Op.getNode();10233SDLoc dl(N);1023410235SDValue Lo = DAG.getNode(10236ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),10237DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,10238MVT::i32));10239SDValue Hi = DAG.getNode(10240ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),10241DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,10242MVT::i32));1024310244return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),10245{ST->getChain(), Lo, Hi, ST->getBasePtr()},10246MemVT, ST->getMemOperand());10247} else if (Subtarget->hasMVEIntegerOps() &&10248((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||10249MemVT == MVT::v16i1))) {10250return LowerPredicateStore(Op, DAG);10251}1025210253return SDValue();10254}1025510256static bool isZeroVector(SDValue N) {10257return (ISD::isBuildVectorAllZeros(N.getNode()) ||10258(N->getOpcode() == ARMISD::VMOVIMM &&10259isNullConstant(N->getOperand(0))));10260}1026110262static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {10263MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());10264MVT VT = Op.getSimpleValueType();10265SDValue Mask = N->getMask();10266SDValue PassThru = N->getPassThru();10267SDLoc dl(Op);1026810269if (isZeroVector(PassThru))10270return Op;1027110272// MVE Masked loads use zero as the passthru value. Here we convert undef to10273// zero too, and other values are lowered to a select.10274SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,10275DAG.getTargetConstant(0, dl, MVT::i32));10276SDValue NewLoad = DAG.getMaskedLoad(10277VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,10278N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),10279N->getExtensionType(), N->isExpandingLoad());10280SDValue Combo = NewLoad;10281bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||10282PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&10283isZeroVector(PassThru->getOperand(0));10284if (!PassThru.isUndef() && !PassThruIsCastZero)10285Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);10286return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);10287}1028810289static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG,10290const ARMSubtarget *ST) {10291if (!ST->hasMVEIntegerOps())10292return SDValue();1029310294SDLoc dl(Op);10295unsigned BaseOpcode = 0;10296switch (Op->getOpcode()) {10297default: llvm_unreachable("Expected VECREDUCE opcode");10298case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;10299case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;10300case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;10301case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;10302case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;10303case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;10304case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;10305case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;10306}1030710308SDValue Op0 = Op->getOperand(0);10309EVT VT = Op0.getValueType();10310EVT EltVT = VT.getVectorElementType();10311unsigned NumElts = VT.getVectorNumElements();10312unsigned NumActiveLanes = NumElts;1031310314assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||10315NumActiveLanes == 2) &&10316"Only expected a power 2 vector size");1031710318// Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements10319// allows us to easily extract vector elements from the lanes.10320while (NumActiveLanes > 4) {10321unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;10322SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);10323Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);10324NumActiveLanes /= 2;10325}1032610327SDValue Res;10328if (NumActiveLanes == 4) {10329// The remaining 4 elements are summed sequentially10330SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,10331DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));10332SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,10333DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));10334SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,10335DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));10336SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,10337DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));10338SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());10339SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());10340Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());10341} else {10342SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,10343DAG.getConstant(0, dl, MVT::i32));10344SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,10345DAG.getConstant(1, dl, MVT::i32));10346Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());10347}1034810349// Result type may be wider than element type.10350if (EltVT != Op->getValueType(0))10351Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);10352return Res;10353}1035410355static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG,10356const ARMSubtarget *ST) {10357if (!ST->hasMVEFloatOps())10358return SDValue();10359return LowerVecReduce(Op, DAG, ST);10360}1036110362static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG,10363const ARMSubtarget *ST) {10364if (!ST->hasNEON())10365return SDValue();1036610367SDLoc dl(Op);10368SDValue Op0 = Op->getOperand(0);10369EVT VT = Op0.getValueType();10370EVT EltVT = VT.getVectorElementType();1037110372unsigned PairwiseIntrinsic = 0;10373switch (Op->getOpcode()) {10374default:10375llvm_unreachable("Expected VECREDUCE opcode");10376case ISD::VECREDUCE_UMIN:10377PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;10378break;10379case ISD::VECREDUCE_UMAX:10380PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;10381break;10382case ISD::VECREDUCE_SMIN:10383PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;10384break;10385case ISD::VECREDUCE_SMAX:10386PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;10387break;10388}10389SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);1039010391unsigned NumElts = VT.getVectorNumElements();10392unsigned NumActiveLanes = NumElts;1039310394assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||10395NumActiveLanes == 2) &&10396"Only expected a power 2 vector size");1039710398// Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.10399if (VT.is128BitVector()) {10400SDValue Lo, Hi;10401std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);10402VT = Lo.getValueType();10403Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});10404NumActiveLanes /= 2;10405}1040610407// Use pairwise reductions until one lane remains10408while (NumActiveLanes > 1) {10409Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});10410NumActiveLanes /= 2;10411}1041210413SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,10414DAG.getConstant(0, dl, MVT::i32));1041510416// Result type may be wider than element type.10417if (EltVT != Op.getValueType()) {10418unsigned Extend = 0;10419switch (Op->getOpcode()) {10420default:10421llvm_unreachable("Expected VECREDUCE opcode");10422case ISD::VECREDUCE_UMIN:10423case ISD::VECREDUCE_UMAX:10424Extend = ISD::ZERO_EXTEND;10425break;10426case ISD::VECREDUCE_SMIN:10427case ISD::VECREDUCE_SMAX:10428Extend = ISD::SIGN_EXTEND;10429break;10430}10431Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);10432}10433return Res;10434}1043510436static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {10437if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))10438// Acquire/Release load/store is not legal for targets without a dmb or10439// equivalent available.10440return SDValue();1044110442// Monotonic load/store is legal for all targets.10443return Op;10444}1044510446static void ReplaceREADCYCLECOUNTER(SDNode *N,10447SmallVectorImpl<SDValue> &Results,10448SelectionDAG &DAG,10449const ARMSubtarget *Subtarget) {10450SDLoc DL(N);10451// Under Power Management extensions, the cycle-count is:10452// mrc p15, #0, <Rt>, c9, c13, #010453SDValue Ops[] = { N->getOperand(0), // Chain10454DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),10455DAG.getTargetConstant(15, DL, MVT::i32),10456DAG.getTargetConstant(0, DL, MVT::i32),10457DAG.getTargetConstant(9, DL, MVT::i32),10458DAG.getTargetConstant(13, DL, MVT::i32),10459DAG.getTargetConstant(0, DL, MVT::i32)10460};1046110462SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,10463DAG.getVTList(MVT::i32, MVT::Other), Ops);10464Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,10465DAG.getConstant(0, DL, MVT::i32)));10466Results.push_back(Cycles32.getValue(1));10467}1046810469static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {10470SDLoc dl(V.getNode());10471auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);10472bool isBigEndian = DAG.getDataLayout().isBigEndian();10473if (isBigEndian)10474std::swap (VLo, VHi);10475SDValue RegClass =10476DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);10477SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);10478SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);10479const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };10480return SDValue(10481DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);10482}1048310484static void ReplaceCMP_SWAP_64Results(SDNode *N,10485SmallVectorImpl<SDValue> & Results,10486SelectionDAG &DAG) {10487assert(N->getValueType(0) == MVT::i64 &&10488"AtomicCmpSwap on types less than 64 should be legal");10489SDValue Ops[] = {N->getOperand(1),10490createGPRPairNode(DAG, N->getOperand(2)),10491createGPRPairNode(DAG, N->getOperand(3)),10492N->getOperand(0)};10493SDNode *CmpSwap = DAG.getMachineNode(10494ARM::CMP_SWAP_64, SDLoc(N),10495DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);1049610497MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();10498DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});1049910500bool isBigEndian = DAG.getDataLayout().isBigEndian();1050110502SDValue Lo =10503DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,10504SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));10505SDValue Hi =10506DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,10507SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));10508Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));10509Results.push_back(SDValue(CmpSwap, 2));10510}1051110512SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {10513SDLoc dl(Op);10514EVT VT = Op.getValueType();10515SDValue Chain = Op.getOperand(0);10516SDValue LHS = Op.getOperand(1);10517SDValue RHS = Op.getOperand(2);10518ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();10519bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;1052010521// If we don't have instructions of this float type then soften to a libcall10522// and use SETCC instead.10523if (isUnsupportedFloatingType(LHS.getValueType())) {10524DAG.getTargetLoweringInfo().softenSetCCOperands(10525DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling);10526if (!RHS.getNode()) {10527RHS = DAG.getConstant(0, dl, LHS.getValueType());10528CC = ISD::SETNE;10529}10530SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,10531DAG.getCondCode(CC));10532return DAG.getMergeValues({Result, Chain}, dl);10533}1053410535ARMCC::CondCodes CondCode, CondCode2;10536FPCCToARMCC(CC, CondCode, CondCode2);1053710538// FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit10539// in CMPFP and CMPFPE, but instead it should be made explicit by these10540// instructions using a chain instead of glue. This would also fix the problem10541// here (and also in LowerSELECT_CC) where we generate two comparisons when10542// CondCode2 != AL.10543SDValue True = DAG.getConstant(1, dl, VT);10544SDValue False = DAG.getConstant(0, dl, VT);10545SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);10546SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);10547SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);10548SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG);10549if (CondCode2 != ARMCC::AL) {10550ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);10551Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);10552Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG);10553}10554return DAG.getMergeValues({Result, Chain}, dl);10555}1055610557SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {10558MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();1055910560EVT VT = getPointerTy(DAG.getDataLayout());10561SDLoc DL(Op);10562int FI = MFI.CreateFixedObject(4, 0, false);10563return DAG.getFrameIndex(FI, VT);10564}1056510566SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {10567LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());10568switch (Op.getOpcode()) {10569default: llvm_unreachable("Don't know how to custom lower this!");10570case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);10571case ISD::ConstantPool: return LowerConstantPool(Op, DAG);10572case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);10573case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);10574case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);10575case ISD::SELECT: return LowerSELECT(Op, DAG);10576case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);10577case ISD::BRCOND: return LowerBRCOND(Op, DAG);10578case ISD::BR_CC: return LowerBR_CC(Op, DAG);10579case ISD::BR_JT: return LowerBR_JT(Op, DAG);10580case ISD::VASTART: return LowerVASTART(Op, DAG);10581case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);10582case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);10583case ISD::SINT_TO_FP:10584case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);10585case ISD::STRICT_FP_TO_SINT:10586case ISD::STRICT_FP_TO_UINT:10587case ISD::FP_TO_SINT:10588case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);10589case ISD::FP_TO_SINT_SAT:10590case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);10591case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);10592case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);10593case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);10594case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);10595case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);10596case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);10597case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);10598case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,10599Subtarget);10600case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);10601case ISD::SHL:10602case ISD::SRL:10603case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);10604case ISD::SREM: return LowerREM(Op.getNode(), DAG);10605case ISD::UREM: return LowerREM(Op.getNode(), DAG);10606case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);10607case ISD::SRL_PARTS:10608case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);10609case ISD::CTTZ:10610case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);10611case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);10612case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);10613case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);10614case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);10615case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);10616case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);10617case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);10618case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);10619case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);10620case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);10621case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);10622case ISD::SIGN_EXTEND:10623case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);10624case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);10625case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);10626case ISD::SET_FPMODE:10627return LowerSET_FPMODE(Op, DAG);10628case ISD::RESET_FPMODE:10629return LowerRESET_FPMODE(Op, DAG);10630case ISD::MUL: return LowerMUL(Op, DAG);10631case ISD::SDIV:10632if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())10633return LowerDIV_Windows(Op, DAG, /* Signed */ true);10634return LowerSDIV(Op, DAG, Subtarget);10635case ISD::UDIV:10636if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())10637return LowerDIV_Windows(Op, DAG, /* Signed */ false);10638return LowerUDIV(Op, DAG, Subtarget);10639case ISD::UADDO_CARRY:10640case ISD::USUBO_CARRY:10641return LowerUADDSUBO_CARRY(Op, DAG);10642case ISD::SADDO:10643case ISD::SSUBO:10644return LowerSignedALUO(Op, DAG);10645case ISD::UADDO:10646case ISD::USUBO:10647return LowerUnsignedALUO(Op, DAG);10648case ISD::SADDSAT:10649case ISD::SSUBSAT:10650case ISD::UADDSAT:10651case ISD::USUBSAT:10652return LowerADDSUBSAT(Op, DAG, Subtarget);10653case ISD::LOAD:10654return LowerPredicateLoad(Op, DAG);10655case ISD::STORE:10656return LowerSTORE(Op, DAG, Subtarget);10657case ISD::MLOAD:10658return LowerMLOAD(Op, DAG);10659case ISD::VECREDUCE_MUL:10660case ISD::VECREDUCE_AND:10661case ISD::VECREDUCE_OR:10662case ISD::VECREDUCE_XOR:10663return LowerVecReduce(Op, DAG, Subtarget);10664case ISD::VECREDUCE_FADD:10665case ISD::VECREDUCE_FMUL:10666case ISD::VECREDUCE_FMIN:10667case ISD::VECREDUCE_FMAX:10668return LowerVecReduceF(Op, DAG, Subtarget);10669case ISD::VECREDUCE_UMIN:10670case ISD::VECREDUCE_UMAX:10671case ISD::VECREDUCE_SMIN:10672case ISD::VECREDUCE_SMAX:10673return LowerVecReduceMinMax(Op, DAG, Subtarget);10674case ISD::ATOMIC_LOAD:10675case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);10676case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);10677case ISD::SDIVREM:10678case ISD::UDIVREM: return LowerDivRem(Op, DAG);10679case ISD::DYNAMIC_STACKALLOC:10680if (Subtarget->isTargetWindows())10681return LowerDYNAMIC_STACKALLOC(Op, DAG);10682llvm_unreachable("Don't know how to custom lower this!");10683case ISD::STRICT_FP_ROUND:10684case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);10685case ISD::STRICT_FP_EXTEND:10686case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);10687case ISD::STRICT_FSETCC:10688case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);10689case ISD::SPONENTRY:10690return LowerSPONENTRY(Op, DAG);10691case ARMISD::WIN__DBZCHK: return SDValue();10692}10693}1069410695static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,10696SelectionDAG &DAG) {10697unsigned IntNo = N->getConstantOperandVal(0);10698unsigned Opc = 0;10699if (IntNo == Intrinsic::arm_smlald)10700Opc = ARMISD::SMLALD;10701else if (IntNo == Intrinsic::arm_smlaldx)10702Opc = ARMISD::SMLALDX;10703else if (IntNo == Intrinsic::arm_smlsld)10704Opc = ARMISD::SMLSLD;10705else if (IntNo == Intrinsic::arm_smlsldx)10706Opc = ARMISD::SMLSLDX;10707else10708return;1070910710SDLoc dl(N);10711SDValue Lo, Hi;10712std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);1071310714SDValue LongMul = DAG.getNode(Opc, dl,10715DAG.getVTList(MVT::i32, MVT::i32),10716N->getOperand(1), N->getOperand(2),10717Lo, Hi);10718Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,10719LongMul.getValue(0), LongMul.getValue(1)));10720}1072110722/// ReplaceNodeResults - Replace the results of node with an illegal result10723/// type with new values built out of custom code.10724void ARMTargetLowering::ReplaceNodeResults(SDNode *N,10725SmallVectorImpl<SDValue> &Results,10726SelectionDAG &DAG) const {10727SDValue Res;10728switch (N->getOpcode()) {10729default:10730llvm_unreachable("Don't know how to custom expand this!");10731case ISD::READ_REGISTER:10732ExpandREAD_REGISTER(N, Results, DAG);10733break;10734case ISD::BITCAST:10735Res = ExpandBITCAST(N, DAG, Subtarget);10736break;10737case ISD::SRL:10738case ISD::SRA:10739case ISD::SHL:10740Res = Expand64BitShift(N, DAG, Subtarget);10741break;10742case ISD::SREM:10743case ISD::UREM:10744Res = LowerREM(N, DAG);10745break;10746case ISD::SDIVREM:10747case ISD::UDIVREM:10748Res = LowerDivRem(SDValue(N, 0), DAG);10749assert(Res.getNumOperands() == 2 && "DivRem needs two values");10750Results.push_back(Res.getValue(0));10751Results.push_back(Res.getValue(1));10752return;10753case ISD::SADDSAT:10754case ISD::SSUBSAT:10755case ISD::UADDSAT:10756case ISD::USUBSAT:10757Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);10758break;10759case ISD::READCYCLECOUNTER:10760ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);10761return;10762case ISD::UDIV:10763case ISD::SDIV:10764assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");10765return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,10766Results);10767case ISD::ATOMIC_CMP_SWAP:10768ReplaceCMP_SWAP_64Results(N, Results, DAG);10769return;10770case ISD::INTRINSIC_WO_CHAIN:10771return ReplaceLongIntrinsic(N, Results, DAG);10772case ISD::LOAD:10773LowerLOAD(N, Results, DAG);10774break;10775case ISD::TRUNCATE:10776Res = LowerTruncate(N, DAG, Subtarget);10777break;10778case ISD::SIGN_EXTEND:10779case ISD::ZERO_EXTEND:10780Res = LowerVectorExtend(N, DAG, Subtarget);10781break;10782case ISD::FP_TO_SINT_SAT:10783case ISD::FP_TO_UINT_SAT:10784Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);10785break;10786}10787if (Res.getNode())10788Results.push_back(Res);10789}1079010791//===----------------------------------------------------------------------===//10792// ARM Scheduler Hooks10793//===----------------------------------------------------------------------===//1079410795/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and10796/// registers the function context.10797void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,10798MachineBasicBlock *MBB,10799MachineBasicBlock *DispatchBB,10800int FI) const {10801assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&10802"ROPI/RWPI not currently supported with SjLj");10803const TargetInstrInfo *TII = Subtarget->getInstrInfo();10804DebugLoc dl = MI.getDebugLoc();10805MachineFunction *MF = MBB->getParent();10806MachineRegisterInfo *MRI = &MF->getRegInfo();10807MachineConstantPool *MCP = MF->getConstantPool();10808ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();10809const Function &F = MF->getFunction();1081010811bool isThumb = Subtarget->isThumb();10812bool isThumb2 = Subtarget->isThumb2();1081310814unsigned PCLabelId = AFI->createPICLabelUId();10815unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;10816ARMConstantPoolValue *CPV =10817ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);10818unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));1081910820const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass10821: &ARM::GPRRegClass;1082210823// Grab constant pool and fixed stack memory operands.10824MachineMemOperand *CPMMO =10825MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),10826MachineMemOperand::MOLoad, 4, Align(4));1082710828MachineMemOperand *FIMMOSt =10829MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),10830MachineMemOperand::MOStore, 4, Align(4));1083110832// Load the address of the dispatch MBB into the jump buffer.10833if (isThumb2) {10834// Incoming value: jbuf10835// ldr.n r5, LCPI1_110836// orr r5, r5, #110837// add r5, pc10838// str r5, [$jbuf, #+4] ; &jbuf[1]10839Register NewVReg1 = MRI->createVirtualRegister(TRC);10840BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)10841.addConstantPoolIndex(CPI)10842.addMemOperand(CPMMO)10843.add(predOps(ARMCC::AL));10844// Set the low bit because of thumb mode.10845Register NewVReg2 = MRI->createVirtualRegister(TRC);10846BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)10847.addReg(NewVReg1, RegState::Kill)10848.addImm(0x01)10849.add(predOps(ARMCC::AL))10850.add(condCodeOp());10851Register NewVReg3 = MRI->createVirtualRegister(TRC);10852BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)10853.addReg(NewVReg2, RegState::Kill)10854.addImm(PCLabelId);10855BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))10856.addReg(NewVReg3, RegState::Kill)10857.addFrameIndex(FI)10858.addImm(36) // &jbuf[1] :: pc10859.addMemOperand(FIMMOSt)10860.add(predOps(ARMCC::AL));10861} else if (isThumb) {10862// Incoming value: jbuf10863// ldr.n r1, LCPI1_410864// add r1, pc10865// mov r2, #110866// orrs r1, r210867// add r2, $jbuf, #+4 ; &jbuf[1]10868// str r1, [r2]10869Register NewVReg1 = MRI->createVirtualRegister(TRC);10870BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)10871.addConstantPoolIndex(CPI)10872.addMemOperand(CPMMO)10873.add(predOps(ARMCC::AL));10874Register NewVReg2 = MRI->createVirtualRegister(TRC);10875BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)10876.addReg(NewVReg1, RegState::Kill)10877.addImm(PCLabelId);10878// Set the low bit because of thumb mode.10879Register NewVReg3 = MRI->createVirtualRegister(TRC);10880BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)10881.addReg(ARM::CPSR, RegState::Define)10882.addImm(1)10883.add(predOps(ARMCC::AL));10884Register NewVReg4 = MRI->createVirtualRegister(TRC);10885BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)10886.addReg(ARM::CPSR, RegState::Define)10887.addReg(NewVReg2, RegState::Kill)10888.addReg(NewVReg3, RegState::Kill)10889.add(predOps(ARMCC::AL));10890Register NewVReg5 = MRI->createVirtualRegister(TRC);10891BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)10892.addFrameIndex(FI)10893.addImm(36); // &jbuf[1] :: pc10894BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))10895.addReg(NewVReg4, RegState::Kill)10896.addReg(NewVReg5, RegState::Kill)10897.addImm(0)10898.addMemOperand(FIMMOSt)10899.add(predOps(ARMCC::AL));10900} else {10901// Incoming value: jbuf10902// ldr r1, LCPI1_110903// add r1, pc, r110904// str r1, [$jbuf, #+4] ; &jbuf[1]10905Register NewVReg1 = MRI->createVirtualRegister(TRC);10906BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)10907.addConstantPoolIndex(CPI)10908.addImm(0)10909.addMemOperand(CPMMO)10910.add(predOps(ARMCC::AL));10911Register NewVReg2 = MRI->createVirtualRegister(TRC);10912BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)10913.addReg(NewVReg1, RegState::Kill)10914.addImm(PCLabelId)10915.add(predOps(ARMCC::AL));10916BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))10917.addReg(NewVReg2, RegState::Kill)10918.addFrameIndex(FI)10919.addImm(36) // &jbuf[1] :: pc10920.addMemOperand(FIMMOSt)10921.add(predOps(ARMCC::AL));10922}10923}1092410925void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,10926MachineBasicBlock *MBB) const {10927const TargetInstrInfo *TII = Subtarget->getInstrInfo();10928DebugLoc dl = MI.getDebugLoc();10929MachineFunction *MF = MBB->getParent();10930MachineRegisterInfo *MRI = &MF->getRegInfo();10931MachineFrameInfo &MFI = MF->getFrameInfo();10932int FI = MFI.getFunctionContextIndex();1093310934const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass10935: &ARM::GPRnopcRegClass;1093610937// Get a mapping of the call site numbers to all of the landing pads they're10938// associated with.10939DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;10940unsigned MaxCSNum = 0;10941for (MachineBasicBlock &BB : *MF) {10942if (!BB.isEHPad())10943continue;1094410945// FIXME: We should assert that the EH_LABEL is the first MI in the landing10946// pad.10947for (MachineInstr &II : BB) {10948if (!II.isEHLabel())10949continue;1095010951MCSymbol *Sym = II.getOperand(0).getMCSymbol();10952if (!MF->hasCallSiteLandingPad(Sym)) continue;1095310954SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);10955for (unsigned Idx : CallSiteIdxs) {10956CallSiteNumToLPad[Idx].push_back(&BB);10957MaxCSNum = std::max(MaxCSNum, Idx);10958}10959break;10960}10961}1096210963// Get an ordered list of the machine basic blocks for the jump table.10964std::vector<MachineBasicBlock*> LPadList;10965SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;10966LPadList.reserve(CallSiteNumToLPad.size());10967for (unsigned I = 1; I <= MaxCSNum; ++I) {10968SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];10969for (MachineBasicBlock *MBB : MBBList) {10970LPadList.push_back(MBB);10971InvokeBBs.insert(MBB->pred_begin(), MBB->pred_end());10972}10973}1097410975assert(!LPadList.empty() &&10976"No landing pad destinations for the dispatch jump table!");1097710978// Create the jump table and associated information.10979MachineJumpTableInfo *JTI =10980MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);10981unsigned MJTI = JTI->createJumpTableIndex(LPadList);1098210983// Create the MBBs for the dispatch code.1098410985// Shove the dispatch's address into the return slot in the function context.10986MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();10987DispatchBB->setIsEHPad();1098810989MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();10990unsigned trap_opcode;10991if (Subtarget->isThumb())10992trap_opcode = ARM::tTRAP;10993else10994trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;1099510996BuildMI(TrapBB, dl, TII->get(trap_opcode));10997DispatchBB->addSuccessor(TrapBB);1099810999MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();11000DispatchBB->addSuccessor(DispContBB);1100111002// Insert and MBBs.11003MF->insert(MF->end(), DispatchBB);11004MF->insert(MF->end(), DispContBB);11005MF->insert(MF->end(), TrapBB);1100611007// Insert code into the entry block that creates and registers the function11008// context.11009SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);1101011011MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(11012MachinePointerInfo::getFixedStack(*MF, FI),11013MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4));1101411015MachineInstrBuilder MIB;11016MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));1101711018const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);11019const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();1102011021// Add a register mask with no preserved registers. This results in all11022// registers being marked as clobbered. This can't work if the dispatch block11023// is in a Thumb1 function and is linked with ARM code which uses the FP11024// registers, as there is no way to preserve the FP registers in Thumb1 mode.11025MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));1102611027bool IsPositionIndependent = isPositionIndependent();11028unsigned NumLPads = LPadList.size();11029if (Subtarget->isThumb2()) {11030Register NewVReg1 = MRI->createVirtualRegister(TRC);11031BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)11032.addFrameIndex(FI)11033.addImm(4)11034.addMemOperand(FIMMOLd)11035.add(predOps(ARMCC::AL));1103611037if (NumLPads < 256) {11038BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))11039.addReg(NewVReg1)11040.addImm(LPadList.size())11041.add(predOps(ARMCC::AL));11042} else {11043Register VReg1 = MRI->createVirtualRegister(TRC);11044BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)11045.addImm(NumLPads & 0xFFFF)11046.add(predOps(ARMCC::AL));1104711048unsigned VReg2 = VReg1;11049if ((NumLPads & 0xFFFF0000) != 0) {11050VReg2 = MRI->createVirtualRegister(TRC);11051BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)11052.addReg(VReg1)11053.addImm(NumLPads >> 16)11054.add(predOps(ARMCC::AL));11055}1105611057BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))11058.addReg(NewVReg1)11059.addReg(VReg2)11060.add(predOps(ARMCC::AL));11061}1106211063BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))11064.addMBB(TrapBB)11065.addImm(ARMCC::HI)11066.addReg(ARM::CPSR);1106711068Register NewVReg3 = MRI->createVirtualRegister(TRC);11069BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)11070.addJumpTableIndex(MJTI)11071.add(predOps(ARMCC::AL));1107211073Register NewVReg4 = MRI->createVirtualRegister(TRC);11074BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)11075.addReg(NewVReg3, RegState::Kill)11076.addReg(NewVReg1)11077.addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))11078.add(predOps(ARMCC::AL))11079.add(condCodeOp());1108011081BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))11082.addReg(NewVReg4, RegState::Kill)11083.addReg(NewVReg1)11084.addJumpTableIndex(MJTI);11085} else if (Subtarget->isThumb()) {11086Register NewVReg1 = MRI->createVirtualRegister(TRC);11087BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)11088.addFrameIndex(FI)11089.addImm(1)11090.addMemOperand(FIMMOLd)11091.add(predOps(ARMCC::AL));1109211093if (NumLPads < 256) {11094BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))11095.addReg(NewVReg1)11096.addImm(NumLPads)11097.add(predOps(ARMCC::AL));11098} else {11099MachineConstantPool *ConstantPool = MF->getConstantPool();11100Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());11101const Constant *C = ConstantInt::get(Int32Ty, NumLPads);1110211103// MachineConstantPool wants an explicit alignment.11104Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);11105unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);1110611107Register VReg1 = MRI->createVirtualRegister(TRC);11108BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))11109.addReg(VReg1, RegState::Define)11110.addConstantPoolIndex(Idx)11111.add(predOps(ARMCC::AL));11112BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))11113.addReg(NewVReg1)11114.addReg(VReg1)11115.add(predOps(ARMCC::AL));11116}1111711118BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))11119.addMBB(TrapBB)11120.addImm(ARMCC::HI)11121.addReg(ARM::CPSR);1112211123Register NewVReg2 = MRI->createVirtualRegister(TRC);11124BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)11125.addReg(ARM::CPSR, RegState::Define)11126.addReg(NewVReg1)11127.addImm(2)11128.add(predOps(ARMCC::AL));1112911130Register NewVReg3 = MRI->createVirtualRegister(TRC);11131BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)11132.addJumpTableIndex(MJTI)11133.add(predOps(ARMCC::AL));1113411135Register NewVReg4 = MRI->createVirtualRegister(TRC);11136BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)11137.addReg(ARM::CPSR, RegState::Define)11138.addReg(NewVReg2, RegState::Kill)11139.addReg(NewVReg3)11140.add(predOps(ARMCC::AL));1114111142MachineMemOperand *JTMMOLd =11143MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),11144MachineMemOperand::MOLoad, 4, Align(4));1114511146Register NewVReg5 = MRI->createVirtualRegister(TRC);11147BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)11148.addReg(NewVReg4, RegState::Kill)11149.addImm(0)11150.addMemOperand(JTMMOLd)11151.add(predOps(ARMCC::AL));1115211153unsigned NewVReg6 = NewVReg5;11154if (IsPositionIndependent) {11155NewVReg6 = MRI->createVirtualRegister(TRC);11156BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)11157.addReg(ARM::CPSR, RegState::Define)11158.addReg(NewVReg5, RegState::Kill)11159.addReg(NewVReg3)11160.add(predOps(ARMCC::AL));11161}1116211163BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))11164.addReg(NewVReg6, RegState::Kill)11165.addJumpTableIndex(MJTI);11166} else {11167Register NewVReg1 = MRI->createVirtualRegister(TRC);11168BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)11169.addFrameIndex(FI)11170.addImm(4)11171.addMemOperand(FIMMOLd)11172.add(predOps(ARMCC::AL));1117311174if (NumLPads < 256) {11175BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))11176.addReg(NewVReg1)11177.addImm(NumLPads)11178.add(predOps(ARMCC::AL));11179} else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {11180Register VReg1 = MRI->createVirtualRegister(TRC);11181BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)11182.addImm(NumLPads & 0xFFFF)11183.add(predOps(ARMCC::AL));1118411185unsigned VReg2 = VReg1;11186if ((NumLPads & 0xFFFF0000) != 0) {11187VReg2 = MRI->createVirtualRegister(TRC);11188BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)11189.addReg(VReg1)11190.addImm(NumLPads >> 16)11191.add(predOps(ARMCC::AL));11192}1119311194BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))11195.addReg(NewVReg1)11196.addReg(VReg2)11197.add(predOps(ARMCC::AL));11198} else {11199MachineConstantPool *ConstantPool = MF->getConstantPool();11200Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());11201const Constant *C = ConstantInt::get(Int32Ty, NumLPads);1120211203// MachineConstantPool wants an explicit alignment.11204Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);11205unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);1120611207Register VReg1 = MRI->createVirtualRegister(TRC);11208BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))11209.addReg(VReg1, RegState::Define)11210.addConstantPoolIndex(Idx)11211.addImm(0)11212.add(predOps(ARMCC::AL));11213BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))11214.addReg(NewVReg1)11215.addReg(VReg1, RegState::Kill)11216.add(predOps(ARMCC::AL));11217}1121811219BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))11220.addMBB(TrapBB)11221.addImm(ARMCC::HI)11222.addReg(ARM::CPSR);1122311224Register NewVReg3 = MRI->createVirtualRegister(TRC);11225BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)11226.addReg(NewVReg1)11227.addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))11228.add(predOps(ARMCC::AL))11229.add(condCodeOp());11230Register NewVReg4 = MRI->createVirtualRegister(TRC);11231BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)11232.addJumpTableIndex(MJTI)11233.add(predOps(ARMCC::AL));1123411235MachineMemOperand *JTMMOLd =11236MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),11237MachineMemOperand::MOLoad, 4, Align(4));11238Register NewVReg5 = MRI->createVirtualRegister(TRC);11239BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)11240.addReg(NewVReg3, RegState::Kill)11241.addReg(NewVReg4)11242.addImm(0)11243.addMemOperand(JTMMOLd)11244.add(predOps(ARMCC::AL));1124511246if (IsPositionIndependent) {11247BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))11248.addReg(NewVReg5, RegState::Kill)11249.addReg(NewVReg4)11250.addJumpTableIndex(MJTI);11251} else {11252BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))11253.addReg(NewVReg5, RegState::Kill)11254.addJumpTableIndex(MJTI);11255}11256}1125711258// Add the jump table entries as successors to the MBB.11259SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;11260for (MachineBasicBlock *CurMBB : LPadList) {11261if (SeenMBBs.insert(CurMBB).second)11262DispContBB->addSuccessor(CurMBB);11263}1126411265// N.B. the order the invoke BBs are processed in doesn't matter here.11266const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);11267SmallVector<MachineBasicBlock*, 64> MBBLPads;11268for (MachineBasicBlock *BB : InvokeBBs) {1126911270// Remove the landing pad successor from the invoke block and replace it11271// with the new dispatch block.11272SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());11273while (!Successors.empty()) {11274MachineBasicBlock *SMBB = Successors.pop_back_val();11275if (SMBB->isEHPad()) {11276BB->removeSuccessor(SMBB);11277MBBLPads.push_back(SMBB);11278}11279}1128011281BB->addSuccessor(DispatchBB, BranchProbability::getZero());11282BB->normalizeSuccProbs();1128311284// Find the invoke call and mark all of the callee-saved registers as11285// 'implicit defined' so that they're spilled. This prevents code from11286// moving instructions to before the EH block, where they will never be11287// executed.11288for (MachineBasicBlock::reverse_iterator11289II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {11290if (!II->isCall()) continue;1129111292DenseMap<unsigned, bool> DefRegs;11293for (MachineInstr::mop_iterator11294OI = II->operands_begin(), OE = II->operands_end();11295OI != OE; ++OI) {11296if (!OI->isReg()) continue;11297DefRegs[OI->getReg()] = true;11298}1129911300MachineInstrBuilder MIB(*MF, &*II);1130111302for (unsigned i = 0; SavedRegs[i] != 0; ++i) {11303unsigned Reg = SavedRegs[i];11304if (Subtarget->isThumb2() &&11305!ARM::tGPRRegClass.contains(Reg) &&11306!ARM::hGPRRegClass.contains(Reg))11307continue;11308if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))11309continue;11310if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))11311continue;11312if (!DefRegs[Reg])11313MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);11314}1131511316break;11317}11318}1131911320// Mark all former landing pads as non-landing pads. The dispatch is the only11321// landing pad now.11322for (MachineBasicBlock *MBBLPad : MBBLPads)11323MBBLPad->setIsEHPad(false);1132411325// The instruction is gone now.11326MI.eraseFromParent();11327}1132811329static11330MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {11331for (MachineBasicBlock *S : MBB->successors())11332if (S != Succ)11333return S;11334llvm_unreachable("Expecting a BB with two successors!");11335}1133611337/// Return the load opcode for a given load size. If load size >= 8,11338/// neon opcode will be returned.11339static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {11340if (LdSize >= 8)11341return LdSize == 16 ? ARM::VLD1q32wb_fixed11342: LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;11343if (IsThumb1)11344return LdSize == 4 ? ARM::tLDRi11345: LdSize == 2 ? ARM::tLDRHi11346: LdSize == 1 ? ARM::tLDRBi : 0;11347if (IsThumb2)11348return LdSize == 4 ? ARM::t2LDR_POST11349: LdSize == 2 ? ARM::t2LDRH_POST11350: LdSize == 1 ? ARM::t2LDRB_POST : 0;11351return LdSize == 4 ? ARM::LDR_POST_IMM11352: LdSize == 2 ? ARM::LDRH_POST11353: LdSize == 1 ? ARM::LDRB_POST_IMM : 0;11354}1135511356/// Return the store opcode for a given store size. If store size >= 8,11357/// neon opcode will be returned.11358static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {11359if (StSize >= 8)11360return StSize == 16 ? ARM::VST1q32wb_fixed11361: StSize == 8 ? ARM::VST1d32wb_fixed : 0;11362if (IsThumb1)11363return StSize == 4 ? ARM::tSTRi11364: StSize == 2 ? ARM::tSTRHi11365: StSize == 1 ? ARM::tSTRBi : 0;11366if (IsThumb2)11367return StSize == 4 ? ARM::t2STR_POST11368: StSize == 2 ? ARM::t2STRH_POST11369: StSize == 1 ? ARM::t2STRB_POST : 0;11370return StSize == 4 ? ARM::STR_POST_IMM11371: StSize == 2 ? ARM::STRH_POST11372: StSize == 1 ? ARM::STRB_POST_IMM : 0;11373}1137411375/// Emit a post-increment load operation with given size. The instructions11376/// will be added to BB at Pos.11377static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,11378const TargetInstrInfo *TII, const DebugLoc &dl,11379unsigned LdSize, unsigned Data, unsigned AddrIn,11380unsigned AddrOut, bool IsThumb1, bool IsThumb2) {11381unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);11382assert(LdOpc != 0 && "Should have a load opcode");11383if (LdSize >= 8) {11384BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)11385.addReg(AddrOut, RegState::Define)11386.addReg(AddrIn)11387.addImm(0)11388.add(predOps(ARMCC::AL));11389} else if (IsThumb1) {11390// load + update AddrIn11391BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)11392.addReg(AddrIn)11393.addImm(0)11394.add(predOps(ARMCC::AL));11395BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)11396.add(t1CondCodeOp())11397.addReg(AddrIn)11398.addImm(LdSize)11399.add(predOps(ARMCC::AL));11400} else if (IsThumb2) {11401BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)11402.addReg(AddrOut, RegState::Define)11403.addReg(AddrIn)11404.addImm(LdSize)11405.add(predOps(ARMCC::AL));11406} else { // arm11407BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)11408.addReg(AddrOut, RegState::Define)11409.addReg(AddrIn)11410.addReg(0)11411.addImm(LdSize)11412.add(predOps(ARMCC::AL));11413}11414}1141511416/// Emit a post-increment store operation with given size. The instructions11417/// will be added to BB at Pos.11418static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,11419const TargetInstrInfo *TII, const DebugLoc &dl,11420unsigned StSize, unsigned Data, unsigned AddrIn,11421unsigned AddrOut, bool IsThumb1, bool IsThumb2) {11422unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);11423assert(StOpc != 0 && "Should have a store opcode");11424if (StSize >= 8) {11425BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)11426.addReg(AddrIn)11427.addImm(0)11428.addReg(Data)11429.add(predOps(ARMCC::AL));11430} else if (IsThumb1) {11431// store + update AddrIn11432BuildMI(*BB, Pos, dl, TII->get(StOpc))11433.addReg(Data)11434.addReg(AddrIn)11435.addImm(0)11436.add(predOps(ARMCC::AL));11437BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)11438.add(t1CondCodeOp())11439.addReg(AddrIn)11440.addImm(StSize)11441.add(predOps(ARMCC::AL));11442} else if (IsThumb2) {11443BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)11444.addReg(Data)11445.addReg(AddrIn)11446.addImm(StSize)11447.add(predOps(ARMCC::AL));11448} else { // arm11449BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)11450.addReg(Data)11451.addReg(AddrIn)11452.addReg(0)11453.addImm(StSize)11454.add(predOps(ARMCC::AL));11455}11456}1145711458MachineBasicBlock *11459ARMTargetLowering::EmitStructByval(MachineInstr &MI,11460MachineBasicBlock *BB) const {11461// This pseudo instruction has 3 operands: dst, src, size11462// We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().11463// Otherwise, we will generate unrolled scalar copies.11464const TargetInstrInfo *TII = Subtarget->getInstrInfo();11465const BasicBlock *LLVM_BB = BB->getBasicBlock();11466MachineFunction::iterator It = ++BB->getIterator();1146711468Register dest = MI.getOperand(0).getReg();11469Register src = MI.getOperand(1).getReg();11470unsigned SizeVal = MI.getOperand(2).getImm();11471unsigned Alignment = MI.getOperand(3).getImm();11472DebugLoc dl = MI.getDebugLoc();1147311474MachineFunction *MF = BB->getParent();11475MachineRegisterInfo &MRI = MF->getRegInfo();11476unsigned UnitSize = 0;11477const TargetRegisterClass *TRC = nullptr;11478const TargetRegisterClass *VecTRC = nullptr;1147911480bool IsThumb1 = Subtarget->isThumb1Only();11481bool IsThumb2 = Subtarget->isThumb2();11482bool IsThumb = Subtarget->isThumb();1148311484if (Alignment & 1) {11485UnitSize = 1;11486} else if (Alignment & 2) {11487UnitSize = 2;11488} else {11489// Check whether we can use NEON instructions.11490if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&11491Subtarget->hasNEON()) {11492if ((Alignment % 16 == 0) && SizeVal >= 16)11493UnitSize = 16;11494else if ((Alignment % 8 == 0) && SizeVal >= 8)11495UnitSize = 8;11496}11497// Can't use NEON instructions.11498if (UnitSize == 0)11499UnitSize = 4;11500}1150111502// Select the correct opcode and register class for unit size load/store11503bool IsNeon = UnitSize >= 8;11504TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;11505if (IsNeon)11506VecTRC = UnitSize == 16 ? &ARM::DPairRegClass11507: UnitSize == 8 ? &ARM::DPRRegClass11508: nullptr;1150911510unsigned BytesLeft = SizeVal % UnitSize;11511unsigned LoopSize = SizeVal - BytesLeft;1151211513if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {11514// Use LDR and STR to copy.11515// [scratch, srcOut] = LDR_POST(srcIn, UnitSize)11516// [destOut] = STR_POST(scratch, destIn, UnitSize)11517unsigned srcIn = src;11518unsigned destIn = dest;11519for (unsigned i = 0; i < LoopSize; i+=UnitSize) {11520Register srcOut = MRI.createVirtualRegister(TRC);11521Register destOut = MRI.createVirtualRegister(TRC);11522Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);11523emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,11524IsThumb1, IsThumb2);11525emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,11526IsThumb1, IsThumb2);11527srcIn = srcOut;11528destIn = destOut;11529}1153011531// Handle the leftover bytes with LDRB and STRB.11532// [scratch, srcOut] = LDRB_POST(srcIn, 1)11533// [destOut] = STRB_POST(scratch, destIn, 1)11534for (unsigned i = 0; i < BytesLeft; i++) {11535Register srcOut = MRI.createVirtualRegister(TRC);11536Register destOut = MRI.createVirtualRegister(TRC);11537Register scratch = MRI.createVirtualRegister(TRC);11538emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,11539IsThumb1, IsThumb2);11540emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,11541IsThumb1, IsThumb2);11542srcIn = srcOut;11543destIn = destOut;11544}11545MI.eraseFromParent(); // The instruction is gone now.11546return BB;11547}1154811549// Expand the pseudo op to a loop.11550// thisMBB:11551// ...11552// movw varEnd, # --> with thumb211553// movt varEnd, #11554// ldrcp varEnd, idx --> without thumb211555// fallthrough --> loopMBB11556// loopMBB:11557// PHI varPhi, varEnd, varLoop11558// PHI srcPhi, src, srcLoop11559// PHI destPhi, dst, destLoop11560// [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)11561// [destLoop] = STR_POST(scratch, destPhi, UnitSize)11562// subs varLoop, varPhi, #UnitSize11563// bne loopMBB11564// fallthrough --> exitMBB11565// exitMBB:11566// epilogue to handle left-over bytes11567// [scratch, srcOut] = LDRB_POST(srcLoop, 1)11568// [destOut] = STRB_POST(scratch, destLoop, 1)11569MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);11570MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);11571MF->insert(It, loopMBB);11572MF->insert(It, exitMBB);1157311574// Set the call frame size on entry to the new basic blocks.11575unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);11576loopMBB->setCallFrameSize(CallFrameSize);11577exitMBB->setCallFrameSize(CallFrameSize);1157811579// Transfer the remainder of BB and its successor edges to exitMBB.11580exitMBB->splice(exitMBB->begin(), BB,11581std::next(MachineBasicBlock::iterator(MI)), BB->end());11582exitMBB->transferSuccessorsAndUpdatePHIs(BB);1158311584// Load an immediate to varEnd.11585Register varEnd = MRI.createVirtualRegister(TRC);11586if (Subtarget->useMovt()) {11587BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),11588varEnd)11589.addImm(LoopSize);11590} else if (Subtarget->genExecuteOnly()) {11591assert(IsThumb && "Non-thumb expected to have used movt");11592BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);11593} else {11594MachineConstantPool *ConstantPool = MF->getConstantPool();11595Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());11596const Constant *C = ConstantInt::get(Int32Ty, LoopSize);1159711598// MachineConstantPool wants an explicit alignment.11599Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);11600unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);11601MachineMemOperand *CPMMO =11602MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),11603MachineMemOperand::MOLoad, 4, Align(4));1160411605if (IsThumb)11606BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))11607.addReg(varEnd, RegState::Define)11608.addConstantPoolIndex(Idx)11609.add(predOps(ARMCC::AL))11610.addMemOperand(CPMMO);11611else11612BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))11613.addReg(varEnd, RegState::Define)11614.addConstantPoolIndex(Idx)11615.addImm(0)11616.add(predOps(ARMCC::AL))11617.addMemOperand(CPMMO);11618}11619BB->addSuccessor(loopMBB);1162011621// Generate the loop body:11622// varPhi = PHI(varLoop, varEnd)11623// srcPhi = PHI(srcLoop, src)11624// destPhi = PHI(destLoop, dst)11625MachineBasicBlock *entryBB = BB;11626BB = loopMBB;11627Register varLoop = MRI.createVirtualRegister(TRC);11628Register varPhi = MRI.createVirtualRegister(TRC);11629Register srcLoop = MRI.createVirtualRegister(TRC);11630Register srcPhi = MRI.createVirtualRegister(TRC);11631Register destLoop = MRI.createVirtualRegister(TRC);11632Register destPhi = MRI.createVirtualRegister(TRC);1163311634BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)11635.addReg(varLoop).addMBB(loopMBB)11636.addReg(varEnd).addMBB(entryBB);11637BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)11638.addReg(srcLoop).addMBB(loopMBB)11639.addReg(src).addMBB(entryBB);11640BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)11641.addReg(destLoop).addMBB(loopMBB)11642.addReg(dest).addMBB(entryBB);1164311644// [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)11645// [destLoop] = STR_POST(scratch, destPhi, UnitSiz)11646Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);11647emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,11648IsThumb1, IsThumb2);11649emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,11650IsThumb1, IsThumb2);1165111652// Decrement loop variable by UnitSize.11653if (IsThumb1) {11654BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)11655.add(t1CondCodeOp())11656.addReg(varPhi)11657.addImm(UnitSize)11658.add(predOps(ARMCC::AL));11659} else {11660MachineInstrBuilder MIB =11661BuildMI(*BB, BB->end(), dl,11662TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);11663MIB.addReg(varPhi)11664.addImm(UnitSize)11665.add(predOps(ARMCC::AL))11666.add(condCodeOp());11667MIB->getOperand(5).setReg(ARM::CPSR);11668MIB->getOperand(5).setIsDef(true);11669}11670BuildMI(*BB, BB->end(), dl,11671TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))11672.addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);1167311674// loopMBB can loop back to loopMBB or fall through to exitMBB.11675BB->addSuccessor(loopMBB);11676BB->addSuccessor(exitMBB);1167711678// Add epilogue to handle BytesLeft.11679BB = exitMBB;11680auto StartOfExit = exitMBB->begin();1168111682// [scratch, srcOut] = LDRB_POST(srcLoop, 1)11683// [destOut] = STRB_POST(scratch, destLoop, 1)11684unsigned srcIn = srcLoop;11685unsigned destIn = destLoop;11686for (unsigned i = 0; i < BytesLeft; i++) {11687Register srcOut = MRI.createVirtualRegister(TRC);11688Register destOut = MRI.createVirtualRegister(TRC);11689Register scratch = MRI.createVirtualRegister(TRC);11690emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,11691IsThumb1, IsThumb2);11692emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,11693IsThumb1, IsThumb2);11694srcIn = srcOut;11695destIn = destOut;11696}1169711698MI.eraseFromParent(); // The instruction is gone now.11699return BB;11700}1170111702MachineBasicBlock *11703ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,11704MachineBasicBlock *MBB) const {11705const TargetMachine &TM = getTargetMachine();11706const TargetInstrInfo &TII = *Subtarget->getInstrInfo();11707DebugLoc DL = MI.getDebugLoc();1170811709assert(Subtarget->isTargetWindows() &&11710"__chkstk is only supported on Windows");11711assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");1171211713// __chkstk takes the number of words to allocate on the stack in R4, and11714// returns the stack adjustment in number of bytes in R4. This will not11715// clober any other registers (other than the obvious lr).11716//11717// Although, technically, IP should be considered a register which may be11718// clobbered, the call itself will not touch it. Windows on ARM is a pure11719// thumb-2 environment, so there is no interworking required. As a result, we11720// do not expect a veneer to be emitted by the linker, clobbering IP.11721//11722// Each module receives its own copy of __chkstk, so no import thunk is11723// required, again, ensuring that IP is not clobbered.11724//11725// Finally, although some linkers may theoretically provide a trampoline for11726// out of range calls (which is quite common due to a 32M range limitation of11727// branches for Thumb), we can generate the long-call version via11728// -mcmodel=large, alleviating the need for the trampoline which may clobber11729// IP.1173011731switch (TM.getCodeModel()) {11732case CodeModel::Tiny:11733llvm_unreachable("Tiny code model not available on ARM.");11734case CodeModel::Small:11735case CodeModel::Medium:11736case CodeModel::Kernel:11737BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))11738.add(predOps(ARMCC::AL))11739.addExternalSymbol("__chkstk")11740.addReg(ARM::R4, RegState::Implicit | RegState::Kill)11741.addReg(ARM::R4, RegState::Implicit | RegState::Define)11742.addReg(ARM::R12,11743RegState::Implicit | RegState::Define | RegState::Dead)11744.addReg(ARM::CPSR,11745RegState::Implicit | RegState::Define | RegState::Dead);11746break;11747case CodeModel::Large: {11748MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();11749Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);1175011751BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)11752.addExternalSymbol("__chkstk");11753BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent())))11754.add(predOps(ARMCC::AL))11755.addReg(Reg, RegState::Kill)11756.addReg(ARM::R4, RegState::Implicit | RegState::Kill)11757.addReg(ARM::R4, RegState::Implicit | RegState::Define)11758.addReg(ARM::R12,11759RegState::Implicit | RegState::Define | RegState::Dead)11760.addReg(ARM::CPSR,11761RegState::Implicit | RegState::Define | RegState::Dead);11762break;11763}11764}1176511766BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)11767.addReg(ARM::SP, RegState::Kill)11768.addReg(ARM::R4, RegState::Kill)11769.setMIFlags(MachineInstr::FrameSetup)11770.add(predOps(ARMCC::AL))11771.add(condCodeOp());1177211773MI.eraseFromParent();11774return MBB;11775}1177611777MachineBasicBlock *11778ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,11779MachineBasicBlock *MBB) const {11780DebugLoc DL = MI.getDebugLoc();11781MachineFunction *MF = MBB->getParent();11782const TargetInstrInfo *TII = Subtarget->getInstrInfo();1178311784MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();11785MF->insert(++MBB->getIterator(), ContBB);11786ContBB->splice(ContBB->begin(), MBB,11787std::next(MachineBasicBlock::iterator(MI)), MBB->end());11788ContBB->transferSuccessorsAndUpdatePHIs(MBB);11789MBB->addSuccessor(ContBB);1179011791MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();11792BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));11793MF->push_back(TrapBB);11794MBB->addSuccessor(TrapBB);1179511796BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))11797.addReg(MI.getOperand(0).getReg())11798.addImm(0)11799.add(predOps(ARMCC::AL));11800BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))11801.addMBB(TrapBB)11802.addImm(ARMCC::EQ)11803.addReg(ARM::CPSR);1180411805MI.eraseFromParent();11806return ContBB;11807}1180811809// The CPSR operand of SelectItr might be missing a kill marker11810// because there were multiple uses of CPSR, and ISel didn't know11811// which to mark. Figure out whether SelectItr should have had a11812// kill marker, and set it if it should. Returns the correct kill11813// marker value.11814static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,11815MachineBasicBlock* BB,11816const TargetRegisterInfo* TRI) {11817// Scan forward through BB for a use/def of CPSR.11818MachineBasicBlock::iterator miI(std::next(SelectItr));11819for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {11820const MachineInstr& mi = *miI;11821if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))11822return false;11823if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))11824break; // Should have kill-flag - update below.11825}1182611827// If we hit the end of the block, check whether CPSR is live into a11828// successor.11829if (miI == BB->end()) {11830for (MachineBasicBlock *Succ : BB->successors())11831if (Succ->isLiveIn(ARM::CPSR))11832return false;11833}1183411835// We found a def, or hit the end of the basic block and CPSR wasn't live11836// out. SelectMI should have a kill flag on CPSR.11837SelectItr->addRegisterKilled(ARM::CPSR, TRI);11838return true;11839}1184011841/// Adds logic in loop entry MBB to calculate loop iteration count and adds11842/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop11843static Register genTPEntry(MachineBasicBlock *TpEntry,11844MachineBasicBlock *TpLoopBody,11845MachineBasicBlock *TpExit, Register OpSizeReg,11846const TargetInstrInfo *TII, DebugLoc Dl,11847MachineRegisterInfo &MRI) {11848// Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.11849Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);11850BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)11851.addUse(OpSizeReg)11852.addImm(15)11853.add(predOps(ARMCC::AL))11854.addReg(0);1185511856Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);11857BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)11858.addUse(AddDestReg, RegState::Kill)11859.addImm(4)11860.add(predOps(ARMCC::AL))11861.addReg(0);1186211863Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);11864BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)11865.addUse(LsrDestReg, RegState::Kill);1186611867BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))11868.addUse(TotalIterationsReg)11869.addMBB(TpExit);1187011871BuildMI(TpEntry, Dl, TII->get(ARM::t2B))11872.addMBB(TpLoopBody)11873.add(predOps(ARMCC::AL));1187411875return TotalIterationsReg;11876}1187711878/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and11879/// t2DoLoopEnd. These are used by later passes to generate tail predicated11880/// loops.11881static void genTPLoopBody(MachineBasicBlock *TpLoopBody,11882MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,11883const TargetInstrInfo *TII, DebugLoc Dl,11884MachineRegisterInfo &MRI, Register OpSrcReg,11885Register OpDestReg, Register ElementCountReg,11886Register TotalIterationsReg, bool IsMemcpy) {11887// First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest11888// array, loop iteration counter, predication counter.1188911890Register SrcPhiReg, CurrSrcReg;11891if (IsMemcpy) {11892// Current position in the src array11893SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);11894CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);11895BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)11896.addUse(OpSrcReg)11897.addMBB(TpEntry)11898.addUse(CurrSrcReg)11899.addMBB(TpLoopBody);11900}1190111902// Current position in the dest array11903Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);11904Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);11905BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)11906.addUse(OpDestReg)11907.addMBB(TpEntry)11908.addUse(CurrDestReg)11909.addMBB(TpLoopBody);1191011911// Current loop counter11912Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);11913Register RemainingLoopIterationsReg =11914MRI.createVirtualRegister(&ARM::GPRlrRegClass);11915BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)11916.addUse(TotalIterationsReg)11917.addMBB(TpEntry)11918.addUse(RemainingLoopIterationsReg)11919.addMBB(TpLoopBody);1192011921// Predication counter11922Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);11923Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);11924BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)11925.addUse(ElementCountReg)11926.addMBB(TpEntry)11927.addUse(RemainingElementsReg)11928.addMBB(TpLoopBody);1192911930// Pass predication counter to VCTP11931Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);11932BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)11933.addUse(PredCounterPhiReg)11934.addImm(ARMVCC::None)11935.addReg(0)11936.addReg(0);1193711938BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)11939.addUse(PredCounterPhiReg)11940.addImm(16)11941.add(predOps(ARMCC::AL))11942.addReg(0);1194311944// VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR11945Register SrcValueReg;11946if (IsMemcpy) {11947SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);11948BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))11949.addDef(CurrSrcReg)11950.addDef(SrcValueReg)11951.addReg(SrcPhiReg)11952.addImm(16)11953.addImm(ARMVCC::Then)11954.addUse(VccrReg)11955.addReg(0);11956} else11957SrcValueReg = OpSrcReg;1195811959BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))11960.addDef(CurrDestReg)11961.addUse(SrcValueReg)11962.addReg(DestPhiReg)11963.addImm(16)11964.addImm(ARMVCC::Then)11965.addUse(VccrReg)11966.addReg(0);1196711968// Add the pseudoInstrs for decrementing the loop counter and marking the11969// end:t2DoLoopDec and t2DoLoopEnd11970BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)11971.addUse(LoopCounterPhiReg)11972.addImm(1);1197311974BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))11975.addUse(RemainingLoopIterationsReg)11976.addMBB(TpLoopBody);1197711978BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))11979.addMBB(TpExit)11980.add(predOps(ARMCC::AL));11981}1198211983MachineBasicBlock *11984ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,11985MachineBasicBlock *BB) const {11986const TargetInstrInfo *TII = Subtarget->getInstrInfo();11987DebugLoc dl = MI.getDebugLoc();11988bool isThumb2 = Subtarget->isThumb2();11989switch (MI.getOpcode()) {11990default: {11991MI.print(errs());11992llvm_unreachable("Unexpected instr type to insert");11993}1199411995// Thumb1 post-indexed loads are really just single-register LDMs.11996case ARM::tLDR_postidx: {11997MachineOperand Def(MI.getOperand(1));11998BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))11999.add(Def) // Rn_wb12000.add(MI.getOperand(2)) // Rn12001.add(MI.getOperand(3)) // PredImm12002.add(MI.getOperand(4)) // PredReg12003.add(MI.getOperand(0)) // Rt12004.cloneMemRefs(MI);12005MI.eraseFromParent();12006return BB;12007}1200812009case ARM::MVE_MEMCPYLOOPINST:12010case ARM::MVE_MEMSETLOOPINST: {1201112012// Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo12013// into a Tail Predicated (TP) Loop. It adds the instructions to calculate12014// the iteration count =ceil(size_in_bytes/16)) in the TP entry block and12015// adds the relevant instructions in the TP loop Body for generation of a12016// WLSTP loop.1201712018// Below is relevant portion of the CFG after the transformation.12019// The Machine Basic Blocks are shown along with branch conditions (in12020// brackets). Note that TP entry/exit MBBs depict the entry/exit of this12021// portion of the CFG and may not necessarily be the entry/exit of the12022// function.1202312024// (Relevant) CFG after transformation:12025// TP entry MBB12026// |12027// |-----------------|12028// (n <= 0) (n > 0)12029// | |12030// | TP loop Body MBB<--|12031// | | |12032// \ |___________|12033// \ /12034// TP exit MBB1203512036MachineFunction *MF = BB->getParent();12037MachineFunctionProperties &Properties = MF->getProperties();12038MachineRegisterInfo &MRI = MF->getRegInfo();1203912040Register OpDestReg = MI.getOperand(0).getReg();12041Register OpSrcReg = MI.getOperand(1).getReg();12042Register OpSizeReg = MI.getOperand(2).getReg();1204312044// Allocate the required MBBs and add to parent function.12045MachineBasicBlock *TpEntry = BB;12046MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();12047MachineBasicBlock *TpExit;1204812049MF->push_back(TpLoopBody);1205012051// If any instructions are present in the current block after12052// MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and12053// move the instructions into the newly created exit block. If there are no12054// instructions add an explicit branch to the FallThrough block and then12055// split.12056//12057// The split is required for two reasons:12058// 1) A terminator(t2WhileLoopStart) will be placed at that site.12059// 2) Since a TPLoopBody will be added later, any phis in successive blocks12060// need to be updated. splitAt() already handles this.12061TpExit = BB->splitAt(MI, false);12062if (TpExit == BB) {12063assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "12064"block containing memcpy/memset Pseudo");12065TpExit = BB->getFallThrough();12066BuildMI(BB, dl, TII->get(ARM::t2B))12067.addMBB(TpExit)12068.add(predOps(ARMCC::AL));12069TpExit = BB->splitAt(MI, false);12070}1207112072// Add logic for iteration count12073Register TotalIterationsReg =12074genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);1207512076// Add the vectorized (and predicated) loads/store instructions12077bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;12078genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,12079OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);1208012081// Required to avoid conflict with the MachineVerifier during testing.12082Properties.reset(MachineFunctionProperties::Property::NoPHIs);1208312084// Connect the blocks12085TpEntry->addSuccessor(TpLoopBody);12086TpLoopBody->addSuccessor(TpLoopBody);12087TpLoopBody->addSuccessor(TpExit);1208812089// Reorder for a more natural layout12090TpLoopBody->moveAfter(TpEntry);12091TpExit->moveAfter(TpLoopBody);1209212093// Finally, remove the memcpy Pseudo Instruction12094MI.eraseFromParent();1209512096// Return the exit block as it may contain other instructions requiring a12097// custom inserter12098return TpExit;12099}1210012101// The Thumb2 pre-indexed stores have the same MI operands, they just12102// define them differently in the .td files from the isel patterns, so12103// they need pseudos.12104case ARM::t2STR_preidx:12105MI.setDesc(TII->get(ARM::t2STR_PRE));12106return BB;12107case ARM::t2STRB_preidx:12108MI.setDesc(TII->get(ARM::t2STRB_PRE));12109return BB;12110case ARM::t2STRH_preidx:12111MI.setDesc(TII->get(ARM::t2STRH_PRE));12112return BB;1211312114case ARM::STRi_preidx:12115case ARM::STRBi_preidx: {12116unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM12117: ARM::STRB_PRE_IMM;12118// Decode the offset.12119unsigned Offset = MI.getOperand(4).getImm();12120bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;12121Offset = ARM_AM::getAM2Offset(Offset);12122if (isSub)12123Offset = -Offset;1212412125MachineMemOperand *MMO = *MI.memoperands_begin();12126BuildMI(*BB, MI, dl, TII->get(NewOpc))12127.add(MI.getOperand(0)) // Rn_wb12128.add(MI.getOperand(1)) // Rt12129.add(MI.getOperand(2)) // Rn12130.addImm(Offset) // offset (skip GPR==zero_reg)12131.add(MI.getOperand(5)) // pred12132.add(MI.getOperand(6))12133.addMemOperand(MMO);12134MI.eraseFromParent();12135return BB;12136}12137case ARM::STRr_preidx:12138case ARM::STRBr_preidx:12139case ARM::STRH_preidx: {12140unsigned NewOpc;12141switch (MI.getOpcode()) {12142default: llvm_unreachable("unexpected opcode!");12143case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;12144case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;12145case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;12146}12147MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));12148for (const MachineOperand &MO : MI.operands())12149MIB.add(MO);12150MI.eraseFromParent();12151return BB;12152}1215312154case ARM::tMOVCCr_pseudo: {12155// To "insert" a SELECT_CC instruction, we actually have to insert the12156// diamond control-flow pattern. The incoming instruction knows the12157// destination vreg to set, the condition code register to branch on, the12158// true/false values to select between, and a branch opcode to use.12159const BasicBlock *LLVM_BB = BB->getBasicBlock();12160MachineFunction::iterator It = ++BB->getIterator();1216112162// thisMBB:12163// ...12164// TrueVal = ...12165// cmpTY ccX, r1, r212166// bCC copy1MBB12167// fallthrough --> copy0MBB12168MachineBasicBlock *thisMBB = BB;12169MachineFunction *F = BB->getParent();12170MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);12171MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);12172F->insert(It, copy0MBB);12173F->insert(It, sinkMBB);1217412175// Set the call frame size on entry to the new basic blocks.12176unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);12177copy0MBB->setCallFrameSize(CallFrameSize);12178sinkMBB->setCallFrameSize(CallFrameSize);1217912180// Check whether CPSR is live past the tMOVCCr_pseudo.12181const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();12182if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&12183!checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {12184copy0MBB->addLiveIn(ARM::CPSR);12185sinkMBB->addLiveIn(ARM::CPSR);12186}1218712188// Transfer the remainder of BB and its successor edges to sinkMBB.12189sinkMBB->splice(sinkMBB->begin(), BB,12190std::next(MachineBasicBlock::iterator(MI)), BB->end());12191sinkMBB->transferSuccessorsAndUpdatePHIs(BB);1219212193BB->addSuccessor(copy0MBB);12194BB->addSuccessor(sinkMBB);1219512196BuildMI(BB, dl, TII->get(ARM::tBcc))12197.addMBB(sinkMBB)12198.addImm(MI.getOperand(3).getImm())12199.addReg(MI.getOperand(4).getReg());1220012201// copy0MBB:12202// %FalseValue = ...12203// # fallthrough to sinkMBB12204BB = copy0MBB;1220512206// Update machine-CFG edges12207BB->addSuccessor(sinkMBB);1220812209// sinkMBB:12210// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]12211// ...12212BB = sinkMBB;12213BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())12214.addReg(MI.getOperand(1).getReg())12215.addMBB(copy0MBB)12216.addReg(MI.getOperand(2).getReg())12217.addMBB(thisMBB);1221812219MI.eraseFromParent(); // The pseudo instruction is gone now.12220return BB;12221}1222212223case ARM::BCCi64:12224case ARM::BCCZi64: {12225// If there is an unconditional branch to the other successor, remove it.12226BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());1222712228// Compare both parts that make up the double comparison separately for12229// equality.12230bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;1223112232Register LHS1 = MI.getOperand(1).getReg();12233Register LHS2 = MI.getOperand(2).getReg();12234if (RHSisZero) {12235BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))12236.addReg(LHS1)12237.addImm(0)12238.add(predOps(ARMCC::AL));12239BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))12240.addReg(LHS2).addImm(0)12241.addImm(ARMCC::EQ).addReg(ARM::CPSR);12242} else {12243Register RHS1 = MI.getOperand(3).getReg();12244Register RHS2 = MI.getOperand(4).getReg();12245BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))12246.addReg(LHS1)12247.addReg(RHS1)12248.add(predOps(ARMCC::AL));12249BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))12250.addReg(LHS2).addReg(RHS2)12251.addImm(ARMCC::EQ).addReg(ARM::CPSR);12252}1225312254MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();12255MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);12256if (MI.getOperand(0).getImm() == ARMCC::NE)12257std::swap(destMBB, exitMBB);1225812259BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))12260.addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);12261if (isThumb2)12262BuildMI(BB, dl, TII->get(ARM::t2B))12263.addMBB(exitMBB)12264.add(predOps(ARMCC::AL));12265else12266BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);1226712268MI.eraseFromParent(); // The pseudo instruction is gone now.12269return BB;12270}1227112272case ARM::Int_eh_sjlj_setjmp:12273case ARM::Int_eh_sjlj_setjmp_nofp:12274case ARM::tInt_eh_sjlj_setjmp:12275case ARM::t2Int_eh_sjlj_setjmp:12276case ARM::t2Int_eh_sjlj_setjmp_nofp:12277return BB;1227812279case ARM::Int_eh_sjlj_setup_dispatch:12280EmitSjLjDispatchBlock(MI, BB);12281return BB;1228212283case ARM::ABS:12284case ARM::t2ABS: {12285// To insert an ABS instruction, we have to insert the12286// diamond control-flow pattern. The incoming instruction knows the12287// source vreg to test against 0, the destination vreg to set,12288// the condition code register to branch on, the12289// true/false values to select between, and a branch opcode to use.12290// It transforms12291// V1 = ABS V012292// into12293// V2 = MOVS V012294// BCC (branch to SinkBB if V0 >= 0)12295// RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)12296// SinkBB: V1 = PHI(V2, V3)12297const BasicBlock *LLVM_BB = BB->getBasicBlock();12298MachineFunction::iterator BBI = ++BB->getIterator();12299MachineFunction *Fn = BB->getParent();12300MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);12301MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB);12302Fn->insert(BBI, RSBBB);12303Fn->insert(BBI, SinkBB);1230412305Register ABSSrcReg = MI.getOperand(1).getReg();12306Register ABSDstReg = MI.getOperand(0).getReg();12307bool ABSSrcKIll = MI.getOperand(1).isKill();12308bool isThumb2 = Subtarget->isThumb2();12309MachineRegisterInfo &MRI = Fn->getRegInfo();12310// In Thumb mode S must not be specified if source register is the SP or12311// PC and if destination register is the SP, so restrict register class12312Register NewRsbDstReg = MRI.createVirtualRegister(12313isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);1231412315// Transfer the remainder of BB and its successor edges to sinkMBB.12316SinkBB->splice(SinkBB->begin(), BB,12317std::next(MachineBasicBlock::iterator(MI)), BB->end());12318SinkBB->transferSuccessorsAndUpdatePHIs(BB);1231912320BB->addSuccessor(RSBBB);12321BB->addSuccessor(SinkBB);1232212323// fall through to SinkMBB12324RSBBB->addSuccessor(SinkBB);1232512326// insert a cmp at the end of BB12327BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))12328.addReg(ABSSrcReg)12329.addImm(0)12330.add(predOps(ARMCC::AL));1233112332// insert a bcc with opposite CC to ARMCC::MI at the end of BB12333BuildMI(BB, dl,12334TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)12335.addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);1233612337// insert rsbri in RSBBB12338// Note: BCC and rsbri will be converted into predicated rsbmi12339// by if-conversion pass12340BuildMI(*RSBBB, RSBBB->begin(), dl,12341TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)12342.addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)12343.addImm(0)12344.add(predOps(ARMCC::AL))12345.add(condCodeOp());1234612347// insert PHI in SinkBB,12348// reuse ABSDstReg to not change uses of ABS instruction12349BuildMI(*SinkBB, SinkBB->begin(), dl,12350TII->get(ARM::PHI), ABSDstReg)12351.addReg(NewRsbDstReg).addMBB(RSBBB)12352.addReg(ABSSrcReg).addMBB(BB);1235312354// remove ABS instruction12355MI.eraseFromParent();1235612357// return last added BB12358return SinkBB;12359}12360case ARM::COPY_STRUCT_BYVAL_I32:12361++NumLoopByVals;12362return EmitStructByval(MI, BB);12363case ARM::WIN__CHKSTK:12364return EmitLowered__chkstk(MI, BB);12365case ARM::WIN__DBZCHK:12366return EmitLowered__dbzchk(MI, BB);12367}12368}1236912370/// Attaches vregs to MEMCPY that it will use as scratch registers12371/// when it is expanded into LDM/STM. This is done as a post-isel lowering12372/// instead of as a custom inserter because we need the use list from the SDNode.12373static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,12374MachineInstr &MI, const SDNode *Node) {12375bool isThumb1 = Subtarget->isThumb1Only();1237612377DebugLoc DL = MI.getDebugLoc();12378MachineFunction *MF = MI.getParent()->getParent();12379MachineRegisterInfo &MRI = MF->getRegInfo();12380MachineInstrBuilder MIB(*MF, MI);1238112382// If the new dst/src is unused mark it as dead.12383if (!Node->hasAnyUseOfValue(0)) {12384MI.getOperand(0).setIsDead(true);12385}12386if (!Node->hasAnyUseOfValue(1)) {12387MI.getOperand(1).setIsDead(true);12388}1238912390// The MEMCPY both defines and kills the scratch registers.12391for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {12392Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass12393: &ARM::GPRRegClass);12394MIB.addReg(TmpReg, RegState::Define|RegState::Dead);12395}12396}1239712398void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,12399SDNode *Node) const {12400if (MI.getOpcode() == ARM::MEMCPY) {12401attachMEMCPYScratchRegs(Subtarget, MI, Node);12402return;12403}1240412405const MCInstrDesc *MCID = &MI.getDesc();12406// Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,12407// RSC. Coming out of isel, they have an implicit CPSR def, but the optional12408// operand is still set to noreg. If needed, set the optional operand's12409// register to CPSR, and remove the redundant implicit def.12410//12411// e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).1241212413// Rename pseudo opcodes.12414unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());12415unsigned ccOutIdx;12416if (NewOpc) {12417const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();12418MCID = &TII->get(NewOpc);1241912420assert(MCID->getNumOperands() ==12421MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()12422&& "converted opcode should be the same except for cc_out"12423" (and, on Thumb1, pred)");1242412425MI.setDesc(*MCID);1242612427// Add the optional cc_out operand12428MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));1242912430// On Thumb1, move all input operands to the end, then add the predicate12431if (Subtarget->isThumb1Only()) {12432for (unsigned c = MCID->getNumOperands() - 4; c--;) {12433MI.addOperand(MI.getOperand(1));12434MI.removeOperand(1);12435}1243612437// Restore the ties12438for (unsigned i = MI.getNumOperands(); i--;) {12439const MachineOperand& op = MI.getOperand(i);12440if (op.isReg() && op.isUse()) {12441int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);12442if (DefIdx != -1)12443MI.tieOperands(DefIdx, i);12444}12445}1244612447MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));12448MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));12449ccOutIdx = 1;12450} else12451ccOutIdx = MCID->getNumOperands() - 1;12452} else12453ccOutIdx = MCID->getNumOperands() - 1;1245412455// Any ARM instruction that sets the 's' bit should specify an optional12456// "cc_out" operand in the last operand position.12457if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {12458assert(!NewOpc && "Optional cc_out operand required");12459return;12460}12461// Look for an implicit def of CPSR added by MachineInstr ctor. Remove it12462// since we already have an optional CPSR def.12463bool definesCPSR = false;12464bool deadCPSR = false;12465for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;12466++i) {12467const MachineOperand &MO = MI.getOperand(i);12468if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {12469definesCPSR = true;12470if (MO.isDead())12471deadCPSR = true;12472MI.removeOperand(i);12473break;12474}12475}12476if (!definesCPSR) {12477assert(!NewOpc && "Optional cc_out operand required");12478return;12479}12480assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");12481if (deadCPSR) {12482assert(!MI.getOperand(ccOutIdx).getReg() &&12483"expect uninitialized optional cc_out operand");12484// Thumb1 instructions must have the S bit even if the CPSR is dead.12485if (!Subtarget->isThumb1Only())12486return;12487}1248812489// If this instruction was defined with an optional CPSR def and its dag node12490// had a live implicit CPSR def, then activate the optional CPSR def.12491MachineOperand &MO = MI.getOperand(ccOutIdx);12492MO.setReg(ARM::CPSR);12493MO.setIsDef(true);12494}1249512496//===----------------------------------------------------------------------===//12497// ARM Optimization Hooks12498//===----------------------------------------------------------------------===//1249912500// Helper function that checks if N is a null or all ones constant.12501static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {12502return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);12503}1250412505// Return true if N is conditionally 0 or all ones.12506// Detects these expressions where cc is an i1 value:12507//12508// (select cc 0, y) [AllOnes=0]12509// (select cc y, 0) [AllOnes=0]12510// (zext cc) [AllOnes=0]12511// (sext cc) [AllOnes=0/1]12512// (select cc -1, y) [AllOnes=1]12513// (select cc y, -1) [AllOnes=1]12514//12515// Invert is set when N is the null/all ones constant when CC is false.12516// OtherOp is set to the alternative value of N.12517static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,12518SDValue &CC, bool &Invert,12519SDValue &OtherOp,12520SelectionDAG &DAG) {12521switch (N->getOpcode()) {12522default: return false;12523case ISD::SELECT: {12524CC = N->getOperand(0);12525SDValue N1 = N->getOperand(1);12526SDValue N2 = N->getOperand(2);12527if (isZeroOrAllOnes(N1, AllOnes)) {12528Invert = false;12529OtherOp = N2;12530return true;12531}12532if (isZeroOrAllOnes(N2, AllOnes)) {12533Invert = true;12534OtherOp = N1;12535return true;12536}12537return false;12538}12539case ISD::ZERO_EXTEND:12540// (zext cc) can never be the all ones value.12541if (AllOnes)12542return false;12543[[fallthrough]];12544case ISD::SIGN_EXTEND: {12545SDLoc dl(N);12546EVT VT = N->getValueType(0);12547CC = N->getOperand(0);12548if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)12549return false;12550Invert = !AllOnes;12551if (AllOnes)12552// When looking for an AllOnes constant, N is an sext, and the 'other'12553// value is 0.12554OtherOp = DAG.getConstant(0, dl, VT);12555else if (N->getOpcode() == ISD::ZERO_EXTEND)12556// When looking for a 0 constant, N can be zext or sext.12557OtherOp = DAG.getConstant(1, dl, VT);12558else12559OtherOp = DAG.getAllOnesConstant(dl, VT);12560return true;12561}12562}12563}1256412565// Combine a constant select operand into its use:12566//12567// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))12568// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))12569// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]12570// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))12571// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))12572//12573// The transform is rejected if the select doesn't have a constant operand that12574// is null, or all ones when AllOnes is set.12575//12576// Also recognize sext/zext from i1:12577//12578// (add (zext cc), x) -> (select cc (add x, 1), x)12579// (add (sext cc), x) -> (select cc (add x, -1), x)12580//12581// These transformations eventually create predicated instructions.12582//12583// @param N The node to transform.12584// @param Slct The N operand that is a select.12585// @param OtherOp The other N operand (x above).12586// @param DCI Context.12587// @param AllOnes Require the select constant to be all ones instead of null.12588// @returns The new node, or SDValue() on failure.12589static12590SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,12591TargetLowering::DAGCombinerInfo &DCI,12592bool AllOnes = false) {12593SelectionDAG &DAG = DCI.DAG;12594EVT VT = N->getValueType(0);12595SDValue NonConstantVal;12596SDValue CCOp;12597bool SwapSelectOps;12598if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,12599NonConstantVal, DAG))12600return SDValue();1260112602// Slct is now know to be the desired identity constant when CC is true.12603SDValue TrueVal = OtherOp;12604SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,12605OtherOp, NonConstantVal);12606// Unless SwapSelectOps says CC should be false.12607if (SwapSelectOps)12608std::swap(TrueVal, FalseVal);1260912610return DAG.getNode(ISD::SELECT, SDLoc(N), VT,12611CCOp, TrueVal, FalseVal);12612}1261312614// Attempt combineSelectAndUse on each operand of a commutative operator N.12615static12616SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,12617TargetLowering::DAGCombinerInfo &DCI) {12618SDValue N0 = N->getOperand(0);12619SDValue N1 = N->getOperand(1);12620if (N0.getNode()->hasOneUse())12621if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))12622return Result;12623if (N1.getNode()->hasOneUse())12624if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))12625return Result;12626return SDValue();12627}1262812629static bool IsVUZPShuffleNode(SDNode *N) {12630// VUZP shuffle node.12631if (N->getOpcode() == ARMISD::VUZP)12632return true;1263312634// "VUZP" on i32 is an alias for VTRN.12635if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)12636return true;1263712638return false;12639}1264012641static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,12642TargetLowering::DAGCombinerInfo &DCI,12643const ARMSubtarget *Subtarget) {12644// Look for ADD(VUZP.0, VUZP.1).12645if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||12646N0 == N1)12647return SDValue();1264812649// Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.12650if (!N->getValueType(0).is64BitVector())12651return SDValue();1265212653// Generate vpadd.12654SelectionDAG &DAG = DCI.DAG;12655const TargetLowering &TLI = DAG.getTargetLoweringInfo();12656SDLoc dl(N);12657SDNode *Unzip = N0.getNode();12658EVT VT = N->getValueType(0);1265912660SmallVector<SDValue, 8> Ops;12661Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,12662TLI.getPointerTy(DAG.getDataLayout())));12663Ops.push_back(Unzip->getOperand(0));12664Ops.push_back(Unzip->getOperand(1));1266512666return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);12667}1266812669static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,12670TargetLowering::DAGCombinerInfo &DCI,12671const ARMSubtarget *Subtarget) {12672// Check for two extended operands.12673if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&12674N1.getOpcode() == ISD::SIGN_EXTEND) &&12675!(N0.getOpcode() == ISD::ZERO_EXTEND &&12676N1.getOpcode() == ISD::ZERO_EXTEND))12677return SDValue();1267812679SDValue N00 = N0.getOperand(0);12680SDValue N10 = N1.getOperand(0);1268112682// Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))12683if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||12684N00 == N10)12685return SDValue();1268612687// We only recognize Q register paddl here; this can't be reached until12688// after type legalization.12689if (!N00.getValueType().is64BitVector() ||12690!N0.getValueType().is128BitVector())12691return SDValue();1269212693// Generate vpaddl.12694SelectionDAG &DAG = DCI.DAG;12695const TargetLowering &TLI = DAG.getTargetLoweringInfo();12696SDLoc dl(N);12697EVT VT = N->getValueType(0);1269812699SmallVector<SDValue, 8> Ops;12700// Form vpaddl.sN or vpaddl.uN depending on the kind of extension.12701unsigned Opcode;12702if (N0.getOpcode() == ISD::SIGN_EXTEND)12703Opcode = Intrinsic::arm_neon_vpaddls;12704else12705Opcode = Intrinsic::arm_neon_vpaddlu;12706Ops.push_back(DAG.getConstant(Opcode, dl,12707TLI.getPointerTy(DAG.getDataLayout())));12708EVT ElemTy = N00.getValueType().getVectorElementType();12709unsigned NumElts = VT.getVectorNumElements();12710EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);12711SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,12712N00.getOperand(0), N00.getOperand(1));12713Ops.push_back(Concat);1271412715return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);12716}1271712718// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in12719// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is12720// much easier to match.12721static SDValue12722AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,12723TargetLowering::DAGCombinerInfo &DCI,12724const ARMSubtarget *Subtarget) {12725// Only perform optimization if after legalize, and if NEON is available. We12726// also expected both operands to be BUILD_VECTORs.12727if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()12728|| N0.getOpcode() != ISD::BUILD_VECTOR12729|| N1.getOpcode() != ISD::BUILD_VECTOR)12730return SDValue();1273112732// Check output type since VPADDL operand elements can only be 8, 16, or 32.12733EVT VT = N->getValueType(0);12734if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)12735return SDValue();1273612737// Check that the vector operands are of the right form.12738// N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR12739// operands, where N is the size of the formed vector.12740// Each EXTRACT_VECTOR should have the same input vector and odd or even12741// index such that we have a pair wise add pattern.1274212743// Grab the vector that all EXTRACT_VECTOR nodes should be referencing.12744if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)12745return SDValue();12746SDValue Vec = N0->getOperand(0)->getOperand(0);12747SDNode *V = Vec.getNode();12748unsigned nextIndex = 0;1274912750// For each operands to the ADD which are BUILD_VECTORs,12751// check to see if each of their operands are an EXTRACT_VECTOR with12752// the same vector and appropriate index.12753for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {12754if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT12755&& N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {1275612757SDValue ExtVec0 = N0->getOperand(i);12758SDValue ExtVec1 = N1->getOperand(i);1275912760// First operand is the vector, verify its the same.12761if (V != ExtVec0->getOperand(0).getNode() ||12762V != ExtVec1->getOperand(0).getNode())12763return SDValue();1276412765// Second is the constant, verify its correct.12766ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));12767ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));1276812769// For the constant, we want to see all the even or all the odd.12770if (!C0 || !C1 || C0->getZExtValue() != nextIndex12771|| C1->getZExtValue() != nextIndex+1)12772return SDValue();1277312774// Increment index.12775nextIndex+=2;12776} else12777return SDValue();12778}1277912780// Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure12781// we're using the entire input vector, otherwise there's a size/legality12782// mismatch somewhere.12783if (nextIndex != Vec.getValueType().getVectorNumElements() ||12784Vec.getValueType().getVectorElementType() == VT.getVectorElementType())12785return SDValue();1278612787// Create VPADDL node.12788SelectionDAG &DAG = DCI.DAG;12789const TargetLowering &TLI = DAG.getTargetLoweringInfo();1279012791SDLoc dl(N);1279212793// Build operand list.12794SmallVector<SDValue, 8> Ops;12795Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,12796TLI.getPointerTy(DAG.getDataLayout())));1279712798// Input is the vector.12799Ops.push_back(Vec);1280012801// Get widened type and narrowed type.12802MVT widenType;12803unsigned numElem = VT.getVectorNumElements();1280412805EVT inputLaneType = Vec.getValueType().getVectorElementType();12806switch (inputLaneType.getSimpleVT().SimpleTy) {12807case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;12808case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;12809case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;12810default:12811llvm_unreachable("Invalid vector element type for padd optimization.");12812}1281312814SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);12815unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;12816return DAG.getNode(ExtOp, dl, VT, tmp);12817}1281812819static SDValue findMUL_LOHI(SDValue V) {12820if (V->getOpcode() == ISD::UMUL_LOHI ||12821V->getOpcode() == ISD::SMUL_LOHI)12822return V;12823return SDValue();12824}1282512826static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,12827TargetLowering::DAGCombinerInfo &DCI,12828const ARMSubtarget *Subtarget) {12829if (!Subtarget->hasBaseDSP())12830return SDValue();1283112832// SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and12833// accumulates the product into a 64-bit value. The 16-bit values will12834// be sign extended somehow or SRA'd into 32-bit values12835// (addc (adde (mul 16bit, 16bit), lo), hi)12836SDValue Mul = AddcNode->getOperand(0);12837SDValue Lo = AddcNode->getOperand(1);12838if (Mul.getOpcode() != ISD::MUL) {12839Lo = AddcNode->getOperand(0);12840Mul = AddcNode->getOperand(1);12841if (Mul.getOpcode() != ISD::MUL)12842return SDValue();12843}1284412845SDValue SRA = AddeNode->getOperand(0);12846SDValue Hi = AddeNode->getOperand(1);12847if (SRA.getOpcode() != ISD::SRA) {12848SRA = AddeNode->getOperand(1);12849Hi = AddeNode->getOperand(0);12850if (SRA.getOpcode() != ISD::SRA)12851return SDValue();12852}12853if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {12854if (Const->getZExtValue() != 31)12855return SDValue();12856} else12857return SDValue();1285812859if (SRA.getOperand(0) != Mul)12860return SDValue();1286112862SelectionDAG &DAG = DCI.DAG;12863SDLoc dl(AddcNode);12864unsigned Opcode = 0;12865SDValue Op0;12866SDValue Op1;1286712868if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {12869Opcode = ARMISD::SMLALBB;12870Op0 = Mul.getOperand(0);12871Op1 = Mul.getOperand(1);12872} else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {12873Opcode = ARMISD::SMLALBT;12874Op0 = Mul.getOperand(0);12875Op1 = Mul.getOperand(1).getOperand(0);12876} else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {12877Opcode = ARMISD::SMLALTB;12878Op0 = Mul.getOperand(0).getOperand(0);12879Op1 = Mul.getOperand(1);12880} else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {12881Opcode = ARMISD::SMLALTT;12882Op0 = Mul->getOperand(0).getOperand(0);12883Op1 = Mul->getOperand(1).getOperand(0);12884}1288512886if (!Op0 || !Op1)12887return SDValue();1288812889SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),12890Op0, Op1, Lo, Hi);12891// Replace the ADDs' nodes uses by the MLA node's values.12892SDValue HiMLALResult(SMLAL.getNode(), 1);12893SDValue LoMLALResult(SMLAL.getNode(), 0);1289412895DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);12896DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);1289712898// Return original node to notify the driver to stop replacing.12899SDValue resNode(AddcNode, 0);12900return resNode;12901}1290212903static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,12904TargetLowering::DAGCombinerInfo &DCI,12905const ARMSubtarget *Subtarget) {12906// Look for multiply add opportunities.12907// The pattern is a ISD::UMUL_LOHI followed by two add nodes, where12908// each add nodes consumes a value from ISD::UMUL_LOHI and there is12909// a glue link from the first add to the second add.12910// If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by12911// a S/UMLAL instruction.12912// UMUL_LOHI12913// / :lo \ :hi12914// V \ [no multiline comment]12915// loAdd -> ADDC |12916// \ :carry /12917// V V12918// ADDE <- hiAdd12919//12920// In the special case where only the higher part of a signed result is used12921// and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts12922// a constant with the exact value of 0x80000000, we recognize we are dealing12923// with a "rounded multiply and add" (or subtract) and transform it into12924// either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.1292512926assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||12927AddeSubeNode->getOpcode() == ARMISD::SUBE) &&12928"Expect an ADDE or SUBE");1292912930assert(AddeSubeNode->getNumOperands() == 3 &&12931AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&12932"ADDE node has the wrong inputs");1293312934// Check that we are chained to the right ADDC or SUBC node.12935SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();12936if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&12937AddcSubcNode->getOpcode() != ARMISD::ADDC) ||12938(AddeSubeNode->getOpcode() == ARMISD::SUBE &&12939AddcSubcNode->getOpcode() != ARMISD::SUBC))12940return SDValue();1294112942SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);12943SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);1294412945// Check if the two operands are from the same mul_lohi node.12946if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())12947return SDValue();1294812949assert(AddcSubcNode->getNumValues() == 2 &&12950AddcSubcNode->getValueType(0) == MVT::i32 &&12951"Expect ADDC with two result values. First: i32");1295212953// Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it12954// maybe a SMLAL which multiplies two 16-bit values.12955if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&12956AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&12957AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&12958AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&12959AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)12960return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);1296112962// Check for the triangle shape.12963SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);12964SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);1296512966// Make sure that the ADDE/SUBE operands are not coming from the same node.12967if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())12968return SDValue();1296912970// Find the MUL_LOHI node walking up ADDE/SUBE's operands.12971bool IsLeftOperandMUL = false;12972SDValue MULOp = findMUL_LOHI(AddeSubeOp0);12973if (MULOp == SDValue())12974MULOp = findMUL_LOHI(AddeSubeOp1);12975else12976IsLeftOperandMUL = true;12977if (MULOp == SDValue())12978return SDValue();1297912980// Figure out the right opcode.12981unsigned Opc = MULOp->getOpcode();12982unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;1298312984// Figure out the high and low input values to the MLAL node.12985SDValue *HiAddSub = nullptr;12986SDValue *LoMul = nullptr;12987SDValue *LowAddSub = nullptr;1298812989// Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.12990if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))12991return SDValue();1299212993if (IsLeftOperandMUL)12994HiAddSub = &AddeSubeOp1;12995else12996HiAddSub = &AddeSubeOp0;1299712998// Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node12999// whose low result is fed to the ADDC/SUBC we are checking.1300013001if (AddcSubcOp0 == MULOp.getValue(0)) {13002LoMul = &AddcSubcOp0;13003LowAddSub = &AddcSubcOp1;13004}13005if (AddcSubcOp1 == MULOp.getValue(0)) {13006LoMul = &AddcSubcOp1;13007LowAddSub = &AddcSubcOp0;13008}1300913010if (!LoMul)13011return SDValue();1301213013// If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC13014// the replacement below will create a cycle.13015if (AddcSubcNode == HiAddSub->getNode() ||13016AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))13017return SDValue();1301813019// Create the merged node.13020SelectionDAG &DAG = DCI.DAG;1302113022// Start building operand list.13023SmallVector<SDValue, 8> Ops;13024Ops.push_back(LoMul->getOperand(0));13025Ops.push_back(LoMul->getOperand(1));1302613027// Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be13028// the case, we must be doing signed multiplication and only use the higher13029// part of the result of the MLAL, furthermore the LowAddSub must be a constant13030// addition or subtraction with the value of 0x800000.13031if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&13032FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&13033LowAddSub->getNode()->getOpcode() == ISD::Constant &&13034static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==130350x80000000) {13036Ops.push_back(*HiAddSub);13037if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {13038FinalOpc = ARMISD::SMMLSR;13039} else {13040FinalOpc = ARMISD::SMMLAR;13041}13042SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);13043DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);1304413045return SDValue(AddeSubeNode, 0);13046} else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)13047// SMMLS is generated during instruction selection and the rest of this13048// function can not handle the case where AddcSubcNode is a SUBC.13049return SDValue();1305013051// Finish building the operand list for {U/S}MLAL13052Ops.push_back(*LowAddSub);13053Ops.push_back(*HiAddSub);1305413055SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),13056DAG.getVTList(MVT::i32, MVT::i32), Ops);1305713058// Replace the ADDs' nodes uses by the MLA node's values.13059SDValue HiMLALResult(MLALNode.getNode(), 1);13060DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);1306113062SDValue LoMLALResult(MLALNode.getNode(), 0);13063DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);1306413065// Return original node to notify the driver to stop replacing.13066return SDValue(AddeSubeNode, 0);13067}1306813069static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,13070TargetLowering::DAGCombinerInfo &DCI,13071const ARMSubtarget *Subtarget) {13072// UMAAL is similar to UMLAL except that it adds two unsigned values.13073// While trying to combine for the other MLAL nodes, first search for the13074// chance to use UMAAL. Check if Addc uses a node which has already13075// been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde13076// as the addend, and it's handled in PerformUMLALCombine.1307713078if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())13079return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);1308013081// Check that we have a glued ADDC node.13082SDNode* AddcNode = AddeNode->getOperand(2).getNode();13083if (AddcNode->getOpcode() != ARMISD::ADDC)13084return SDValue();1308513086// Find the converted UMAAL or quit if it doesn't exist.13087SDNode *UmlalNode = nullptr;13088SDValue AddHi;13089if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {13090UmlalNode = AddcNode->getOperand(0).getNode();13091AddHi = AddcNode->getOperand(1);13092} else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {13093UmlalNode = AddcNode->getOperand(1).getNode();13094AddHi = AddcNode->getOperand(0);13095} else {13096return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);13097}1309813099// The ADDC should be glued to an ADDE node, which uses the same UMLAL as13100// the ADDC as well as Zero.13101if (!isNullConstant(UmlalNode->getOperand(3)))13102return SDValue();1310313104if ((isNullConstant(AddeNode->getOperand(0)) &&13105AddeNode->getOperand(1).getNode() == UmlalNode) ||13106(AddeNode->getOperand(0).getNode() == UmlalNode &&13107isNullConstant(AddeNode->getOperand(1)))) {13108SelectionDAG &DAG = DCI.DAG;13109SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),13110UmlalNode->getOperand(2), AddHi };13111SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),13112DAG.getVTList(MVT::i32, MVT::i32), Ops);1311313114// Replace the ADDs' nodes uses by the UMAAL node's values.13115DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));13116DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));1311713118// Return original node to notify the driver to stop replacing.13119return SDValue(AddeNode, 0);13120}13121return SDValue();13122}1312313124static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,13125const ARMSubtarget *Subtarget) {13126if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())13127return SDValue();1312813129// Check that we have a pair of ADDC and ADDE as operands.13130// Both addends of the ADDE must be zero.13131SDNode* AddcNode = N->getOperand(2).getNode();13132SDNode* AddeNode = N->getOperand(3).getNode();13133if ((AddcNode->getOpcode() == ARMISD::ADDC) &&13134(AddeNode->getOpcode() == ARMISD::ADDE) &&13135isNullConstant(AddeNode->getOperand(0)) &&13136isNullConstant(AddeNode->getOperand(1)) &&13137(AddeNode->getOperand(2).getNode() == AddcNode))13138return DAG.getNode(ARMISD::UMAAL, SDLoc(N),13139DAG.getVTList(MVT::i32, MVT::i32),13140{N->getOperand(0), N->getOperand(1),13141AddcNode->getOperand(0), AddcNode->getOperand(1)});13142else13143return SDValue();13144}1314513146static SDValue PerformAddcSubcCombine(SDNode *N,13147TargetLowering::DAGCombinerInfo &DCI,13148const ARMSubtarget *Subtarget) {13149SelectionDAG &DAG(DCI.DAG);1315013151if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {13152// (SUBC (ADDE 0, 0, C), 1) -> C13153SDValue LHS = N->getOperand(0);13154SDValue RHS = N->getOperand(1);13155if (LHS->getOpcode() == ARMISD::ADDE &&13156isNullConstant(LHS->getOperand(0)) &&13157isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {13158return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));13159}13160}1316113162if (Subtarget->isThumb1Only()) {13163SDValue RHS = N->getOperand(1);13164if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {13165int32_t imm = C->getSExtValue();13166if (imm < 0 && imm > std::numeric_limits<int>::min()) {13167SDLoc DL(N);13168RHS = DAG.getConstant(-imm, DL, MVT::i32);13169unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC13170: ARMISD::ADDC;13171return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);13172}13173}13174}1317513176return SDValue();13177}1317813179static SDValue PerformAddeSubeCombine(SDNode *N,13180TargetLowering::DAGCombinerInfo &DCI,13181const ARMSubtarget *Subtarget) {13182if (Subtarget->isThumb1Only()) {13183SelectionDAG &DAG = DCI.DAG;13184SDValue RHS = N->getOperand(1);13185if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {13186int64_t imm = C->getSExtValue();13187if (imm < 0) {13188SDLoc DL(N);1318913190// The with-carry-in form matches bitwise not instead of the negation.13191// Effectively, the inverse interpretation of the carry flag already13192// accounts for part of the negation.13193RHS = DAG.getConstant(~imm, DL, MVT::i32);1319413195unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE13196: ARMISD::ADDE;13197return DAG.getNode(Opcode, DL, N->getVTList(),13198N->getOperand(0), RHS, N->getOperand(2));13199}13200}13201} else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {13202return AddCombineTo64bitMLAL(N, DCI, Subtarget);13203}13204return SDValue();13205}1320613207static SDValue PerformSELECTCombine(SDNode *N,13208TargetLowering::DAGCombinerInfo &DCI,13209const ARMSubtarget *Subtarget) {13210if (!Subtarget->hasMVEIntegerOps())13211return SDValue();1321213213SDLoc dl(N);13214SDValue SetCC;13215SDValue LHS;13216SDValue RHS;13217ISD::CondCode CC;13218SDValue TrueVal;13219SDValue FalseVal;1322013221if (N->getOpcode() == ISD::SELECT &&13222N->getOperand(0)->getOpcode() == ISD::SETCC) {13223SetCC = N->getOperand(0);13224LHS = SetCC->getOperand(0);13225RHS = SetCC->getOperand(1);13226CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();13227TrueVal = N->getOperand(1);13228FalseVal = N->getOperand(2);13229} else if (N->getOpcode() == ISD::SELECT_CC) {13230LHS = N->getOperand(0);13231RHS = N->getOperand(1);13232CC = cast<CondCodeSDNode>(N->getOperand(4))->get();13233TrueVal = N->getOperand(2);13234FalseVal = N->getOperand(3);13235} else {13236return SDValue();13237}1323813239unsigned int Opcode = 0;13240if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||13241FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&13242(CC == ISD::SETULT || CC == ISD::SETUGT)) {13243Opcode = ARMISD::VMINVu;13244if (CC == ISD::SETUGT)13245std::swap(TrueVal, FalseVal);13246} else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||13247FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&13248(CC == ISD::SETLT || CC == ISD::SETGT)) {13249Opcode = ARMISD::VMINVs;13250if (CC == ISD::SETGT)13251std::swap(TrueVal, FalseVal);13252} else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||13253FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&13254(CC == ISD::SETUGT || CC == ISD::SETULT)) {13255Opcode = ARMISD::VMAXVu;13256if (CC == ISD::SETULT)13257std::swap(TrueVal, FalseVal);13258} else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||13259FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&13260(CC == ISD::SETGT || CC == ISD::SETLT)) {13261Opcode = ARMISD::VMAXVs;13262if (CC == ISD::SETLT)13263std::swap(TrueVal, FalseVal);13264} else13265return SDValue();1326613267// Normalise to the right hand side being the vector reduction13268switch (TrueVal->getOpcode()) {13269case ISD::VECREDUCE_UMIN:13270case ISD::VECREDUCE_SMIN:13271case ISD::VECREDUCE_UMAX:13272case ISD::VECREDUCE_SMAX:13273std::swap(LHS, RHS);13274std::swap(TrueVal, FalseVal);13275break;13276}1327713278EVT VectorType = FalseVal->getOperand(0).getValueType();1327913280if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&13281VectorType != MVT::v4i32)13282return SDValue();1328313284EVT VectorScalarType = VectorType.getVectorElementType();1328513286// The values being selected must also be the ones being compared13287if (TrueVal != LHS || FalseVal != RHS)13288return SDValue();1328913290EVT LeftType = LHS->getValueType(0);13291EVT RightType = RHS->getValueType(0);1329213293// The types must match the reduced type too13294if (LeftType != VectorScalarType || RightType != VectorScalarType)13295return SDValue();1329613297// Legalise the scalar to an i3213298if (VectorScalarType != MVT::i32)13299LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);1330013301// Generate the reduction as an i32 for legalisation purposes13302auto Reduction =13303DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));1330413305// The result isn't actually an i32 so truncate it back to its original type13306if (VectorScalarType != MVT::i32)13307Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);1330813309return Reduction;13310}1331113312// A special combine for the vqdmulh family of instructions. This is one of the13313// potential set of patterns that could patch this instruction. The base pattern13314// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).13315// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),13316// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as13317// the max is unnecessary.13318static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) {13319EVT VT = N->getValueType(0);13320SDValue Shft;13321ConstantSDNode *Clamp;1332213323if (!VT.isVector() || VT.getScalarSizeInBits() > 64)13324return SDValue();1332513326if (N->getOpcode() == ISD::SMIN) {13327Shft = N->getOperand(0);13328Clamp = isConstOrConstSplat(N->getOperand(1));13329} else if (N->getOpcode() == ISD::VSELECT) {13330// Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.13331SDValue Cmp = N->getOperand(0);13332if (Cmp.getOpcode() != ISD::SETCC ||13333cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||13334Cmp.getOperand(0) != N->getOperand(1) ||13335Cmp.getOperand(1) != N->getOperand(2))13336return SDValue();13337Shft = N->getOperand(1);13338Clamp = isConstOrConstSplat(N->getOperand(2));13339} else13340return SDValue();1334113342if (!Clamp)13343return SDValue();1334413345MVT ScalarType;13346int ShftAmt = 0;13347switch (Clamp->getSExtValue()) {13348case (1 << 7) - 1:13349ScalarType = MVT::i8;13350ShftAmt = 7;13351break;13352case (1 << 15) - 1:13353ScalarType = MVT::i16;13354ShftAmt = 15;13355break;13356case (1ULL << 31) - 1:13357ScalarType = MVT::i32;13358ShftAmt = 31;13359break;13360default:13361return SDValue();13362}1336313364if (Shft.getOpcode() != ISD::SRA)13365return SDValue();13366ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1));13367if (!N1 || N1->getSExtValue() != ShftAmt)13368return SDValue();1336913370SDValue Mul = Shft.getOperand(0);13371if (Mul.getOpcode() != ISD::MUL)13372return SDValue();1337313374SDValue Ext0 = Mul.getOperand(0);13375SDValue Ext1 = Mul.getOperand(1);13376if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||13377Ext1.getOpcode() != ISD::SIGN_EXTEND)13378return SDValue();13379EVT VecVT = Ext0.getOperand(0).getValueType();13380if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)13381return SDValue();13382if (Ext1.getOperand(0).getValueType() != VecVT ||13383VecVT.getScalarType() != ScalarType ||13384VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)13385return SDValue();1338613387SDLoc DL(Mul);13388unsigned LegalLanes = 128 / (ShftAmt + 1);13389EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);13390// For types smaller than legal vectors extend to be legal and only use needed13391// lanes.13392if (VecVT.getSizeInBits() < 128) {13393EVT ExtVecVT =13394MVT::getVectorVT(MVT::getIntegerVT(128 / VecVT.getVectorNumElements()),13395VecVT.getVectorNumElements());13396SDValue Inp0 =13397DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));13398SDValue Inp1 =13399DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));13400Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);13401Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);13402SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);13403SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);13404Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);13405return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);13406}1340713408// For larger types, split into legal sized chunks.13409assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");13410unsigned NumParts = VecVT.getSizeInBits() / 128;13411SmallVector<SDValue> Parts;13412for (unsigned I = 0; I < NumParts; ++I) {13413SDValue Inp0 =13414DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),13415DAG.getVectorIdxConstant(I * LegalLanes, DL));13416SDValue Inp1 =13417DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),13418DAG.getVectorIdxConstant(I * LegalLanes, DL));13419SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);13420Parts.push_back(VQDMULH);13421}13422return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,13423DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));13424}1342513426static SDValue PerformVSELECTCombine(SDNode *N,13427TargetLowering::DAGCombinerInfo &DCI,13428const ARMSubtarget *Subtarget) {13429if (!Subtarget->hasMVEIntegerOps())13430return SDValue();1343113432if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))13433return V;1343413435// Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).13436//13437// We need to re-implement this optimization here as the implementation in the13438// Target-Independent DAGCombiner does not handle the kind of constant we make13439// (it calls isConstOrConstSplat with AllowTruncation set to false - and for13440// good reason, allowing truncation there would break other targets).13441//13442// Currently, this is only done for MVE, as it's the only target that benefits13443// from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).13444if (N->getOperand(0).getOpcode() != ISD::XOR)13445return SDValue();13446SDValue XOR = N->getOperand(0);1344713448// Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.13449// It is important to check with truncation allowed as the BUILD_VECTORs we13450// generate in those situations will truncate their operands.13451ConstantSDNode *Const =13452isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,13453/*AllowTruncation*/ true);13454if (!Const || !Const->isOne())13455return SDValue();1345613457// Rewrite into vselect(cond, rhs, lhs).13458SDValue Cond = XOR->getOperand(0);13459SDValue LHS = N->getOperand(1);13460SDValue RHS = N->getOperand(2);13461EVT Type = N->getValueType(0);13462return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);13463}1346413465// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n13466static SDValue PerformVSetCCToVCTPCombine(SDNode *N,13467TargetLowering::DAGCombinerInfo &DCI,13468const ARMSubtarget *Subtarget) {13469SDValue Op0 = N->getOperand(0);13470SDValue Op1 = N->getOperand(1);13471ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();13472EVT VT = N->getValueType(0);1347313474if (!Subtarget->hasMVEIntegerOps() ||13475!DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))13476return SDValue();1347713478if (CC == ISD::SETUGE) {13479std::swap(Op0, Op1);13480CC = ISD::SETULT;13481}1348213483if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||13484Op0.getOpcode() != ISD::BUILD_VECTOR)13485return SDValue();1348613487// Check first operand is BuildVector of 0,1,2,...13488for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {13489if (!Op0.getOperand(I).isUndef() &&13490!(isa<ConstantSDNode>(Op0.getOperand(I)) &&13491Op0.getConstantOperandVal(I) == I))13492return SDValue();13493}1349413495// The second is a Splat of Op1S13496SDValue Op1S = DCI.DAG.getSplatValue(Op1);13497if (!Op1S)13498return SDValue();1349913500unsigned Opc;13501switch (VT.getVectorNumElements()) {13502case 2:13503Opc = Intrinsic::arm_mve_vctp64;13504break;13505case 4:13506Opc = Intrinsic::arm_mve_vctp32;13507break;13508case 8:13509Opc = Intrinsic::arm_mve_vctp16;13510break;13511case 16:13512Opc = Intrinsic::arm_mve_vctp8;13513break;13514default:13515return SDValue();13516}1351713518SDLoc DL(N);13519return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,13520DCI.DAG.getConstant(Opc, DL, MVT::i32),13521DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));13522}1352313524/// PerformADDECombine - Target-specific dag combine transform from13525/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or13526/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL13527static SDValue PerformADDECombine(SDNode *N,13528TargetLowering::DAGCombinerInfo &DCI,13529const ARMSubtarget *Subtarget) {13530// Only ARM and Thumb2 support UMLAL/SMLAL.13531if (Subtarget->isThumb1Only())13532return PerformAddeSubeCombine(N, DCI, Subtarget);1353313534// Only perform the checks after legalize when the pattern is available.13535if (DCI.isBeforeLegalize()) return SDValue();1353613537return AddCombineTo64bitUMAAL(N, DCI, Subtarget);13538}1353913540/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with13541/// operands N0 and N1. This is a helper for PerformADDCombine that is13542/// called with the default operands, and if that fails, with commuted13543/// operands.13544static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,13545TargetLowering::DAGCombinerInfo &DCI,13546const ARMSubtarget *Subtarget){13547// Attempt to create vpadd for this add.13548if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))13549return Result;1355013551// Attempt to create vpaddl for this add.13552if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))13553return Result;13554if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,13555Subtarget))13556return Result;1355713558// fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))13559if (N0.getNode()->hasOneUse())13560if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))13561return Result;13562return SDValue();13563}1356413565static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) {13566EVT VT = N->getValueType(0);13567SDValue N0 = N->getOperand(0);13568SDValue N1 = N->getOperand(1);13569SDLoc dl(N);1357013571auto IsVecReduce = [](SDValue Op) {13572switch (Op.getOpcode()) {13573case ISD::VECREDUCE_ADD:13574case ARMISD::VADDVs:13575case ARMISD::VADDVu:13576case ARMISD::VMLAVs:13577case ARMISD::VMLAVu:13578return true;13579}13580return false;13581};1358213583auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {13584// Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->13585// add(add(X, vecreduce(Y)), vecreduce(Z))13586// to make better use of vaddva style instructions.13587if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&13588IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&13589!isa<ConstantSDNode>(N0) && N1->hasOneUse()) {13590SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));13591return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));13592}13593// And turn add(add(A, reduce(B)), add(C, reduce(D))) ->13594// add(add(add(A, C), reduce(B)), reduce(D))13595if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&13596N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {13597unsigned N0RedOp = 0;13598if (!IsVecReduce(N0.getOperand(N0RedOp))) {13599N0RedOp = 1;13600if (!IsVecReduce(N0.getOperand(N0RedOp)))13601return SDValue();13602}1360313604unsigned N1RedOp = 0;13605if (!IsVecReduce(N1.getOperand(N1RedOp)))13606N1RedOp = 1;13607if (!IsVecReduce(N1.getOperand(N1RedOp)))13608return SDValue();1360913610SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),13611N1.getOperand(1 - N1RedOp));13612SDValue Add1 =13613DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));13614return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));13615}13616return SDValue();13617};13618if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))13619return R;13620if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))13621return R;1362213623// Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))13624// Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))13625// by ascending load offsets. This can help cores prefetch if the order of13626// loads is more predictable.13627auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {13628// Check if two reductions are known to load data where one is before/after13629// another. Return negative if N0 loads data before N1, positive if N1 is13630// before N0 and 0 otherwise if nothing is known.13631auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {13632// Look through to the first operand of a MUL, for the VMLA case.13633// Currently only looks at the first operand, in the hope they are equal.13634if (N0.getOpcode() == ISD::MUL)13635N0 = N0.getOperand(0);13636if (N1.getOpcode() == ISD::MUL)13637N1 = N1.getOperand(0);1363813639// Return true if the two operands are loads to the same object and the13640// offset of the first is known to be less than the offset of the second.13641LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);13642LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);13643if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||13644!Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||13645Load1->isIndexed())13646return 0;1364713648auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);13649auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);1365013651if (!BaseLocDecomp0.getBase() ||13652BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||13653!BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())13654return 0;13655if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())13656return -1;13657if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())13658return 1;13659return 0;13660};1366113662SDValue X;13663if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {13664if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {13665int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),13666N0.getOperand(1).getOperand(0));13667if (IsBefore < 0) {13668X = N0.getOperand(0);13669N0 = N0.getOperand(1);13670} else if (IsBefore > 0) {13671X = N0.getOperand(1);13672N0 = N0.getOperand(0);13673} else13674return SDValue();13675} else if (IsVecReduce(N0.getOperand(0))) {13676X = N0.getOperand(1);13677N0 = N0.getOperand(0);13678} else if (IsVecReduce(N0.getOperand(1))) {13679X = N0.getOperand(0);13680N0 = N0.getOperand(1);13681} else13682return SDValue();13683} else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&13684IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {13685// Note this is backward to how you would expect. We create13686// add(reduce(load + 16), reduce(load + 0)) so that the13687// add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving13688// the X as VADDV(load + 0)13689return DAG.getNode(ISD::ADD, dl, VT, N1, N0);13690} else13691return SDValue();1369213693if (!IsVecReduce(N0) || !IsVecReduce(N1))13694return SDValue();1369513696if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)13697return SDValue();1369813699// Switch from add(add(X, N0), N1) to add(add(X, N1), N0)13700SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);13701return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);13702};13703if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))13704return R;13705if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))13706return R;13707return SDValue();13708}1370913710static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,13711const ARMSubtarget *Subtarget) {13712if (!Subtarget->hasMVEIntegerOps())13713return SDValue();1371413715if (SDValue R = TryDistrubutionADDVecReduce(N, DAG))13716return R;1371713718EVT VT = N->getValueType(0);13719SDValue N0 = N->getOperand(0);13720SDValue N1 = N->getOperand(1);13721SDLoc dl(N);1372213723if (VT != MVT::i64)13724return SDValue();1372513726// We are looking for a i64 add of a VADDLVx. Due to these being i64's, this13727// will look like:13728// t1: i32,i32 = ARMISD::VADDLVs x13729// t2: i64 = build_pair t1, t1:113730// t3: i64 = add t2, y13731// Otherwise we try to push the add up above VADDLVAx, to potentially allow13732// the add to be simplified separately.13733// We also need to check for sext / zext and commutitive adds.13734auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,13735SDValue NB) {13736if (NB->getOpcode() != ISD::BUILD_PAIR)13737return SDValue();13738SDValue VecRed = NB->getOperand(0);13739if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||13740VecRed.getResNo() != 0 ||13741NB->getOperand(1) != SDValue(VecRed.getNode(), 1))13742return SDValue();1374313744if (VecRed->getOpcode() == OpcodeA) {13745// add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)13746SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,13747VecRed.getOperand(0), VecRed.getOperand(1));13748NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);13749}1375013751SmallVector<SDValue, 4> Ops(2);13752std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);1375313754unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;13755for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)13756Ops.push_back(VecRed->getOperand(I));13757SDValue Red =13758DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);13759return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,13760SDValue(Red.getNode(), 1));13761};1376213763if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))13764return M;13765if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))13766return M;13767if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))13768return M;13769if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))13770return M;13771if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))13772return M;13773if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))13774return M;13775if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))13776return M;13777if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))13778return M;13779if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))13780return M;13781if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))13782return M;13783if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))13784return M;13785if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))13786return M;13787if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))13788return M;13789if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))13790return M;13791if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))13792return M;13793if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))13794return M;13795return SDValue();13796}1379713798bool13799ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,13800CombineLevel Level) const {13801assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||13802N->getOpcode() == ISD::SRL) &&13803"Expected shift op");1380413805if (Level == BeforeLegalizeTypes)13806return true;1380713808if (N->getOpcode() != ISD::SHL)13809return true;1381013811if (Subtarget->isThumb1Only()) {13812// Avoid making expensive immediates by commuting shifts. (This logic13813// only applies to Thumb1 because ARM and Thumb2 immediates can be shifted13814// for free.)13815if (N->getOpcode() != ISD::SHL)13816return true;13817SDValue N1 = N->getOperand(0);13818if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&13819N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)13820return true;13821if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {13822if (Const->getAPIntValue().ult(256))13823return false;13824if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&13825Const->getAPIntValue().sgt(-256))13826return false;13827}13828return true;13829}1383013831// Turn off commute-with-shift transform after legalization, so it doesn't13832// conflict with PerformSHLSimplify. (We could try to detect when13833// PerformSHLSimplify would trigger more precisely, but it isn't13834// really necessary.)13835return false;13836}1383713838bool ARMTargetLowering::isDesirableToCommuteXorWithShift(13839const SDNode *N) const {13840assert(N->getOpcode() == ISD::XOR &&13841(N->getOperand(0).getOpcode() == ISD::SHL ||13842N->getOperand(0).getOpcode() == ISD::SRL) &&13843"Expected XOR(SHIFT) pattern");1384413845// Only commute if the entire NOT mask is a hidden shifted mask.13846auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));13847auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));13848if (XorC && ShiftC) {13849unsigned MaskIdx, MaskLen;13850if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {13851unsigned ShiftAmt = ShiftC->getZExtValue();13852unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();13853if (N->getOperand(0).getOpcode() == ISD::SHL)13854return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);13855return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);13856}13857}1385813859return false;13860}1386113862bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(13863const SDNode *N, CombineLevel Level) const {13864assert(((N->getOpcode() == ISD::SHL &&13865N->getOperand(0).getOpcode() == ISD::SRL) ||13866(N->getOpcode() == ISD::SRL &&13867N->getOperand(0).getOpcode() == ISD::SHL)) &&13868"Expected shift-shift mask");1386913870if (!Subtarget->isThumb1Only())13871return true;1387213873if (Level == BeforeLegalizeTypes)13874return true;1387513876return false;13877}1387813879bool ARMTargetLowering::shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,13880EVT VT) const {13881return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT);13882}1388313884bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {13885if (!Subtarget->hasNEON()) {13886if (Subtarget->isThumb1Only())13887return VT.getScalarSizeInBits() <= 32;13888return true;13889}13890return VT.isScalarInteger();13891}1389213893bool ARMTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,13894EVT VT) const {13895if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())13896return false;1389713898switch (FPVT.getSimpleVT().SimpleTy) {13899case MVT::f16:13900return Subtarget->hasVFP2Base();13901case MVT::f32:13902return Subtarget->hasVFP2Base();13903case MVT::f64:13904return Subtarget->hasFP64();13905case MVT::v4f32:13906case MVT::v8f16:13907return Subtarget->hasMVEFloatOps();13908default:13909return false;13910}13911}1391213913static SDValue PerformSHLSimplify(SDNode *N,13914TargetLowering::DAGCombinerInfo &DCI,13915const ARMSubtarget *ST) {13916// Allow the generic combiner to identify potential bswaps.13917if (DCI.isBeforeLegalize())13918return SDValue();1391913920// DAG combiner will fold:13921// (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)13922// (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c213923// Other code patterns that can be also be modified have the following form:13924// b + ((a << 1) | 510)13925// b + ((a << 1) & 510)13926// b + ((a << 1) ^ 510)13927// b + ((a << 1) + 510)1392813929// Many instructions can perform the shift for free, but it requires both13930// the operands to be registers. If c1 << c2 is too large, a mov immediate13931// instruction will needed. So, unfold back to the original pattern if:13932// - if c1 and c2 are small enough that they don't require mov imms.13933// - the user(s) of the node can perform an shl1393413935// No shifted operands for 16-bit instructions.13936if (ST->isThumb() && ST->isThumb1Only())13937return SDValue();1393813939// Check that all the users could perform the shl themselves.13940for (auto *U : N->uses()) {13941switch(U->getOpcode()) {13942default:13943return SDValue();13944case ISD::SUB:13945case ISD::ADD:13946case ISD::AND:13947case ISD::OR:13948case ISD::XOR:13949case ISD::SETCC:13950case ARMISD::CMP:13951// Check that the user isn't already using a constant because there13952// aren't any instructions that support an immediate operand and a13953// shifted operand.13954if (isa<ConstantSDNode>(U->getOperand(0)) ||13955isa<ConstantSDNode>(U->getOperand(1)))13956return SDValue();1395713958// Check that it's not already using a shift.13959if (U->getOperand(0).getOpcode() == ISD::SHL ||13960U->getOperand(1).getOpcode() == ISD::SHL)13961return SDValue();13962break;13963}13964}1396513966if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&13967N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)13968return SDValue();1396913970if (N->getOperand(0).getOpcode() != ISD::SHL)13971return SDValue();1397213973SDValue SHL = N->getOperand(0);1397413975auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));13976auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));13977if (!C1ShlC2 || !C2)13978return SDValue();1397913980APInt C2Int = C2->getAPIntValue();13981APInt C1Int = C1ShlC2->getAPIntValue();13982unsigned C2Width = C2Int.getBitWidth();13983if (C2Int.uge(C2Width))13984return SDValue();13985uint64_t C2Value = C2Int.getZExtValue();1398613987// Check that performing a lshr will not lose any information.13988APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);13989if ((C1Int & Mask) != C1Int)13990return SDValue();1399113992// Shift the first constant.13993C1Int.lshrInPlace(C2Int);1399413995// The immediates are encoded as an 8-bit value that can be rotated.13996auto LargeImm = [](const APInt &Imm) {13997unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();13998return Imm.getBitWidth() - Zeros > 8;13999};1400014001if (LargeImm(C1Int) || LargeImm(C2Int))14002return SDValue();1400314004SelectionDAG &DAG = DCI.DAG;14005SDLoc dl(N);14006SDValue X = SHL.getOperand(0);14007SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,14008DAG.getConstant(C1Int, dl, MVT::i32));14009// Shift left to compensate for the lshr of C1Int.14010SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));1401114012LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();14013SHL.dump(); N->dump());14014LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());14015return Res;14016}140171401814019/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.14020///14021static SDValue PerformADDCombine(SDNode *N,14022TargetLowering::DAGCombinerInfo &DCI,14023const ARMSubtarget *Subtarget) {14024SDValue N0 = N->getOperand(0);14025SDValue N1 = N->getOperand(1);1402614027// Only works one way, because it needs an immediate operand.14028if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))14029return Result;1403014031if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))14032return Result;1403314034// First try with the default operand order.14035if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))14036return Result;1403714038// If that didn't work, try again with the operands commuted.14039return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);14040}1404114042// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)14043// providing -X is as cheap as X (currently, just a constant).14044static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG) {14045if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))14046return SDValue();14047SDValue CSINC = N->getOperand(1);14048if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())14049return SDValue();1405014051ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));14052if (!X)14053return SDValue();1405414055return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,14056DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),14057CSINC.getOperand(0)),14058CSINC.getOperand(1), CSINC.getOperand(2),14059CSINC.getOperand(3));14060}1406114062/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.14063///14064static SDValue PerformSUBCombine(SDNode *N,14065TargetLowering::DAGCombinerInfo &DCI,14066const ARMSubtarget *Subtarget) {14067SDValue N0 = N->getOperand(0);14068SDValue N1 = N->getOperand(1);1406914070// fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))14071if (N1.getNode()->hasOneUse())14072if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))14073return Result;1407414075if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))14076return R;1407714078if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())14079return SDValue();1408014081// Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))14082// so that we can readily pattern match more mve instructions which can use14083// a scalar operand.14084SDValue VDup = N->getOperand(1);14085if (VDup->getOpcode() != ARMISD::VDUP)14086return SDValue();1408714088SDValue VMov = N->getOperand(0);14089if (VMov->getOpcode() == ISD::BITCAST)14090VMov = VMov->getOperand(0);1409114092if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))14093return SDValue();1409414095SDLoc dl(N);14096SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,14097DCI.DAG.getConstant(0, dl, MVT::i32),14098VDup->getOperand(0));14099return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);14100}1410114102/// PerformVMULCombine14103/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the14104/// special multiplier accumulator forwarding.14105/// vmul d3, d0, d214106/// vmla d3, d1, d214107/// is faster than14108/// vadd d3, d0, d114109/// vmul d3, d3, d214110// However, for (A + B) * (A + B),14111// vadd d2, d0, d114112// vmul d3, d0, d214113// vmla d3, d1, d214114// is slower than14115// vadd d2, d0, d114116// vmul d3, d2, d214117static SDValue PerformVMULCombine(SDNode *N,14118TargetLowering::DAGCombinerInfo &DCI,14119const ARMSubtarget *Subtarget) {14120if (!Subtarget->hasVMLxForwarding())14121return SDValue();1412214123SelectionDAG &DAG = DCI.DAG;14124SDValue N0 = N->getOperand(0);14125SDValue N1 = N->getOperand(1);14126unsigned Opcode = N0.getOpcode();14127if (Opcode != ISD::ADD && Opcode != ISD::SUB &&14128Opcode != ISD::FADD && Opcode != ISD::FSUB) {14129Opcode = N1.getOpcode();14130if (Opcode != ISD::ADD && Opcode != ISD::SUB &&14131Opcode != ISD::FADD && Opcode != ISD::FSUB)14132return SDValue();14133std::swap(N0, N1);14134}1413514136if (N0 == N1)14137return SDValue();1413814139EVT VT = N->getValueType(0);14140SDLoc DL(N);14141SDValue N00 = N0->getOperand(0);14142SDValue N01 = N0->getOperand(1);14143return DAG.getNode(Opcode, DL, VT,14144DAG.getNode(ISD::MUL, DL, VT, N00, N1),14145DAG.getNode(ISD::MUL, DL, VT, N01, N1));14146}1414714148static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG,14149const ARMSubtarget *Subtarget) {14150EVT VT = N->getValueType(0);14151if (VT != MVT::v2i64)14152return SDValue();1415314154SDValue N0 = N->getOperand(0);14155SDValue N1 = N->getOperand(1);1415614157auto IsSignExt = [&](SDValue Op) {14158if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)14159return SDValue();14160EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();14161if (VT.getScalarSizeInBits() == 32)14162return Op->getOperand(0);14163return SDValue();14164};14165auto IsZeroExt = [&](SDValue Op) {14166// Zero extends are a little more awkward. At the point we are matching14167// this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.14168// That might be before of after a bitcast depending on how the and is14169// placed. Because this has to look through bitcasts, it is currently only14170// supported on LE.14171if (!Subtarget->isLittle())14172return SDValue();1417314174SDValue And = Op;14175if (And->getOpcode() == ISD::BITCAST)14176And = And->getOperand(0);14177if (And->getOpcode() != ISD::AND)14178return SDValue();14179SDValue Mask = And->getOperand(1);14180if (Mask->getOpcode() == ISD::BITCAST)14181Mask = Mask->getOperand(0);1418214183if (Mask->getOpcode() != ISD::BUILD_VECTOR ||14184Mask.getValueType() != MVT::v4i32)14185return SDValue();14186if (isAllOnesConstant(Mask->getOperand(0)) &&14187isNullConstant(Mask->getOperand(1)) &&14188isAllOnesConstant(Mask->getOperand(2)) &&14189isNullConstant(Mask->getOperand(3)))14190return And->getOperand(0);14191return SDValue();14192};1419314194SDLoc dl(N);14195if (SDValue Op0 = IsSignExt(N0)) {14196if (SDValue Op1 = IsSignExt(N1)) {14197SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);14198SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);14199return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);14200}14201}14202if (SDValue Op0 = IsZeroExt(N0)) {14203if (SDValue Op1 = IsZeroExt(N1)) {14204SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);14205SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);14206return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);14207}14208}1420914210return SDValue();14211}1421214213static SDValue PerformMULCombine(SDNode *N,14214TargetLowering::DAGCombinerInfo &DCI,14215const ARMSubtarget *Subtarget) {14216SelectionDAG &DAG = DCI.DAG;1421714218EVT VT = N->getValueType(0);14219if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)14220return PerformMVEVMULLCombine(N, DAG, Subtarget);1422114222if (Subtarget->isThumb1Only())14223return SDValue();1422414225if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())14226return SDValue();1422714228if (VT.is64BitVector() || VT.is128BitVector())14229return PerformVMULCombine(N, DCI, Subtarget);14230if (VT != MVT::i32)14231return SDValue();1423214233ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));14234if (!C)14235return SDValue();1423614237int64_t MulAmt = C->getSExtValue();14238unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);1423914240ShiftAmt = ShiftAmt & (32 - 1);14241SDValue V = N->getOperand(0);14242SDLoc DL(N);1424314244SDValue Res;14245MulAmt >>= ShiftAmt;1424614247if (MulAmt >= 0) {14248if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {14249// (mul x, 2^N + 1) => (add (shl x, N), x)14250Res = DAG.getNode(ISD::ADD, DL, VT,14251V,14252DAG.getNode(ISD::SHL, DL, VT,14253V,14254DAG.getConstant(Log2_32(MulAmt - 1), DL,14255MVT::i32)));14256} else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {14257// (mul x, 2^N - 1) => (sub (shl x, N), x)14258Res = DAG.getNode(ISD::SUB, DL, VT,14259DAG.getNode(ISD::SHL, DL, VT,14260V,14261DAG.getConstant(Log2_32(MulAmt + 1), DL,14262MVT::i32)),14263V);14264} else14265return SDValue();14266} else {14267uint64_t MulAmtAbs = -MulAmt;14268if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {14269// (mul x, -(2^N - 1)) => (sub x, (shl x, N))14270Res = DAG.getNode(ISD::SUB, DL, VT,14271V,14272DAG.getNode(ISD::SHL, DL, VT,14273V,14274DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,14275MVT::i32)));14276} else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {14277// (mul x, -(2^N + 1)) => - (add (shl x, N), x)14278Res = DAG.getNode(ISD::ADD, DL, VT,14279V,14280DAG.getNode(ISD::SHL, DL, VT,14281V,14282DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,14283MVT::i32)));14284Res = DAG.getNode(ISD::SUB, DL, VT,14285DAG.getConstant(0, DL, MVT::i32), Res);14286} else14287return SDValue();14288}1428914290if (ShiftAmt != 0)14291Res = DAG.getNode(ISD::SHL, DL, VT,14292Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));1429314294// Do not add new nodes to DAG combiner worklist.14295DCI.CombineTo(N, Res, false);14296return SDValue();14297}1429814299static SDValue CombineANDShift(SDNode *N,14300TargetLowering::DAGCombinerInfo &DCI,14301const ARMSubtarget *Subtarget) {14302// Allow DAGCombine to pattern-match before we touch the canonical form.14303if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())14304return SDValue();1430514306if (N->getValueType(0) != MVT::i32)14307return SDValue();1430814309ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));14310if (!N1C)14311return SDValue();1431214313uint32_t C1 = (uint32_t)N1C->getZExtValue();14314// Don't transform uxtb/uxth.14315if (C1 == 255 || C1 == 65535)14316return SDValue();1431714318SDNode *N0 = N->getOperand(0).getNode();14319if (!N0->hasOneUse())14320return SDValue();1432114322if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)14323return SDValue();1432414325bool LeftShift = N0->getOpcode() == ISD::SHL;1432614327ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));14328if (!N01C)14329return SDValue();1433014331uint32_t C2 = (uint32_t)N01C->getZExtValue();14332if (!C2 || C2 >= 32)14333return SDValue();1433414335// Clear irrelevant bits in the mask.14336if (LeftShift)14337C1 &= (-1U << C2);14338else14339C1 &= (-1U >> C2);1434014341SelectionDAG &DAG = DCI.DAG;14342SDLoc DL(N);1434314344// We have a pattern of the form "(and (shl x, c2) c1)" or14345// "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to14346// transform to a pair of shifts, to save materializing c1.1434714348// First pattern: right shift, then mask off leading bits.14349// FIXME: Use demanded bits?14350if (!LeftShift && isMask_32(C1)) {14351uint32_t C3 = llvm::countl_zero(C1);14352if (C2 < C3) {14353SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),14354DAG.getConstant(C3 - C2, DL, MVT::i32));14355return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,14356DAG.getConstant(C3, DL, MVT::i32));14357}14358}1435914360// First pattern, reversed: left shift, then mask off trailing bits.14361if (LeftShift && isMask_32(~C1)) {14362uint32_t C3 = llvm::countr_zero(C1);14363if (C2 < C3) {14364SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),14365DAG.getConstant(C3 - C2, DL, MVT::i32));14366return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,14367DAG.getConstant(C3, DL, MVT::i32));14368}14369}1437014371// Second pattern: left shift, then mask off leading bits.14372// FIXME: Use demanded bits?14373if (LeftShift && isShiftedMask_32(C1)) {14374uint32_t Trailing = llvm::countr_zero(C1);14375uint32_t C3 = llvm::countl_zero(C1);14376if (Trailing == C2 && C2 + C3 < 32) {14377SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),14378DAG.getConstant(C2 + C3, DL, MVT::i32));14379return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,14380DAG.getConstant(C3, DL, MVT::i32));14381}14382}1438314384// Second pattern, reversed: right shift, then mask off trailing bits.14385// FIXME: Handle other patterns of known/demanded bits.14386if (!LeftShift && isShiftedMask_32(C1)) {14387uint32_t Leading = llvm::countl_zero(C1);14388uint32_t C3 = llvm::countr_zero(C1);14389if (Leading == C2 && C2 + C3 < 32) {14390SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),14391DAG.getConstant(C2 + C3, DL, MVT::i32));14392return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,14393DAG.getConstant(C3, DL, MVT::i32));14394}14395}1439614397// Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"14398// if "c1 >> c2" is a cheaper immediate than "c1"14399if (LeftShift &&14400HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {1440114402SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),14403DAG.getConstant(C1 >> C2, DL, MVT::i32));14404return DAG.getNode(ISD::SHL, DL, MVT::i32, And,14405DAG.getConstant(C2, DL, MVT::i32));14406}1440714408return SDValue();14409}1441014411static SDValue PerformANDCombine(SDNode *N,14412TargetLowering::DAGCombinerInfo &DCI,14413const ARMSubtarget *Subtarget) {14414// Attempt to use immediate-form VBIC14415BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));14416SDLoc dl(N);14417EVT VT = N->getValueType(0);14418SelectionDAG &DAG = DCI.DAG;1441914420if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||14421VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)14422return SDValue();1442314424APInt SplatBits, SplatUndef;14425unsigned SplatBitSize;14426bool HasAnyUndefs;14427if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&14428BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {14429if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||14430SplatBitSize == 64) {14431EVT VbicVT;14432SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),14433SplatUndef.getZExtValue(), SplatBitSize,14434DAG, dl, VbicVT, VT, OtherModImm);14435if (Val.getNode()) {14436SDValue Input =14437DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));14438SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);14439return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);14440}14441}14442}1444314444if (!Subtarget->isThumb1Only()) {14445// fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))14446if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))14447return Result;1444814449if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))14450return Result;14451}1445214453if (Subtarget->isThumb1Only())14454if (SDValue Result = CombineANDShift(N, DCI, Subtarget))14455return Result;1445614457return SDValue();14458}1445914460// Try combining OR nodes to SMULWB, SMULWT.14461static SDValue PerformORCombineToSMULWBT(SDNode *OR,14462TargetLowering::DAGCombinerInfo &DCI,14463const ARMSubtarget *Subtarget) {14464if (!Subtarget->hasV6Ops() ||14465(Subtarget->isThumb() &&14466(!Subtarget->hasThumb2() || !Subtarget->hasDSP())))14467return SDValue();1446814469SDValue SRL = OR->getOperand(0);14470SDValue SHL = OR->getOperand(1);1447114472if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {14473SRL = OR->getOperand(1);14474SHL = OR->getOperand(0);14475}14476if (!isSRL16(SRL) || !isSHL16(SHL))14477return SDValue();1447814479// The first operands to the shifts need to be the two results from the14480// same smul_lohi node.14481if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||14482SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)14483return SDValue();1448414485SDNode *SMULLOHI = SRL.getOperand(0).getNode();14486if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||14487SHL.getOperand(0) != SDValue(SMULLOHI, 1))14488return SDValue();1448914490// Now we have:14491// (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))14492// For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.14493// For SMUWB the 16-bit value will signed extended somehow.14494// For SMULWT only the SRA is required.14495// Check both sides of SMUL_LOHI14496SDValue OpS16 = SMULLOHI->getOperand(0);14497SDValue OpS32 = SMULLOHI->getOperand(1);1449814499SelectionDAG &DAG = DCI.DAG;14500if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {14501OpS16 = OpS32;14502OpS32 = SMULLOHI->getOperand(0);14503}1450414505SDLoc dl(OR);14506unsigned Opcode = 0;14507if (isS16(OpS16, DAG))14508Opcode = ARMISD::SMULWB;14509else if (isSRA16(OpS16)) {14510Opcode = ARMISD::SMULWT;14511OpS16 = OpS16->getOperand(0);14512}14513else14514return SDValue();1451514516SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);14517DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);14518return SDValue(OR, 0);14519}1452014521static SDValue PerformORCombineToBFI(SDNode *N,14522TargetLowering::DAGCombinerInfo &DCI,14523const ARMSubtarget *Subtarget) {14524// BFI is only available on V6T2+14525if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())14526return SDValue();1452714528EVT VT = N->getValueType(0);14529SDValue N0 = N->getOperand(0);14530SDValue N1 = N->getOperand(1);14531SelectionDAG &DAG = DCI.DAG;14532SDLoc DL(N);14533// 1) or (and A, mask), val => ARMbfi A, val, mask14534// iff (val & mask) == val14535//14536// 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask14537// 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)14538// && mask == ~mask214539// 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)14540// && ~mask == mask214541// (i.e., copy a bitfield value into another bitfield of the same width)1454214543if (VT != MVT::i32)14544return SDValue();1454514546SDValue N00 = N0.getOperand(0);1454714548// The value and the mask need to be constants so we can verify this is14549// actually a bitfield set. If the mask is 0xffff, we can do better14550// via a movt instruction, so don't use BFI in that case.14551SDValue MaskOp = N0.getOperand(1);14552ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);14553if (!MaskC)14554return SDValue();14555unsigned Mask = MaskC->getZExtValue();14556if (Mask == 0xffff)14557return SDValue();14558SDValue Res;14559// Case (1): or (and A, mask), val => ARMbfi A, val, mask14560ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);14561if (N1C) {14562unsigned Val = N1C->getZExtValue();14563if ((Val & ~Mask) != Val)14564return SDValue();1456514566if (ARM::isBitFieldInvertedMask(Mask)) {14567Val >>= llvm::countr_zero(~Mask);1456814569Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,14570DAG.getConstant(Val, DL, MVT::i32),14571DAG.getConstant(Mask, DL, MVT::i32));1457214573DCI.CombineTo(N, Res, false);14574// Return value from the original node to inform the combiner than N is14575// now dead.14576return SDValue(N, 0);14577}14578} else if (N1.getOpcode() == ISD::AND) {14579// case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask14580ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));14581if (!N11C)14582return SDValue();14583unsigned Mask2 = N11C->getZExtValue();1458414585// Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern14586// as is to match.14587if (ARM::isBitFieldInvertedMask(Mask) &&14588(Mask == ~Mask2)) {14589// The pack halfword instruction works better for masks that fit it,14590// so use that when it's available.14591if (Subtarget->hasDSP() &&14592(Mask == 0xffff || Mask == 0xffff0000))14593return SDValue();14594// 2a14595unsigned amt = llvm::countr_zero(Mask2);14596Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),14597DAG.getConstant(amt, DL, MVT::i32));14598Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,14599DAG.getConstant(Mask, DL, MVT::i32));14600DCI.CombineTo(N, Res, false);14601// Return value from the original node to inform the combiner than N is14602// now dead.14603return SDValue(N, 0);14604} else if (ARM::isBitFieldInvertedMask(~Mask) &&14605(~Mask == Mask2)) {14606// The pack halfword instruction works better for masks that fit it,14607// so use that when it's available.14608if (Subtarget->hasDSP() &&14609(Mask2 == 0xffff || Mask2 == 0xffff0000))14610return SDValue();14611// 2b14612unsigned lsb = llvm::countr_zero(Mask);14613Res = DAG.getNode(ISD::SRL, DL, VT, N00,14614DAG.getConstant(lsb, DL, MVT::i32));14615Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,14616DAG.getConstant(Mask2, DL, MVT::i32));14617DCI.CombineTo(N, Res, false);14618// Return value from the original node to inform the combiner than N is14619// now dead.14620return SDValue(N, 0);14621}14622}1462314624if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&14625N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&14626ARM::isBitFieldInvertedMask(~Mask)) {14627// Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask14628// where lsb(mask) == #shamt and masked bits of B are known zero.14629SDValue ShAmt = N00.getOperand(1);14630unsigned ShAmtC = ShAmt->getAsZExtVal();14631unsigned LSB = llvm::countr_zero(Mask);14632if (ShAmtC != LSB)14633return SDValue();1463414635Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),14636DAG.getConstant(~Mask, DL, MVT::i32));1463714638DCI.CombineTo(N, Res, false);14639// Return value from the original node to inform the combiner than N is14640// now dead.14641return SDValue(N, 0);14642}1464314644return SDValue();14645}1464614647static bool isValidMVECond(unsigned CC, bool IsFloat) {14648switch (CC) {14649case ARMCC::EQ:14650case ARMCC::NE:14651case ARMCC::LE:14652case ARMCC::GT:14653case ARMCC::GE:14654case ARMCC::LT:14655return true;14656case ARMCC::HS:14657case ARMCC::HI:14658return !IsFloat;14659default:14660return false;14661};14662}1466314664static ARMCC::CondCodes getVCMPCondCode(SDValue N) {14665if (N->getOpcode() == ARMISD::VCMP)14666return (ARMCC::CondCodes)N->getConstantOperandVal(2);14667else if (N->getOpcode() == ARMISD::VCMPZ)14668return (ARMCC::CondCodes)N->getConstantOperandVal(1);14669else14670llvm_unreachable("Not a VCMP/VCMPZ!");14671}1467214673static bool CanInvertMVEVCMP(SDValue N) {14674ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N));14675return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());14676}1467714678static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG,14679const ARMSubtarget *Subtarget) {14680// Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain14681// together with predicates14682EVT VT = N->getValueType(0);14683SDLoc DL(N);14684SDValue N0 = N->getOperand(0);14685SDValue N1 = N->getOperand(1);1468614687auto IsFreelyInvertable = [&](SDValue V) {14688if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)14689return CanInvertMVEVCMP(V);14690return false;14691};1469214693// At least one operand must be freely invertable.14694if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))14695return SDValue();1469614697SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);14698SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);14699SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);14700return DAG.getLogicalNOT(DL, And, VT);14701}1470214703/// PerformORCombine - Target-specific dag combine xforms for ISD::OR14704static SDValue PerformORCombine(SDNode *N,14705TargetLowering::DAGCombinerInfo &DCI,14706const ARMSubtarget *Subtarget) {14707// Attempt to use immediate-form VORR14708BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));14709SDLoc dl(N);14710EVT VT = N->getValueType(0);14711SelectionDAG &DAG = DCI.DAG;1471214713if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))14714return SDValue();1471514716if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||14717VT == MVT::v8i1 || VT == MVT::v16i1))14718return PerformORCombine_i1(N, DAG, Subtarget);1471914720APInt SplatBits, SplatUndef;14721unsigned SplatBitSize;14722bool HasAnyUndefs;14723if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&14724BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {14725if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||14726SplatBitSize == 64) {14727EVT VorrVT;14728SDValue Val =14729isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),14730SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);14731if (Val.getNode()) {14732SDValue Input =14733DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));14734SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);14735return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);14736}14737}14738}1473914740if (!Subtarget->isThumb1Only()) {14741// fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))14742if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))14743return Result;14744if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))14745return Result;14746}1474714748SDValue N0 = N->getOperand(0);14749SDValue N1 = N->getOperand(1);1475014751// (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.14752if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&14753DAG.getTargetLoweringInfo().isTypeLegal(VT)) {1475414755// The code below optimizes (or (and X, Y), Z).14756// The AND operand needs to have a single user to make these optimizations14757// profitable.14758if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())14759return SDValue();1476014761APInt SplatUndef;14762unsigned SplatBitSize;14763bool HasAnyUndefs;1476414765APInt SplatBits0, SplatBits1;14766BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));14767BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));14768// Ensure that the second operand of both ands are constants14769if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,14770HasAnyUndefs) && !HasAnyUndefs) {14771if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,14772HasAnyUndefs) && !HasAnyUndefs) {14773// Ensure that the bit width of the constants are the same and that14774// the splat arguments are logical inverses as per the pattern we14775// are trying to simplify.14776if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&14777SplatBits0 == ~SplatBits1) {14778// Canonicalize the vector type to make instruction selection14779// simpler.14780EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;14781SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,14782N0->getOperand(1),14783N0->getOperand(0),14784N1->getOperand(0));14785return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);14786}14787}14788}14789}1479014791// Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when14792// reasonable.14793if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {14794if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))14795return Res;14796}1479714798if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))14799return Result;1480014801return SDValue();14802}1480314804static SDValue PerformXORCombine(SDNode *N,14805TargetLowering::DAGCombinerInfo &DCI,14806const ARMSubtarget *Subtarget) {14807EVT VT = N->getValueType(0);14808SelectionDAG &DAG = DCI.DAG;1480914810if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))14811return SDValue();1481214813if (!Subtarget->isThumb1Only()) {14814// fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))14815if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))14816return Result;1481714818if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))14819return Result;14820}1482114822if (Subtarget->hasMVEIntegerOps()) {14823// fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.14824SDValue N0 = N->getOperand(0);14825SDValue N1 = N->getOperand(1);14826const TargetLowering *TLI = Subtarget->getTargetLowering();14827if (TLI->isConstTrueVal(N1) &&14828(N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {14829if (CanInvertMVEVCMP(N0)) {14830SDLoc DL(N0);14831ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0));1483214833SmallVector<SDValue, 4> Ops;14834Ops.push_back(N0->getOperand(0));14835if (N0->getOpcode() == ARMISD::VCMP)14836Ops.push_back(N0->getOperand(1));14837Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));14838return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);14839}14840}14841}1484214843return SDValue();14844}1484514846// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,14847// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and14848// their position in "to" (Rd).14849static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {14850assert(N->getOpcode() == ARMISD::BFI);1485114852SDValue From = N->getOperand(1);14853ToMask = ~N->getConstantOperandAPInt(2);14854FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());1485514856// If the Base came from a SHR #C, we can deduce that it is really testing bit14857// #C in the base of the SHR.14858if (From->getOpcode() == ISD::SRL &&14859isa<ConstantSDNode>(From->getOperand(1))) {14860APInt Shift = From->getConstantOperandAPInt(1);14861assert(Shift.getLimitedValue() < 32 && "Shift too large!");14862FromMask <<= Shift.getLimitedValue(31);14863From = From->getOperand(0);14864}1486514866return From;14867}1486814869// If A and B contain one contiguous set of bits, does A | B == A . B?14870//14871// Neither A nor B must be zero.14872static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {14873unsigned LastActiveBitInA = A.countr_zero();14874unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;14875return LastActiveBitInA - 1 == FirstActiveBitInB;14876}1487714878static SDValue FindBFIToCombineWith(SDNode *N) {14879// We have a BFI in N. Find a BFI it can combine with, if one exists.14880APInt ToMask, FromMask;14881SDValue From = ParseBFI(N, ToMask, FromMask);14882SDValue To = N->getOperand(0);1488314884SDValue V = To;14885if (V.getOpcode() != ARMISD::BFI)14886return SDValue();1488714888APInt NewToMask, NewFromMask;14889SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);14890if (NewFrom != From)14891return SDValue();1489214893// Do the written bits conflict with any we've seen so far?14894if ((NewToMask & ToMask).getBoolValue())14895// Conflicting bits.14896return SDValue();1489714898// Are the new bits contiguous when combined with the old bits?14899if (BitsProperlyConcatenate(ToMask, NewToMask) &&14900BitsProperlyConcatenate(FromMask, NewFromMask))14901return V;14902if (BitsProperlyConcatenate(NewToMask, ToMask) &&14903BitsProperlyConcatenate(NewFromMask, FromMask))14904return V;1490514906return SDValue();14907}1490814909static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {14910SDValue N0 = N->getOperand(0);14911SDValue N1 = N->getOperand(1);1491214913if (N1.getOpcode() == ISD::AND) {14914// (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff14915// the bits being cleared by the AND are not demanded by the BFI.14916ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));14917if (!N11C)14918return SDValue();14919unsigned InvMask = N->getConstantOperandVal(2);14920unsigned LSB = llvm::countr_zero(~InvMask);14921unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;14922assert(Width <14923static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&14924"undefined behavior");14925unsigned Mask = (1u << Width) - 1;14926unsigned Mask2 = N11C->getZExtValue();14927if ((Mask & (~Mask2)) == 0)14928return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),14929N->getOperand(0), N1.getOperand(0), N->getOperand(2));14930return SDValue();14931}1493214933// Look for another BFI to combine with.14934if (SDValue CombineBFI = FindBFIToCombineWith(N)) {14935// We've found a BFI.14936APInt ToMask1, FromMask1;14937SDValue From1 = ParseBFI(N, ToMask1, FromMask1);1493814939APInt ToMask2, FromMask2;14940SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);14941assert(From1 == From2);14942(void)From2;1494314944// Create a new BFI, combining the two together.14945APInt NewFromMask = FromMask1 | FromMask2;14946APInt NewToMask = ToMask1 | ToMask2;1494714948EVT VT = N->getValueType(0);14949SDLoc dl(N);1495014951if (NewFromMask[0] == 0)14952From1 = DAG.getNode(ISD::SRL, dl, VT, From1,14953DAG.getConstant(NewFromMask.countr_zero(), dl, VT));14954return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,14955DAG.getConstant(~NewToMask, dl, VT));14956}1495714958// Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so14959// that lower bit insertions are performed first, providing that M1 and M214960// do no overlap. This can allow multiple BFI instructions to be combined14961// together by the other folds above.14962if (N->getOperand(0).getOpcode() == ARMISD::BFI) {14963APInt ToMask1 = ~N->getConstantOperandAPInt(2);14964APInt ToMask2 = ~N0.getConstantOperandAPInt(2);1496514966if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||14967ToMask1.countl_zero() < ToMask2.countl_zero())14968return SDValue();1496914970EVT VT = N->getValueType(0);14971SDLoc dl(N);14972SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),14973N->getOperand(1), N->getOperand(2));14974return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),14975N0.getOperand(2));14976}1497714978return SDValue();14979}1498014981// Check that N is CMPZ(CSINC(0, 0, CC, X)),14982// or CMPZ(CMOV(1, 0, CC, $cpsr, X))14983// return X if valid.14984static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) {14985if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))14986return SDValue();14987SDValue CSInc = Cmp->getOperand(0);1498814989// Ignore any `And 1` nodes that may not yet have been removed. We are14990// looking for a value that produces 1/0, so these have no effect on the14991// code.14992while (CSInc.getOpcode() == ISD::AND &&14993isa<ConstantSDNode>(CSInc.getOperand(1)) &&14994CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())14995CSInc = CSInc.getOperand(0);1499614997if (CSInc.getOpcode() == ARMISD::CSINC &&14998isNullConstant(CSInc.getOperand(0)) &&14999isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {15000CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);15001return CSInc.getOperand(3);15002}15003if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&15004isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {15005CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);15006return CSInc.getOperand(4);15007}15008if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&15009isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {15010CC = ARMCC::getOppositeCondition(15011(ARMCC::CondCodes)CSInc.getConstantOperandVal(2));15012return CSInc.getOperand(4);15013}15014return SDValue();15015}1501615017static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG) {15018// Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in15019// t92: glue = ARMISD::CMPZ t74, 015020// t93: i32 = ARMISD::CSINC 0, 0, 1, t9215021// t96: glue = ARMISD::CMPZ t93, 015022// t114: i32 = ARMISD::CSINV 0, 0, 0, t9615023ARMCC::CondCodes Cond;15024if (SDValue C = IsCMPZCSINC(N, Cond))15025if (Cond == ARMCC::EQ)15026return C;15027return SDValue();15028}1502915030static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG) {15031// Fold away an unneccessary CMPZ/CSINC15032// CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->15033// if C1==EQ -> CSXYZ A, B, C2, D15034// if C1==NE -> CSXYZ A, B, NOT(C2), D15035ARMCC::CondCodes Cond;15036if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {15037if (N->getConstantOperandVal(2) == ARMCC::EQ)15038return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),15039N->getOperand(1),15040DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);15041if (N->getConstantOperandVal(2) == ARMCC::NE)15042return DAG.getNode(15043N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),15044N->getOperand(1),15045DAG.getConstant(ARMCC::getOppositeCondition(Cond), SDLoc(N), MVT::i32), C);15046}15047return SDValue();15048}1504915050/// PerformVMOVRRDCombine - Target-specific dag combine xforms for15051/// ARMISD::VMOVRRD.15052static SDValue PerformVMOVRRDCombine(SDNode *N,15053TargetLowering::DAGCombinerInfo &DCI,15054const ARMSubtarget *Subtarget) {15055// vmovrrd(vmovdrr x, y) -> x,y15056SDValue InDouble = N->getOperand(0);15057if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())15058return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));1505915060// vmovrrd(load f64) -> (load i32), (load i32)15061SDNode *InNode = InDouble.getNode();15062if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&15063InNode->getValueType(0) == MVT::f64 &&15064InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&15065!cast<LoadSDNode>(InNode)->isVolatile()) {15066// TODO: Should this be done for non-FrameIndex operands?15067LoadSDNode *LD = cast<LoadSDNode>(InNode);1506815069SelectionDAG &DAG = DCI.DAG;15070SDLoc DL(LD);15071SDValue BasePtr = LD->getBasePtr();15072SDValue NewLD1 =15073DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),15074LD->getAlign(), LD->getMemOperand()->getFlags());1507515076SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,15077DAG.getConstant(4, DL, MVT::i32));1507815079SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,15080LD->getPointerInfo().getWithOffset(4),15081commonAlignment(LD->getAlign(), 4),15082LD->getMemOperand()->getFlags());1508315084DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));15085if (DCI.DAG.getDataLayout().isBigEndian())15086std::swap (NewLD1, NewLD2);15087SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);15088return Result;15089}1509015091// VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d15092// VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b15093if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&15094isa<ConstantSDNode>(InDouble.getOperand(1))) {15095SDValue BV = InDouble.getOperand(0);15096// Look up through any nop bitcasts and vector_reg_casts. bitcasts may15097// change lane order under big endian.15098bool BVSwap = BV.getOpcode() == ISD::BITCAST;15099while (15100(BV.getOpcode() == ISD::BITCAST ||15101BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&15102(BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {15103BVSwap = BV.getOpcode() == ISD::BITCAST;15104BV = BV.getOperand(0);15105}15106if (BV.getValueType() != MVT::v4i32)15107return SDValue();1510815109// Handle buildvectors, pulling out the correct lane depending on15110// endianness.15111unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;15112if (BV.getOpcode() == ISD::BUILD_VECTOR) {15113SDValue Op0 = BV.getOperand(Offset);15114SDValue Op1 = BV.getOperand(Offset + 1);15115if (!Subtarget->isLittle() && BVSwap)15116std::swap(Op0, Op1);1511715118return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));15119}1512015121// A chain of insert_vectors, grabbing the correct value of the chain of15122// inserts.15123SDValue Op0, Op1;15124while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {15125if (isa<ConstantSDNode>(BV.getOperand(2))) {15126if (BV.getConstantOperandVal(2) == Offset)15127Op0 = BV.getOperand(1);15128if (BV.getConstantOperandVal(2) == Offset + 1)15129Op1 = BV.getOperand(1);15130}15131BV = BV.getOperand(0);15132}15133if (!Subtarget->isLittle() && BVSwap)15134std::swap(Op0, Op1);15135if (Op0 && Op1)15136return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));15137}1513815139return SDValue();15140}1514115142/// PerformVMOVDRRCombine - Target-specific dag combine xforms for15143/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.15144static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {15145// N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)15146SDValue Op0 = N->getOperand(0);15147SDValue Op1 = N->getOperand(1);15148if (Op0.getOpcode() == ISD::BITCAST)15149Op0 = Op0.getOperand(0);15150if (Op1.getOpcode() == ISD::BITCAST)15151Op1 = Op1.getOperand(0);15152if (Op0.getOpcode() == ARMISD::VMOVRRD &&15153Op0.getNode() == Op1.getNode() &&15154Op0.getResNo() == 0 && Op1.getResNo() == 1)15155return DAG.getNode(ISD::BITCAST, SDLoc(N),15156N->getValueType(0), Op0.getOperand(0));15157return SDValue();15158}1515915160static SDValue PerformVMOVhrCombine(SDNode *N,15161TargetLowering::DAGCombinerInfo &DCI) {15162SDValue Op0 = N->getOperand(0);1516315164// VMOVhr (VMOVrh (X)) -> X15165if (Op0->getOpcode() == ARMISD::VMOVrh)15166return Op0->getOperand(0);1516715168// FullFP16: half values are passed in S-registers, and we don't15169// need any of the bitcast and moves:15170//15171// t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?15172// t5: i32 = bitcast t215173// t18: f16 = ARMISD::VMOVhr t515174// =>15175// tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?15176if (Op0->getOpcode() == ISD::BITCAST) {15177SDValue Copy = Op0->getOperand(0);15178if (Copy.getValueType() == MVT::f32 &&15179Copy->getOpcode() == ISD::CopyFromReg) {15180bool HasGlue = Copy->getNumOperands() == 3;15181SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),15182HasGlue ? Copy->getOperand(2) : SDValue()};15183EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};15184SDValue NewCopy =15185DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N),15186DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),15187ArrayRef(Ops, HasGlue ? 3 : 2));1518815189// Update Users, Chains, and Potential Glue.15190DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));15191DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));15192if (HasGlue)15193DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),15194NewCopy.getValue(2));1519515196return NewCopy;15197}15198}1519915200// fold (VMOVhr (load x)) -> (load (f16*)x)15201if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {15202if (LN0->hasOneUse() && LN0->isUnindexed() &&15203LN0->getMemoryVT() == MVT::i16) {15204SDValue Load =15205DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),15206LN0->getBasePtr(), LN0->getMemOperand());15207DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));15208DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));15209return Load;15210}15211}1521215213// Only the bottom 16 bits of the source register are used.15214APInt DemandedMask = APInt::getLowBitsSet(32, 16);15215const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();15216if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))15217return SDValue(N, 0);1521815219return SDValue();15220}1522115222static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG) {15223SDValue N0 = N->getOperand(0);15224EVT VT = N->getValueType(0);1522515226// fold (VMOVrh (fpconst x)) -> const x15227if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) {15228APFloat V = C->getValueAPF();15229return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);15230}1523115232// fold (VMOVrh (load x)) -> (zextload (i16*)x)15233if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {15234LoadSDNode *LN0 = cast<LoadSDNode>(N0);1523515236SDValue Load =15237DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),15238LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());15239DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));15240DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));15241return Load;15242}1524315244// Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)15245if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&15246isa<ConstantSDNode>(N0->getOperand(1)))15247return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),15248N0->getOperand(1));1524915250return SDValue();15251}1525215253/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node15254/// are normal, non-volatile loads. If so, it is profitable to bitcast an15255/// i64 vector to have f64 elements, since the value can then be loaded15256/// directly into a VFP register.15257static bool hasNormalLoadOperand(SDNode *N) {15258unsigned NumElts = N->getValueType(0).getVectorNumElements();15259for (unsigned i = 0; i < NumElts; ++i) {15260SDNode *Elt = N->getOperand(i).getNode();15261if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())15262return true;15263}15264return false;15265}1526615267/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for15268/// ISD::BUILD_VECTOR.15269static SDValue PerformBUILD_VECTORCombine(SDNode *N,15270TargetLowering::DAGCombinerInfo &DCI,15271const ARMSubtarget *Subtarget) {15272// build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):15273// VMOVRRD is introduced when legalizing i64 types. It forces the i64 value15274// into a pair of GPRs, which is fine when the value is used as a scalar,15275// but if the i64 value is converted to a vector, we need to undo the VMOVRRD.15276SelectionDAG &DAG = DCI.DAG;15277if (N->getNumOperands() == 2)15278if (SDValue RV = PerformVMOVDRRCombine(N, DAG))15279return RV;1528015281// Load i64 elements as f64 values so that type legalization does not split15282// them up into i32 values.15283EVT VT = N->getValueType(0);15284if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))15285return SDValue();15286SDLoc dl(N);15287SmallVector<SDValue, 8> Ops;15288unsigned NumElts = VT.getVectorNumElements();15289for (unsigned i = 0; i < NumElts; ++i) {15290SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));15291Ops.push_back(V);15292// Make the DAGCombiner fold the bitcast.15293DCI.AddToWorklist(V.getNode());15294}15295EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);15296SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);15297return DAG.getNode(ISD::BITCAST, dl, VT, BV);15298}1529915300/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.15301static SDValue15302PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {15303// ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.15304// At that time, we may have inserted bitcasts from integer to float.15305// If these bitcasts have survived DAGCombine, change the lowering of this15306// BUILD_VECTOR in something more vector friendly, i.e., that does not15307// force to use floating point types.1530815309// Make sure we can change the type of the vector.15310// This is possible iff:15311// 1. The vector is only used in a bitcast to a integer type. I.e.,15312// 1.1. Vector is used only once.15313// 1.2. Use is a bit convert to an integer type.15314// 2. The size of its operands are 32-bits (64-bits are not legal).15315EVT VT = N->getValueType(0);15316EVT EltVT = VT.getVectorElementType();1531715318// Check 1.1. and 2.15319if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())15320return SDValue();1532115322// By construction, the input type must be float.15323assert(EltVT == MVT::f32 && "Unexpected type!");1532415325// Check 1.2.15326SDNode *Use = *N->use_begin();15327if (Use->getOpcode() != ISD::BITCAST ||15328Use->getValueType(0).isFloatingPoint())15329return SDValue();1533015331// Check profitability.15332// Model is, if more than half of the relevant operands are bitcast from15333// i32, turn the build_vector into a sequence of insert_vector_elt.15334// Relevant operands are everything that is not statically15335// (i.e., at compile time) bitcasted.15336unsigned NumOfBitCastedElts = 0;15337unsigned NumElts = VT.getVectorNumElements();15338unsigned NumOfRelevantElts = NumElts;15339for (unsigned Idx = 0; Idx < NumElts; ++Idx) {15340SDValue Elt = N->getOperand(Idx);15341if (Elt->getOpcode() == ISD::BITCAST) {15342// Assume only bit cast to i32 will go away.15343if (Elt->getOperand(0).getValueType() == MVT::i32)15344++NumOfBitCastedElts;15345} else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))15346// Constants are statically casted, thus do not count them as15347// relevant operands.15348--NumOfRelevantElts;15349}1535015351// Check if more than half of the elements require a non-free bitcast.15352if (NumOfBitCastedElts <= NumOfRelevantElts / 2)15353return SDValue();1535415355SelectionDAG &DAG = DCI.DAG;15356// Create the new vector type.15357EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);15358// Check if the type is legal.15359const TargetLowering &TLI = DAG.getTargetLoweringInfo();15360if (!TLI.isTypeLegal(VecVT))15361return SDValue();1536215363// Combine:15364// ARMISD::BUILD_VECTOR E1, E2, ..., EN.15365// => BITCAST INSERT_VECTOR_ELT15366// (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),15367// (BITCAST EN), N.15368SDValue Vec = DAG.getUNDEF(VecVT);15369SDLoc dl(N);15370for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {15371SDValue V = N->getOperand(Idx);15372if (V.isUndef())15373continue;15374if (V.getOpcode() == ISD::BITCAST &&15375V->getOperand(0).getValueType() == MVT::i32)15376// Fold obvious case.15377V = V.getOperand(0);15378else {15379V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);15380// Make the DAGCombiner fold the bitcasts.15381DCI.AddToWorklist(V.getNode());15382}15383SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);15384Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);15385}15386Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);15387// Make the DAGCombiner fold the bitcasts.15388DCI.AddToWorklist(Vec.getNode());15389return Vec;15390}1539115392static SDValue15393PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {15394EVT VT = N->getValueType(0);15395SDValue Op = N->getOperand(0);15396SDLoc dl(N);1539715398// PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)15399if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {15400// If the valuetypes are the same, we can remove the cast entirely.15401if (Op->getOperand(0).getValueType() == VT)15402return Op->getOperand(0);15403return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));15404}1540515406// Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce15407// more VPNOT which might get folded as else predicates.15408if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {15409SDValue X =15410DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));15411SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,15412DCI.DAG.getConstant(65535, dl, MVT::i32));15413return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);15414}1541515416// Only the bottom 16 bits of the source register are used.15417if (Op.getValueType() == MVT::i32) {15418APInt DemandedMask = APInt::getLowBitsSet(32, 16);15419const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();15420if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))15421return SDValue(N, 0);15422}15423return SDValue();15424}1542515426static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG,15427const ARMSubtarget *ST) {15428EVT VT = N->getValueType(0);15429SDValue Op = N->getOperand(0);15430SDLoc dl(N);1543115432// Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST15433if (ST->isLittle())15434return DAG.getNode(ISD::BITCAST, dl, VT, Op);1543515436// VECTOR_REG_CAST undef -> undef15437if (Op.isUndef())15438return DAG.getUNDEF(VT);1543915440// VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)15441if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {15442// If the valuetypes are the same, we can remove the cast entirely.15443if (Op->getOperand(0).getValueType() == VT)15444return Op->getOperand(0);15445return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));15446}1544715448return SDValue();15449}1545015451static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG,15452const ARMSubtarget *Subtarget) {15453if (!Subtarget->hasMVEIntegerOps())15454return SDValue();1545515456EVT VT = N->getValueType(0);15457SDValue Op0 = N->getOperand(0);15458SDValue Op1 = N->getOperand(1);15459ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);15460SDLoc dl(N);1546115462// vcmp X, 0, cc -> vcmpz X, cc15463if (isZeroVector(Op1))15464return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));1546515466unsigned SwappedCond = getSwappedCondition(Cond);15467if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {15468// vcmp 0, X, cc -> vcmpz X, reversed(cc)15469if (isZeroVector(Op0))15470return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,15471DAG.getConstant(SwappedCond, dl, MVT::i32));15472// vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)15473if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)15474return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,15475DAG.getConstant(SwappedCond, dl, MVT::i32));15476}1547715478return SDValue();15479}1548015481/// PerformInsertEltCombine - Target-specific dag combine xforms for15482/// ISD::INSERT_VECTOR_ELT.15483static SDValue PerformInsertEltCombine(SDNode *N,15484TargetLowering::DAGCombinerInfo &DCI) {15485// Bitcast an i64 load inserted into a vector to f64.15486// Otherwise, the i64 value will be legalized to a pair of i32 values.15487EVT VT = N->getValueType(0);15488SDNode *Elt = N->getOperand(1).getNode();15489if (VT.getVectorElementType() != MVT::i64 ||15490!ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())15491return SDValue();1549215493SelectionDAG &DAG = DCI.DAG;15494SDLoc dl(N);15495EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,15496VT.getVectorNumElements());15497SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));15498SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));15499// Make the DAGCombiner fold the bitcasts.15500DCI.AddToWorklist(Vec.getNode());15501DCI.AddToWorklist(V.getNode());15502SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,15503Vec, V, N->getOperand(2));15504return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);15505}1550615507// Convert a pair of extracts from the same base vector to a VMOVRRD. Either15508// directly or bitcast to an integer if the original is a float vector.15509// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)15510// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)15511static SDValue15512PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {15513EVT VT = N->getValueType(0);15514SDLoc dl(N);1551515516if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||15517!DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))15518return SDValue();1551915520SDValue Ext = SDValue(N, 0);15521if (Ext.getOpcode() == ISD::BITCAST &&15522Ext.getOperand(0).getValueType() == MVT::f32)15523Ext = Ext.getOperand(0);15524if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||15525!isa<ConstantSDNode>(Ext.getOperand(1)) ||15526Ext.getConstantOperandVal(1) % 2 != 0)15527return SDValue();15528if (Ext->use_size() == 1 &&15529(Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP ||15530Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP))15531return SDValue();1553215533SDValue Op0 = Ext.getOperand(0);15534EVT VecVT = Op0.getValueType();15535unsigned ResNo = Op0.getResNo();15536unsigned Lane = Ext.getConstantOperandVal(1);15537if (VecVT.getVectorNumElements() != 4)15538return SDValue();1553915540// Find another extract, of Lane + 115541auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) {15542return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&15543isa<ConstantSDNode>(V->getOperand(1)) &&15544V->getConstantOperandVal(1) == Lane + 1 &&15545V->getOperand(0).getResNo() == ResNo;15546});15547if (OtherIt == Op0->uses().end())15548return SDValue();1554915550// For float extracts, we need to be converting to a i32 for both vector15551// lanes.15552SDValue OtherExt(*OtherIt, 0);15553if (OtherExt.getValueType() != MVT::i32) {15554if (OtherExt->use_size() != 1 ||15555OtherExt->use_begin()->getOpcode() != ISD::BITCAST ||15556OtherExt->use_begin()->getValueType(0) != MVT::i32)15557return SDValue();15558OtherExt = SDValue(*OtherExt->use_begin(), 0);15559}1556015561// Convert the type to a f64 and extract with a VMOVRRD.15562SDValue F64 = DCI.DAG.getNode(15563ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,15564DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),15565DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));15566SDValue VMOVRRD =15567DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);1556815569DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));15570return VMOVRRD;15571}1557215573static SDValue PerformExtractEltCombine(SDNode *N,15574TargetLowering::DAGCombinerInfo &DCI,15575const ARMSubtarget *ST) {15576SDValue Op0 = N->getOperand(0);15577EVT VT = N->getValueType(0);15578SDLoc dl(N);1557915580// extract (vdup x) -> x15581if (Op0->getOpcode() == ARMISD::VDUP) {15582SDValue X = Op0->getOperand(0);15583if (VT == MVT::f16 && X.getValueType() == MVT::i32)15584return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);15585if (VT == MVT::i32 && X.getValueType() == MVT::f16)15586return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);15587if (VT == MVT::f32 && X.getValueType() == MVT::i32)15588return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);1558915590while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)15591X = X->getOperand(0);15592if (X.getValueType() == VT)15593return X;15594}1559515596// extract ARM_BUILD_VECTOR -> x15597if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&15598isa<ConstantSDNode>(N->getOperand(1)) &&15599N->getConstantOperandVal(1) < Op0.getNumOperands()) {15600return Op0.getOperand(N->getConstantOperandVal(1));15601}1560215603// extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b15604if (Op0.getValueType() == MVT::v4i32 &&15605isa<ConstantSDNode>(N->getOperand(1)) &&15606Op0.getOpcode() == ISD::BITCAST &&15607Op0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&15608Op0.getOperand(0).getValueType() == MVT::v2f64) {15609SDValue BV = Op0.getOperand(0);15610unsigned Offset = N->getConstantOperandVal(1);15611SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);15612if (MOV.getOpcode() == ARMISD::VMOVDRR)15613return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);15614}1561515616// extract x, n; extract x, n+1 -> VMOVRRD x15617if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))15618return R;1561915620// extract (MVETrunc(x)) -> extract x15621if (Op0->getOpcode() == ARMISD::MVETRUNC) {15622unsigned Idx = N->getConstantOperandVal(1);15623unsigned Vec =15624Idx / Op0->getOperand(0).getValueType().getVectorNumElements();15625unsigned SubIdx =15626Idx % Op0->getOperand(0).getValueType().getVectorNumElements();15627return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),15628DCI.DAG.getConstant(SubIdx, dl, MVT::i32));15629}1563015631return SDValue();15632}1563315634static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG) {15635SDValue Op = N->getOperand(0);15636EVT VT = N->getValueType(0);1563715638// sext_inreg(VGETLANEu) -> VGETLANEs15639if (Op.getOpcode() == ARMISD::VGETLANEu &&15640cast<VTSDNode>(N->getOperand(1))->getVT() ==15641Op.getOperand(0).getValueType().getScalarType())15642return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),15643Op.getOperand(1));1564415645return SDValue();15646}1564715648static SDValue15649PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {15650SDValue Vec = N->getOperand(0);15651SDValue SubVec = N->getOperand(1);15652uint64_t IdxVal = N->getConstantOperandVal(2);15653EVT VecVT = Vec.getValueType();15654EVT SubVT = SubVec.getValueType();1565515656// Only do this for legal fixed vector types.15657if (!VecVT.isFixedLengthVector() ||15658!DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||15659!DCI.DAG.getTargetLoweringInfo().isTypeLegal(SubVT))15660return SDValue();1566115662// Ignore widening patterns.15663if (IdxVal == 0 && Vec.isUndef())15664return SDValue();1566515666// Subvector must be half the width and an "aligned" insertion.15667unsigned NumSubElts = SubVT.getVectorNumElements();15668if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||15669(IdxVal != 0 && IdxVal != NumSubElts))15670return SDValue();1567115672// Fold insert_subvector -> concat_vectors15673// insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))15674// insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)15675SDLoc DL(N);15676SDValue Lo, Hi;15677if (IdxVal == 0) {15678Lo = SubVec;15679Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,15680DCI.DAG.getVectorIdxConstant(NumSubElts, DL));15681} else {15682Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,15683DCI.DAG.getVectorIdxConstant(0, DL));15684Hi = SubVec;15685}15686return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);15687}1568815689// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)15690static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N,15691SelectionDAG &DAG) {15692SDValue Trunc = N->getOperand(0);15693EVT VT = Trunc.getValueType();15694if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())15695return SDValue();1569615697SDLoc DL(Trunc);15698if (isVMOVNTruncMask(N->getMask(), VT, false))15699return DAG.getNode(15700ARMISD::VMOVN, DL, VT,15701DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),15702DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),15703DAG.getConstant(1, DL, MVT::i32));15704else if (isVMOVNTruncMask(N->getMask(), VT, true))15705return DAG.getNode(15706ARMISD::VMOVN, DL, VT,15707DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),15708DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),15709DAG.getConstant(1, DL, MVT::i32));15710return SDValue();15711}1571215713/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for15714/// ISD::VECTOR_SHUFFLE.15715static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {15716if (SDValue R = PerformShuffleVMOVNCombine(cast<ShuffleVectorSDNode>(N), DAG))15717return R;1571815719// The LLVM shufflevector instruction does not require the shuffle mask15720// length to match the operand vector length, but ISD::VECTOR_SHUFFLE does15721// have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the15722// operands do not match the mask length, they are extended by concatenating15723// them with undef vectors. That is probably the right thing for other15724// targets, but for NEON it is better to concatenate two double-register15725// size vector operands into a single quad-register size vector. Do that15726// transformation here:15727// shuffle(concat(v1, undef), concat(v2, undef)) ->15728// shuffle(concat(v1, v2), undef)15729SDValue Op0 = N->getOperand(0);15730SDValue Op1 = N->getOperand(1);15731if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||15732Op1.getOpcode() != ISD::CONCAT_VECTORS ||15733Op0.getNumOperands() != 2 ||15734Op1.getNumOperands() != 2)15735return SDValue();15736SDValue Concat0Op1 = Op0.getOperand(1);15737SDValue Concat1Op1 = Op1.getOperand(1);15738if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())15739return SDValue();15740// Skip the transformation if any of the types are illegal.15741const TargetLowering &TLI = DAG.getTargetLoweringInfo();15742EVT VT = N->getValueType(0);15743if (!TLI.isTypeLegal(VT) ||15744!TLI.isTypeLegal(Concat0Op1.getValueType()) ||15745!TLI.isTypeLegal(Concat1Op1.getValueType()))15746return SDValue();1574715748SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,15749Op0.getOperand(0), Op1.getOperand(0));15750// Translate the shuffle mask.15751SmallVector<int, 16> NewMask;15752unsigned NumElts = VT.getVectorNumElements();15753unsigned HalfElts = NumElts/2;15754ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);15755for (unsigned n = 0; n < NumElts; ++n) {15756int MaskElt = SVN->getMaskElt(n);15757int NewElt = -1;15758if (MaskElt < (int)HalfElts)15759NewElt = MaskElt;15760else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))15761NewElt = HalfElts + MaskElt - NumElts;15762NewMask.push_back(NewElt);15763}15764return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,15765DAG.getUNDEF(VT), NewMask);15766}1576715768/// Load/store instruction that can be merged with a base address15769/// update15770struct BaseUpdateTarget {15771SDNode *N;15772bool isIntrinsic;15773bool isStore;15774unsigned AddrOpIdx;15775};1577615777struct BaseUpdateUser {15778/// Instruction that updates a pointer15779SDNode *N;15780/// Pointer increment operand15781SDValue Inc;15782/// Pointer increment value if it is a constant, or 0 otherwise15783unsigned ConstInc;15784};1578515786static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,15787struct BaseUpdateUser &User,15788bool SimpleConstIncOnly,15789TargetLowering::DAGCombinerInfo &DCI) {15790SelectionDAG &DAG = DCI.DAG;15791SDNode *N = Target.N;15792MemSDNode *MemN = cast<MemSDNode>(N);15793SDLoc dl(N);1579415795// Find the new opcode for the updating load/store.15796bool isLoadOp = true;15797bool isLaneOp = false;15798// Workaround for vst1x and vld1x intrinsics which do not have alignment15799// as an operand.15800bool hasAlignment = true;15801unsigned NewOpc = 0;15802unsigned NumVecs = 0;15803if (Target.isIntrinsic) {15804unsigned IntNo = N->getConstantOperandVal(1);15805switch (IntNo) {15806default:15807llvm_unreachable("unexpected intrinsic for Neon base update");15808case Intrinsic::arm_neon_vld1:15809NewOpc = ARMISD::VLD1_UPD;15810NumVecs = 1;15811break;15812case Intrinsic::arm_neon_vld2:15813NewOpc = ARMISD::VLD2_UPD;15814NumVecs = 2;15815break;15816case Intrinsic::arm_neon_vld3:15817NewOpc = ARMISD::VLD3_UPD;15818NumVecs = 3;15819break;15820case Intrinsic::arm_neon_vld4:15821NewOpc = ARMISD::VLD4_UPD;15822NumVecs = 4;15823break;15824case Intrinsic::arm_neon_vld1x2:15825NewOpc = ARMISD::VLD1x2_UPD;15826NumVecs = 2;15827hasAlignment = false;15828break;15829case Intrinsic::arm_neon_vld1x3:15830NewOpc = ARMISD::VLD1x3_UPD;15831NumVecs = 3;15832hasAlignment = false;15833break;15834case Intrinsic::arm_neon_vld1x4:15835NewOpc = ARMISD::VLD1x4_UPD;15836NumVecs = 4;15837hasAlignment = false;15838break;15839case Intrinsic::arm_neon_vld2dup:15840NewOpc = ARMISD::VLD2DUP_UPD;15841NumVecs = 2;15842break;15843case Intrinsic::arm_neon_vld3dup:15844NewOpc = ARMISD::VLD3DUP_UPD;15845NumVecs = 3;15846break;15847case Intrinsic::arm_neon_vld4dup:15848NewOpc = ARMISD::VLD4DUP_UPD;15849NumVecs = 4;15850break;15851case Intrinsic::arm_neon_vld2lane:15852NewOpc = ARMISD::VLD2LN_UPD;15853NumVecs = 2;15854isLaneOp = true;15855break;15856case Intrinsic::arm_neon_vld3lane:15857NewOpc = ARMISD::VLD3LN_UPD;15858NumVecs = 3;15859isLaneOp = true;15860break;15861case Intrinsic::arm_neon_vld4lane:15862NewOpc = ARMISD::VLD4LN_UPD;15863NumVecs = 4;15864isLaneOp = true;15865break;15866case Intrinsic::arm_neon_vst1:15867NewOpc = ARMISD::VST1_UPD;15868NumVecs = 1;15869isLoadOp = false;15870break;15871case Intrinsic::arm_neon_vst2:15872NewOpc = ARMISD::VST2_UPD;15873NumVecs = 2;15874isLoadOp = false;15875break;15876case Intrinsic::arm_neon_vst3:15877NewOpc = ARMISD::VST3_UPD;15878NumVecs = 3;15879isLoadOp = false;15880break;15881case Intrinsic::arm_neon_vst4:15882NewOpc = ARMISD::VST4_UPD;15883NumVecs = 4;15884isLoadOp = false;15885break;15886case Intrinsic::arm_neon_vst2lane:15887NewOpc = ARMISD::VST2LN_UPD;15888NumVecs = 2;15889isLoadOp = false;15890isLaneOp = true;15891break;15892case Intrinsic::arm_neon_vst3lane:15893NewOpc = ARMISD::VST3LN_UPD;15894NumVecs = 3;15895isLoadOp = false;15896isLaneOp = true;15897break;15898case Intrinsic::arm_neon_vst4lane:15899NewOpc = ARMISD::VST4LN_UPD;15900NumVecs = 4;15901isLoadOp = false;15902isLaneOp = true;15903break;15904case Intrinsic::arm_neon_vst1x2:15905NewOpc = ARMISD::VST1x2_UPD;15906NumVecs = 2;15907isLoadOp = false;15908hasAlignment = false;15909break;15910case Intrinsic::arm_neon_vst1x3:15911NewOpc = ARMISD::VST1x3_UPD;15912NumVecs = 3;15913isLoadOp = false;15914hasAlignment = false;15915break;15916case Intrinsic::arm_neon_vst1x4:15917NewOpc = ARMISD::VST1x4_UPD;15918NumVecs = 4;15919isLoadOp = false;15920hasAlignment = false;15921break;15922}15923} else {15924isLaneOp = true;15925switch (N->getOpcode()) {15926default:15927llvm_unreachable("unexpected opcode for Neon base update");15928case ARMISD::VLD1DUP:15929NewOpc = ARMISD::VLD1DUP_UPD;15930NumVecs = 1;15931break;15932case ARMISD::VLD2DUP:15933NewOpc = ARMISD::VLD2DUP_UPD;15934NumVecs = 2;15935break;15936case ARMISD::VLD3DUP:15937NewOpc = ARMISD::VLD3DUP_UPD;15938NumVecs = 3;15939break;15940case ARMISD::VLD4DUP:15941NewOpc = ARMISD::VLD4DUP_UPD;15942NumVecs = 4;15943break;15944case ISD::LOAD:15945NewOpc = ARMISD::VLD1_UPD;15946NumVecs = 1;15947isLaneOp = false;15948break;15949case ISD::STORE:15950NewOpc = ARMISD::VST1_UPD;15951NumVecs = 1;15952isLaneOp = false;15953isLoadOp = false;15954break;15955}15956}1595715958// Find the size of memory referenced by the load/store.15959EVT VecTy;15960if (isLoadOp) {15961VecTy = N->getValueType(0);15962} else if (Target.isIntrinsic) {15963VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();15964} else {15965assert(Target.isStore &&15966"Node has to be a load, a store, or an intrinsic!");15967VecTy = N->getOperand(1).getValueType();15968}1596915970bool isVLDDUPOp =15971NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||15972NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;1597315974unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;15975if (isLaneOp || isVLDDUPOp)15976NumBytes /= VecTy.getVectorNumElements();1597715978if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {15979// VLD3/4 and VST3/4 for 128-bit vectors are implemented with two15980// separate instructions that make it harder to use a non-constant update.15981return false;15982}1598315984if (SimpleConstIncOnly && User.ConstInc != NumBytes)15985return false;1598615987// OK, we found an ADD we can fold into the base update.15988// Now, create a _UPD node, taking care of not breaking alignment.1598915990EVT AlignedVecTy = VecTy;15991Align Alignment = MemN->getAlign();1599215993// If this is a less-than-standard-aligned load/store, change the type to15994// match the standard alignment.15995// The alignment is overlooked when selecting _UPD variants; and it's15996// easier to introduce bitcasts here than fix that.15997// There are 3 ways to get to this base-update combine:15998// - intrinsics: they are assumed to be properly aligned (to the standard15999// alignment of the memory type), so we don't need to do anything.16000// - ARMISD::VLDx nodes: they are only generated from the aforementioned16001// intrinsics, so, likewise, there's nothing to do.16002// - generic load/store instructions: the alignment is specified as an16003// explicit operand, rather than implicitly as the standard alignment16004// of the memory type (like the intrisics). We need to change the16005// memory type to match the explicit alignment. That way, we don't16006// generate non-standard-aligned ARMISD::VLDx nodes.16007if (isa<LSBaseSDNode>(N)) {16008if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {16009MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);16010assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");16011assert(!isLaneOp && "Unexpected generic load/store lane.");16012unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);16013AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);16014}16015// Don't set an explicit alignment on regular load/stores that we want16016// to transform to VLD/VST 1_UPD nodes.16017// This matches the behavior of regular load/stores, which only get an16018// explicit alignment if the MMO alignment is larger than the standard16019// alignment of the memory type.16020// Intrinsics, however, always get an explicit alignment, set to the16021// alignment of the MMO.16022Alignment = Align(1);16023}1602416025// Create the new updating load/store node.16026// First, create an SDVTList for the new updating node's results.16027EVT Tys[6];16028unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);16029unsigned n;16030for (n = 0; n < NumResultVecs; ++n)16031Tys[n] = AlignedVecTy;16032Tys[n++] = MVT::i32;16033Tys[n] = MVT::Other;16034SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));1603516036// Then, gather the new node's operands.16037SmallVector<SDValue, 8> Ops;16038Ops.push_back(N->getOperand(0)); // incoming chain16039Ops.push_back(N->getOperand(Target.AddrOpIdx));16040Ops.push_back(User.Inc);1604116042if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {16043// Try to match the intrinsic's signature16044Ops.push_back(StN->getValue());16045} else {16046// Loads (and of course intrinsics) match the intrinsics' signature,16047// so just add all but the alignment operand.16048unsigned LastOperand =16049hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();16050for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)16051Ops.push_back(N->getOperand(i));16052}1605316054// For all node types, the alignment operand is always the last one.16055Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));1605616057// If this is a non-standard-aligned STORE, the penultimate operand is the16058// stored value. Bitcast it to the aligned type.16059if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {16060SDValue &StVal = Ops[Ops.size() - 2];16061StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);16062}1606316064EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;16065SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,16066MemN->getMemOperand());1606716068// Update the uses.16069SmallVector<SDValue, 5> NewResults;16070for (unsigned i = 0; i < NumResultVecs; ++i)16071NewResults.push_back(SDValue(UpdN.getNode(), i));1607216073// If this is an non-standard-aligned LOAD, the first result is the loaded16074// value. Bitcast it to the expected result type.16075if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {16076SDValue &LdVal = NewResults[0];16077LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);16078}1607916080NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain16081DCI.CombineTo(N, NewResults);16082DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));1608316084return true;16085}1608616087// If (opcode ptr inc) is and ADD-like instruction, return the16088// increment value. Otherwise return 0.16089static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,16090SDValue Inc, const SelectionDAG &DAG) {16091ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());16092if (!CInc)16093return 0;1609416095switch (Opcode) {16096case ARMISD::VLD1_UPD:16097case ISD::ADD:16098return CInc->getZExtValue();16099case ISD::OR: {16100if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {16101// (OR ptr inc) is the same as (ADD ptr inc)16102return CInc->getZExtValue();16103}16104return 0;16105}16106default:16107return 0;16108}16109}1611016111static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) {16112switch (N->getOpcode()) {16113case ISD::ADD:16114case ISD::OR: {16115if (isa<ConstantSDNode>(N->getOperand(1))) {16116*Ptr = N->getOperand(0);16117*CInc = N->getOperand(1);16118return true;16119}16120return false;16121}16122case ARMISD::VLD1_UPD: {16123if (isa<ConstantSDNode>(N->getOperand(2))) {16124*Ptr = N->getOperand(1);16125*CInc = N->getOperand(2);16126return true;16127}16128return false;16129}16130default:16131return false;16132}16133}1613416135static bool isValidBaseUpdate(SDNode *N, SDNode *User) {16136// Check that the add is independent of the load/store.16137// Otherwise, folding it would create a cycle. Search through Addr16138// as well, since the User may not be a direct user of Addr and16139// only share a base pointer.16140SmallPtrSet<const SDNode *, 32> Visited;16141SmallVector<const SDNode *, 16> Worklist;16142Worklist.push_back(N);16143Worklist.push_back(User);16144if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||16145SDNode::hasPredecessorHelper(User, Visited, Worklist))16146return false;16147return true;16148}1614916150/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,16151/// NEON load/store intrinsics, and generic vector load/stores, to merge16152/// base address updates.16153/// For generic load/stores, the memory type is assumed to be a vector.16154/// The caller is assumed to have checked legality.16155static SDValue CombineBaseUpdate(SDNode *N,16156TargetLowering::DAGCombinerInfo &DCI) {16157const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||16158N->getOpcode() == ISD::INTRINSIC_W_CHAIN);16159const bool isStore = N->getOpcode() == ISD::STORE;16160const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);16161BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};1616216163SDValue Addr = N->getOperand(AddrOpIdx);1616416165SmallVector<BaseUpdateUser, 8> BaseUpdates;1616616167// Search for a use of the address operand that is an increment.16168for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),16169UE = Addr.getNode()->use_end(); UI != UE; ++UI) {16170SDNode *User = *UI;16171if (UI.getUse().getResNo() != Addr.getResNo() ||16172User->getNumOperands() != 2)16173continue;1617416175SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1);16176unsigned ConstInc =16177getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);1617816179if (ConstInc || User->getOpcode() == ISD::ADD)16180BaseUpdates.push_back({User, Inc, ConstInc});16181}1618216183// If the address is a constant pointer increment itself, find16184// another constant increment that has the same base operand16185SDValue Base;16186SDValue CInc;16187if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {16188unsigned Offset =16189getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);16190for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end();16191UI != UE; ++UI) {1619216193SDNode *User = *UI;16194if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() ||16195User->getNumOperands() != 2)16196continue;1619716198SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0);16199unsigned UserOffset =16200getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);1620116202if (!UserOffset || UserOffset <= Offset)16203continue;1620416205unsigned NewConstInc = UserOffset - Offset;16206SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);16207BaseUpdates.push_back({User, NewInc, NewConstInc});16208}16209}1621016211// Try to fold the load/store with an update that matches memory16212// access size. This should work well for sequential loads.16213//16214// Filter out invalid updates as well.16215unsigned NumValidUpd = BaseUpdates.size();16216for (unsigned I = 0; I < NumValidUpd;) {16217BaseUpdateUser &User = BaseUpdates[I];16218if (!isValidBaseUpdate(N, User.N)) {16219--NumValidUpd;16220std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]);16221continue;16222}1622316224if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))16225return SDValue();16226++I;16227}16228BaseUpdates.resize(NumValidUpd);1622916230// Try to fold with other users. Non-constant updates are considered16231// first, and constant updates are sorted to not break a sequence of16232// strided accesses (if there is any).16233std::stable_sort(BaseUpdates.begin(), BaseUpdates.end(),16234[](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {16235return LHS.ConstInc < RHS.ConstInc;16236});16237for (BaseUpdateUser &User : BaseUpdates) {16238if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))16239return SDValue();16240}16241return SDValue();16242}1624316244static SDValue PerformVLDCombine(SDNode *N,16245TargetLowering::DAGCombinerInfo &DCI) {16246if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())16247return SDValue();1624816249return CombineBaseUpdate(N, DCI);16250}1625116252static SDValue PerformMVEVLDCombine(SDNode *N,16253TargetLowering::DAGCombinerInfo &DCI) {16254if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())16255return SDValue();1625616257SelectionDAG &DAG = DCI.DAG;16258SDValue Addr = N->getOperand(2);16259MemSDNode *MemN = cast<MemSDNode>(N);16260SDLoc dl(N);1626116262// For the stores, where there are multiple intrinsics we only actually want16263// to post-inc the last of the them.16264unsigned IntNo = N->getConstantOperandVal(1);16265if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)16266return SDValue();16267if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)16268return SDValue();1626916270// Search for a use of the address operand that is an increment.16271for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),16272UE = Addr.getNode()->use_end();16273UI != UE; ++UI) {16274SDNode *User = *UI;16275if (User->getOpcode() != ISD::ADD ||16276UI.getUse().getResNo() != Addr.getResNo())16277continue;1627816279// Check that the add is independent of the load/store. Otherwise, folding16280// it would create a cycle. We can avoid searching through Addr as it's a16281// predecessor to both.16282SmallPtrSet<const SDNode *, 32> Visited;16283SmallVector<const SDNode *, 16> Worklist;16284Visited.insert(Addr.getNode());16285Worklist.push_back(N);16286Worklist.push_back(User);16287if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||16288SDNode::hasPredecessorHelper(User, Visited, Worklist))16289continue;1629016291// Find the new opcode for the updating load/store.16292bool isLoadOp = true;16293unsigned NewOpc = 0;16294unsigned NumVecs = 0;16295switch (IntNo) {16296default:16297llvm_unreachable("unexpected intrinsic for MVE VLDn combine");16298case Intrinsic::arm_mve_vld2q:16299NewOpc = ARMISD::VLD2_UPD;16300NumVecs = 2;16301break;16302case Intrinsic::arm_mve_vld4q:16303NewOpc = ARMISD::VLD4_UPD;16304NumVecs = 4;16305break;16306case Intrinsic::arm_mve_vst2q:16307NewOpc = ARMISD::VST2_UPD;16308NumVecs = 2;16309isLoadOp = false;16310break;16311case Intrinsic::arm_mve_vst4q:16312NewOpc = ARMISD::VST4_UPD;16313NumVecs = 4;16314isLoadOp = false;16315break;16316}1631716318// Find the size of memory referenced by the load/store.16319EVT VecTy;16320if (isLoadOp) {16321VecTy = N->getValueType(0);16322} else {16323VecTy = N->getOperand(3).getValueType();16324}1632516326unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;1632716328// If the increment is a constant, it must match the memory ref size.16329SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);16330ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());16331if (!CInc || CInc->getZExtValue() != NumBytes)16332continue;1633316334// Create the new updating load/store node.16335// First, create an SDVTList for the new updating node's results.16336EVT Tys[6];16337unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);16338unsigned n;16339for (n = 0; n < NumResultVecs; ++n)16340Tys[n] = VecTy;16341Tys[n++] = MVT::i32;16342Tys[n] = MVT::Other;16343SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));1634416345// Then, gather the new node's operands.16346SmallVector<SDValue, 8> Ops;16347Ops.push_back(N->getOperand(0)); // incoming chain16348Ops.push_back(N->getOperand(2)); // ptr16349Ops.push_back(Inc);1635016351for (unsigned i = 3; i < N->getNumOperands(); ++i)16352Ops.push_back(N->getOperand(i));1635316354SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,16355MemN->getMemOperand());1635616357// Update the uses.16358SmallVector<SDValue, 5> NewResults;16359for (unsigned i = 0; i < NumResultVecs; ++i)16360NewResults.push_back(SDValue(UpdN.getNode(), i));1636116362NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain16363DCI.CombineTo(N, NewResults);16364DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));1636516366break;16367}1636816369return SDValue();16370}1637116372/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a16373/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic16374/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and16375/// return true.16376static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {16377SelectionDAG &DAG = DCI.DAG;16378EVT VT = N->getValueType(0);16379// vldN-dup instructions only support 64-bit vectors for N > 1.16380if (!VT.is64BitVector())16381return false;1638216383// Check if the VDUPLANE operand is a vldN-dup intrinsic.16384SDNode *VLD = N->getOperand(0).getNode();16385if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)16386return false;16387unsigned NumVecs = 0;16388unsigned NewOpc = 0;16389unsigned IntNo = VLD->getConstantOperandVal(1);16390if (IntNo == Intrinsic::arm_neon_vld2lane) {16391NumVecs = 2;16392NewOpc = ARMISD::VLD2DUP;16393} else if (IntNo == Intrinsic::arm_neon_vld3lane) {16394NumVecs = 3;16395NewOpc = ARMISD::VLD3DUP;16396} else if (IntNo == Intrinsic::arm_neon_vld4lane) {16397NumVecs = 4;16398NewOpc = ARMISD::VLD4DUP;16399} else {16400return false;16401}1640216403// First check that all the vldN-lane uses are VDUPLANEs and that the lane16404// numbers match the load.16405unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);16406for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();16407UI != UE; ++UI) {16408// Ignore uses of the chain result.16409if (UI.getUse().getResNo() == NumVecs)16410continue;16411SDNode *User = *UI;16412if (User->getOpcode() != ARMISD::VDUPLANE ||16413VLDLaneNo != User->getConstantOperandVal(1))16414return false;16415}1641616417// Create the vldN-dup node.16418EVT Tys[5];16419unsigned n;16420for (n = 0; n < NumVecs; ++n)16421Tys[n] = VT;16422Tys[n] = MVT::Other;16423SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));16424SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };16425MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);16426SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,16427Ops, VLDMemInt->getMemoryVT(),16428VLDMemInt->getMemOperand());1642916430// Update the uses.16431for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();16432UI != UE; ++UI) {16433unsigned ResNo = UI.getUse().getResNo();16434// Ignore uses of the chain result.16435if (ResNo == NumVecs)16436continue;16437SDNode *User = *UI;16438DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));16439}1644016441// Now the vldN-lane intrinsic is dead except for its chain result.16442// Update uses of the chain.16443std::vector<SDValue> VLDDupResults;16444for (unsigned n = 0; n < NumVecs; ++n)16445VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));16446VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));16447DCI.CombineTo(VLD, VLDDupResults);1644816449return true;16450}1645116452/// PerformVDUPLANECombine - Target-specific dag combine xforms for16453/// ARMISD::VDUPLANE.16454static SDValue PerformVDUPLANECombine(SDNode *N,16455TargetLowering::DAGCombinerInfo &DCI,16456const ARMSubtarget *Subtarget) {16457SDValue Op = N->getOperand(0);16458EVT VT = N->getValueType(0);1645916460// On MVE, we just convert the VDUPLANE to a VDUP with an extract.16461if (Subtarget->hasMVEIntegerOps()) {16462EVT ExtractVT = VT.getVectorElementType();16463// We need to ensure we are creating a legal type.16464if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))16465ExtractVT = MVT::i32;16466SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,16467N->getOperand(0), N->getOperand(1));16468return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);16469}1647016471// If the source is a vldN-lane (N > 1) intrinsic, and all the other uses16472// of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.16473if (CombineVLDDUP(N, DCI))16474return SDValue(N, 0);1647516476// If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is16477// redundant. Ignore bit_converts for now; element sizes are checked below.16478while (Op.getOpcode() == ISD::BITCAST)16479Op = Op.getOperand(0);16480if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)16481return SDValue();1648216483// Make sure the VMOV element size is not bigger than the VDUPLANE elements.16484unsigned EltSize = Op.getScalarValueSizeInBits();16485// The canonical VMOV for a zero vector uses a 32-bit element size.16486unsigned Imm = Op.getConstantOperandVal(0);16487unsigned EltBits;16488if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)16489EltSize = 8;16490if (EltSize > VT.getScalarSizeInBits())16491return SDValue();1649216493return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);16494}1649516496/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.16497static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG,16498const ARMSubtarget *Subtarget) {16499SDValue Op = N->getOperand(0);16500SDLoc dl(N);1650116502if (Subtarget->hasMVEIntegerOps()) {16503// Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will16504// need to come from a GPR.16505if (Op.getValueType() == MVT::f32)16506return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),16507DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));16508else if (Op.getValueType() == MVT::f16)16509return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),16510DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));16511}1651216513if (!Subtarget->hasNEON())16514return SDValue();1651516516// Match VDUP(LOAD) -> VLD1DUP.16517// We match this pattern here rather than waiting for isel because the16518// transform is only legal for unindexed loads.16519LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());16520if (LD && Op.hasOneUse() && LD->isUnindexed() &&16521LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {16522SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),16523DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};16524SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);16525SDValue VLDDup =16526DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops,16527LD->getMemoryVT(), LD->getMemOperand());16528DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));16529return VLDDup;16530}1653116532return SDValue();16533}1653416535static SDValue PerformLOADCombine(SDNode *N,16536TargetLowering::DAGCombinerInfo &DCI,16537const ARMSubtarget *Subtarget) {16538EVT VT = N->getValueType(0);1653916540// If this is a legal vector load, try to combine it into a VLD1_UPD.16541if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&16542DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))16543return CombineBaseUpdate(N, DCI);1654416545return SDValue();16546}1654716548// Optimize trunc store (of multiple scalars) to shuffle and store. First,16549// pack all of the elements in one place. Next, store to memory in fewer16550// chunks.16551static SDValue PerformTruncatingStoreCombine(StoreSDNode *St,16552SelectionDAG &DAG) {16553SDValue StVal = St->getValue();16554EVT VT = StVal.getValueType();16555if (!St->isTruncatingStore() || !VT.isVector())16556return SDValue();16557const TargetLowering &TLI = DAG.getTargetLoweringInfo();16558EVT StVT = St->getMemoryVT();16559unsigned NumElems = VT.getVectorNumElements();16560assert(StVT != VT && "Cannot truncate to the same type");16561unsigned FromEltSz = VT.getScalarSizeInBits();16562unsigned ToEltSz = StVT.getScalarSizeInBits();1656316564// From, To sizes and ElemCount must be pow of two16565if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))16566return SDValue();1656716568// We are going to use the original vector elt for storing.16569// Accumulated smaller vector elements must be a multiple of the store size.16570if (0 != (NumElems * FromEltSz) % ToEltSz)16571return SDValue();1657216573unsigned SizeRatio = FromEltSz / ToEltSz;16574assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());1657516576// Create a type on which we perform the shuffle.16577EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),16578NumElems * SizeRatio);16579assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());1658016581SDLoc DL(St);16582SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);16583SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);16584for (unsigned i = 0; i < NumElems; ++i)16585ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 116586: i * SizeRatio;1658716588// Can't shuffle using an illegal type.16589if (!TLI.isTypeLegal(WideVecVT))16590return SDValue();1659116592SDValue Shuff = DAG.getVectorShuffle(16593WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);16594// At this point all of the data is stored at the bottom of the16595// register. We now need to save it to mem.1659616597// Find the largest store unit16598MVT StoreType = MVT::i8;16599for (MVT Tp : MVT::integer_valuetypes()) {16600if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)16601StoreType = Tp;16602}16603// Didn't find a legal store type.16604if (!TLI.isTypeLegal(StoreType))16605return SDValue();1660616607// Bitcast the original vector into a vector of store-size units16608EVT StoreVecVT =16609EVT::getVectorVT(*DAG.getContext(), StoreType,16610VT.getSizeInBits() / EVT(StoreType).getSizeInBits());16611assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());16612SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);16613SmallVector<SDValue, 8> Chains;16614SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,16615TLI.getPointerTy(DAG.getDataLayout()));16616SDValue BasePtr = St->getBasePtr();1661716618// Perform one or more big stores into memory.16619unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();16620for (unsigned I = 0; I < E; I++) {16621SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,16622ShuffWide, DAG.getIntPtrConstant(I, DL));16623SDValue Ch =16624DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),16625St->getAlign(), St->getMemOperand()->getFlags());16626BasePtr =16627DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);16628Chains.push_back(Ch);16629}16630return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);16631}1663216633// Try taking a single vector store from an fpround (which would otherwise turn16634// into an expensive buildvector) and splitting it into a series of narrowing16635// stores.16636static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,16637SelectionDAG &DAG) {16638if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())16639return SDValue();16640SDValue Trunc = St->getValue();16641if (Trunc->getOpcode() != ISD::FP_ROUND)16642return SDValue();16643EVT FromVT = Trunc->getOperand(0).getValueType();16644EVT ToVT = Trunc.getValueType();16645if (!ToVT.isVector())16646return SDValue();16647assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());16648EVT ToEltVT = ToVT.getVectorElementType();16649EVT FromEltVT = FromVT.getVectorElementType();1665016651if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)16652return SDValue();1665316654unsigned NumElements = 4;16655if (FromVT.getVectorNumElements() % NumElements != 0)16656return SDValue();1665716658// Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so16659// use the VMOVN over splitting the store. We are looking for patterns of:16660// !rev: 0 N 1 N+1 2 N+2 ...16661// rev: N 0 N+1 1 N+2 2 ...16662// The shuffle may either be a single source (in which case N = NumElts/2) or16663// two inputs extended with concat to the same size (in which case N =16664// NumElts).16665auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {16666ArrayRef<int> M = SVN->getMask();16667unsigned NumElts = ToVT.getVectorNumElements();16668if (SVN->getOperand(1).isUndef())16669NumElts /= 2;1667016671unsigned Off0 = Rev ? NumElts : 0;16672unsigned Off1 = Rev ? 0 : NumElts;1667316674for (unsigned I = 0; I < NumElts; I += 2) {16675if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))16676return false;16677if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))16678return false;16679}1668016681return true;16682};1668316684if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))16685if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))16686return SDValue();1668716688LLVMContext &C = *DAG.getContext();16689SDLoc DL(St);16690// Details about the old store16691SDValue Ch = St->getChain();16692SDValue BasePtr = St->getBasePtr();16693Align Alignment = St->getOriginalAlign();16694MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();16695AAMDNodes AAInfo = St->getAAInfo();1669616697// We split the store into slices of NumElements. fp16 trunc stores are vcvt16698// and then stored as truncating integer stores.16699EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);16700EVT NewToVT = EVT::getVectorVT(16701C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);1670216703SmallVector<SDValue, 4> Stores;16704for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {16705unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;16706SDValue NewPtr =16707DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));1670816709SDValue Extract =16710DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),16711DAG.getConstant(i * NumElements, DL, MVT::i32));1671216713SDValue FPTrunc =16714DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),16715Extract, DAG.getConstant(0, DL, MVT::i32));16716Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);1671716718SDValue Store = DAG.getTruncStore(16719Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),16720NewToVT, Alignment, MMOFlags, AAInfo);16721Stores.push_back(Store);16722}16723return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);16724}1672516726// Try taking a single vector store from an MVETRUNC (which would otherwise turn16727// into an expensive buildvector) and splitting it into a series of narrowing16728// stores.16729static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St,16730SelectionDAG &DAG) {16731if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())16732return SDValue();16733SDValue Trunc = St->getValue();16734if (Trunc->getOpcode() != ARMISD::MVETRUNC)16735return SDValue();16736EVT FromVT = Trunc->getOperand(0).getValueType();16737EVT ToVT = Trunc.getValueType();1673816739LLVMContext &C = *DAG.getContext();16740SDLoc DL(St);16741// Details about the old store16742SDValue Ch = St->getChain();16743SDValue BasePtr = St->getBasePtr();16744Align Alignment = St->getOriginalAlign();16745MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();16746AAMDNodes AAInfo = St->getAAInfo();1674716748EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),16749FromVT.getVectorNumElements());1675016751SmallVector<SDValue, 4> Stores;16752for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {16753unsigned NewOffset =16754i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;16755SDValue NewPtr =16756DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));1675716758SDValue Extract = Trunc.getOperand(i);16759SDValue Store = DAG.getTruncStore(16760Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),16761NewToVT, Alignment, MMOFlags, AAInfo);16762Stores.push_back(Store);16763}16764return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);16765}1676616767// Given a floating point store from an extracted vector, with an integer16768// VGETLANE that already exists, store the existing VGETLANEu directly. This can16769// help reduce fp register pressure, doesn't require the fp extract and allows16770// use of more integer post-inc stores not available with vstr.16771static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) {16772if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())16773return SDValue();16774SDValue Extract = St->getValue();16775EVT VT = Extract.getValueType();16776// For now only uses f16. This may be useful for f32 too, but that will16777// be bitcast(extract), not the VGETLANEu we currently check here.16778if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)16779return SDValue();1678016781SDNode *GetLane =16782DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),16783{Extract.getOperand(0), Extract.getOperand(1)});16784if (!GetLane)16785return SDValue();1678616787LLVMContext &C = *DAG.getContext();16788SDLoc DL(St);16789// Create a new integer store to replace the existing floating point version.16790SDValue Ch = St->getChain();16791SDValue BasePtr = St->getBasePtr();16792Align Alignment = St->getOriginalAlign();16793MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();16794AAMDNodes AAInfo = St->getAAInfo();16795EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());16796SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,16797St->getPointerInfo(), NewToVT, Alignment,16798MMOFlags, AAInfo);1679916800return Store;16801}1680216803/// PerformSTORECombine - Target-specific dag combine xforms for16804/// ISD::STORE.16805static SDValue PerformSTORECombine(SDNode *N,16806TargetLowering::DAGCombinerInfo &DCI,16807const ARMSubtarget *Subtarget) {16808StoreSDNode *St = cast<StoreSDNode>(N);16809if (St->isVolatile())16810return SDValue();16811SDValue StVal = St->getValue();16812EVT VT = StVal.getValueType();1681316814if (Subtarget->hasNEON())16815if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))16816return Store;1681716818if (Subtarget->hasMVEFloatOps())16819if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))16820return NewToken;1682116822if (Subtarget->hasMVEIntegerOps()) {16823if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))16824return NewChain;16825if (SDValue NewToken =16826PerformSplittingMVETruncToNarrowingStores(St, DCI.DAG))16827return NewToken;16828}1682916830if (!ISD::isNormalStore(St))16831return SDValue();1683216833// Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and16834// ARM stores of arguments in the same cache line.16835if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&16836StVal.getNode()->hasOneUse()) {16837SelectionDAG &DAG = DCI.DAG;16838bool isBigEndian = DAG.getDataLayout().isBigEndian();16839SDLoc DL(St);16840SDValue BasePtr = St->getBasePtr();16841SDValue NewST1 = DAG.getStore(16842St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),16843BasePtr, St->getPointerInfo(), St->getOriginalAlign(),16844St->getMemOperand()->getFlags());1684516846SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,16847DAG.getConstant(4, DL, MVT::i32));16848return DAG.getStore(NewST1.getValue(0), DL,16849StVal.getNode()->getOperand(isBigEndian ? 0 : 1),16850OffsetPtr, St->getPointerInfo().getWithOffset(4),16851St->getOriginalAlign(),16852St->getMemOperand()->getFlags());16853}1685416855if (StVal.getValueType() == MVT::i64 &&16856StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {1685716858// Bitcast an i64 store extracted from a vector to f64.16859// Otherwise, the i64 value will be legalized to a pair of i32 values.16860SelectionDAG &DAG = DCI.DAG;16861SDLoc dl(StVal);16862SDValue IntVec = StVal.getOperand(0);16863EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,16864IntVec.getValueType().getVectorNumElements());16865SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);16866SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,16867Vec, StVal.getOperand(1));16868dl = SDLoc(N);16869SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);16870// Make the DAGCombiner fold the bitcasts.16871DCI.AddToWorklist(Vec.getNode());16872DCI.AddToWorklist(ExtElt.getNode());16873DCI.AddToWorklist(V.getNode());16874return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),16875St->getPointerInfo(), St->getAlign(),16876St->getMemOperand()->getFlags(), St->getAAInfo());16877}1687816879// If this is a legal vector store, try to combine it into a VST1_UPD.16880if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&16881DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))16882return CombineBaseUpdate(N, DCI);1688316884return SDValue();16885}1688616887/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)16888/// can replace combinations of VMUL and VCVT (floating-point to integer)16889/// when the VMUL has a constant operand that is a power of 2.16890///16891/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):16892/// vmul.f32 d16, d17, d1616893/// vcvt.s32.f32 d16, d1616894/// becomes:16895/// vcvt.s32.f32 d16, d16, #316896static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,16897const ARMSubtarget *Subtarget) {16898if (!Subtarget->hasNEON())16899return SDValue();1690016901SDValue Op = N->getOperand(0);16902if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||16903Op.getOpcode() != ISD::FMUL)16904return SDValue();1690516906SDValue ConstVec = Op->getOperand(1);16907if (!isa<BuildVectorSDNode>(ConstVec))16908return SDValue();1690916910MVT FloatTy = Op.getSimpleValueType().getVectorElementType();16911uint32_t FloatBits = FloatTy.getSizeInBits();16912MVT IntTy = N->getSimpleValueType(0).getVectorElementType();16913uint32_t IntBits = IntTy.getSizeInBits();16914unsigned NumLanes = Op.getValueType().getVectorNumElements();16915if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {16916// These instructions only exist converting from f32 to i32. We can handle16917// smaller integers by generating an extra truncate, but larger ones would16918// be lossy. We also can't handle anything other than 2 or 4 lanes, since16919// these intructions only support v2i32/v4i32 types.16920return SDValue();16921}1692216923BitVector UndefElements;16924BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);16925int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);16926if (C == -1 || C == 0 || C > 32)16927return SDValue();1692816929SDLoc dl(N);16930bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;16931unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :16932Intrinsic::arm_neon_vcvtfp2fxu;16933SDValue FixConv = DAG.getNode(16934ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,16935DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),16936DAG.getConstant(C, dl, MVT::i32));1693716938if (IntBits < FloatBits)16939FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);1694016941return FixConv;16942}1694316944static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG,16945const ARMSubtarget *Subtarget) {16946if (!Subtarget->hasMVEFloatOps())16947return SDValue();1694816949// Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)16950// The second form can be more easily turned into a predicated vadd, and16951// possibly combined into a fma to become a predicated vfma.16952SDValue Op0 = N->getOperand(0);16953SDValue Op1 = N->getOperand(1);16954EVT VT = N->getValueType(0);16955SDLoc DL(N);1695616957// The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,16958// which these VMOV's represent.16959auto isIdentitySplat = [&](SDValue Op, bool NSZ) {16960if (Op.getOpcode() != ISD::BITCAST ||16961Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)16962return false;16963uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);16964if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))16965return true;16966if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))16967return true;16968return false;16969};1697016971if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)16972std::swap(Op0, Op1);1697316974if (Op1.getOpcode() != ISD::VSELECT)16975return SDValue();1697616977SDNodeFlags FaddFlags = N->getFlags();16978bool NSZ = FaddFlags.hasNoSignedZeros();16979if (!isIdentitySplat(Op1.getOperand(2), NSZ))16980return SDValue();1698116982SDValue FAdd =16983DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);16984return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);16985}1698616987static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG) {16988SDValue LHS = N->getOperand(0);16989SDValue RHS = N->getOperand(1);16990EVT VT = N->getValueType(0);16991SDLoc DL(N);1699216993if (!N->getFlags().hasAllowReassociation())16994return SDValue();1699516996// Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)16997auto ReassocComplex = [&](SDValue A, SDValue B) {16998if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)16999return SDValue();17000unsigned Opc = A.getConstantOperandVal(0);17001if (Opc != Intrinsic::arm_mve_vcmlaq)17002return SDValue();17003SDValue VCMLA = DAG.getNode(17004ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),17005DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),17006A.getOperand(3), A.getOperand(4));17007VCMLA->setFlags(A->getFlags());17008return VCMLA;17009};17010if (SDValue R = ReassocComplex(LHS, RHS))17011return R;17012if (SDValue R = ReassocComplex(RHS, LHS))17013return R;1701417015return SDValue();17016}1701717018static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,17019const ARMSubtarget *Subtarget) {17020if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))17021return S;17022if (SDValue S = PerformFADDVCMLACombine(N, DAG))17023return S;17024return SDValue();17025}1702617027/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)17028/// can replace combinations of VCVT (integer to floating-point) and VMUL17029/// when the VMUL has a constant operand that is a power of 2.17030///17031/// Example (assume d17 = <float 0.125, float 0.125>):17032/// vcvt.f32.s32 d16, d1617033/// vmul.f32 d16, d16, d1717034/// becomes:17035/// vcvt.f32.s32 d16, d16, #317036static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG,17037const ARMSubtarget *Subtarget) {17038if (!Subtarget->hasNEON())17039return SDValue();1704017041SDValue Op = N->getOperand(0);17042unsigned OpOpcode = Op.getNode()->getOpcode();17043if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||17044(OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))17045return SDValue();1704617047SDValue ConstVec = N->getOperand(1);17048if (!isa<BuildVectorSDNode>(ConstVec))17049return SDValue();1705017051MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();17052uint32_t FloatBits = FloatTy.getSizeInBits();17053MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();17054uint32_t IntBits = IntTy.getSizeInBits();17055unsigned NumLanes = Op.getValueType().getVectorNumElements();17056if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {17057// These instructions only exist converting from i32 to f32. We can handle17058// smaller integers by generating an extra extend, but larger ones would17059// be lossy. We also can't handle anything other than 2 or 4 lanes, since17060// these intructions only support v2i32/v4i32 types.17061return SDValue();17062}1706317064ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);17065APFloat Recip(0.0f);17066if (!CN || !CN->getValueAPF().getExactInverse(&Recip))17067return SDValue();1706817069bool IsExact;17070APSInt IntVal(33);17071if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=17072APFloat::opOK ||17073!IsExact)17074return SDValue();1707517076int32_t C = IntVal.exactLogBase2();17077if (C == -1 || C == 0 || C > 32)17078return SDValue();1707917080SDLoc DL(N);17081bool isSigned = OpOpcode == ISD::SINT_TO_FP;17082SDValue ConvInput = Op.getOperand(0);17083if (IntBits < FloatBits)17084ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,17085NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);1708617087unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp17088: Intrinsic::arm_neon_vcvtfxu2fp;17089return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),17090DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,17091DAG.getConstant(C, DL, MVT::i32));17092}1709317094static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,17095const ARMSubtarget *ST) {17096if (!ST->hasMVEIntegerOps())17097return SDValue();1709817099assert(N->getOpcode() == ISD::VECREDUCE_ADD);17100EVT ResVT = N->getValueType(0);17101SDValue N0 = N->getOperand(0);17102SDLoc dl(N);1710317104// Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)17105if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&17106(N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||17107N0.getValueType() == MVT::v16i8)) {17108SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));17109SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));17110return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);17111}1711217113// We are looking for something that will have illegal types if left alone,17114// but that we can convert to a single instruction under MVE. For example17115// vecreduce_add(sext(A, v8i32)) => VADDV.s16 A17116// or17117// vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B1711817119// The legal cases are:17120// VADDV u/s 8/16/3217121// VMLAV u/s 8/16/3217122// VADDLV u/s 3217123// VMLALV u/s 16/321712417125// If the input vector is smaller than legal (v4i8/v4i16 for example) we can17126// extend it and use v4i32 instead.17127auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {17128EVT AVT = A.getValueType();17129return any_of(ExtTypes, [&](MVT Ty) {17130return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&17131AVT.bitsLE(Ty);17132});17133};17134auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {17135EVT AVT = A.getValueType();17136if (!AVT.is128BitVector())17137A = DAG.getNode(ExtendCode, dl,17138AVT.changeVectorElementType(MVT::getIntegerVT(17139128 / AVT.getVectorMinNumElements())),17140A);17141return A;17142};17143auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {17144if (ResVT != RetTy || N0->getOpcode() != ExtendCode)17145return SDValue();17146SDValue A = N0->getOperand(0);17147if (ExtTypeMatches(A, ExtTypes))17148return ExtendIfNeeded(A, ExtendCode);17149return SDValue();17150};17151auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,17152ArrayRef<MVT> ExtTypes, SDValue &Mask) {17153if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||17154!ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))17155return SDValue();17156Mask = N0->getOperand(0);17157SDValue Ext = N0->getOperand(1);17158if (Ext->getOpcode() != ExtendCode)17159return SDValue();17160SDValue A = Ext->getOperand(0);17161if (ExtTypeMatches(A, ExtTypes))17162return ExtendIfNeeded(A, ExtendCode);17163return SDValue();17164};17165auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,17166SDValue &A, SDValue &B) {17167// For a vmla we are trying to match a larger pattern:17168// ExtA = sext/zext A17169// ExtB = sext/zext B17170// Mul = mul ExtA, ExtB17171// vecreduce.add Mul17172// There might also be en extra extend between the mul and the addreduce, so17173// long as the bitwidth is high enough to make them equivalent (for example17174// original v8i16 might be mul at v8i32 and the reduce happens at v8i64).17175if (ResVT != RetTy)17176return false;17177SDValue Mul = N0;17178if (Mul->getOpcode() == ExtendCode &&17179Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=17180ResVT.getScalarSizeInBits())17181Mul = Mul->getOperand(0);17182if (Mul->getOpcode() != ISD::MUL)17183return false;17184SDValue ExtA = Mul->getOperand(0);17185SDValue ExtB = Mul->getOperand(1);17186if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)17187return false;17188A = ExtA->getOperand(0);17189B = ExtB->getOperand(0);17190if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {17191A = ExtendIfNeeded(A, ExtendCode);17192B = ExtendIfNeeded(B, ExtendCode);17193return true;17194}17195return false;17196};17197auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,17198SDValue &A, SDValue &B, SDValue &Mask) {17199// Same as the pattern above with a select for the zero predicated lanes17200// ExtA = sext/zext A17201// ExtB = sext/zext B17202// Mul = mul ExtA, ExtB17203// N0 = select Mask, Mul, 017204// vecreduce.add N017205if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||17206!ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode()))17207return false;17208Mask = N0->getOperand(0);17209SDValue Mul = N0->getOperand(1);17210if (Mul->getOpcode() == ExtendCode &&17211Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=17212ResVT.getScalarSizeInBits())17213Mul = Mul->getOperand(0);17214if (Mul->getOpcode() != ISD::MUL)17215return false;17216SDValue ExtA = Mul->getOperand(0);17217SDValue ExtB = Mul->getOperand(1);17218if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)17219return false;17220A = ExtA->getOperand(0);17221B = ExtB->getOperand(0);17222if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {17223A = ExtendIfNeeded(A, ExtendCode);17224B = ExtendIfNeeded(B, ExtendCode);17225return true;17226}17227return false;17228};17229auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {17230// Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i6417231// reductions. The operands are extended with MVEEXT, but as they are17232// reductions the lane orders do not matter. MVEEXT may be combined with17233// loads to produce two extending loads, or else they will be expanded to17234// VREV/VMOVL.17235EVT VT = Ops[0].getValueType();17236if (VT == MVT::v16i8) {17237assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&17238"Unexpected illegal long reduction opcode");17239bool IsUnsigned = Opcode == ARMISD::VMLALVu;1724017241SDValue Ext0 =17242DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,17243DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);17244SDValue Ext1 =17245DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,17246DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);1724717248SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),17249Ext0, Ext1);17250SDValue MLA1 =17251DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,17252DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),17253Ext0.getValue(1), Ext1.getValue(1));17254return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));17255}17256SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);17257return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,17258SDValue(Node.getNode(), 1));17259};1726017261SDValue A, B;17262SDValue Mask;17263if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))17264return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);17265if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))17266return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);17267if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},17268A, B))17269return Create64bitNode(ARMISD::VMLALVs, {A, B});17270if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},17271A, B))17272return Create64bitNode(ARMISD::VMLALVu, {A, B});17273if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))17274return DAG.getNode(ISD::TRUNCATE, dl, ResVT,17275DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));17276if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))17277return DAG.getNode(ISD::TRUNCATE, dl, ResVT,17278DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));1727917280if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,17281Mask))17282return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);17283if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,17284Mask))17285return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);17286if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,17287Mask))17288return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});17289if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,17290Mask))17291return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});17292if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))17293return DAG.getNode(ISD::TRUNCATE, dl, ResVT,17294DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));17295if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))17296return DAG.getNode(ISD::TRUNCATE, dl, ResVT,17297DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));1729817299if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))17300return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);17301if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))17302return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);17303if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))17304return Create64bitNode(ARMISD::VADDLVs, {A});17305if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))17306return Create64bitNode(ARMISD::VADDLVu, {A});17307if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))17308return DAG.getNode(ISD::TRUNCATE, dl, ResVT,17309DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));17310if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))17311return DAG.getNode(ISD::TRUNCATE, dl, ResVT,17312DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));1731317314if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))17315return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);17316if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))17317return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);17318if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))17319return Create64bitNode(ARMISD::VADDLVps, {A, Mask});17320if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))17321return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});17322if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))17323return DAG.getNode(ISD::TRUNCATE, dl, ResVT,17324DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));17325if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))17326return DAG.getNode(ISD::TRUNCATE, dl, ResVT,17327DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));1732817329// Some complications. We can get a case where the two inputs of the mul are17330// the same, then the output sext will have been helpfully converted to a17331// zext. Turn it back.17332SDValue Op = N0;17333if (Op->getOpcode() == ISD::VSELECT)17334Op = Op->getOperand(1);17335if (Op->getOpcode() == ISD::ZERO_EXTEND &&17336Op->getOperand(0)->getOpcode() == ISD::MUL) {17337SDValue Mul = Op->getOperand(0);17338if (Mul->getOperand(0) == Mul->getOperand(1) &&17339Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {17340SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);17341if (Op != N0)17342Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),17343N0->getOperand(0), Ext, N0->getOperand(2));17344return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);17345}17346}1734717348return SDValue();17349}1735017351// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all17352// the lanes are used. Due to the reduction being commutative the shuffle can be17353// removed.17354static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG) {17355unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;17356auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));17357if (!Shuf || !Shuf->getOperand(1).isUndef())17358return SDValue();1735917360// Check all elements are used once in the mask.17361ArrayRef<int> Mask = Shuf->getMask();17362APInt SetElts(Mask.size(), 0);17363for (int E : Mask) {17364if (E < 0 || E >= (int)Mask.size())17365return SDValue();17366SetElts.setBit(E);17367}17368if (!SetElts.isAllOnes())17369return SDValue();1737017371if (N->getNumOperands() != VecOp + 1) {17372auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));17373if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)17374return SDValue();17375}1737617377SmallVector<SDValue> Ops;17378for (SDValue Op : N->ops()) {17379if (Op.getValueType().isVector())17380Ops.push_back(Op.getOperand(0));17381else17382Ops.push_back(Op);17383}17384return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);17385}1738617387static SDValue PerformVMOVNCombine(SDNode *N,17388TargetLowering::DAGCombinerInfo &DCI) {17389SDValue Op0 = N->getOperand(0);17390SDValue Op1 = N->getOperand(1);17391unsigned IsTop = N->getConstantOperandVal(2);1739217393// VMOVNT a undef -> a17394// VMOVNB a undef -> a17395// VMOVNB undef a -> a17396if (Op1->isUndef())17397return Op0;17398if (Op0->isUndef() && !IsTop)17399return Op1;1740017401// VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)17402// VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)17403if ((Op1->getOpcode() == ARMISD::VQMOVNs ||17404Op1->getOpcode() == ARMISD::VQMOVNu) &&17405Op1->getConstantOperandVal(2) == 0)17406return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),17407Op0, Op1->getOperand(1), N->getOperand(2));1740817409// Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from17410// Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting17411// into the top or bottom lanes.17412unsigned NumElts = N->getValueType(0).getVectorNumElements();17413APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));17414APInt Op0DemandedElts =17415IsTop ? Op1DemandedElts17416: APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));1741717418const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();17419if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))17420return SDValue(N, 0);17421if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))17422return SDValue(N, 0);1742317424return SDValue();17425}1742617427static SDValue PerformVQMOVNCombine(SDNode *N,17428TargetLowering::DAGCombinerInfo &DCI) {17429SDValue Op0 = N->getOperand(0);17430unsigned IsTop = N->getConstantOperandVal(2);1743117432unsigned NumElts = N->getValueType(0).getVectorNumElements();17433APInt Op0DemandedElts =17434APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)17435: APInt::getHighBitsSet(2, 1));1743617437const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();17438if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))17439return SDValue(N, 0);17440return SDValue();17441}1744217443static SDValue PerformVQDMULHCombine(SDNode *N,17444TargetLowering::DAGCombinerInfo &DCI) {17445EVT VT = N->getValueType(0);17446SDValue LHS = N->getOperand(0);17447SDValue RHS = N->getOperand(1);1744817449auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);17450auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);17451// Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)17452if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&17453LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&17454(LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {17455SDLoc DL(N);17456SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,17457LHS.getOperand(0), RHS.getOperand(0));17458SDValue UndefV = LHS.getOperand(1);17459return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());17460}17461return SDValue();17462}1746317464static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) {17465SDLoc DL(N);17466SDValue Op0 = N->getOperand(0);17467SDValue Op1 = N->getOperand(1);1746817469// Turn X << -C -> X >> C and viceversa. The negative shifts can come up from17470// uses of the intrinsics.17471if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {17472int ShiftAmt = C->getSExtValue();17473if (ShiftAmt == 0) {17474SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);17475DAG.ReplaceAllUsesWith(N, Merge.getNode());17476return SDValue();17477}1747817479if (ShiftAmt >= -32 && ShiftAmt < 0) {17480unsigned NewOpcode =17481N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;17482SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,17483DAG.getConstant(-ShiftAmt, DL, MVT::i32));17484DAG.ReplaceAllUsesWith(N, NewShift.getNode());17485return NewShift;17486}17487}1748817489return SDValue();17490}1749117492/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.17493SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,17494DAGCombinerInfo &DCI) const {17495SelectionDAG &DAG = DCI.DAG;17496unsigned IntNo = N->getConstantOperandVal(0);17497switch (IntNo) {17498default:17499// Don't do anything for most intrinsics.17500break;1750117502// Vector shifts: check for immediate versions and lower them.17503// Note: This is done during DAG combining instead of DAG legalizing because17504// the build_vectors for 64-bit vector element shift counts are generally17505// not legal, and it is hard to see their values after they get legalized to17506// loads from a constant pool.17507case Intrinsic::arm_neon_vshifts:17508case Intrinsic::arm_neon_vshiftu:17509case Intrinsic::arm_neon_vrshifts:17510case Intrinsic::arm_neon_vrshiftu:17511case Intrinsic::arm_neon_vrshiftn:17512case Intrinsic::arm_neon_vqshifts:17513case Intrinsic::arm_neon_vqshiftu:17514case Intrinsic::arm_neon_vqshiftsu:17515case Intrinsic::arm_neon_vqshiftns:17516case Intrinsic::arm_neon_vqshiftnu:17517case Intrinsic::arm_neon_vqshiftnsu:17518case Intrinsic::arm_neon_vqrshiftns:17519case Intrinsic::arm_neon_vqrshiftnu:17520case Intrinsic::arm_neon_vqrshiftnsu: {17521EVT VT = N->getOperand(1).getValueType();17522int64_t Cnt;17523unsigned VShiftOpc = 0;1752417525switch (IntNo) {17526case Intrinsic::arm_neon_vshifts:17527case Intrinsic::arm_neon_vshiftu:17528if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {17529VShiftOpc = ARMISD::VSHLIMM;17530break;17531}17532if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {17533VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM17534: ARMISD::VSHRuIMM);17535break;17536}17537return SDValue();1753817539case Intrinsic::arm_neon_vrshifts:17540case Intrinsic::arm_neon_vrshiftu:17541if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))17542break;17543return SDValue();1754417545case Intrinsic::arm_neon_vqshifts:17546case Intrinsic::arm_neon_vqshiftu:17547if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))17548break;17549return SDValue();1755017551case Intrinsic::arm_neon_vqshiftsu:17552if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))17553break;17554llvm_unreachable("invalid shift count for vqshlu intrinsic");1755517556case Intrinsic::arm_neon_vrshiftn:17557case Intrinsic::arm_neon_vqshiftns:17558case Intrinsic::arm_neon_vqshiftnu:17559case Intrinsic::arm_neon_vqshiftnsu:17560case Intrinsic::arm_neon_vqrshiftns:17561case Intrinsic::arm_neon_vqrshiftnu:17562case Intrinsic::arm_neon_vqrshiftnsu:17563// Narrowing shifts require an immediate right shift.17564if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))17565break;17566llvm_unreachable("invalid shift count for narrowing vector shift "17567"intrinsic");1756817569default:17570llvm_unreachable("unhandled vector shift");17571}1757217573switch (IntNo) {17574case Intrinsic::arm_neon_vshifts:17575case Intrinsic::arm_neon_vshiftu:17576// Opcode already set above.17577break;17578case Intrinsic::arm_neon_vrshifts:17579VShiftOpc = ARMISD::VRSHRsIMM;17580break;17581case Intrinsic::arm_neon_vrshiftu:17582VShiftOpc = ARMISD::VRSHRuIMM;17583break;17584case Intrinsic::arm_neon_vrshiftn:17585VShiftOpc = ARMISD::VRSHRNIMM;17586break;17587case Intrinsic::arm_neon_vqshifts:17588VShiftOpc = ARMISD::VQSHLsIMM;17589break;17590case Intrinsic::arm_neon_vqshiftu:17591VShiftOpc = ARMISD::VQSHLuIMM;17592break;17593case Intrinsic::arm_neon_vqshiftsu:17594VShiftOpc = ARMISD::VQSHLsuIMM;17595break;17596case Intrinsic::arm_neon_vqshiftns:17597VShiftOpc = ARMISD::VQSHRNsIMM;17598break;17599case Intrinsic::arm_neon_vqshiftnu:17600VShiftOpc = ARMISD::VQSHRNuIMM;17601break;17602case Intrinsic::arm_neon_vqshiftnsu:17603VShiftOpc = ARMISD::VQSHRNsuIMM;17604break;17605case Intrinsic::arm_neon_vqrshiftns:17606VShiftOpc = ARMISD::VQRSHRNsIMM;17607break;17608case Intrinsic::arm_neon_vqrshiftnu:17609VShiftOpc = ARMISD::VQRSHRNuIMM;17610break;17611case Intrinsic::arm_neon_vqrshiftnsu:17612VShiftOpc = ARMISD::VQRSHRNsuIMM;17613break;17614}1761517616SDLoc dl(N);17617return DAG.getNode(VShiftOpc, dl, N->getValueType(0),17618N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));17619}1762017621case Intrinsic::arm_neon_vshiftins: {17622EVT VT = N->getOperand(1).getValueType();17623int64_t Cnt;17624unsigned VShiftOpc = 0;1762517626if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))17627VShiftOpc = ARMISD::VSLIIMM;17628else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))17629VShiftOpc = ARMISD::VSRIIMM;17630else {17631llvm_unreachable("invalid shift count for vsli/vsri intrinsic");17632}1763317634SDLoc dl(N);17635return DAG.getNode(VShiftOpc, dl, N->getValueType(0),17636N->getOperand(1), N->getOperand(2),17637DAG.getConstant(Cnt, dl, MVT::i32));17638}1763917640case Intrinsic::arm_neon_vqrshifts:17641case Intrinsic::arm_neon_vqrshiftu:17642// No immediate versions of these to check for.17643break;1764417645case Intrinsic::arm_mve_vqdmlah:17646case Intrinsic::arm_mve_vqdmlash:17647case Intrinsic::arm_mve_vqrdmlah:17648case Intrinsic::arm_mve_vqrdmlash:17649case Intrinsic::arm_mve_vmla_n_predicated:17650case Intrinsic::arm_mve_vmlas_n_predicated:17651case Intrinsic::arm_mve_vqdmlah_predicated:17652case Intrinsic::arm_mve_vqdmlash_predicated:17653case Intrinsic::arm_mve_vqrdmlah_predicated:17654case Intrinsic::arm_mve_vqrdmlash_predicated: {17655// These intrinsics all take an i32 scalar operand which is narrowed to the17656// size of a single lane of the vector type they return. So we don't need17657// any bits of that operand above that point, which allows us to eliminate17658// uxth/sxth.17659unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();17660APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);17661if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))17662return SDValue();17663break;17664}1766517666case Intrinsic::arm_mve_minv:17667case Intrinsic::arm_mve_maxv:17668case Intrinsic::arm_mve_minav:17669case Intrinsic::arm_mve_maxav:17670case Intrinsic::arm_mve_minv_predicated:17671case Intrinsic::arm_mve_maxv_predicated:17672case Intrinsic::arm_mve_minav_predicated:17673case Intrinsic::arm_mve_maxav_predicated: {17674// These intrinsics all take an i32 scalar operand which is narrowed to the17675// size of a single lane of the vector type they take as the other input.17676unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();17677APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);17678if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))17679return SDValue();17680break;17681}1768217683case Intrinsic::arm_mve_addv: {17684// Turn this intrinsic straight into the appropriate ARMISD::VADDV node,17685// which allow PerformADDVecReduce to turn it into VADDLV when possible.17686bool Unsigned = N->getConstantOperandVal(2);17687unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;17688return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));17689}1769017691case Intrinsic::arm_mve_addlv:17692case Intrinsic::arm_mve_addlv_predicated: {17693// Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR17694// which recombines the two outputs into an i6417695bool Unsigned = N->getConstantOperandVal(2);17696unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?17697(Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :17698(Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);1769917700SmallVector<SDValue, 4> Ops;17701for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)17702if (i != 2) // skip the unsigned flag17703Ops.push_back(N->getOperand(i));1770417705SDLoc dl(N);17706SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);17707return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),17708val.getValue(1));17709}17710}1771117712return SDValue();17713}1771417715/// PerformShiftCombine - Checks for immediate versions of vector shifts and17716/// lowers them. As with the vector shift intrinsics, this is done during DAG17717/// combining instead of DAG legalizing because the build_vectors for 64-bit17718/// vector element shift counts are generally not legal, and it is hard to see17719/// their values after they get legalized to loads from a constant pool.17720static SDValue PerformShiftCombine(SDNode *N,17721TargetLowering::DAGCombinerInfo &DCI,17722const ARMSubtarget *ST) {17723SelectionDAG &DAG = DCI.DAG;17724EVT VT = N->getValueType(0);1772517726if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&17727N->getOperand(0)->getOpcode() == ISD::AND &&17728N->getOperand(0)->hasOneUse()) {17729if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())17730return SDValue();17731// Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't17732// usually show up because instcombine prefers to canonicalize it to17733// (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come17734// out of GEP lowering in some cases.17735SDValue N0 = N->getOperand(0);17736ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));17737if (!ShiftAmtNode)17738return SDValue();17739uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());17740ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));17741if (!AndMaskNode)17742return SDValue();17743uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());17744// Don't transform uxtb/uxth.17745if (AndMask == 255 || AndMask == 65535)17746return SDValue();17747if (isMask_32(AndMask)) {17748uint32_t MaskedBits = llvm::countl_zero(AndMask);17749if (MaskedBits > ShiftAmt) {17750SDLoc DL(N);17751SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),17752DAG.getConstant(MaskedBits, DL, MVT::i32));17753return DAG.getNode(17754ISD::SRL, DL, MVT::i32, SHL,17755DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));17756}17757}17758}1775917760// Nothing to be done for scalar shifts.17761const TargetLowering &TLI = DAG.getTargetLoweringInfo();17762if (!VT.isVector() || !TLI.isTypeLegal(VT))17763return SDValue();17764if (ST->hasMVEIntegerOps())17765return SDValue();1776617767int64_t Cnt;1776817769switch (N->getOpcode()) {17770default: llvm_unreachable("unexpected shift opcode");1777117772case ISD::SHL:17773if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {17774SDLoc dl(N);17775return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),17776DAG.getConstant(Cnt, dl, MVT::i32));17777}17778break;1777917780case ISD::SRA:17781case ISD::SRL:17782if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {17783unsigned VShiftOpc =17784(N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);17785SDLoc dl(N);17786return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),17787DAG.getConstant(Cnt, dl, MVT::i32));17788}17789}17790return SDValue();17791}1779217793// Look for a sign/zero/fpextend extend of a larger than legal load. This can be17794// split into multiple extending loads, which are simpler to deal with than an17795// arbitrary extend. For fp extends we use an integer extending load and a VCVTL17796// to convert the type to an f32.17797static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {17798SDValue N0 = N->getOperand(0);17799if (N0.getOpcode() != ISD::LOAD)17800return SDValue();17801LoadSDNode *LD = cast<LoadSDNode>(N0.getNode());17802if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||17803LD->getExtensionType() != ISD::NON_EXTLOAD)17804return SDValue();17805EVT FromVT = LD->getValueType(0);17806EVT ToVT = N->getValueType(0);17807if (!ToVT.isVector())17808return SDValue();17809assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());17810EVT ToEltVT = ToVT.getVectorElementType();17811EVT FromEltVT = FromVT.getVectorElementType();1781217813unsigned NumElements = 0;17814if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)17815NumElements = 4;17816if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)17817NumElements = 4;17818if (NumElements == 0 ||17819(FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||17820FromVT.getVectorNumElements() % NumElements != 0 ||17821!isPowerOf2_32(NumElements))17822return SDValue();1782317824LLVMContext &C = *DAG.getContext();17825SDLoc DL(LD);17826// Details about the old load17827SDValue Ch = LD->getChain();17828SDValue BasePtr = LD->getBasePtr();17829Align Alignment = LD->getOriginalAlign();17830MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();17831AAMDNodes AAInfo = LD->getAAInfo();1783217833ISD::LoadExtType NewExtType =17834N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;17835SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());17836EVT NewFromVT = EVT::getVectorVT(17837C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);17838EVT NewToVT = EVT::getVectorVT(17839C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);1784017841SmallVector<SDValue, 4> Loads;17842SmallVector<SDValue, 4> Chains;17843for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {17844unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;17845SDValue NewPtr =17846DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));1784717848SDValue NewLoad =17849DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,17850LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,17851Alignment, MMOFlags, AAInfo);17852Loads.push_back(NewLoad);17853Chains.push_back(SDValue(NewLoad.getNode(), 1));17854}1785517856// Float truncs need to extended with VCVTB's into their floating point types.17857if (FromEltVT == MVT::f16) {17858SmallVector<SDValue, 4> Extends;1785917860for (unsigned i = 0; i < Loads.size(); i++) {17861SDValue LoadBC =17862DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);17863SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,17864DAG.getConstant(0, DL, MVT::i32));17865Extends.push_back(FPExt);17866}1786717868Loads = Extends;17869}1787017871SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);17872DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);17873return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);17874}1787517876/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,17877/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.17878static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,17879const ARMSubtarget *ST) {17880SDValue N0 = N->getOperand(0);1788117882// Check for sign- and zero-extensions of vector extract operations of 8- and17883// 16-bit vector elements. NEON and MVE support these directly. They are17884// handled during DAG combining because type legalization will promote them17885// to 32-bit types and it is messy to recognize the operations after that.17886if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&17887N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {17888SDValue Vec = N0.getOperand(0);17889SDValue Lane = N0.getOperand(1);17890EVT VT = N->getValueType(0);17891EVT EltVT = N0.getValueType();17892const TargetLowering &TLI = DAG.getTargetLoweringInfo();1789317894if (VT == MVT::i32 &&17895(EltVT == MVT::i8 || EltVT == MVT::i16) &&17896TLI.isTypeLegal(Vec.getValueType()) &&17897isa<ConstantSDNode>(Lane)) {1789817899unsigned Opc = 0;17900switch (N->getOpcode()) {17901default: llvm_unreachable("unexpected opcode");17902case ISD::SIGN_EXTEND:17903Opc = ARMISD::VGETLANEs;17904break;17905case ISD::ZERO_EXTEND:17906case ISD::ANY_EXTEND:17907Opc = ARMISD::VGETLANEu;17908break;17909}17910return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);17911}17912}1791317914if (ST->hasMVEIntegerOps())17915if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))17916return NewLoad;1791717918return SDValue();17919}1792017921static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG,17922const ARMSubtarget *ST) {17923if (ST->hasMVEFloatOps())17924if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))17925return NewLoad;1792617927return SDValue();17928}1792917930// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating17931// constant bounds.17932static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG,17933const ARMSubtarget *Subtarget) {17934if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&17935!Subtarget->isThumb2())17936return SDValue();1793717938EVT VT = Op.getValueType();17939SDValue Op0 = Op.getOperand(0);1794017941if (VT != MVT::i32 ||17942(Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||17943!isa<ConstantSDNode>(Op.getOperand(1)) ||17944!isa<ConstantSDNode>(Op0.getOperand(1)))17945return SDValue();1794617947SDValue Min = Op;17948SDValue Max = Op0;17949SDValue Input = Op0.getOperand(0);17950if (Min.getOpcode() == ISD::SMAX)17951std::swap(Min, Max);1795217953APInt MinC = Min.getConstantOperandAPInt(1);17954APInt MaxC = Max.getConstantOperandAPInt(1);1795517956if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||17957!(MinC + 1).isPowerOf2())17958return SDValue();1795917960SDLoc DL(Op);17961if (MinC == ~MaxC)17962return DAG.getNode(ARMISD::SSAT, DL, VT, Input,17963DAG.getConstant(MinC.countr_one(), DL, VT));17964if (MaxC == 0)17965return DAG.getNode(ARMISD::USAT, DL, VT, Input,17966DAG.getConstant(MinC.countr_one(), DL, VT));1796717968return SDValue();17969}1797017971/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating17972/// saturates.17973static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,17974const ARMSubtarget *ST) {17975EVT VT = N->getValueType(0);17976SDValue N0 = N->getOperand(0);1797717978if (VT == MVT::i32)17979return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);1798017981if (!ST->hasMVEIntegerOps())17982return SDValue();1798317984if (SDValue V = PerformVQDMULHCombine(N, DAG))17985return V;1798617987if (VT != MVT::v4i32 && VT != MVT::v8i16)17988return SDValue();1798917990auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {17991// Check one is a smin and the other is a smax17992if (Min->getOpcode() != ISD::SMIN)17993std::swap(Min, Max);17994if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)17995return false;1799617997APInt SaturateC;17998if (VT == MVT::v4i32)17999SaturateC = APInt(32, (1 << 15) - 1, true);18000else //if (VT == MVT::v8i16)18001SaturateC = APInt(16, (1 << 7) - 1, true);1800218003APInt MinC, MaxC;18004if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||18005MinC != SaturateC)18006return false;18007if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||18008MaxC != ~SaturateC)18009return false;18010return true;18011};1801218013if (IsSignedSaturate(N, N0.getNode())) {18014SDLoc DL(N);18015MVT ExtVT, HalfVT;18016if (VT == MVT::v4i32) {18017HalfVT = MVT::v8i16;18018ExtVT = MVT::v4i16;18019} else { // if (VT == MVT::v8i16)18020HalfVT = MVT::v16i8;18021ExtVT = MVT::v8i8;18022}1802318024// Create a VQMOVNB with undef top lanes, then signed extended into the top18025// half. That extend will hopefully be removed if only the bottom bits are18026// demanded (though a truncating store, for example).18027SDValue VQMOVN =18028DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),18029N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));18030SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);18031return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,18032DAG.getValueType(ExtVT));18033}1803418035auto IsUnsignedSaturate = [&](SDNode *Min) {18036// For unsigned, we just need to check for <= 0xffff18037if (Min->getOpcode() != ISD::UMIN)18038return false;1803918040APInt SaturateC;18041if (VT == MVT::v4i32)18042SaturateC = APInt(32, (1 << 16) - 1, true);18043else //if (VT == MVT::v8i16)18044SaturateC = APInt(16, (1 << 8) - 1, true);1804518046APInt MinC;18047if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||18048MinC != SaturateC)18049return false;18050return true;18051};1805218053if (IsUnsignedSaturate(N)) {18054SDLoc DL(N);18055MVT HalfVT;18056unsigned ExtConst;18057if (VT == MVT::v4i32) {18058HalfVT = MVT::v8i16;18059ExtConst = 0x0000FFFF;18060} else { //if (VT == MVT::v8i16)18061HalfVT = MVT::v16i8;18062ExtConst = 0x00FF;18063}1806418065// Create a VQMOVNB with undef top lanes, then ZExt into the top half with18066// an AND. That extend will hopefully be removed if only the bottom bits are18067// demanded (though a truncating store, for example).18068SDValue VQMOVN =18069DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,18070DAG.getConstant(0, DL, MVT::i32));18071SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);18072return DAG.getNode(ISD::AND, DL, VT, Bitcast,18073DAG.getConstant(ExtConst, DL, VT));18074}1807518076return SDValue();18077}1807818079static const APInt *isPowerOf2Constant(SDValue V) {18080ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);18081if (!C)18082return nullptr;18083const APInt *CV = &C->getAPIntValue();18084return CV->isPowerOf2() ? CV : nullptr;18085}1808618087SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {18088// If we have a CMOV, OR and AND combination such as:18089// if (x & CN)18090// y |= CM;18091//18092// And:18093// * CN is a single bit;18094// * All bits covered by CM are known zero in y18095//18096// Then we can convert this into a sequence of BFI instructions. This will18097// always be a win if CM is a single bit, will always be no worse than the18098// TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is18099// three bits (due to the extra IT instruction).1810018101SDValue Op0 = CMOV->getOperand(0);18102SDValue Op1 = CMOV->getOperand(1);18103auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();18104SDValue CmpZ = CMOV->getOperand(4);1810518106// The compare must be against zero.18107if (!isNullConstant(CmpZ->getOperand(1)))18108return SDValue();1810918110assert(CmpZ->getOpcode() == ARMISD::CMPZ);18111SDValue And = CmpZ->getOperand(0);18112if (And->getOpcode() != ISD::AND)18113return SDValue();18114const APInt *AndC = isPowerOf2Constant(And->getOperand(1));18115if (!AndC)18116return SDValue();18117SDValue X = And->getOperand(0);1811818119if (CC == ARMCC::EQ) {18120// We're performing an "equal to zero" compare. Swap the operands so we18121// canonicalize on a "not equal to zero" compare.18122std::swap(Op0, Op1);18123} else {18124assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");18125}1812618127if (Op1->getOpcode() != ISD::OR)18128return SDValue();1812918130ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));18131if (!OrC)18132return SDValue();18133SDValue Y = Op1->getOperand(0);1813418135if (Op0 != Y)18136return SDValue();1813718138// Now, is it profitable to continue?18139APInt OrCI = OrC->getAPIntValue();18140unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;18141if (OrCI.popcount() > Heuristic)18142return SDValue();1814318144// Lastly, can we determine that the bits defined by OrCI18145// are zero in Y?18146KnownBits Known = DAG.computeKnownBits(Y);18147if ((OrCI & Known.Zero) != OrCI)18148return SDValue();1814918150// OK, we can do the combine.18151SDValue V = Y;18152SDLoc dl(X);18153EVT VT = X.getValueType();18154unsigned BitInX = AndC->logBase2();1815518156if (BitInX != 0) {18157// We must shift X first.18158X = DAG.getNode(ISD::SRL, dl, VT, X,18159DAG.getConstant(BitInX, dl, VT));18160}1816118162for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();18163BitInY < NumActiveBits; ++BitInY) {18164if (OrCI[BitInY] == 0)18165continue;18166APInt Mask(VT.getSizeInBits(), 0);18167Mask.setBit(BitInY);18168V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,18169// Confusingly, the operand is an *inverted* mask.18170DAG.getConstant(~Mask, dl, VT));18171}1817218173return V;18174}1817518176// Given N, the value controlling the conditional branch, search for the loop18177// intrinsic, returning it, along with how the value is used. We need to handle18178// patterns such as the following:18179// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)18180// (brcond (setcc (loop.decrement), 0, eq), exit)18181// (brcond (setcc (loop.decrement), 0, ne), header)18182static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,18183bool &Negate) {18184switch (N->getOpcode()) {18185default:18186break;18187case ISD::XOR: {18188if (!isa<ConstantSDNode>(N.getOperand(1)))18189return SDValue();18190if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())18191return SDValue();18192Negate = !Negate;18193return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);18194}18195case ISD::SETCC: {18196auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));18197if (!Const)18198return SDValue();18199if (Const->isZero())18200Imm = 0;18201else if (Const->isOne())18202Imm = 1;18203else18204return SDValue();18205CC = cast<CondCodeSDNode>(N.getOperand(2))->get();18206return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);18207}18208case ISD::INTRINSIC_W_CHAIN: {18209unsigned IntOp = N.getConstantOperandVal(1);18210if (IntOp != Intrinsic::test_start_loop_iterations &&18211IntOp != Intrinsic::loop_decrement_reg)18212return SDValue();18213return N;18214}18215}18216return SDValue();18217}1821818219static SDValue PerformHWLoopCombine(SDNode *N,18220TargetLowering::DAGCombinerInfo &DCI,18221const ARMSubtarget *ST) {1822218223// The hwloop intrinsics that we're interested are used for control-flow,18224// either for entering or exiting the loop:18225// - test.start.loop.iterations will test whether its operand is zero. If it18226// is zero, the proceeding branch should not enter the loop.18227// - loop.decrement.reg also tests whether its operand is zero. If it is18228// zero, the proceeding branch should not branch back to the beginning of18229// the loop.18230// So here, we need to check that how the brcond is using the result of each18231// of the intrinsics to ensure that we're branching to the right place at the18232// right time.1823318234ISD::CondCode CC;18235SDValue Cond;18236int Imm = 1;18237bool Negate = false;18238SDValue Chain = N->getOperand(0);18239SDValue Dest;1824018241if (N->getOpcode() == ISD::BRCOND) {18242CC = ISD::SETEQ;18243Cond = N->getOperand(1);18244Dest = N->getOperand(2);18245} else {18246assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");18247CC = cast<CondCodeSDNode>(N->getOperand(1))->get();18248Cond = N->getOperand(2);18249Dest = N->getOperand(4);18250if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {18251if (!Const->isOne() && !Const->isZero())18252return SDValue();18253Imm = Const->getZExtValue();18254} else18255return SDValue();18256}1825718258SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);18259if (!Int)18260return SDValue();1826118262if (Negate)18263CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);1826418265auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {18266return (CC == ISD::SETEQ && Imm == 0) ||18267(CC == ISD::SETNE && Imm == 1) ||18268(CC == ISD::SETLT && Imm == 1) ||18269(CC == ISD::SETULT && Imm == 1);18270};1827118272auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {18273return (CC == ISD::SETEQ && Imm == 1) ||18274(CC == ISD::SETNE && Imm == 0) ||18275(CC == ISD::SETGT && Imm == 0) ||18276(CC == ISD::SETUGT && Imm == 0) ||18277(CC == ISD::SETGE && Imm == 1) ||18278(CC == ISD::SETUGE && Imm == 1);18279};1828018281assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&18282"unsupported condition");1828318284SDLoc dl(Int);18285SelectionDAG &DAG = DCI.DAG;18286SDValue Elements = Int.getOperand(2);18287unsigned IntOp = Int->getConstantOperandVal(1);18288assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)18289&& "expected single br user");18290SDNode *Br = *N->use_begin();18291SDValue OtherTarget = Br->getOperand(1);1829218293// Update the unconditional branch to branch to the given Dest.18294auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {18295SDValue NewBrOps[] = { Br->getOperand(0), Dest };18296SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);18297DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);18298};1829918300if (IntOp == Intrinsic::test_start_loop_iterations) {18301SDValue Res;18302SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);18303// We expect this 'instruction' to branch when the counter is zero.18304if (IsTrueIfZero(CC, Imm)) {18305SDValue Ops[] = {Chain, Setup, Dest};18306Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);18307} else {18308// The logic is the reverse of what we need for WLS, so find the other18309// basic block target: the target of the proceeding br.18310UpdateUncondBr(Br, Dest, DAG);1831118312SDValue Ops[] = {Chain, Setup, OtherTarget};18313Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);18314}18315// Update LR count to the new value18316DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);18317// Update chain18318DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));18319return Res;18320} else {18321SDValue Size =18322DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);18323SDValue Args[] = { Int.getOperand(0), Elements, Size, };18324SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,18325DAG.getVTList(MVT::i32, MVT::Other), Args);18326DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());1832718328// We expect this instruction to branch when the count is not zero.18329SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;1833018331// Update the unconditional branch to target the loop preheader if we've18332// found the condition has been reversed.18333if (Target == OtherTarget)18334UpdateUncondBr(Br, Dest, DAG);1833518336Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,18337SDValue(LoopDec.getNode(), 1), Chain);1833818339SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };18340return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);18341}18342return SDValue();18343}1834418345/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.18346SDValue18347ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {18348SDValue Cmp = N->getOperand(4);18349if (Cmp.getOpcode() != ARMISD::CMPZ)18350// Only looking at NE cases.18351return SDValue();1835218353EVT VT = N->getValueType(0);18354SDLoc dl(N);18355SDValue LHS = Cmp.getOperand(0);18356SDValue RHS = Cmp.getOperand(1);18357SDValue Chain = N->getOperand(0);18358SDValue BB = N->getOperand(1);18359SDValue ARMcc = N->getOperand(2);18360ARMCC::CondCodes CC = (ARMCC::CondCodes)ARMcc->getAsZExtVal();1836118362// (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))18363// -> (brcond Chain BB CC CPSR Cmp)18364if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&18365LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&18366LHS->getOperand(0)->hasOneUse() &&18367isNullConstant(LHS->getOperand(0)->getOperand(0)) &&18368isOneConstant(LHS->getOperand(0)->getOperand(1)) &&18369isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {18370return DAG.getNode(18371ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),18372LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));18373}1837418375return SDValue();18376}1837718378/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.18379SDValue18380ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {18381SDValue Cmp = N->getOperand(4);18382if (Cmp.getOpcode() != ARMISD::CMPZ)18383// Only looking at EQ and NE cases.18384return SDValue();1838518386EVT VT = N->getValueType(0);18387SDLoc dl(N);18388SDValue LHS = Cmp.getOperand(0);18389SDValue RHS = Cmp.getOperand(1);18390SDValue FalseVal = N->getOperand(0);18391SDValue TrueVal = N->getOperand(1);18392SDValue ARMcc = N->getOperand(2);18393ARMCC::CondCodes CC = (ARMCC::CondCodes)ARMcc->getAsZExtVal();1839418395// BFI is only available on V6T2+.18396if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {18397SDValue R = PerformCMOVToBFICombine(N, DAG);18398if (R)18399return R;18400}1840118402// Simplify18403// mov r1, r018404// cmp r1, x18405// mov r0, y18406// moveq r0, x18407// to18408// cmp r0, x18409// movne r0, y18410//18411// mov r1, r018412// cmp r1, x18413// mov r0, x18414// movne r0, y18415// to18416// cmp r0, x18417// movne r0, y18418/// FIXME: Turn this into a target neutral optimization?18419SDValue Res;18420if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {18421Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,18422N->getOperand(3), Cmp);18423} else if (CC == ARMCC::EQ && TrueVal == RHS) {18424SDValue ARMcc;18425SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);18426Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,18427N->getOperand(3), NewCmp);18428}1842918430// (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))18431// -> (cmov F T CC CPSR Cmp)18432if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&18433isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&18434isNullConstant(RHS)) {18435return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,18436LHS->getOperand(2), LHS->getOperand(3),18437LHS->getOperand(4));18438}1843918440if (!VT.isInteger())18441return SDValue();1844218443// Fold away an unneccessary CMPZ/CMOV18444// CMOV A, B, C1, $cpsr, (CMPZ (CMOV 1, 0, C2, D), 0) ->18445// if C1==EQ -> CMOV A, B, C2, $cpsr, D18446// if C1==NE -> CMOV A, B, NOT(C2), $cpsr, D18447if (N->getConstantOperandVal(2) == ARMCC::EQ ||18448N->getConstantOperandVal(2) == ARMCC::NE) {18449ARMCC::CondCodes Cond;18450if (SDValue C = IsCMPZCSINC(N->getOperand(4).getNode(), Cond)) {18451if (N->getConstantOperandVal(2) == ARMCC::NE)18452Cond = ARMCC::getOppositeCondition(Cond);18453return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),18454N->getOperand(1),18455DAG.getTargetConstant(Cond, SDLoc(N), MVT::i32),18456N->getOperand(3), C);18457}18458}1845918460// Materialize a boolean comparison for integers so we can avoid branching.18461if (isNullConstant(FalseVal)) {18462if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {18463if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {18464// If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it18465// right 5 bits will make that 32 be 1, otherwise it will be 0.18466// CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 518467SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);18468Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),18469DAG.getConstant(5, dl, MVT::i32));18470} else {18471// CMOV 0, 1, ==, (CMPZ x, y) ->18472// (UADDO_CARRY (SUB x, y), t:0, t:1)18473// where t = (USUBO_CARRY 0, (SUB x, y), 0)18474//18475// The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when18476// x != y. In other words, a carry C == 1 when x == y, C == 018477// otherwise.18478// The final UADDO_CARRY computes18479// x - y + (0 - (x - y)) + C == C18480SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);18481SDVTList VTs = DAG.getVTList(VT, MVT::i32);18482SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);18483// ISD::USUBO_CARRY returns a borrow but we want the carry here18484// actually.18485SDValue Carry =18486DAG.getNode(ISD::SUB, dl, MVT::i32,18487DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));18488Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);18489}18490} else if (CC == ARMCC::NE && !isNullConstant(RHS) &&18491(!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {18492// This seems pointless but will allow us to combine it further below.18493// CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):118494SDValue Sub =18495DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);18496SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,18497Sub.getValue(1), SDValue());18498Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,18499N->getOperand(3), CPSRGlue.getValue(1));18500FalseVal = Sub;18501}18502} else if (isNullConstant(TrueVal)) {18503if (CC == ARMCC::EQ && !isNullConstant(RHS) &&18504(!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {18505// This seems pointless but will allow us to combine it further below18506// Note that we change == for != as this is the dual for the case above.18507// CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):118508SDValue Sub =18509DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);18510SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,18511Sub.getValue(1), SDValue());18512Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,18513DAG.getConstant(ARMCC::NE, dl, MVT::i32),18514N->getOperand(3), CPSRGlue.getValue(1));18515FalseVal = Sub;18516}18517}1851818519// On Thumb1, the DAG above may be further combined if z is a power of 218520// (z == 2 ^ K).18521// CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->18522// t1 = (USUBO (SUB x, y), 1)18523// t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)18524// Result = if K != 0 then (SHL t2:0, K) else t2:018525//18526// This also handles the special case of comparing against zero; it's18527// essentially, the same pattern, except there's no SUBC:18528// CMOV x, z, !=, (CMPZ x, 0) ->18529// t1 = (USUBO x, 1)18530// t2 = (USUBO_CARRY x, t1:0, t1:1)18531// Result = if K != 0 then (SHL t2:0, K) else t2:018532const APInt *TrueConst;18533if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&18534((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&18535FalseVal.getOperand(1) == RHS) ||18536(FalseVal == LHS && isNullConstant(RHS))) &&18537(TrueConst = isPowerOf2Constant(TrueVal))) {18538SDVTList VTs = DAG.getVTList(VT, MVT::i32);18539unsigned ShiftAmount = TrueConst->logBase2();18540if (ShiftAmount)18541TrueVal = DAG.getConstant(1, dl, VT);18542SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);18543Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,18544Subc.getValue(1));1854518546if (ShiftAmount)18547Res = DAG.getNode(ISD::SHL, dl, VT, Res,18548DAG.getConstant(ShiftAmount, dl, MVT::i32));18549}1855018551if (Res.getNode()) {18552KnownBits Known = DAG.computeKnownBits(SDValue(N,0));18553// Capture demanded bits information that would be otherwise lost.18554if (Known.Zero == 0xfffffffe)18555Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,18556DAG.getValueType(MVT::i1));18557else if (Known.Zero == 0xffffff00)18558Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,18559DAG.getValueType(MVT::i8));18560else if (Known.Zero == 0xffff0000)18561Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,18562DAG.getValueType(MVT::i16));18563}1856418565return Res;18566}1856718568static SDValue PerformBITCASTCombine(SDNode *N,18569TargetLowering::DAGCombinerInfo &DCI,18570const ARMSubtarget *ST) {18571SelectionDAG &DAG = DCI.DAG;18572SDValue Src = N->getOperand(0);18573EVT DstVT = N->getValueType(0);1857418575// Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.18576if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {18577EVT SrcVT = Src.getValueType();18578if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())18579return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));18580}1858118582// We may have a bitcast of something that has already had this bitcast18583// combine performed on it, so skip past any VECTOR_REG_CASTs.18584while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST)18585Src = Src.getOperand(0);1858618587// Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that18588// would be generated is at least the width of the element type.18589EVT SrcVT = Src.getValueType();18590if ((Src.getOpcode() == ARMISD::VMOVIMM ||18591Src.getOpcode() == ARMISD::VMVNIMM ||18592Src.getOpcode() == ARMISD::VMOVFPIMM) &&18593SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&18594DAG.getDataLayout().isBigEndian())18595return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);1859618597// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x18598if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))18599return R;1860018601return SDValue();18602}1860318604// Some combines for the MVETrunc truncations legalizer helper. Also lowers the18605// node into stack operations after legalizeOps.18606SDValue ARMTargetLowering::PerformMVETruncCombine(18607SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {18608SelectionDAG &DAG = DCI.DAG;18609EVT VT = N->getValueType(0);18610SDLoc DL(N);1861118612// MVETrunc(Undef, Undef) -> Undef18613if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))18614return DAG.getUNDEF(VT);1861518616// MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc18617if (N->getNumOperands() == 2 &&18618N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&18619N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)18620return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),18621N->getOperand(0).getOperand(1),18622N->getOperand(1).getOperand(0),18623N->getOperand(1).getOperand(1));1862418625// MVETrunc(shuffle, shuffle) -> VMOVN18626if (N->getNumOperands() == 2 &&18627N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&18628N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {18629auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());18630auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());1863118632if (S0->getOperand(0) == S1->getOperand(0) &&18633S0->getOperand(1) == S1->getOperand(1)) {18634// Construct complete shuffle mask18635SmallVector<int, 8> Mask(S0->getMask());18636Mask.append(S1->getMask().begin(), S1->getMask().end());1863718638if (isVMOVNTruncMask(Mask, VT, false))18639return DAG.getNode(18640ARMISD::VMOVN, DL, VT,18641DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),18642DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),18643DAG.getConstant(1, DL, MVT::i32));18644if (isVMOVNTruncMask(Mask, VT, true))18645return DAG.getNode(18646ARMISD::VMOVN, DL, VT,18647DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),18648DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),18649DAG.getConstant(1, DL, MVT::i32));18650}18651}1865218653// For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the18654// truncate to a buildvector to allow the generic optimisations to kick in.18655if (all_of(N->ops(), [](SDValue Op) {18656return Op.getOpcode() == ISD::BUILD_VECTOR ||18657Op.getOpcode() == ISD::VECTOR_SHUFFLE ||18658(Op.getOpcode() == ISD::BITCAST &&18659Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);18660})) {18661SmallVector<SDValue, 8> Extracts;18662for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {18663SDValue O = N->getOperand(Op);18664for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {18665SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,18666DAG.getConstant(i, DL, MVT::i32));18667Extracts.push_back(Ext);18668}18669}18670return DAG.getBuildVector(VT, DL, Extracts);18671}1867218673// If we are late in the legalization process and nothing has optimised18674// the trunc to anything better, lower it to a stack store and reload,18675// performing the truncation whilst keeping the lanes in the correct order:18676// VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;18677if (!DCI.isAfterLegalizeDAG())18678return SDValue();1867918680SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));18681int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();18682int NumIns = N->getNumOperands();18683assert((NumIns == 2 || NumIns == 4) &&18684"Expected 2 or 4 inputs to an MVETrunc");18685EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());18686if (N->getNumOperands() == 4)18687StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());1868818689SmallVector<SDValue> Chains;18690for (int I = 0; I < NumIns; I++) {18691SDValue Ptr = DAG.getNode(18692ISD::ADD, DL, StackPtr.getValueType(), StackPtr,18693DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));18694MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(18695DAG.getMachineFunction(), SPFI, I * 16 / NumIns);18696SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),18697Ptr, MPI, StoreVT, Align(4));18698Chains.push_back(Ch);18699}1870018701SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);18702MachinePointerInfo MPI =18703MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);18704return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));18705}1870618707// Take a MVEEXT(load x) and split that into (extload x, extload x+8)18708static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N,18709SelectionDAG &DAG) {18710SDValue N0 = N->getOperand(0);18711LoadSDNode *LD = dyn_cast<LoadSDNode>(N0.getNode());18712if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())18713return SDValue();1871418715EVT FromVT = LD->getMemoryVT();18716EVT ToVT = N->getValueType(0);18717if (!ToVT.isVector())18718return SDValue();18719assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);18720EVT ToEltVT = ToVT.getVectorElementType();18721EVT FromEltVT = FromVT.getVectorElementType();1872218723unsigned NumElements = 0;18724if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))18725NumElements = 4;18726if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)18727NumElements = 8;18728assert(NumElements != 0);1872918730ISD::LoadExtType NewExtType =18731N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;18732if (LD->getExtensionType() != ISD::NON_EXTLOAD &&18733LD->getExtensionType() != ISD::EXTLOAD &&18734LD->getExtensionType() != NewExtType)18735return SDValue();1873618737LLVMContext &C = *DAG.getContext();18738SDLoc DL(LD);18739// Details about the old load18740SDValue Ch = LD->getChain();18741SDValue BasePtr = LD->getBasePtr();18742Align Alignment = LD->getOriginalAlign();18743MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();18744AAMDNodes AAInfo = LD->getAAInfo();1874518746SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());18747EVT NewFromVT = EVT::getVectorVT(18748C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);18749EVT NewToVT = EVT::getVectorVT(18750C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);1875118752SmallVector<SDValue, 4> Loads;18753SmallVector<SDValue, 4> Chains;18754for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {18755unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;18756SDValue NewPtr =18757DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));1875818759SDValue NewLoad =18760DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,18761LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,18762Alignment, MMOFlags, AAInfo);18763Loads.push_back(NewLoad);18764Chains.push_back(SDValue(NewLoad.getNode(), 1));18765}1876618767SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);18768DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);18769return DAG.getMergeValues(Loads, DL);18770}1877118772// Perform combines for MVEEXT. If it has not be optimized to anything better18773// before lowering, it gets converted to stack store and extloads performing the18774// extend whilst still keeping the same lane ordering.18775SDValue ARMTargetLowering::PerformMVEExtCombine(18776SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {18777SelectionDAG &DAG = DCI.DAG;18778EVT VT = N->getValueType(0);18779SDLoc DL(N);18780assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");18781assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");1878218783EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(18784*DAG.getContext());18785auto Extend = [&](SDValue V) {18786SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);18787return N->getOpcode() == ARMISD::MVESEXT18788? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,18789DAG.getValueType(ExtVT))18790: DAG.getZeroExtendInReg(VVT, DL, ExtVT);18791};1879218793// MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)18794if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {18795SDValue Ext = Extend(N->getOperand(0));18796return DAG.getMergeValues({Ext, Ext}, DL);18797}1879818799// MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG18800if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {18801ArrayRef<int> Mask = SVN->getMask();18802assert(Mask.size() == 2 * VT.getVectorNumElements());18803assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());18804unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;18805SDValue Op0 = SVN->getOperand(0);18806SDValue Op1 = SVN->getOperand(1);1880718808auto CheckInregMask = [&](int Start, int Offset) {18809for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)18810if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)18811return false;18812return true;18813};18814SDValue V0 = SDValue(N, 0);18815SDValue V1 = SDValue(N, 1);18816if (CheckInregMask(0, 0))18817V0 = Extend(Op0);18818else if (CheckInregMask(0, 1))18819V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));18820else if (CheckInregMask(0, Mask.size()))18821V0 = Extend(Op1);18822else if (CheckInregMask(0, Mask.size() + 1))18823V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));1882418825if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))18826V1 = Extend(Op1);18827else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))18828V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));18829else if (CheckInregMask(VT.getVectorNumElements(), 0))18830V1 = Extend(Op0);18831else if (CheckInregMask(VT.getVectorNumElements(), 1))18832V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));1883318834if (V0.getNode() != N || V1.getNode() != N)18835return DAG.getMergeValues({V0, V1}, DL);18836}1883718838// MVEEXT(load) -> extload, extload18839if (N->getOperand(0)->getOpcode() == ISD::LOAD)18840if (SDValue L = PerformSplittingMVEEXTToWideningLoad(N, DAG))18841return L;1884218843if (!DCI.isAfterLegalizeDAG())18844return SDValue();1884518846// Lower to a stack store and reload:18847// VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;18848SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));18849int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();18850int NumOuts = N->getNumValues();18851assert((NumOuts == 2 || NumOuts == 4) &&18852"Expected 2 or 4 outputs to an MVEEXT");18853EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(18854*DAG.getContext());18855if (N->getNumOperands() == 4)18856LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());1885718858MachinePointerInfo MPI =18859MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI, 0);18860SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),18861StackPtr, MPI, Align(4));1886218863SmallVector<SDValue> Loads;18864for (int I = 0; I < NumOuts; I++) {18865SDValue Ptr = DAG.getNode(18866ISD::ADD, DL, StackPtr.getValueType(), StackPtr,18867DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));18868MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(18869DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);18870SDValue Load = DAG.getExtLoad(18871N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,18872VT, Chain, Ptr, MPI, LoadVT, Align(4));18873Loads.push_back(Load);18874}1887518876return DAG.getMergeValues(Loads, DL);18877}1887818879SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,18880DAGCombinerInfo &DCI) const {18881switch (N->getOpcode()) {18882default: break;18883case ISD::SELECT_CC:18884case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);18885case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);18886case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);18887case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);18888case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);18889case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);18890case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);18891case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);18892case ISD::OR: return PerformORCombine(N, DCI, Subtarget);18893case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);18894case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);18895case ISD::BRCOND:18896case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);18897case ARMISD::ADDC:18898case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);18899case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);18900case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);18901case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);18902case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);18903case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);18904case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);18905case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);18906case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);18907case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);18908case ISD::EXTRACT_VECTOR_ELT:18909return PerformExtractEltCombine(N, DCI, Subtarget);18910case ISD::SIGN_EXTEND_INREG: return PerformSignExtendInregCombine(N, DCI.DAG);18911case ISD::INSERT_SUBVECTOR: return PerformInsertSubvectorCombine(N, DCI);18912case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);18913case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);18914case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);18915case ISD::FP_TO_SINT:18916case ISD::FP_TO_UINT:18917return PerformVCVTCombine(N, DCI.DAG, Subtarget);18918case ISD::FADD:18919return PerformFADDCombine(N, DCI.DAG, Subtarget);18920case ISD::FMUL:18921return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);18922case ISD::INTRINSIC_WO_CHAIN:18923return PerformIntrinsicCombine(N, DCI);18924case ISD::SHL:18925case ISD::SRA:18926case ISD::SRL:18927return PerformShiftCombine(N, DCI, Subtarget);18928case ISD::SIGN_EXTEND:18929case ISD::ZERO_EXTEND:18930case ISD::ANY_EXTEND:18931return PerformExtendCombine(N, DCI.DAG, Subtarget);18932case ISD::FP_EXTEND:18933return PerformFPExtendCombine(N, DCI.DAG, Subtarget);18934case ISD::SMIN:18935case ISD::UMIN:18936case ISD::SMAX:18937case ISD::UMAX:18938return PerformMinMaxCombine(N, DCI.DAG, Subtarget);18939case ARMISD::CMOV:18940return PerformCMOVCombine(N, DCI.DAG);18941case ARMISD::BRCOND:18942return PerformBRCONDCombine(N, DCI.DAG);18943case ARMISD::CMPZ:18944return PerformCMPZCombine(N, DCI.DAG);18945case ARMISD::CSINC:18946case ARMISD::CSINV:18947case ARMISD::CSNEG:18948return PerformCSETCombine(N, DCI.DAG);18949case ISD::LOAD:18950return PerformLOADCombine(N, DCI, Subtarget);18951case ARMISD::VLD1DUP:18952case ARMISD::VLD2DUP:18953case ARMISD::VLD3DUP:18954case ARMISD::VLD4DUP:18955return PerformVLDCombine(N, DCI);18956case ARMISD::BUILD_VECTOR:18957return PerformARMBUILD_VECTORCombine(N, DCI);18958case ISD::BITCAST:18959return PerformBITCASTCombine(N, DCI, Subtarget);18960case ARMISD::PREDICATE_CAST:18961return PerformPREDICATE_CASTCombine(N, DCI);18962case ARMISD::VECTOR_REG_CAST:18963return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);18964case ARMISD::MVETRUNC:18965return PerformMVETruncCombine(N, DCI);18966case ARMISD::MVESEXT:18967case ARMISD::MVEZEXT:18968return PerformMVEExtCombine(N, DCI);18969case ARMISD::VCMP:18970return PerformVCMPCombine(N, DCI.DAG, Subtarget);18971case ISD::VECREDUCE_ADD:18972return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);18973case ARMISD::VADDVs:18974case ARMISD::VADDVu:18975case ARMISD::VADDLVs:18976case ARMISD::VADDLVu:18977case ARMISD::VADDLVAs:18978case ARMISD::VADDLVAu:18979case ARMISD::VMLAVs:18980case ARMISD::VMLAVu:18981case ARMISD::VMLALVs:18982case ARMISD::VMLALVu:18983case ARMISD::VMLALVAs:18984case ARMISD::VMLALVAu:18985return PerformReduceShuffleCombine(N, DCI.DAG);18986case ARMISD::VMOVN:18987return PerformVMOVNCombine(N, DCI);18988case ARMISD::VQMOVNs:18989case ARMISD::VQMOVNu:18990return PerformVQMOVNCombine(N, DCI);18991case ARMISD::VQDMULH:18992return PerformVQDMULHCombine(N, DCI);18993case ARMISD::ASRL:18994case ARMISD::LSRL:18995case ARMISD::LSLL:18996return PerformLongShiftCombine(N, DCI.DAG);18997case ARMISD::SMULWB: {18998unsigned BitWidth = N->getValueType(0).getSizeInBits();18999APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);19000if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))19001return SDValue();19002break;19003}19004case ARMISD::SMULWT: {19005unsigned BitWidth = N->getValueType(0).getSizeInBits();19006APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);19007if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))19008return SDValue();19009break;19010}19011case ARMISD::SMLALBB:19012case ARMISD::QADD16b:19013case ARMISD::QSUB16b:19014case ARMISD::UQADD16b:19015case ARMISD::UQSUB16b: {19016unsigned BitWidth = N->getValueType(0).getSizeInBits();19017APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);19018if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||19019(SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))19020return SDValue();19021break;19022}19023case ARMISD::SMLALBT: {19024unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();19025APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);19026unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();19027APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);19028if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||19029(SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))19030return SDValue();19031break;19032}19033case ARMISD::SMLALTB: {19034unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();19035APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);19036unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();19037APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);19038if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||19039(SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))19040return SDValue();19041break;19042}19043case ARMISD::SMLALTT: {19044unsigned BitWidth = N->getValueType(0).getSizeInBits();19045APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);19046if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||19047(SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))19048return SDValue();19049break;19050}19051case ARMISD::QADD8b:19052case ARMISD::QSUB8b:19053case ARMISD::UQADD8b:19054case ARMISD::UQSUB8b: {19055unsigned BitWidth = N->getValueType(0).getSizeInBits();19056APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);19057if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||19058(SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))19059return SDValue();19060break;19061}19062case ISD::INTRINSIC_VOID:19063case ISD::INTRINSIC_W_CHAIN:19064switch (N->getConstantOperandVal(1)) {19065case Intrinsic::arm_neon_vld1:19066case Intrinsic::arm_neon_vld1x2:19067case Intrinsic::arm_neon_vld1x3:19068case Intrinsic::arm_neon_vld1x4:19069case Intrinsic::arm_neon_vld2:19070case Intrinsic::arm_neon_vld3:19071case Intrinsic::arm_neon_vld4:19072case Intrinsic::arm_neon_vld2lane:19073case Intrinsic::arm_neon_vld3lane:19074case Intrinsic::arm_neon_vld4lane:19075case Intrinsic::arm_neon_vld2dup:19076case Intrinsic::arm_neon_vld3dup:19077case Intrinsic::arm_neon_vld4dup:19078case Intrinsic::arm_neon_vst1:19079case Intrinsic::arm_neon_vst1x2:19080case Intrinsic::arm_neon_vst1x3:19081case Intrinsic::arm_neon_vst1x4:19082case Intrinsic::arm_neon_vst2:19083case Intrinsic::arm_neon_vst3:19084case Intrinsic::arm_neon_vst4:19085case Intrinsic::arm_neon_vst2lane:19086case Intrinsic::arm_neon_vst3lane:19087case Intrinsic::arm_neon_vst4lane:19088return PerformVLDCombine(N, DCI);19089case Intrinsic::arm_mve_vld2q:19090case Intrinsic::arm_mve_vld4q:19091case Intrinsic::arm_mve_vst2q:19092case Intrinsic::arm_mve_vst4q:19093return PerformMVEVLDCombine(N, DCI);19094default: break;19095}19096break;19097}19098return SDValue();19099}1910019101bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,19102EVT VT) const {19103return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);19104}1910519106bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,19107Align Alignment,19108MachineMemOperand::Flags,19109unsigned *Fast) const {19110// Depends what it gets converted into if the type is weird.19111if (!VT.isSimple())19112return false;1911319114// The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus19115bool AllowsUnaligned = Subtarget->allowsUnalignedMem();19116auto Ty = VT.getSimpleVT().SimpleTy;1911719118if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {19119// Unaligned access can use (for example) LRDB, LRDH, LDR19120if (AllowsUnaligned) {19121if (Fast)19122*Fast = Subtarget->hasV7Ops();19123return true;19124}19125}1912619127if (Ty == MVT::f64 || Ty == MVT::v2f64) {19128// For any little-endian targets with neon, we can support unaligned ld/st19129// of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.19130// A big-endian target may also explicitly support unaligned accesses19131if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {19132if (Fast)19133*Fast = 1;19134return true;19135}19136}1913719138if (!Subtarget->hasMVEIntegerOps())19139return false;1914019141// These are for predicates19142if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||19143Ty == MVT::v2i1)) {19144if (Fast)19145*Fast = 1;19146return true;19147}1914819149// These are for truncated stores/narrowing loads. They are fine so long as19150// the alignment is at least the size of the item being loaded19151if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&19152Alignment >= VT.getScalarSizeInBits() / 8) {19153if (Fast)19154*Fast = true;19155return true;19156}1915719158// In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and19159// VSTRW.U32 all store the vector register in exactly the same format, and19160// differ only in the range of their immediate offset field and the required19161// alignment. So there is always a store that can be used, regardless of19162// actual type.19163//19164// For big endian, that is not the case. But can still emit a (VSTRB.U8;19165// VREV64.8) pair and get the same effect. This will likely be better than19166// aligning the vector through the stack.19167if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||19168Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||19169Ty == MVT::v2f64) {19170if (Fast)19171*Fast = 1;19172return true;19173}1917419175return false;19176}191771917819179EVT ARMTargetLowering::getOptimalMemOpType(19180const MemOp &Op, const AttributeList &FuncAttributes) const {19181// See if we can use NEON instructions for this...19182if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&19183!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {19184unsigned Fast;19185if (Op.size() >= 16 &&19186(Op.isAligned(Align(16)) ||19187(allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),19188MachineMemOperand::MONone, &Fast) &&19189Fast))) {19190return MVT::v2f64;19191} else if (Op.size() >= 8 &&19192(Op.isAligned(Align(8)) ||19193(allowsMisalignedMemoryAccesses(19194MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&19195Fast))) {19196return MVT::f64;19197}19198}1919919200// Let the target-independent logic figure it out.19201return MVT::Other;19202}1920319204// 64-bit integers are split into their high and low parts and held in two19205// different registers, so the trunc is free since the low register can just19206// be used.19207bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {19208if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())19209return false;19210unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();19211unsigned DestBits = DstTy->getPrimitiveSizeInBits();19212return (SrcBits == 64 && DestBits == 32);19213}1921419215bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {19216if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||19217!DstVT.isInteger())19218return false;19219unsigned SrcBits = SrcVT.getSizeInBits();19220unsigned DestBits = DstVT.getSizeInBits();19221return (SrcBits == 64 && DestBits == 32);19222}1922319224bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {19225if (Val.getOpcode() != ISD::LOAD)19226return false;1922719228EVT VT1 = Val.getValueType();19229if (!VT1.isSimple() || !VT1.isInteger() ||19230!VT2.isSimple() || !VT2.isInteger())19231return false;1923219233switch (VT1.getSimpleVT().SimpleTy) {19234default: break;19235case MVT::i1:19236case MVT::i8:19237case MVT::i16:19238// 8-bit and 16-bit loads implicitly zero-extend to 32-bits.19239return true;19240}1924119242return false;19243}1924419245bool ARMTargetLowering::isFNegFree(EVT VT) const {19246if (!VT.isSimple())19247return false;1924819249// There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that19250// negate values directly (fneg is free). So, we don't want to let the DAG19251// combiner rewrite fneg into xors and some other instructions. For f16 and19252// FullFP16 argument passing, some bitcast nodes may be introduced,19253// triggering this DAG combine rewrite, so we are avoiding that with this.19254switch (VT.getSimpleVT().SimpleTy) {19255default: break;19256case MVT::f16:19257return Subtarget->hasFullFP16();19258}1925919260return false;19261}1926219263/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth19264/// of the vector elements.19265static bool areExtractExts(Value *Ext1, Value *Ext2) {19266auto areExtDoubled = [](Instruction *Ext) {19267return Ext->getType()->getScalarSizeInBits() ==192682 * Ext->getOperand(0)->getType()->getScalarSizeInBits();19269};1927019271if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||19272!match(Ext2, m_ZExtOrSExt(m_Value())) ||19273!areExtDoubled(cast<Instruction>(Ext1)) ||19274!areExtDoubled(cast<Instruction>(Ext2)))19275return false;1927619277return true;19278}1927919280/// Check if sinking \p I's operands to I's basic block is profitable, because19281/// the operands can be folded into a target instruction, e.g.19282/// sext/zext can be folded into vsubl.19283bool ARMTargetLowering::shouldSinkOperands(Instruction *I,19284SmallVectorImpl<Use *> &Ops) const {19285if (!I->getType()->isVectorTy())19286return false;1928719288if (Subtarget->hasNEON()) {19289switch (I->getOpcode()) {19290case Instruction::Sub:19291case Instruction::Add: {19292if (!areExtractExts(I->getOperand(0), I->getOperand(1)))19293return false;19294Ops.push_back(&I->getOperandUse(0));19295Ops.push_back(&I->getOperandUse(1));19296return true;19297}19298default:19299return false;19300}19301}1930219303if (!Subtarget->hasMVEIntegerOps())19304return false;1930519306auto IsFMSMul = [&](Instruction *I) {19307if (!I->hasOneUse())19308return false;19309auto *Sub = cast<Instruction>(*I->users().begin());19310return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;19311};19312auto IsFMS = [&](Instruction *I) {19313if (match(I->getOperand(0), m_FNeg(m_Value())) ||19314match(I->getOperand(1), m_FNeg(m_Value())))19315return true;19316return false;19317};1931819319auto IsSinker = [&](Instruction *I, int Operand) {19320switch (I->getOpcode()) {19321case Instruction::Add:19322case Instruction::Mul:19323case Instruction::FAdd:19324case Instruction::ICmp:19325case Instruction::FCmp:19326return true;19327case Instruction::FMul:19328return !IsFMSMul(I);19329case Instruction::Sub:19330case Instruction::FSub:19331case Instruction::Shl:19332case Instruction::LShr:19333case Instruction::AShr:19334return Operand == 1;19335case Instruction::Call:19336if (auto *II = dyn_cast<IntrinsicInst>(I)) {19337switch (II->getIntrinsicID()) {19338case Intrinsic::fma:19339return !IsFMS(I);19340case Intrinsic::sadd_sat:19341case Intrinsic::uadd_sat:19342case Intrinsic::arm_mve_add_predicated:19343case Intrinsic::arm_mve_mul_predicated:19344case Intrinsic::arm_mve_qadd_predicated:19345case Intrinsic::arm_mve_vhadd:19346case Intrinsic::arm_mve_hadd_predicated:19347case Intrinsic::arm_mve_vqdmull:19348case Intrinsic::arm_mve_vqdmull_predicated:19349case Intrinsic::arm_mve_vqdmulh:19350case Intrinsic::arm_mve_qdmulh_predicated:19351case Intrinsic::arm_mve_vqrdmulh:19352case Intrinsic::arm_mve_qrdmulh_predicated:19353case Intrinsic::arm_mve_fma_predicated:19354return true;19355case Intrinsic::ssub_sat:19356case Intrinsic::usub_sat:19357case Intrinsic::arm_mve_sub_predicated:19358case Intrinsic::arm_mve_qsub_predicated:19359case Intrinsic::arm_mve_hsub_predicated:19360case Intrinsic::arm_mve_vhsub:19361return Operand == 1;19362default:19363return false;19364}19365}19366return false;19367default:19368return false;19369}19370};1937119372for (auto OpIdx : enumerate(I->operands())) {19373Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());19374// Make sure we are not already sinking this operand19375if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))19376continue;1937719378Instruction *Shuffle = Op;19379if (Shuffle->getOpcode() == Instruction::BitCast)19380Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));19381// We are looking for a splat that can be sunk.19382if (!Shuffle ||19383!match(Shuffle, m_Shuffle(19384m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),19385m_Undef(), m_ZeroMask())))19386continue;19387if (!IsSinker(I, OpIdx.index()))19388continue;1938919390// All uses of the shuffle should be sunk to avoid duplicating it across gpr19391// and vector registers19392for (Use &U : Op->uses()) {19393Instruction *Insn = cast<Instruction>(U.getUser());19394if (!IsSinker(Insn, U.getOperandNo()))19395return false;19396}1939719398Ops.push_back(&Shuffle->getOperandUse(0));19399if (Shuffle != Op)19400Ops.push_back(&Op->getOperandUse(0));19401Ops.push_back(&OpIdx.value());19402}19403return true;19404}1940519406Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const {19407if (!Subtarget->hasMVEIntegerOps())19408return nullptr;19409Type *SVIType = SVI->getType();19410Type *ScalarType = SVIType->getScalarType();1941119412if (ScalarType->isFloatTy())19413return Type::getInt32Ty(SVIType->getContext());19414if (ScalarType->isHalfTy())19415return Type::getInt16Ty(SVIType->getContext());19416return nullptr;19417}1941819419bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {19420EVT VT = ExtVal.getValueType();1942119422if (!isTypeLegal(VT))19423return false;1942419425if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {19426if (Ld->isExpandingLoad())19427return false;19428}1942919430if (Subtarget->hasMVEIntegerOps())19431return true;1943219433// Don't create a loadext if we can fold the extension into a wide/long19434// instruction.19435// If there's more than one user instruction, the loadext is desirable no19436// matter what. There can be two uses by the same instruction.19437if (ExtVal->use_empty() ||19438!ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))19439return true;1944019441SDNode *U = *ExtVal->use_begin();19442if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||19443U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))19444return false;1944519446return true;19447}1944819449bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {19450if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())19451return false;1945219453if (!isTypeLegal(EVT::getEVT(Ty1)))19454return false;1945519456assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");1945719458// Assuming the caller doesn't have a zeroext or signext return parameter,19459// truncation all the way down to i1 is valid.19460return true;19461}1946219463/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster19464/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be19465/// expanded to FMAs when this method returns true, otherwise fmuladd is19466/// expanded to fmul + fadd.19467///19468/// ARM supports both fused and unfused multiply-add operations; we already19469/// lower a pair of fmul and fadd to the latter so it's not clear that there19470/// would be a gain or that the gain would be worthwhile enough to risk19471/// correctness bugs.19472///19473/// For MVE, we set this to true as it helps simplify the need for some19474/// patterns (and we don't have the non-fused floating point instruction).19475bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,19476EVT VT) const {19477if (!VT.isSimple())19478return false;1947919480switch (VT.getSimpleVT().SimpleTy) {19481case MVT::v4f32:19482case MVT::v8f16:19483return Subtarget->hasMVEFloatOps();19484case MVT::f16:19485return Subtarget->useFPVFMx16();19486case MVT::f32:19487return Subtarget->useFPVFMx();19488case MVT::f64:19489return Subtarget->useFPVFMx64();19490default:19491break;19492}1949319494return false;19495}1949619497static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {19498if (V < 0)19499return false;1950019501unsigned Scale = 1;19502switch (VT.getSimpleVT().SimpleTy) {19503case MVT::i1:19504case MVT::i8:19505// Scale == 1;19506break;19507case MVT::i16:19508// Scale == 2;19509Scale = 2;19510break;19511default:19512// On thumb1 we load most things (i32, i64, floats, etc) with a LDR19513// Scale == 4;19514Scale = 4;19515break;19516}1951719518if ((V & (Scale - 1)) != 0)19519return false;19520return isUInt<5>(V / Scale);19521}1952219523static bool isLegalT2AddressImmediate(int64_t V, EVT VT,19524const ARMSubtarget *Subtarget) {19525if (!VT.isInteger() && !VT.isFloatingPoint())19526return false;19527if (VT.isVector() && Subtarget->hasNEON())19528return false;19529if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&19530!Subtarget->hasMVEFloatOps())19531return false;1953219533bool IsNeg = false;19534if (V < 0) {19535IsNeg = true;19536V = -V;19537}1953819539unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);1954019541// MVE: size * imm719542if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {19543switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {19544case MVT::i32:19545case MVT::f32:19546return isShiftedUInt<7,2>(V);19547case MVT::i16:19548case MVT::f16:19549return isShiftedUInt<7,1>(V);19550case MVT::i8:19551return isUInt<7>(V);19552default:19553return false;19554}19555}1955619557// half VLDR: 2 * imm819558if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())19559return isShiftedUInt<8, 1>(V);19560// VLDR and LDRD: 4 * imm819561if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)19562return isShiftedUInt<8, 2>(V);1956319564if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {19565// + imm12 or - imm819566if (IsNeg)19567return isUInt<8>(V);19568return isUInt<12>(V);19569}1957019571return false;19572}1957319574/// isLegalAddressImmediate - Return true if the integer value can be used19575/// as the offset of the target addressing mode for load / store of the19576/// given type.19577static bool isLegalAddressImmediate(int64_t V, EVT VT,19578const ARMSubtarget *Subtarget) {19579if (V == 0)19580return true;1958119582if (!VT.isSimple())19583return false;1958419585if (Subtarget->isThumb1Only())19586return isLegalT1AddressImmediate(V, VT);19587else if (Subtarget->isThumb2())19588return isLegalT2AddressImmediate(V, VT, Subtarget);1958919590// ARM mode.19591if (V < 0)19592V = - V;19593switch (VT.getSimpleVT().SimpleTy) {19594default: return false;19595case MVT::i1:19596case MVT::i8:19597case MVT::i32:19598// +- imm1219599return isUInt<12>(V);19600case MVT::i16:19601// +- imm819602return isUInt<8>(V);19603case MVT::f32:19604case MVT::f64:19605if (!Subtarget->hasVFP2Base()) // FIXME: NEON?19606return false;19607return isShiftedUInt<8, 2>(V);19608}19609}1961019611bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,19612EVT VT) const {19613int Scale = AM.Scale;19614if (Scale < 0)19615return false;1961619617switch (VT.getSimpleVT().SimpleTy) {19618default: return false;19619case MVT::i1:19620case MVT::i8:19621case MVT::i16:19622case MVT::i32:19623if (Scale == 1)19624return true;19625// r + r << imm19626Scale = Scale & ~1;19627return Scale == 2 || Scale == 4 || Scale == 8;19628case MVT::i64:19629// FIXME: What are we trying to model here? ldrd doesn't have an r + r19630// version in Thumb mode.19631// r + r19632if (Scale == 1)19633return true;19634// r * 2 (this can be lowered to r + r).19635if (!AM.HasBaseReg && Scale == 2)19636return true;19637return false;19638case MVT::isVoid:19639// Note, we allow "void" uses (basically, uses that aren't loads or19640// stores), because arm allows folding a scale into many arithmetic19641// operations. This should be made more precise and revisited later.1964219643// Allow r << imm, but the imm has to be a multiple of two.19644if (Scale & 1) return false;19645return isPowerOf2_32(Scale);19646}19647}1964819649bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM,19650EVT VT) const {19651const int Scale = AM.Scale;1965219653// Negative scales are not supported in Thumb1.19654if (Scale < 0)19655return false;1965619657// Thumb1 addressing modes do not support register scaling excepting the19658// following cases:19659// 1. Scale == 1 means no scaling.19660// 2. Scale == 2 this can be lowered to r + r if there is no base register.19661return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);19662}1966319664/// isLegalAddressingMode - Return true if the addressing mode represented19665/// by AM is legal for this target, for a load/store of the specified type.19666bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,19667const AddrMode &AM, Type *Ty,19668unsigned AS, Instruction *I) const {19669EVT VT = getValueType(DL, Ty, true);19670if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))19671return false;1967219673// Can never fold addr of global into load/store.19674if (AM.BaseGV)19675return false;1967619677switch (AM.Scale) {19678case 0: // no scale reg, must be "r+i" or "r", or "i".19679break;19680default:19681// ARM doesn't support any R+R*scale+imm addr modes.19682if (AM.BaseOffs)19683return false;1968419685if (!VT.isSimple())19686return false;1968719688if (Subtarget->isThumb1Only())19689return isLegalT1ScaledAddressingMode(AM, VT);1969019691if (Subtarget->isThumb2())19692return isLegalT2ScaledAddressingMode(AM, VT);1969319694int Scale = AM.Scale;19695switch (VT.getSimpleVT().SimpleTy) {19696default: return false;19697case MVT::i1:19698case MVT::i8:19699case MVT::i32:19700if (Scale < 0) Scale = -Scale;19701if (Scale == 1)19702return true;19703// r + r << imm19704return isPowerOf2_32(Scale & ~1);19705case MVT::i16:19706case MVT::i64:19707// r +/- r19708if (Scale == 1 || (AM.HasBaseReg && Scale == -1))19709return true;19710// r * 2 (this can be lowered to r + r).19711if (!AM.HasBaseReg && Scale == 2)19712return true;19713return false;1971419715case MVT::isVoid:19716// Note, we allow "void" uses (basically, uses that aren't loads or19717// stores), because arm allows folding a scale into many arithmetic19718// operations. This should be made more precise and revisited later.1971919720// Allow r << imm, but the imm has to be a multiple of two.19721if (Scale & 1) return false;19722return isPowerOf2_32(Scale);19723}19724}19725return true;19726}1972719728/// isLegalICmpImmediate - Return true if the specified immediate is legal19729/// icmp immediate, that is the target has icmp instructions which can compare19730/// a register against the immediate without having to materialize the19731/// immediate into a register.19732bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {19733// Thumb2 and ARM modes can use cmn for negative immediates.19734if (!Subtarget->isThumb())19735return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||19736ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;19737if (Subtarget->isThumb2())19738return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||19739ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;19740// Thumb1 doesn't have cmn, and only 8-bit immediates.19741return Imm >= 0 && Imm <= 255;19742}1974319744/// isLegalAddImmediate - Return true if the specified immediate is a legal add19745/// *or sub* immediate, that is the target has add or sub instructions which can19746/// add a register with the immediate without having to materialize the19747/// immediate into a register.19748bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {19749// Same encoding for add/sub, just flip the sign.19750int64_t AbsImm = std::abs(Imm);19751if (!Subtarget->isThumb())19752return ARM_AM::getSOImmVal(AbsImm) != -1;19753if (Subtarget->isThumb2())19754return ARM_AM::getT2SOImmVal(AbsImm) != -1;19755// Thumb1 only has 8-bit unsigned immediate.19756return AbsImm >= 0 && AbsImm <= 255;19757}1975819759// Return false to prevent folding19760// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,19761// if the folding leads to worse code.19762bool ARMTargetLowering::isMulAddWithConstProfitable(SDValue AddNode,19763SDValue ConstNode) const {19764// Let the DAGCombiner decide for vector types and large types.19765const EVT VT = AddNode.getValueType();19766if (VT.isVector() || VT.getScalarSizeInBits() > 32)19767return true;1976819769// It is worse if c0 is legal add immediate, while c1*c0 is not19770// and has to be composed by at least two instructions.19771const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));19772const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);19773const int64_t C0 = C0Node->getSExtValue();19774APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();19775if (!isLegalAddImmediate(C0) || isLegalAddImmediate(CA.getSExtValue()))19776return true;19777if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)19778return false;1977919780// Default to true and let the DAGCombiner decide.19781return true;19782}1978319784static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,19785bool isSEXTLoad, SDValue &Base,19786SDValue &Offset, bool &isInc,19787SelectionDAG &DAG) {19788if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)19789return false;1979019791if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {19792// AddressingMode 319793Base = Ptr->getOperand(0);19794if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {19795int RHSC = (int)RHS->getZExtValue();19796if (RHSC < 0 && RHSC > -256) {19797assert(Ptr->getOpcode() == ISD::ADD);19798isInc = false;19799Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));19800return true;19801}19802}19803isInc = (Ptr->getOpcode() == ISD::ADD);19804Offset = Ptr->getOperand(1);19805return true;19806} else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {19807// AddressingMode 219808if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {19809int RHSC = (int)RHS->getZExtValue();19810if (RHSC < 0 && RHSC > -0x1000) {19811assert(Ptr->getOpcode() == ISD::ADD);19812isInc = false;19813Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));19814Base = Ptr->getOperand(0);19815return true;19816}19817}1981819819if (Ptr->getOpcode() == ISD::ADD) {19820isInc = true;19821ARM_AM::ShiftOpc ShOpcVal=19822ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());19823if (ShOpcVal != ARM_AM::no_shift) {19824Base = Ptr->getOperand(1);19825Offset = Ptr->getOperand(0);19826} else {19827Base = Ptr->getOperand(0);19828Offset = Ptr->getOperand(1);19829}19830return true;19831}1983219833isInc = (Ptr->getOpcode() == ISD::ADD);19834Base = Ptr->getOperand(0);19835Offset = Ptr->getOperand(1);19836return true;19837}1983819839// FIXME: Use VLDM / VSTM to emulate indexed FP load / store.19840return false;19841}1984219843static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,19844bool isSEXTLoad, SDValue &Base,19845SDValue &Offset, bool &isInc,19846SelectionDAG &DAG) {19847if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)19848return false;1984919850Base = Ptr->getOperand(0);19851if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {19852int RHSC = (int)RHS->getZExtValue();19853if (RHSC < 0 && RHSC > -0x100) { // 8 bits.19854assert(Ptr->getOpcode() == ISD::ADD);19855isInc = false;19856Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));19857return true;19858} else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.19859isInc = Ptr->getOpcode() == ISD::ADD;19860Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));19861return true;19862}19863}1986419865return false;19866}1986719868static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,19869bool isSEXTLoad, bool IsMasked, bool isLE,19870SDValue &Base, SDValue &Offset,19871bool &isInc, SelectionDAG &DAG) {19872if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)19873return false;19874if (!isa<ConstantSDNode>(Ptr->getOperand(1)))19875return false;1987619877// We allow LE non-masked loads to change the type (for example use a vldrb.819878// as opposed to a vldrw.32). This can allow extra addressing modes or19879// alignments for what is otherwise an equivalent instruction.19880bool CanChangeType = isLE && !IsMasked;1988119882ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));19883int RHSC = (int)RHS->getZExtValue();1988419885auto IsInRange = [&](int RHSC, int Limit, int Scale) {19886if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {19887assert(Ptr->getOpcode() == ISD::ADD);19888isInc = false;19889Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));19890return true;19891} else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {19892isInc = Ptr->getOpcode() == ISD::ADD;19893Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));19894return true;19895}19896return false;19897};1989819899// Try to find a matching instruction based on s/zext, Alignment, Offset and19900// (in BE/masked) type.19901Base = Ptr->getOperand(0);19902if (VT == MVT::v4i16) {19903if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))19904return true;19905} else if (VT == MVT::v4i8 || VT == MVT::v8i8) {19906if (IsInRange(RHSC, 0x80, 1))19907return true;19908} else if (Alignment >= 4 &&19909(CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&19910IsInRange(RHSC, 0x80, 4))19911return true;19912else if (Alignment >= 2 &&19913(CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&19914IsInRange(RHSC, 0x80, 2))19915return true;19916else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))19917return true;19918return false;19919}1992019921/// getPreIndexedAddressParts - returns true by value, base pointer and19922/// offset pointer and addressing mode by reference if the node's address19923/// can be legally represented as pre-indexed load / store address.19924bool19925ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,19926SDValue &Offset,19927ISD::MemIndexedMode &AM,19928SelectionDAG &DAG) const {19929if (Subtarget->isThumb1Only())19930return false;1993119932EVT VT;19933SDValue Ptr;19934Align Alignment;19935bool isSEXTLoad = false;19936bool IsMasked = false;19937if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {19938Ptr = LD->getBasePtr();19939VT = LD->getMemoryVT();19940Alignment = LD->getAlign();19941isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;19942} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {19943Ptr = ST->getBasePtr();19944VT = ST->getMemoryVT();19945Alignment = ST->getAlign();19946} else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {19947Ptr = LD->getBasePtr();19948VT = LD->getMemoryVT();19949Alignment = LD->getAlign();19950isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;19951IsMasked = true;19952} else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {19953Ptr = ST->getBasePtr();19954VT = ST->getMemoryVT();19955Alignment = ST->getAlign();19956IsMasked = true;19957} else19958return false;1995919960bool isInc;19961bool isLegal = false;19962if (VT.isVector())19963isLegal = Subtarget->hasMVEIntegerOps() &&19964getMVEIndexedAddressParts(19965Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,19966Subtarget->isLittle(), Base, Offset, isInc, DAG);19967else {19968if (Subtarget->isThumb2())19969isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,19970Offset, isInc, DAG);19971else19972isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,19973Offset, isInc, DAG);19974}19975if (!isLegal)19976return false;1997719978AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;19979return true;19980}1998119982/// getPostIndexedAddressParts - returns true by value, base pointer and19983/// offset pointer and addressing mode by reference if this node can be19984/// combined with a load / store to form a post-indexed load / store.19985bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,19986SDValue &Base,19987SDValue &Offset,19988ISD::MemIndexedMode &AM,19989SelectionDAG &DAG) const {19990EVT VT;19991SDValue Ptr;19992Align Alignment;19993bool isSEXTLoad = false, isNonExt;19994bool IsMasked = false;19995if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {19996VT = LD->getMemoryVT();19997Ptr = LD->getBasePtr();19998Alignment = LD->getAlign();19999isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;20000isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;20001} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {20002VT = ST->getMemoryVT();20003Ptr = ST->getBasePtr();20004Alignment = ST->getAlign();20005isNonExt = !ST->isTruncatingStore();20006} else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {20007VT = LD->getMemoryVT();20008Ptr = LD->getBasePtr();20009Alignment = LD->getAlign();20010isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;20011isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;20012IsMasked = true;20013} else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {20014VT = ST->getMemoryVT();20015Ptr = ST->getBasePtr();20016Alignment = ST->getAlign();20017isNonExt = !ST->isTruncatingStore();20018IsMasked = true;20019} else20020return false;2002120022if (Subtarget->isThumb1Only()) {20023// Thumb-1 can do a limited post-inc load or store as an updating LDM. It20024// must be non-extending/truncating, i32, with an offset of 4.20025assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");20026if (Op->getOpcode() != ISD::ADD || !isNonExt)20027return false;20028auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));20029if (!RHS || RHS->getZExtValue() != 4)20030return false;20031if (Alignment < Align(4))20032return false;2003320034Offset = Op->getOperand(1);20035Base = Op->getOperand(0);20036AM = ISD::POST_INC;20037return true;20038}2003920040bool isInc;20041bool isLegal = false;20042if (VT.isVector())20043isLegal = Subtarget->hasMVEIntegerOps() &&20044getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,20045Subtarget->isLittle(), Base, Offset,20046isInc, DAG);20047else {20048if (Subtarget->isThumb2())20049isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,20050isInc, DAG);20051else20052isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,20053isInc, DAG);20054}20055if (!isLegal)20056return false;2005720058if (Ptr != Base) {20059// Swap base ptr and offset to catch more post-index load / store when20060// it's legal. In Thumb2 mode, offset must be an immediate.20061if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&20062!Subtarget->isThumb2())20063std::swap(Base, Offset);2006420065// Post-indexed load / store update the base pointer.20066if (Ptr != Base)20067return false;20068}2006920070AM = isInc ? ISD::POST_INC : ISD::POST_DEC;20071return true;20072}2007320074void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,20075KnownBits &Known,20076const APInt &DemandedElts,20077const SelectionDAG &DAG,20078unsigned Depth) const {20079unsigned BitWidth = Known.getBitWidth();20080Known.resetAll();20081switch (Op.getOpcode()) {20082default: break;20083case ARMISD::ADDC:20084case ARMISD::ADDE:20085case ARMISD::SUBC:20086case ARMISD::SUBE:20087// Special cases when we convert a carry to a boolean.20088if (Op.getResNo() == 0) {20089SDValue LHS = Op.getOperand(0);20090SDValue RHS = Op.getOperand(1);20091// (ADDE 0, 0, C) will give us a single bit.20092if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&20093isNullConstant(RHS)) {20094Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);20095return;20096}20097}20098break;20099case ARMISD::CMOV: {20100// Bits are known zero/one if known on the LHS and RHS.20101Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);20102if (Known.isUnknown())20103return;2010420105KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);20106Known = Known.intersectWith(KnownRHS);20107return;20108}20109case ISD::INTRINSIC_W_CHAIN: {20110Intrinsic::ID IntID =20111static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));20112switch (IntID) {20113default: return;20114case Intrinsic::arm_ldaex:20115case Intrinsic::arm_ldrex: {20116EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();20117unsigned MemBits = VT.getScalarSizeInBits();20118Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);20119return;20120}20121}20122}20123case ARMISD::BFI: {20124// Conservatively, we can recurse down the first operand20125// and just mask out all affected bits.20126Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);2012720128// The operand to BFI is already a mask suitable for removing the bits it20129// sets.20130const APInt &Mask = Op.getConstantOperandAPInt(2);20131Known.Zero &= Mask;20132Known.One &= Mask;20133return;20134}20135case ARMISD::VGETLANEs:20136case ARMISD::VGETLANEu: {20137const SDValue &SrcSV = Op.getOperand(0);20138EVT VecVT = SrcSV.getValueType();20139assert(VecVT.isVector() && "VGETLANE expected a vector type");20140const unsigned NumSrcElts = VecVT.getVectorNumElements();20141ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());20142assert(Pos->getAPIntValue().ult(NumSrcElts) &&20143"VGETLANE index out of bounds");20144unsigned Idx = Pos->getZExtValue();20145APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);20146Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);2014720148EVT VT = Op.getValueType();20149const unsigned DstSz = VT.getScalarSizeInBits();20150const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();20151(void)SrcSz;20152assert(SrcSz == Known.getBitWidth());20153assert(DstSz > SrcSz);20154if (Op.getOpcode() == ARMISD::VGETLANEs)20155Known = Known.sext(DstSz);20156else {20157Known = Known.zext(DstSz);20158}20159assert(DstSz == Known.getBitWidth());20160break;20161}20162case ARMISD::VMOVrh: {20163KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);20164assert(KnownOp.getBitWidth() == 16);20165Known = KnownOp.zext(32);20166break;20167}20168case ARMISD::CSINC:20169case ARMISD::CSINV:20170case ARMISD::CSNEG: {20171KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);20172KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);2017320174// The result is either:20175// CSINC: KnownOp0 or KnownOp1 + 120176// CSINV: KnownOp0 or ~KnownOp120177// CSNEG: KnownOp0 or KnownOp1 * -120178if (Op.getOpcode() == ARMISD::CSINC)20179KnownOp1 = KnownBits::computeForAddSub(20180/*Add=*/true, /*NSW=*/false, /*NUW=*/false, KnownOp1,20181KnownBits::makeConstant(APInt(32, 1)));20182else if (Op.getOpcode() == ARMISD::CSINV)20183std::swap(KnownOp1.Zero, KnownOp1.One);20184else if (Op.getOpcode() == ARMISD::CSNEG)20185KnownOp1 = KnownBits::mul(20186KnownOp1, KnownBits::makeConstant(APInt(32, -1)));2018720188Known = KnownOp0.intersectWith(KnownOp1);20189break;20190}20191}20192}2019320194bool ARMTargetLowering::targetShrinkDemandedConstant(20195SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,20196TargetLoweringOpt &TLO) const {20197// Delay optimization, so we don't have to deal with illegal types, or block20198// optimizations.20199if (!TLO.LegalOps)20200return false;2020120202// Only optimize AND for now.20203if (Op.getOpcode() != ISD::AND)20204return false;2020520206EVT VT = Op.getValueType();2020720208// Ignore vectors.20209if (VT.isVector())20210return false;2021120212assert(VT == MVT::i32 && "Unexpected integer type");2021320214// Make sure the RHS really is a constant.20215ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));20216if (!C)20217return false;2021820219unsigned Mask = C->getZExtValue();2022020221unsigned Demanded = DemandedBits.getZExtValue();20222unsigned ShrunkMask = Mask & Demanded;20223unsigned ExpandedMask = Mask | ~Demanded;2022420225// If the mask is all zeros, let the target-independent code replace the20226// result with zero.20227if (ShrunkMask == 0)20228return false;2022920230// If the mask is all ones, erase the AND. (Currently, the target-independent20231// code won't do this, so we have to do it explicitly to avoid an infinite20232// loop in obscure cases.)20233if (ExpandedMask == ~0U)20234return TLO.CombineTo(Op, Op.getOperand(0));2023520236auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {20237return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;20238};20239auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {20240if (NewMask == Mask)20241return true;20242SDLoc DL(Op);20243SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);20244SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);20245return TLO.CombineTo(Op, NewOp);20246};2024720248// Prefer uxtb mask.20249if (IsLegalMask(0xFF))20250return UseMask(0xFF);2025120252// Prefer uxth mask.20253if (IsLegalMask(0xFFFF))20254return UseMask(0xFFFF);2025520256// [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.20257// FIXME: Prefer a contiguous sequence of bits for other optimizations.20258if (ShrunkMask < 256)20259return UseMask(ShrunkMask);2026020261// [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.20262// FIXME: Prefer a contiguous sequence of bits for other optimizations.20263if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)20264return UseMask(ExpandedMask);2026520266// Potential improvements:20267//20268// We could try to recognize lsls+lsrs or lsrs+lsls pairs here.20269// We could try to prefer Thumb1 immediates which can be lowered to a20270// two-instruction sequence.20271// We could try to recognize more legal ARM/Thumb2 immediates here.2027220273return false;20274}2027520276bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(20277SDValue Op, const APInt &OriginalDemandedBits,20278const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,20279unsigned Depth) const {20280unsigned Opc = Op.getOpcode();2028120282switch (Opc) {20283case ARMISD::ASRL:20284case ARMISD::LSRL: {20285// If this is result 0 and the other result is unused, see if the demand20286// bits allow us to shrink this long shift into a standard small shift in20287// the opposite direction.20288if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&20289isa<ConstantSDNode>(Op->getOperand(2))) {20290unsigned ShAmt = Op->getConstantOperandVal(2);20291if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)20292<< (32 - ShAmt)))20293return TLO.CombineTo(20294Op, TLO.DAG.getNode(20295ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),20296TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));20297}20298break;20299}20300case ARMISD::VBICIMM: {20301SDValue Op0 = Op.getOperand(0);20302unsigned ModImm = Op.getConstantOperandVal(1);20303unsigned EltBits = 0;20304uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);20305if ((OriginalDemandedBits & Mask) == 0)20306return TLO.CombineTo(Op, Op0);20307}20308}2030920310return TargetLowering::SimplifyDemandedBitsForTargetNode(20311Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);20312}2031320314//===----------------------------------------------------------------------===//20315// ARM Inline Assembly Support20316//===----------------------------------------------------------------------===//2031720318bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {20319// Looking for "rev" which is V6+.20320if (!Subtarget->hasV6Ops())20321return false;2032220323InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());20324StringRef AsmStr = IA->getAsmString();20325SmallVector<StringRef, 4> AsmPieces;20326SplitString(AsmStr, AsmPieces, ";\n");2032720328switch (AsmPieces.size()) {20329default: return false;20330case 1:20331AsmStr = AsmPieces[0];20332AsmPieces.clear();20333SplitString(AsmStr, AsmPieces, " \t,");2033420335// rev $0, $120336if (AsmPieces.size() == 3 &&20337AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&20338IA->getConstraintString().compare(0, 4, "=l,l") == 0) {20339IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());20340if (Ty && Ty->getBitWidth() == 32)20341return IntrinsicLowering::LowerToByteSwap(CI);20342}20343break;20344}2034520346return false;20347}2034820349const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {20350// At this point, we have to lower this constraint to something else, so we20351// lower it to an "r" or "w". However, by doing this we will force the result20352// to be in register, while the X constraint is much more permissive.20353//20354// Although we are correct (we are free to emit anything, without20355// constraints), we might break use cases that would expect us to be more20356// efficient and emit something else.20357if (!Subtarget->hasVFP2Base())20358return "r";20359if (ConstraintVT.isFloatingPoint())20360return "w";20361if (ConstraintVT.isVector() && Subtarget->hasNEON() &&20362(ConstraintVT.getSizeInBits() == 64 ||20363ConstraintVT.getSizeInBits() == 128))20364return "w";2036520366return "r";20367}2036820369/// getConstraintType - Given a constraint letter, return the type of20370/// constraint it is for this target.20371ARMTargetLowering::ConstraintType20372ARMTargetLowering::getConstraintType(StringRef Constraint) const {20373unsigned S = Constraint.size();20374if (S == 1) {20375switch (Constraint[0]) {20376default: break;20377case 'l': return C_RegisterClass;20378case 'w': return C_RegisterClass;20379case 'h': return C_RegisterClass;20380case 'x': return C_RegisterClass;20381case 't': return C_RegisterClass;20382case 'j': return C_Immediate; // Constant for movw.20383// An address with a single base register. Due to the way we20384// currently handle addresses it is the same as an 'r' memory constraint.20385case 'Q': return C_Memory;20386}20387} else if (S == 2) {20388switch (Constraint[0]) {20389default: break;20390case 'T': return C_RegisterClass;20391// All 'U+' constraints are addresses.20392case 'U': return C_Memory;20393}20394}20395return TargetLowering::getConstraintType(Constraint);20396}2039720398/// Examine constraint type and operand type and determine a weight value.20399/// This object must already have been set up with the operand type20400/// and the current alternative constraint selected.20401TargetLowering::ConstraintWeight20402ARMTargetLowering::getSingleConstraintMatchWeight(20403AsmOperandInfo &info, const char *constraint) const {20404ConstraintWeight weight = CW_Invalid;20405Value *CallOperandVal = info.CallOperandVal;20406// If we don't have a value, we can't do a match,20407// but allow it at the lowest weight.20408if (!CallOperandVal)20409return CW_Default;20410Type *type = CallOperandVal->getType();20411// Look at the constraint type.20412switch (*constraint) {20413default:20414weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);20415break;20416case 'l':20417if (type->isIntegerTy()) {20418if (Subtarget->isThumb())20419weight = CW_SpecificReg;20420else20421weight = CW_Register;20422}20423break;20424case 'w':20425if (type->isFloatingPointTy())20426weight = CW_Register;20427break;20428}20429return weight;20430}2043120432using RCPair = std::pair<unsigned, const TargetRegisterClass *>;2043320434RCPair ARMTargetLowering::getRegForInlineAsmConstraint(20435const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {20436switch (Constraint.size()) {20437case 1:20438// GCC ARM Constraint Letters20439switch (Constraint[0]) {20440case 'l': // Low regs or general regs.20441if (Subtarget->isThumb())20442return RCPair(0U, &ARM::tGPRRegClass);20443return RCPair(0U, &ARM::GPRRegClass);20444case 'h': // High regs or no regs.20445if (Subtarget->isThumb())20446return RCPair(0U, &ARM::hGPRRegClass);20447break;20448case 'r':20449if (Subtarget->isThumb1Only())20450return RCPair(0U, &ARM::tGPRRegClass);20451return RCPair(0U, &ARM::GPRRegClass);20452case 'w':20453if (VT == MVT::Other)20454break;20455if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)20456return RCPair(0U, &ARM::SPRRegClass);20457if (VT.getSizeInBits() == 64)20458return RCPair(0U, &ARM::DPRRegClass);20459if (VT.getSizeInBits() == 128)20460return RCPair(0U, &ARM::QPRRegClass);20461break;20462case 'x':20463if (VT == MVT::Other)20464break;20465if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)20466return RCPair(0U, &ARM::SPR_8RegClass);20467if (VT.getSizeInBits() == 64)20468return RCPair(0U, &ARM::DPR_8RegClass);20469if (VT.getSizeInBits() == 128)20470return RCPair(0U, &ARM::QPR_8RegClass);20471break;20472case 't':20473if (VT == MVT::Other)20474break;20475if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)20476return RCPair(0U, &ARM::SPRRegClass);20477if (VT.getSizeInBits() == 64)20478return RCPair(0U, &ARM::DPR_VFP2RegClass);20479if (VT.getSizeInBits() == 128)20480return RCPair(0U, &ARM::QPR_VFP2RegClass);20481break;20482}20483break;2048420485case 2:20486if (Constraint[0] == 'T') {20487switch (Constraint[1]) {20488default:20489break;20490case 'e':20491return RCPair(0U, &ARM::tGPREvenRegClass);20492case 'o':20493return RCPair(0U, &ARM::tGPROddRegClass);20494}20495}20496break;2049720498default:20499break;20500}2050120502if (StringRef("{cc}").equals_insensitive(Constraint))20503return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);2050420505return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);20506}2050720508/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops20509/// vector. If it is invalid, don't add anything to Ops.20510void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,20511StringRef Constraint,20512std::vector<SDValue> &Ops,20513SelectionDAG &DAG) const {20514SDValue Result;2051520516// Currently only support length 1 constraints.20517if (Constraint.size() != 1)20518return;2051920520char ConstraintLetter = Constraint[0];20521switch (ConstraintLetter) {20522default: break;20523case 'j':20524case 'I': case 'J': case 'K': case 'L':20525case 'M': case 'N': case 'O':20526ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);20527if (!C)20528return;2052920530int64_t CVal64 = C->getSExtValue();20531int CVal = (int) CVal64;20532// None of these constraints allow values larger than 32 bits. Check20533// that the value fits in an int.20534if (CVal != CVal64)20535return;2053620537switch (ConstraintLetter) {20538case 'j':20539// Constant suitable for movw, must be between 0 and20540// 65535.20541if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))20542if (CVal >= 0 && CVal <= 65535)20543break;20544return;20545case 'I':20546if (Subtarget->isThumb1Only()) {20547// This must be a constant between 0 and 255, for ADD20548// immediates.20549if (CVal >= 0 && CVal <= 255)20550break;20551} else if (Subtarget->isThumb2()) {20552// A constant that can be used as an immediate value in a20553// data-processing instruction.20554if (ARM_AM::getT2SOImmVal(CVal) != -1)20555break;20556} else {20557// A constant that can be used as an immediate value in a20558// data-processing instruction.20559if (ARM_AM::getSOImmVal(CVal) != -1)20560break;20561}20562return;2056320564case 'J':20565if (Subtarget->isThumb1Only()) {20566// This must be a constant between -255 and -1, for negated ADD20567// immediates. This can be used in GCC with an "n" modifier that20568// prints the negated value, for use with SUB instructions. It is20569// not useful otherwise but is implemented for compatibility.20570if (CVal >= -255 && CVal <= -1)20571break;20572} else {20573// This must be a constant between -4095 and 4095. It is not clear20574// what this constraint is intended for. Implemented for20575// compatibility with GCC.20576if (CVal >= -4095 && CVal <= 4095)20577break;20578}20579return;2058020581case 'K':20582if (Subtarget->isThumb1Only()) {20583// A 32-bit value where only one byte has a nonzero value. Exclude20584// zero to match GCC. This constraint is used by GCC internally for20585// constants that can be loaded with a move/shift combination.20586// It is not useful otherwise but is implemented for compatibility.20587if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))20588break;20589} else if (Subtarget->isThumb2()) {20590// A constant whose bitwise inverse can be used as an immediate20591// value in a data-processing instruction. This can be used in GCC20592// with a "B" modifier that prints the inverted value, for use with20593// BIC and MVN instructions. It is not useful otherwise but is20594// implemented for compatibility.20595if (ARM_AM::getT2SOImmVal(~CVal) != -1)20596break;20597} else {20598// A constant whose bitwise inverse can be used as an immediate20599// value in a data-processing instruction. This can be used in GCC20600// with a "B" modifier that prints the inverted value, for use with20601// BIC and MVN instructions. It is not useful otherwise but is20602// implemented for compatibility.20603if (ARM_AM::getSOImmVal(~CVal) != -1)20604break;20605}20606return;2060720608case 'L':20609if (Subtarget->isThumb1Only()) {20610// This must be a constant between -7 and 7,20611// for 3-operand ADD/SUB immediate instructions.20612if (CVal >= -7 && CVal < 7)20613break;20614} else if (Subtarget->isThumb2()) {20615// A constant whose negation can be used as an immediate value in a20616// data-processing instruction. This can be used in GCC with an "n"20617// modifier that prints the negated value, for use with SUB20618// instructions. It is not useful otherwise but is implemented for20619// compatibility.20620if (ARM_AM::getT2SOImmVal(-CVal) != -1)20621break;20622} else {20623// A constant whose negation can be used as an immediate value in a20624// data-processing instruction. This can be used in GCC with an "n"20625// modifier that prints the negated value, for use with SUB20626// instructions. It is not useful otherwise but is implemented for20627// compatibility.20628if (ARM_AM::getSOImmVal(-CVal) != -1)20629break;20630}20631return;2063220633case 'M':20634if (Subtarget->isThumb1Only()) {20635// This must be a multiple of 4 between 0 and 1020, for20636// ADD sp + immediate.20637if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))20638break;20639} else {20640// A power of two or a constant between 0 and 32. This is used in20641// GCC for the shift amount on shifted register operands, but it is20642// useful in general for any shift amounts.20643if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))20644break;20645}20646return;2064720648case 'N':20649if (Subtarget->isThumb1Only()) {20650// This must be a constant between 0 and 31, for shift amounts.20651if (CVal >= 0 && CVal <= 31)20652break;20653}20654return;2065520656case 'O':20657if (Subtarget->isThumb1Only()) {20658// This must be a multiple of 4 between -508 and 508, for20659// ADD/SUB sp = sp + immediate.20660if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))20661break;20662}20663return;20664}20665Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());20666break;20667}2066820669if (Result.getNode()) {20670Ops.push_back(Result);20671return;20672}20673return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);20674}2067520676static RTLIB::Libcall getDivRemLibcall(20677const SDNode *N, MVT::SimpleValueType SVT) {20678assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||20679N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&20680"Unhandled Opcode in getDivRemLibcall");20681bool isSigned = N->getOpcode() == ISD::SDIVREM ||20682N->getOpcode() == ISD::SREM;20683RTLIB::Libcall LC;20684switch (SVT) {20685default: llvm_unreachable("Unexpected request for libcall!");20686case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;20687case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;20688case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;20689case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;20690}20691return LC;20692}2069320694static TargetLowering::ArgListTy getDivRemArgList(20695const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {20696assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||20697N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&20698"Unhandled Opcode in getDivRemArgList");20699bool isSigned = N->getOpcode() == ISD::SDIVREM ||20700N->getOpcode() == ISD::SREM;20701TargetLowering::ArgListTy Args;20702TargetLowering::ArgListEntry Entry;20703for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {20704EVT ArgVT = N->getOperand(i).getValueType();20705Type *ArgTy = ArgVT.getTypeForEVT(*Context);20706Entry.Node = N->getOperand(i);20707Entry.Ty = ArgTy;20708Entry.IsSExt = isSigned;20709Entry.IsZExt = !isSigned;20710Args.push_back(Entry);20711}20712if (Subtarget->isTargetWindows() && Args.size() >= 2)20713std::swap(Args[0], Args[1]);20714return Args;20715}2071620717SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {20718assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||20719Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||20720Subtarget->isTargetWindows()) &&20721"Register-based DivRem lowering only");20722unsigned Opcode = Op->getOpcode();20723assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&20724"Invalid opcode for Div/Rem lowering");20725bool isSigned = (Opcode == ISD::SDIVREM);20726EVT VT = Op->getValueType(0);20727SDLoc dl(Op);2072820729if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {20730SmallVector<SDValue> Result;20731if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {20732SDValue Res0 =20733DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);20734SDValue Res1 =20735DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);20736return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),20737{Res0, Res1});20738}20739}2074020741Type *Ty = VT.getTypeForEVT(*DAG.getContext());2074220743// If the target has hardware divide, use divide + multiply + subtract:20744// div = a / b20745// rem = a - b * div20746// return {div, rem}20747// This should be lowered into UDIV/SDIV + MLS later on.20748bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()20749: Subtarget->hasDivideInARMMode();20750if (hasDivide && Op->getValueType(0).isSimple() &&20751Op->getSimpleValueType(0) == MVT::i32) {20752unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;20753const SDValue Dividend = Op->getOperand(0);20754const SDValue Divisor = Op->getOperand(1);20755SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);20756SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);20757SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);2075820759SDValue Values[2] = {Div, Rem};20760return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);20761}2076220763RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),20764VT.getSimpleVT().SimpleTy);20765SDValue InChain = DAG.getEntryNode();2076620767TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),20768DAG.getContext(),20769Subtarget);2077020771SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),20772getPointerTy(DAG.getDataLayout()));2077320774Type *RetTy = StructType::get(Ty, Ty);2077520776if (Subtarget->isTargetWindows())20777InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);2077820779TargetLowering::CallLoweringInfo CLI(DAG);20780CLI.setDebugLoc(dl).setChain(InChain)20781.setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))20782.setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);2078320784std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);20785return CallInfo.first;20786}2078720788// Lowers REM using divmod helpers20789// see RTABI section 4.2/4.320790SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {20791EVT VT = N->getValueType(0);2079220793if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {20794SmallVector<SDValue> Result;20795if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))20796return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),20797Result[0], Result[1]);20798}2079920800// Build return types (div and rem)20801std::vector<Type*> RetTyParams;20802Type *RetTyElement;2080320804switch (VT.getSimpleVT().SimpleTy) {20805default: llvm_unreachable("Unexpected request for libcall!");20806case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;20807case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;20808case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;20809case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;20810}2081120812RetTyParams.push_back(RetTyElement);20813RetTyParams.push_back(RetTyElement);20814ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);20815Type *RetTy = StructType::get(*DAG.getContext(), ret);2081620817RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().20818SimpleTy);20819SDValue InChain = DAG.getEntryNode();20820TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),20821Subtarget);20822bool isSigned = N->getOpcode() == ISD::SREM;20823SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),20824getPointerTy(DAG.getDataLayout()));2082520826if (Subtarget->isTargetWindows())20827InChain = WinDBZCheckDenominator(DAG, N, InChain);2082820829// Lower call20830CallLoweringInfo CLI(DAG);20831CLI.setChain(InChain)20832.setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))20833.setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));20834std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);2083520836// Return second (rem) result operand (first contains div)20837SDNode *ResNode = CallResult.first.getNode();20838assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");20839return ResNode->getOperand(1);20840}2084120842SDValue20843ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {20844assert(Subtarget->isTargetWindows() && "unsupported target platform");20845SDLoc DL(Op);2084620847// Get the inputs.20848SDValue Chain = Op.getOperand(0);20849SDValue Size = Op.getOperand(1);2085020851if (DAG.getMachineFunction().getFunction().hasFnAttribute(20852"no-stack-arg-probe")) {20853MaybeAlign Align =20854cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();20855SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);20856Chain = SP.getValue(1);20857SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);20858if (Align)20859SP =20860DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),20861DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32));20862Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);20863SDValue Ops[2] = { SP, Chain };20864return DAG.getMergeValues(Ops, DL);20865}2086620867SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,20868DAG.getConstant(2, DL, MVT::i32));2086920870SDValue Glue;20871Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);20872Glue = Chain.getValue(1);2087320874SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);20875Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);2087620877SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);20878Chain = NewSP.getValue(1);2087920880SDValue Ops[2] = { NewSP, Chain };20881return DAG.getMergeValues(Ops, DL);20882}2088320884SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {20885bool IsStrict = Op->isStrictFPOpcode();20886SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);20887const unsigned DstSz = Op.getValueType().getSizeInBits();20888const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();20889assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&20890"Unexpected type for custom-lowering FP_EXTEND");2089120892assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&20893"With both FP DP and 16, any FP conversion is legal!");2089420895assert(!(DstSz == 32 && Subtarget->hasFP16()) &&20896"With FP16, 16 to 32 conversion is legal!");2089720898// Converting from 32 -> 64 is valid if we have FP64.20899if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {20900// FIXME: Remove this when we have strict fp instruction selection patterns20901if (IsStrict) {20902SDLoc Loc(Op);20903SDValue Result = DAG.getNode(ISD::FP_EXTEND,20904Loc, Op.getValueType(), SrcVal);20905return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);20906}20907return Op;20908}2090920910// Either we are converting from 16 -> 64, without FP16 and/or20911// FP.double-precision or without Armv8-fp. So we must do it in two20912// steps.20913// Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 3220914// without FP16. So we must do a function call.20915SDLoc Loc(Op);20916RTLIB::Libcall LC;20917MakeLibCallOptions CallOptions;20918SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();20919for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {20920bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());20921MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);20922MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);20923if (Supported) {20924if (IsStrict) {20925SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,20926{DstVT, MVT::Other}, {Chain, SrcVal});20927Chain = SrcVal.getValue(1);20928} else {20929SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);20930}20931} else {20932LC = RTLIB::getFPEXT(SrcVT, DstVT);20933assert(LC != RTLIB::UNKNOWN_LIBCALL &&20934"Unexpected type for custom-lowering FP_EXTEND");20935std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,20936Loc, Chain);20937}20938}2093920940return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;20941}2094220943SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {20944bool IsStrict = Op->isStrictFPOpcode();2094520946SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);20947EVT SrcVT = SrcVal.getValueType();20948EVT DstVT = Op.getValueType();20949const unsigned DstSz = Op.getValueType().getSizeInBits();20950const unsigned SrcSz = SrcVT.getSizeInBits();20951(void)DstSz;20952assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&20953"Unexpected type for custom-lowering FP_ROUND");2095420955assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&20956"With both FP DP and 16, any FP conversion is legal!");2095720958SDLoc Loc(Op);2095920960// Instruction from 32 -> 16 if hasFP16 is valid20961if (SrcSz == 32 && Subtarget->hasFP16())20962return Op;2096320964// Lib call from 32 -> 16 / 64 -> [32, 16]20965RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);20966assert(LC != RTLIB::UNKNOWN_LIBCALL &&20967"Unexpected type for custom-lowering FP_ROUND");20968MakeLibCallOptions CallOptions;20969SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();20970SDValue Result;20971std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,20972Loc, Chain);20973return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;20974}2097520976bool20977ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {20978// The ARM target isn't yet aware of offsets.20979return false;20980}2098120982bool ARM::isBitFieldInvertedMask(unsigned v) {20983if (v == 0xffffffff)20984return false;2098520986// there can be 1's on either or both "outsides", all the "inside"20987// bits must be 0's20988return isShiftedMask_32(~v);20989}2099020991/// isFPImmLegal - Returns true if the target can instruction select the20992/// specified FP immediate natively. If false, the legalizer will20993/// materialize the FP immediate as a load from a constant pool.20994bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,20995bool ForCodeSize) const {20996if (!Subtarget->hasVFP3Base())20997return false;20998if (VT == MVT::f16 && Subtarget->hasFullFP16())20999return ARM_AM::getFP16Imm(Imm) != -1;21000if (VT == MVT::f32 && Subtarget->hasFullFP16() &&21001ARM_AM::getFP32FP16Imm(Imm) != -1)21002return true;21003if (VT == MVT::f32)21004return ARM_AM::getFP32Imm(Imm) != -1;21005if (VT == MVT::f64 && Subtarget->hasFP64())21006return ARM_AM::getFP64Imm(Imm) != -1;21007return false;21008}2100921010/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as21011/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment21012/// specified in the intrinsic calls.21013bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,21014const CallInst &I,21015MachineFunction &MF,21016unsigned Intrinsic) const {21017switch (Intrinsic) {21018case Intrinsic::arm_neon_vld1:21019case Intrinsic::arm_neon_vld2:21020case Intrinsic::arm_neon_vld3:21021case Intrinsic::arm_neon_vld4:21022case Intrinsic::arm_neon_vld2lane:21023case Intrinsic::arm_neon_vld3lane:21024case Intrinsic::arm_neon_vld4lane:21025case Intrinsic::arm_neon_vld2dup:21026case Intrinsic::arm_neon_vld3dup:21027case Intrinsic::arm_neon_vld4dup: {21028Info.opc = ISD::INTRINSIC_W_CHAIN;21029// Conservatively set memVT to the entire set of vectors loaded.21030auto &DL = I.getDataLayout();21031uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;21032Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);21033Info.ptrVal = I.getArgOperand(0);21034Info.offset = 0;21035Value *AlignArg = I.getArgOperand(I.arg_size() - 1);21036Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();21037// volatile loads with NEON intrinsics not supported21038Info.flags = MachineMemOperand::MOLoad;21039return true;21040}21041case Intrinsic::arm_neon_vld1x2:21042case Intrinsic::arm_neon_vld1x3:21043case Intrinsic::arm_neon_vld1x4: {21044Info.opc = ISD::INTRINSIC_W_CHAIN;21045// Conservatively set memVT to the entire set of vectors loaded.21046auto &DL = I.getDataLayout();21047uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;21048Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);21049Info.ptrVal = I.getArgOperand(I.arg_size() - 1);21050Info.offset = 0;21051Info.align.reset();21052// volatile loads with NEON intrinsics not supported21053Info.flags = MachineMemOperand::MOLoad;21054return true;21055}21056case Intrinsic::arm_neon_vst1:21057case Intrinsic::arm_neon_vst2:21058case Intrinsic::arm_neon_vst3:21059case Intrinsic::arm_neon_vst4:21060case Intrinsic::arm_neon_vst2lane:21061case Intrinsic::arm_neon_vst3lane:21062case Intrinsic::arm_neon_vst4lane: {21063Info.opc = ISD::INTRINSIC_VOID;21064// Conservatively set memVT to the entire set of vectors stored.21065auto &DL = I.getDataLayout();21066unsigned NumElts = 0;21067for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {21068Type *ArgTy = I.getArgOperand(ArgI)->getType();21069if (!ArgTy->isVectorTy())21070break;21071NumElts += DL.getTypeSizeInBits(ArgTy) / 64;21072}21073Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);21074Info.ptrVal = I.getArgOperand(0);21075Info.offset = 0;21076Value *AlignArg = I.getArgOperand(I.arg_size() - 1);21077Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();21078// volatile stores with NEON intrinsics not supported21079Info.flags = MachineMemOperand::MOStore;21080return true;21081}21082case Intrinsic::arm_neon_vst1x2:21083case Intrinsic::arm_neon_vst1x3:21084case Intrinsic::arm_neon_vst1x4: {21085Info.opc = ISD::INTRINSIC_VOID;21086// Conservatively set memVT to the entire set of vectors stored.21087auto &DL = I.getDataLayout();21088unsigned NumElts = 0;21089for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {21090Type *ArgTy = I.getArgOperand(ArgI)->getType();21091if (!ArgTy->isVectorTy())21092break;21093NumElts += DL.getTypeSizeInBits(ArgTy) / 64;21094}21095Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);21096Info.ptrVal = I.getArgOperand(0);21097Info.offset = 0;21098Info.align.reset();21099// volatile stores with NEON intrinsics not supported21100Info.flags = MachineMemOperand::MOStore;21101return true;21102}21103case Intrinsic::arm_mve_vld2q:21104case Intrinsic::arm_mve_vld4q: {21105Info.opc = ISD::INTRINSIC_W_CHAIN;21106// Conservatively set memVT to the entire set of vectors loaded.21107Type *VecTy = cast<StructType>(I.getType())->getElementType(1);21108unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;21109Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);21110Info.ptrVal = I.getArgOperand(0);21111Info.offset = 0;21112Info.align = Align(VecTy->getScalarSizeInBits() / 8);21113// volatile loads with MVE intrinsics not supported21114Info.flags = MachineMemOperand::MOLoad;21115return true;21116}21117case Intrinsic::arm_mve_vst2q:21118case Intrinsic::arm_mve_vst4q: {21119Info.opc = ISD::INTRINSIC_VOID;21120// Conservatively set memVT to the entire set of vectors stored.21121Type *VecTy = I.getArgOperand(1)->getType();21122unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;21123Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);21124Info.ptrVal = I.getArgOperand(0);21125Info.offset = 0;21126Info.align = Align(VecTy->getScalarSizeInBits() / 8);21127// volatile stores with MVE intrinsics not supported21128Info.flags = MachineMemOperand::MOStore;21129return true;21130}21131case Intrinsic::arm_mve_vldr_gather_base:21132case Intrinsic::arm_mve_vldr_gather_base_predicated: {21133Info.opc = ISD::INTRINSIC_W_CHAIN;21134Info.ptrVal = nullptr;21135Info.memVT = MVT::getVT(I.getType());21136Info.align = Align(1);21137Info.flags |= MachineMemOperand::MOLoad;21138return true;21139}21140case Intrinsic::arm_mve_vldr_gather_base_wb:21141case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {21142Info.opc = ISD::INTRINSIC_W_CHAIN;21143Info.ptrVal = nullptr;21144Info.memVT = MVT::getVT(I.getType()->getContainedType(0));21145Info.align = Align(1);21146Info.flags |= MachineMemOperand::MOLoad;21147return true;21148}21149case Intrinsic::arm_mve_vldr_gather_offset:21150case Intrinsic::arm_mve_vldr_gather_offset_predicated: {21151Info.opc = ISD::INTRINSIC_W_CHAIN;21152Info.ptrVal = nullptr;21153MVT DataVT = MVT::getVT(I.getType());21154unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();21155Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),21156DataVT.getVectorNumElements());21157Info.align = Align(1);21158Info.flags |= MachineMemOperand::MOLoad;21159return true;21160}21161case Intrinsic::arm_mve_vstr_scatter_base:21162case Intrinsic::arm_mve_vstr_scatter_base_predicated: {21163Info.opc = ISD::INTRINSIC_VOID;21164Info.ptrVal = nullptr;21165Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());21166Info.align = Align(1);21167Info.flags |= MachineMemOperand::MOStore;21168return true;21169}21170case Intrinsic::arm_mve_vstr_scatter_base_wb:21171case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {21172Info.opc = ISD::INTRINSIC_W_CHAIN;21173Info.ptrVal = nullptr;21174Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());21175Info.align = Align(1);21176Info.flags |= MachineMemOperand::MOStore;21177return true;21178}21179case Intrinsic::arm_mve_vstr_scatter_offset:21180case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {21181Info.opc = ISD::INTRINSIC_VOID;21182Info.ptrVal = nullptr;21183MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());21184unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();21185Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),21186DataVT.getVectorNumElements());21187Info.align = Align(1);21188Info.flags |= MachineMemOperand::MOStore;21189return true;21190}21191case Intrinsic::arm_ldaex:21192case Intrinsic::arm_ldrex: {21193auto &DL = I.getDataLayout();21194Type *ValTy = I.getParamElementType(0);21195Info.opc = ISD::INTRINSIC_W_CHAIN;21196Info.memVT = MVT::getVT(ValTy);21197Info.ptrVal = I.getArgOperand(0);21198Info.offset = 0;21199Info.align = DL.getABITypeAlign(ValTy);21200Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;21201return true;21202}21203case Intrinsic::arm_stlex:21204case Intrinsic::arm_strex: {21205auto &DL = I.getDataLayout();21206Type *ValTy = I.getParamElementType(1);21207Info.opc = ISD::INTRINSIC_W_CHAIN;21208Info.memVT = MVT::getVT(ValTy);21209Info.ptrVal = I.getArgOperand(1);21210Info.offset = 0;21211Info.align = DL.getABITypeAlign(ValTy);21212Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;21213return true;21214}21215case Intrinsic::arm_stlexd:21216case Intrinsic::arm_strexd:21217Info.opc = ISD::INTRINSIC_W_CHAIN;21218Info.memVT = MVT::i64;21219Info.ptrVal = I.getArgOperand(2);21220Info.offset = 0;21221Info.align = Align(8);21222Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;21223return true;2122421225case Intrinsic::arm_ldaexd:21226case Intrinsic::arm_ldrexd:21227Info.opc = ISD::INTRINSIC_W_CHAIN;21228Info.memVT = MVT::i64;21229Info.ptrVal = I.getArgOperand(0);21230Info.offset = 0;21231Info.align = Align(8);21232Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;21233return true;2123421235default:21236break;21237}2123821239return false;21240}2124121242/// Returns true if it is beneficial to convert a load of a constant21243/// to just the constant itself.21244bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,21245Type *Ty) const {21246assert(Ty->isIntegerTy());2124721248unsigned Bits = Ty->getPrimitiveSizeInBits();21249if (Bits == 0 || Bits > 32)21250return false;21251return true;21252}2125321254bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,21255unsigned Index) const {21256if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))21257return false;2125821259return (Index == 0 || Index == ResVT.getVectorNumElements());21260}2126121262Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,21263ARM_MB::MemBOpt Domain) const {21264Module *M = Builder.GetInsertBlock()->getParent()->getParent();2126521266// First, if the target has no DMB, see what fallback we can use.21267if (!Subtarget->hasDataBarrier()) {21268// Some ARMv6 cpus can support data barriers with an mcr instruction.21269// Thumb1 and pre-v6 ARM mode use a libcall instead and should never get21270// here.21271if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {21272Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);21273Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),21274Builder.getInt32(0), Builder.getInt32(7),21275Builder.getInt32(10), Builder.getInt32(5)};21276return Builder.CreateCall(MCR, args);21277} else {21278// Instead of using barriers, atomic accesses on these subtargets use21279// libcalls.21280llvm_unreachable("makeDMB on a target so old that it has no barriers");21281}21282} else {21283Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);21284// Only a full system barrier exists in the M-class architectures.21285Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;21286Constant *CDomain = Builder.getInt32(Domain);21287return Builder.CreateCall(DMB, CDomain);21288}21289}2129021291// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html21292Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,21293Instruction *Inst,21294AtomicOrdering Ord) const {21295switch (Ord) {21296case AtomicOrdering::NotAtomic:21297case AtomicOrdering::Unordered:21298llvm_unreachable("Invalid fence: unordered/non-atomic");21299case AtomicOrdering::Monotonic:21300case AtomicOrdering::Acquire:21301return nullptr; // Nothing to do21302case AtomicOrdering::SequentiallyConsistent:21303if (!Inst->hasAtomicStore())21304return nullptr; // Nothing to do21305[[fallthrough]];21306case AtomicOrdering::Release:21307case AtomicOrdering::AcquireRelease:21308if (Subtarget->preferISHSTBarriers())21309return makeDMB(Builder, ARM_MB::ISHST);21310// FIXME: add a comment with a link to documentation justifying this.21311else21312return makeDMB(Builder, ARM_MB::ISH);21313}21314llvm_unreachable("Unknown fence ordering in emitLeadingFence");21315}2131621317Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,21318Instruction *Inst,21319AtomicOrdering Ord) const {21320switch (Ord) {21321case AtomicOrdering::NotAtomic:21322case AtomicOrdering::Unordered:21323llvm_unreachable("Invalid fence: unordered/not-atomic");21324case AtomicOrdering::Monotonic:21325case AtomicOrdering::Release:21326return nullptr; // Nothing to do21327case AtomicOrdering::Acquire:21328case AtomicOrdering::AcquireRelease:21329case AtomicOrdering::SequentiallyConsistent:21330return makeDMB(Builder, ARM_MB::ISH);21331}21332llvm_unreachable("Unknown fence ordering in emitTrailingFence");21333}2133421335// Loads and stores less than 64-bits are already atomic; ones above that21336// are doomed anyway, so defer to the default libcall and blame the OS when21337// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit21338// anything for those.21339TargetLoweringBase::AtomicExpansionKind21340ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {21341bool has64BitAtomicStore;21342if (Subtarget->isMClass())21343has64BitAtomicStore = false;21344else if (Subtarget->isThumb())21345has64BitAtomicStore = Subtarget->hasV7Ops();21346else21347has64BitAtomicStore = Subtarget->hasV6Ops();2134821349unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();21350return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand21351: AtomicExpansionKind::None;21352}2135321354// Loads and stores less than 64-bits are already atomic; ones above that21355// are doomed anyway, so defer to the default libcall and blame the OS when21356// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit21357// anything for those.21358// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that21359// guarantee, see DDI0406C ARM architecture reference manual,21360// sections A8.8.72-74 LDRD)21361TargetLowering::AtomicExpansionKind21362ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {21363bool has64BitAtomicLoad;21364if (Subtarget->isMClass())21365has64BitAtomicLoad = false;21366else if (Subtarget->isThumb())21367has64BitAtomicLoad = Subtarget->hasV7Ops();21368else21369has64BitAtomicLoad = Subtarget->hasV6Ops();2137021371unsigned Size = LI->getType()->getPrimitiveSizeInBits();21372return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly21373: AtomicExpansionKind::None;21374}2137521376// For the real atomic operations, we have ldrex/strex up to 32 bits,21377// and up to 64 bits on the non-M profiles21378TargetLowering::AtomicExpansionKind21379ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {21380if (AI->isFloatingPointOperation())21381return AtomicExpansionKind::CmpXChg;2138221383unsigned Size = AI->getType()->getPrimitiveSizeInBits();21384bool hasAtomicRMW;21385if (Subtarget->isMClass())21386hasAtomicRMW = Subtarget->hasV8MBaselineOps();21387else if (Subtarget->isThumb())21388hasAtomicRMW = Subtarget->hasV7Ops();21389else21390hasAtomicRMW = Subtarget->hasV6Ops();21391if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {21392// At -O0, fast-regalloc cannot cope with the live vregs necessary to21393// implement atomicrmw without spilling. If the target address is also on21394// the stack and close enough to the spill slot, this can lead to a21395// situation where the monitor always gets cleared and the atomic operation21396// can never succeed. So at -O0 lower this operation to a CAS loop.21397if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)21398return AtomicExpansionKind::CmpXChg;21399return AtomicExpansionKind::LLSC;21400}21401return AtomicExpansionKind::None;21402}2140321404// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 3221405// bits, and up to 64 bits on the non-M profiles.21406TargetLowering::AtomicExpansionKind21407ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {21408// At -O0, fast-regalloc cannot cope with the live vregs necessary to21409// implement cmpxchg without spilling. If the address being exchanged is also21410// on the stack and close enough to the spill slot, this can lead to a21411// situation where the monitor always gets cleared and the atomic operation21412// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.21413unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();21414bool HasAtomicCmpXchg;21415if (Subtarget->isMClass())21416HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();21417else if (Subtarget->isThumb())21418HasAtomicCmpXchg = Subtarget->hasV7Ops();21419else21420HasAtomicCmpXchg = Subtarget->hasV6Ops();21421if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&21422HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))21423return AtomicExpansionKind::LLSC;21424return AtomicExpansionKind::None;21425}2142621427bool ARMTargetLowering::shouldInsertFencesForAtomic(21428const Instruction *I) const {21429return InsertFencesForAtomic;21430}2143121432bool ARMTargetLowering::useLoadStackGuardNode() const {21433// ROPI/RWPI are not supported currently.21434return !Subtarget->isROPI() && !Subtarget->isRWPI();21435}2143621437void ARMTargetLowering::insertSSPDeclarations(Module &M) const {21438if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())21439return TargetLowering::insertSSPDeclarations(M);2144021441// MSVC CRT has a global variable holding security cookie.21442M.getOrInsertGlobal("__security_cookie",21443PointerType::getUnqual(M.getContext()));2144421445// MSVC CRT has a function to validate security cookie.21446FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(21447"__security_check_cookie", Type::getVoidTy(M.getContext()),21448PointerType::getUnqual(M.getContext()));21449if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))21450F->addParamAttr(0, Attribute::AttrKind::InReg);21451}2145221453Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {21454// MSVC CRT has a global variable holding security cookie.21455if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())21456return M.getGlobalVariable("__security_cookie");21457return TargetLowering::getSDagStackGuard(M);21458}2145921460Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {21461// MSVC CRT has a function to validate security cookie.21462if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())21463return M.getFunction("__security_check_cookie");21464return TargetLowering::getSSPStackGuardCheck(M);21465}2146621467bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,21468unsigned &Cost) const {21469// If we do not have NEON, vector types are not natively supported.21470if (!Subtarget->hasNEON())21471return false;2147221473// Floating point values and vector values map to the same register file.21474// Therefore, although we could do a store extract of a vector type, this is21475// better to leave at float as we have more freedom in the addressing mode for21476// those.21477if (VectorTy->isFPOrFPVectorTy())21478return false;2147921480// If the index is unknown at compile time, this is very expensive to lower21481// and it is not possible to combine the store with the extract.21482if (!isa<ConstantInt>(Idx))21483return false;2148421485assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");21486unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();21487// We can do a store + vector extract on any vector that fits perfectly in a D21488// or Q register.21489if (BitWidth == 64 || BitWidth == 128) {21490Cost = 0;21491return true;21492}21493return false;21494}2149521496bool ARMTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {21497return Subtarget->hasV6T2Ops();21498}2149921500bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {21501return Subtarget->hasV6T2Ops();21502}2150321504bool ARMTargetLowering::isMaskAndCmp0FoldingBeneficial(21505const Instruction &AndI) const {21506if (!Subtarget->hasV7Ops())21507return false;2150821509// Sink the `and` instruction only if the mask would fit into a modified21510// immediate operand.21511ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));21512if (!Mask || Mask->getValue().getBitWidth() > 32u)21513return false;21514auto MaskVal = unsigned(Mask->getValue().getZExtValue());21515return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)21516: ARM_AM::getSOImmVal(MaskVal)) != -1;21517}2151821519TargetLowering::ShiftLegalizationStrategy21520ARMTargetLowering::preferredShiftLegalizationStrategy(21521SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {21522if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())21523return ShiftLegalizationStrategy::LowerToLibcall;21524return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,21525ExpansionFactor);21526}2152721528Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,21529Value *Addr,21530AtomicOrdering Ord) const {21531Module *M = Builder.GetInsertBlock()->getParent()->getParent();21532bool IsAcquire = isAcquireOrStronger(Ord);2153321534// Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd21535// intrinsic must return {i32, i32} and we have to recombine them into a21536// single i64 here.21537if (ValueTy->getPrimitiveSizeInBits() == 64) {21538Intrinsic::ID Int =21539IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;21540Function *Ldrex = Intrinsic::getDeclaration(M, Int);2154121542Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");2154321544Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");21545Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");21546if (!Subtarget->isLittle())21547std::swap (Lo, Hi);21548Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");21549Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");21550return Builder.CreateOr(21551Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");21552}2155321554Type *Tys[] = { Addr->getType() };21555Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;21556Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);21557CallInst *CI = Builder.CreateCall(Ldrex, Addr);2155821559CI->addParamAttr(215600, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));21561return Builder.CreateTruncOrBitCast(CI, ValueTy);21562}2156321564void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(21565IRBuilderBase &Builder) const {21566if (!Subtarget->hasV7Ops())21567return;21568Module *M = Builder.GetInsertBlock()->getParent()->getParent();21569Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));21570}2157121572Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,21573Value *Val, Value *Addr,21574AtomicOrdering Ord) const {21575Module *M = Builder.GetInsertBlock()->getParent()->getParent();21576bool IsRelease = isReleaseOrStronger(Ord);2157721578// Since the intrinsics must have legal type, the i64 intrinsics take two21579// parameters: "i32, i32". We must marshal Val into the appropriate form21580// before the call.21581if (Val->getType()->getPrimitiveSizeInBits() == 64) {21582Intrinsic::ID Int =21583IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;21584Function *Strex = Intrinsic::getDeclaration(M, Int);21585Type *Int32Ty = Type::getInt32Ty(M->getContext());2158621587Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");21588Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");21589if (!Subtarget->isLittle())21590std::swap(Lo, Hi);21591return Builder.CreateCall(Strex, {Lo, Hi, Addr});21592}2159321594Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;21595Type *Tys[] = { Addr->getType() };21596Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);2159721598CallInst *CI = Builder.CreateCall(21599Strex, {Builder.CreateZExtOrBitCast(21600Val, Strex->getFunctionType()->getParamType(0)),21601Addr});21602CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,21603Val->getType()));21604return CI;21605}216062160721608bool ARMTargetLowering::alignLoopsWithOptSize() const {21609return Subtarget->isMClass();21610}2161121612/// A helper function for determining the number of interleaved accesses we21613/// will generate when lowering accesses of the given type.21614unsigned21615ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,21616const DataLayout &DL) const {21617return (DL.getTypeSizeInBits(VecTy) + 127) / 128;21618}2161921620bool ARMTargetLowering::isLegalInterleavedAccessType(21621unsigned Factor, FixedVectorType *VecTy, Align Alignment,21622const DataLayout &DL) const {2162321624unsigned VecSize = DL.getTypeSizeInBits(VecTy);21625unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());2162621627if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())21628return false;2162921630// Ensure the vector doesn't have f16 elements. Even though we could do an21631// i16 vldN, we can't hold the f16 vectors and will end up converting via21632// f32.21633if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())21634return false;21635if (Subtarget->hasMVEIntegerOps() && Factor == 3)21636return false;2163721638// Ensure the number of vector elements is greater than 1.21639if (VecTy->getNumElements() < 2)21640return false;2164121642// Ensure the element type is legal.21643if (ElSize != 8 && ElSize != 16 && ElSize != 32)21644return false;21645// And the alignment if high enough under MVE.21646if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)21647return false;2164821649// Ensure the total vector size is 64 or a multiple of 128. Types larger than21650// 128 will be split into multiple interleaved accesses.21651if (Subtarget->hasNEON() && VecSize == 64)21652return true;21653return VecSize % 128 == 0;21654}2165521656unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {21657if (Subtarget->hasNEON())21658return 4;21659if (Subtarget->hasMVEIntegerOps())21660return MVEMaxSupportedInterleaveFactor;21661return TargetLoweringBase::getMaxSupportedInterleaveFactor();21662}2166321664/// Lower an interleaved load into a vldN intrinsic.21665///21666/// E.g. Lower an interleaved load (Factor = 2):21667/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 421668/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements21669/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements21670///21671/// Into:21672/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)21673/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 021674/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 121675bool ARMTargetLowering::lowerInterleavedLoad(21676LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,21677ArrayRef<unsigned> Indices, unsigned Factor) const {21678assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&21679"Invalid interleave factor");21680assert(!Shuffles.empty() && "Empty shufflevector input");21681assert(Shuffles.size() == Indices.size() &&21682"Unmatched number of shufflevectors and indices");2168321684auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());21685Type *EltTy = VecTy->getElementType();2168621687const DataLayout &DL = LI->getDataLayout();21688Align Alignment = LI->getAlign();2168921690// Skip if we do not have NEON and skip illegal vector types. We can21691// "legalize" wide vector types into multiple interleaved accesses as long as21692// the vector types are divisible by 128.21693if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))21694return false;2169521696unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);2169721698// A pointer vector can not be the return type of the ldN intrinsics. Need to21699// load integer vectors first and then convert to pointer vectors.21700if (EltTy->isPointerTy())21701VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);2170221703IRBuilder<> Builder(LI);2170421705// The base address of the load.21706Value *BaseAddr = LI->getPointerOperand();2170721708if (NumLoads > 1) {21709// If we're going to generate more than one load, reset the sub-vector type21710// to something legal.21711VecTy = FixedVectorType::get(VecTy->getElementType(),21712VecTy->getNumElements() / NumLoads);21713}2171421715assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");2171621717auto createLoadIntrinsic = [&](Value *BaseAddr) {21718if (Subtarget->hasNEON()) {21719Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());21720Type *Tys[] = {VecTy, PtrTy};21721static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,21722Intrinsic::arm_neon_vld3,21723Intrinsic::arm_neon_vld4};21724Function *VldnFunc =21725Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);2172621727SmallVector<Value *, 2> Ops;21728Ops.push_back(BaseAddr);21729Ops.push_back(Builder.getInt32(LI->getAlign().value()));2173021731return Builder.CreateCall(VldnFunc, Ops, "vldN");21732} else {21733assert((Factor == 2 || Factor == 4) &&21734"expected interleave factor of 2 or 4 for MVE");21735Intrinsic::ID LoadInts =21736Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;21737Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());21738Type *Tys[] = {VecTy, PtrTy};21739Function *VldnFunc =21740Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);2174121742SmallVector<Value *, 2> Ops;21743Ops.push_back(BaseAddr);21744return Builder.CreateCall(VldnFunc, Ops, "vldN");21745}21746};2174721748// Holds sub-vectors extracted from the load intrinsic return values. The21749// sub-vectors are associated with the shufflevector instructions they will21750// replace.21751DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;2175221753for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {21754// If we're generating more than one load, compute the base address of21755// subsequent loads as an offset from the previous.21756if (LoadCount > 0)21757BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,21758VecTy->getNumElements() * Factor);2175921760CallInst *VldN = createLoadIntrinsic(BaseAddr);2176121762// Replace uses of each shufflevector with the corresponding vector loaded21763// by ldN.21764for (unsigned i = 0; i < Shuffles.size(); i++) {21765ShuffleVectorInst *SV = Shuffles[i];21766unsigned Index = Indices[i];2176721768Value *SubVec = Builder.CreateExtractValue(VldN, Index);2176921770// Convert the integer vector to pointer vector if the element is pointer.21771if (EltTy->isPointerTy())21772SubVec = Builder.CreateIntToPtr(21773SubVec,21774FixedVectorType::get(SV->getType()->getElementType(), VecTy));2177521776SubVecs[SV].push_back(SubVec);21777}21778}2177921780// Replace uses of the shufflevector instructions with the sub-vectors21781// returned by the load intrinsic. If a shufflevector instruction is21782// associated with more than one sub-vector, those sub-vectors will be21783// concatenated into a single wide vector.21784for (ShuffleVectorInst *SVI : Shuffles) {21785auto &SubVec = SubVecs[SVI];21786auto *WideVec =21787SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];21788SVI->replaceAllUsesWith(WideVec);21789}2179021791return true;21792}2179321794/// Lower an interleaved store into a vstN intrinsic.21795///21796/// E.g. Lower an interleaved store (Factor = 3):21797/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,21798/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>21799/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 421800///21801/// Into:21802/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>21803/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>21804/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>21805/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)21806///21807/// Note that the new shufflevectors will be removed and we'll only generate one21808/// vst3 instruction in CodeGen.21809///21810/// Example for a more general valid mask (Factor 3). Lower:21811/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,21812/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>21813/// store <12 x i32> %i.vec, <12 x i32>* %ptr21814///21815/// Into:21816/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>21817/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>21818/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>21819/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)21820bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,21821ShuffleVectorInst *SVI,21822unsigned Factor) const {21823assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&21824"Invalid interleave factor");2182521826auto *VecTy = cast<FixedVectorType>(SVI->getType());21827assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");2182821829unsigned LaneLen = VecTy->getNumElements() / Factor;21830Type *EltTy = VecTy->getElementType();21831auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);2183221833const DataLayout &DL = SI->getDataLayout();21834Align Alignment = SI->getAlign();2183521836// Skip if we do not have NEON and skip illegal vector types. We can21837// "legalize" wide vector types into multiple interleaved accesses as long as21838// the vector types are divisible by 128.21839if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))21840return false;2184121842unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);2184321844Value *Op0 = SVI->getOperand(0);21845Value *Op1 = SVI->getOperand(1);21846IRBuilder<> Builder(SI);2184721848// StN intrinsics don't support pointer vectors as arguments. Convert pointer21849// vectors to integer vectors.21850if (EltTy->isPointerTy()) {21851Type *IntTy = DL.getIntPtrType(EltTy);2185221853// Convert to the corresponding integer vector.21854auto *IntVecTy =21855FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType()));21856Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);21857Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);2185821859SubVecTy = FixedVectorType::get(IntTy, LaneLen);21860}2186121862// The base address of the store.21863Value *BaseAddr = SI->getPointerOperand();2186421865if (NumStores > 1) {21866// If we're going to generate more than one store, reset the lane length21867// and sub-vector type to something legal.21868LaneLen /= NumStores;21869SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);21870}2187121872assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");2187321874auto Mask = SVI->getShuffleMask();2187521876auto createStoreIntrinsic = [&](Value *BaseAddr,21877SmallVectorImpl<Value *> &Shuffles) {21878if (Subtarget->hasNEON()) {21879static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,21880Intrinsic::arm_neon_vst3,21881Intrinsic::arm_neon_vst4};21882Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());21883Type *Tys[] = {PtrTy, SubVecTy};2188421885Function *VstNFunc = Intrinsic::getDeclaration(21886SI->getModule(), StoreInts[Factor - 2], Tys);2188721888SmallVector<Value *, 6> Ops;21889Ops.push_back(BaseAddr);21890append_range(Ops, Shuffles);21891Ops.push_back(Builder.getInt32(SI->getAlign().value()));21892Builder.CreateCall(VstNFunc, Ops);21893} else {21894assert((Factor == 2 || Factor == 4) &&21895"expected interleave factor of 2 or 4 for MVE");21896Intrinsic::ID StoreInts =21897Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;21898Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());21899Type *Tys[] = {PtrTy, SubVecTy};21900Function *VstNFunc =21901Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys);2190221903SmallVector<Value *, 6> Ops;21904Ops.push_back(BaseAddr);21905append_range(Ops, Shuffles);21906for (unsigned F = 0; F < Factor; F++) {21907Ops.push_back(Builder.getInt32(F));21908Builder.CreateCall(VstNFunc, Ops);21909Ops.pop_back();21910}21911}21912};2191321914for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {21915// If we generating more than one store, we compute the base address of21916// subsequent stores as an offset from the previous.21917if (StoreCount > 0)21918BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),21919BaseAddr, LaneLen * Factor);2192021921SmallVector<Value *, 4> Shuffles;2192221923// Split the shufflevector operands into sub vectors for the new vstN call.21924for (unsigned i = 0; i < Factor; i++) {21925unsigned IdxI = StoreCount * LaneLen * Factor + i;21926if (Mask[IdxI] >= 0) {21927Shuffles.push_back(Builder.CreateShuffleVector(21928Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));21929} else {21930unsigned StartMask = 0;21931for (unsigned j = 1; j < LaneLen; j++) {21932unsigned IdxJ = StoreCount * LaneLen * Factor + j;21933if (Mask[IdxJ * Factor + IdxI] >= 0) {21934StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;21935break;21936}21937}21938// Note: If all elements in a chunk are undefs, StartMask=0!21939// Note: Filling undef gaps with random elements is ok, since21940// those elements were being written anyway (with undefs).21941// In the case of all undefs we're defaulting to using elems from 021942// Note: StartMask cannot be negative, it's checked in21943// isReInterleaveMask21944Shuffles.push_back(Builder.CreateShuffleVector(21945Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));21946}21947}2194821949createStoreIntrinsic(BaseAddr, Shuffles);21950}21951return true;21952}2195321954enum HABaseType {21955HA_UNKNOWN = 0,21956HA_FLOAT,21957HA_DOUBLE,21958HA_VECT64,21959HA_VECT12821960};2196121962static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,21963uint64_t &Members) {21964if (auto *ST = dyn_cast<StructType>(Ty)) {21965for (unsigned i = 0; i < ST->getNumElements(); ++i) {21966uint64_t SubMembers = 0;21967if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))21968return false;21969Members += SubMembers;21970}21971} else if (auto *AT = dyn_cast<ArrayType>(Ty)) {21972uint64_t SubMembers = 0;21973if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))21974return false;21975Members += SubMembers * AT->getNumElements();21976} else if (Ty->isFloatTy()) {21977if (Base != HA_UNKNOWN && Base != HA_FLOAT)21978return false;21979Members = 1;21980Base = HA_FLOAT;21981} else if (Ty->isDoubleTy()) {21982if (Base != HA_UNKNOWN && Base != HA_DOUBLE)21983return false;21984Members = 1;21985Base = HA_DOUBLE;21986} else if (auto *VT = dyn_cast<VectorType>(Ty)) {21987Members = 1;21988switch (Base) {21989case HA_FLOAT:21990case HA_DOUBLE:21991return false;21992case HA_VECT64:21993return VT->getPrimitiveSizeInBits().getFixedValue() == 64;21994case HA_VECT128:21995return VT->getPrimitiveSizeInBits().getFixedValue() == 128;21996case HA_UNKNOWN:21997switch (VT->getPrimitiveSizeInBits().getFixedValue()) {21998case 64:21999Base = HA_VECT64;22000return true;22001case 128:22002Base = HA_VECT128;22003return true;22004default:22005return false;22006}22007}22008}2200922010return (Members > 0 && Members <= 4);22011}2201222013/// Return the correct alignment for the current calling convention.22014Align ARMTargetLowering::getABIAlignmentForCallingConv(22015Type *ArgTy, const DataLayout &DL) const {22016const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);22017if (!ArgTy->isVectorTy())22018return ABITypeAlign;2201922020// Avoid over-aligning vector parameters. It would require realigning the22021// stack and waste space for no real benefit.22022return std::min(ABITypeAlign, DL.getStackAlignment());22023}2202422025/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of22026/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when22027/// passing according to AAPCS rules.22028bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(22029Type *Ty, CallingConv::ID CallConv, bool isVarArg,22030const DataLayout &DL) const {22031if (getEffectiveCallingConv(CallConv, isVarArg) !=22032CallingConv::ARM_AAPCS_VFP)22033return false;2203422035HABaseType Base = HA_UNKNOWN;22036uint64_t Members = 0;22037bool IsHA = isHomogeneousAggregate(Ty, Base, Members);22038LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());2203922040bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();22041return IsHA || IsIntArray;22042}2204322044Register ARMTargetLowering::getExceptionPointerRegister(22045const Constant *PersonalityFn) const {22046// Platforms which do not use SjLj EH may return values in these registers22047// via the personality function.22048return Subtarget->useSjLjEH() ? Register() : ARM::R0;22049}2205022051Register ARMTargetLowering::getExceptionSelectorRegister(22052const Constant *PersonalityFn) const {22053// Platforms which do not use SjLj EH may return values in these registers22054// via the personality function.22055return Subtarget->useSjLjEH() ? Register() : ARM::R1;22056}2205722058void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {22059// Update IsSplitCSR in ARMFunctionInfo.22060ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();22061AFI->setIsSplitCSR(true);22062}2206322064void ARMTargetLowering::insertCopiesSplitCSR(22065MachineBasicBlock *Entry,22066const SmallVectorImpl<MachineBasicBlock *> &Exits) const {22067const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();22068const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());22069if (!IStart)22070return;2207122072const TargetInstrInfo *TII = Subtarget->getInstrInfo();22073MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();22074MachineBasicBlock::iterator MBBI = Entry->begin();22075for (const MCPhysReg *I = IStart; *I; ++I) {22076const TargetRegisterClass *RC = nullptr;22077if (ARM::GPRRegClass.contains(*I))22078RC = &ARM::GPRRegClass;22079else if (ARM::DPRRegClass.contains(*I))22080RC = &ARM::DPRRegClass;22081else22082llvm_unreachable("Unexpected register class in CSRsViaCopy!");2208322084Register NewVR = MRI->createVirtualRegister(RC);22085// Create copy from CSR to a virtual register.22086// FIXME: this currently does not emit CFI pseudo-instructions, it works22087// fine for CXX_FAST_TLS since the C++-style TLS access functions should be22088// nounwind. If we want to generalize this later, we may need to emit22089// CFI pseudo-instructions.22090assert(Entry->getParent()->getFunction().hasFnAttribute(22091Attribute::NoUnwind) &&22092"Function should be nounwind in insertCopiesSplitCSR!");22093Entry->addLiveIn(*I);22094BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)22095.addReg(*I);2209622097// Insert the copy-back instructions right before the terminator.22098for (auto *Exit : Exits)22099BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),22100TII->get(TargetOpcode::COPY), *I)22101.addReg(NewVR);22102}22103}2210422105void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {22106MF.getFrameInfo().computeMaxCallFrameSize(MF);22107TargetLoweringBase::finalizeLowering(MF);22108}2210922110bool ARMTargetLowering::isComplexDeinterleavingSupported() const {22111return Subtarget->hasMVEIntegerOps();22112}2211322114bool ARMTargetLowering::isComplexDeinterleavingOperationSupported(22115ComplexDeinterleavingOperation Operation, Type *Ty) const {22116auto *VTy = dyn_cast<FixedVectorType>(Ty);22117if (!VTy)22118return false;2211922120auto *ScalarTy = VTy->getScalarType();22121unsigned NumElements = VTy->getNumElements();2212222123unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;22124if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))22125return false;2212622127// Both VCADD and VCMUL/VCMLA support the same types, F16 and F3222128if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())22129return Subtarget->hasMVEFloatOps();2213022131if (Operation != ComplexDeinterleavingOperation::CAdd)22132return false;2213322134return Subtarget->hasMVEIntegerOps() &&22135(ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||22136ScalarTy->isIntegerTy(32));22137}2213822139Value *ARMTargetLowering::createComplexDeinterleavingIR(22140IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,22141ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,22142Value *Accumulator) const {2214322144FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType());2214522146unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();2214722148assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");2214922150if (TyWidth > 128) {22151int Stride = Ty->getNumElements() / 2;22152auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());22153auto SplitSeqVec = llvm::to_vector(SplitSeq);22154ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);22155ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);2215622157auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);22158auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);22159auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);22160auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);22161Value *LowerSplitAcc = nullptr;22162Value *UpperSplitAcc = nullptr;2216322164if (Accumulator) {22165LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);22166UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);22167}2216822169auto *LowerSplitInt = createComplexDeinterleavingIR(22170B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);22171auto *UpperSplitInt = createComplexDeinterleavingIR(22172B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);2217322174ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());22175return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);22176}2217722178auto *IntTy = Type::getInt32Ty(B.getContext());2217922180ConstantInt *ConstRotation = nullptr;22181if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {22182ConstRotation = ConstantInt::get(IntTy, (int)Rotation);2218322184if (Accumulator)22185return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,22186{ConstRotation, Accumulator, InputB, InputA});22187return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,22188{ConstRotation, InputB, InputA});22189}2219022191if (OperationType == ComplexDeinterleavingOperation::CAdd) {22192// 1 means the value is not halved.22193auto *ConstHalving = ConstantInt::get(IntTy, 1);2219422195if (Rotation == ComplexDeinterleavingRotation::Rotation_90)22196ConstRotation = ConstantInt::get(IntTy, 0);22197else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)22198ConstRotation = ConstantInt::get(IntTy, 1);2219922200if (!ConstRotation)22201return nullptr; // Invalid rotation for arm_mve_vcaddq2220222203return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,22204{ConstHalving, ConstRotation, InputA, InputB});22205}2220622207return nullptr;22208}222092221022211