Path: blob/main/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
35294 views
//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8/// \file9/// This is the parent TargetLowering class for hardware code gen10/// targets.11//12//===----------------------------------------------------------------------===//1314#include "AMDGPUISelLowering.h"15#include "AMDGPU.h"16#include "AMDGPUInstrInfo.h"17#include "AMDGPUMachineFunction.h"18#include "SIMachineFunctionInfo.h"19#include "llvm/CodeGen/Analysis.h"20#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"21#include "llvm/CodeGen/MachineFrameInfo.h"22#include "llvm/IR/DiagnosticInfo.h"23#include "llvm/IR/IntrinsicsAMDGPU.h"24#include "llvm/IR/PatternMatch.h"25#include "llvm/Support/CommandLine.h"26#include "llvm/Support/KnownBits.h"27#include "llvm/Target/TargetMachine.h"2829using namespace llvm;3031#include "AMDGPUGenCallingConv.inc"3233static cl::opt<bool> AMDGPUBypassSlowDiv(34"amdgpu-bypass-slow-div",35cl::desc("Skip 64-bit divide for dynamic 32-bit values"),36cl::init(true));3738// Find a larger type to do a load / store of a vector with.39EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {40unsigned StoreSize = VT.getStoreSizeInBits();41if (StoreSize <= 32)42return EVT::getIntegerVT(Ctx, StoreSize);4344if (StoreSize % 32 == 0)45return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);4647return VT;48}4950unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {51return DAG.computeKnownBits(Op).countMaxActiveBits();52}5354unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {55// In order for this to be a signed 24-bit value, bit 23, must56// be a sign bit.57return DAG.ComputeMaxSignificantBits(Op);58}5960AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,61const AMDGPUSubtarget &STI)62: TargetLowering(TM), Subtarget(&STI) {63// Always lower memset, memcpy, and memmove intrinsics to load/store64// instructions, rather then generating calls to memset, mempcy or memmove.65MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;66MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;67MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;6869// Enable ganging up loads and stores in the memcpy DAG lowering.70MaxGluedStoresPerMemcpy = 16;7172// Lower floating point store/load to integer store/load to reduce the number73// of patterns in tablegen.74setOperationAction(ISD::LOAD, MVT::f32, Promote);75AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);7677setOperationAction(ISD::LOAD, MVT::v2f32, Promote);78AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);7980setOperationAction(ISD::LOAD, MVT::v3f32, Promote);81AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);8283setOperationAction(ISD::LOAD, MVT::v4f32, Promote);84AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);8586setOperationAction(ISD::LOAD, MVT::v5f32, Promote);87AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);8889setOperationAction(ISD::LOAD, MVT::v6f32, Promote);90AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);9192setOperationAction(ISD::LOAD, MVT::v7f32, Promote);93AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);9495setOperationAction(ISD::LOAD, MVT::v8f32, Promote);96AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);9798setOperationAction(ISD::LOAD, MVT::v9f32, Promote);99AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);100101setOperationAction(ISD::LOAD, MVT::v10f32, Promote);102AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);103104setOperationAction(ISD::LOAD, MVT::v11f32, Promote);105AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);106107setOperationAction(ISD::LOAD, MVT::v12f32, Promote);108AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);109110setOperationAction(ISD::LOAD, MVT::v16f32, Promote);111AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);112113setOperationAction(ISD::LOAD, MVT::v32f32, Promote);114AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);115116setOperationAction(ISD::LOAD, MVT::i64, Promote);117AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);118119setOperationAction(ISD::LOAD, MVT::v2i64, Promote);120AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);121122setOperationAction(ISD::LOAD, MVT::f64, Promote);123AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);124125setOperationAction(ISD::LOAD, MVT::v2f64, Promote);126AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);127128setOperationAction(ISD::LOAD, MVT::v3i64, Promote);129AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);130131setOperationAction(ISD::LOAD, MVT::v4i64, Promote);132AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);133134setOperationAction(ISD::LOAD, MVT::v3f64, Promote);135AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);136137setOperationAction(ISD::LOAD, MVT::v4f64, Promote);138AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);139140setOperationAction(ISD::LOAD, MVT::v8i64, Promote);141AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);142143setOperationAction(ISD::LOAD, MVT::v8f64, Promote);144AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);145146setOperationAction(ISD::LOAD, MVT::v16i64, Promote);147AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);148149setOperationAction(ISD::LOAD, MVT::v16f64, Promote);150AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);151152setOperationAction(ISD::LOAD, MVT::i128, Promote);153AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);154155// TODO: Would be better to consume as directly legal156setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote);157AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);158159setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote);160AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);161162setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote);163AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);164165setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote);166AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);167168setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote);169AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);170171setOperationAction(ISD::ATOMIC_STORE, MVT::f64, Promote);172AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);173174setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Promote);175AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);176177setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote);178AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);179180// There are no 64-bit extloads. These should be done as a 32-bit extload and181// an extension to 64-bit.182for (MVT VT : MVT::integer_valuetypes())183setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,184Expand);185186for (MVT VT : MVT::integer_valuetypes()) {187if (VT == MVT::i64)188continue;189190for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {191setLoadExtAction(Op, VT, MVT::i1, Promote);192setLoadExtAction(Op, VT, MVT::i8, Legal);193setLoadExtAction(Op, VT, MVT::i16, Legal);194setLoadExtAction(Op, VT, MVT::i32, Expand);195}196}197198for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())199for (auto MemVT :200{MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})201setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,202Expand);203204setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);205setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);206setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);207setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);208setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);209setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);210setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);211setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);212setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);213setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);214setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);215setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);216setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);217setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);218219setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);220setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);221setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);222setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);223setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);224setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);225226setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);227setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);228setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);229setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);230setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);231setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);232setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);233setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);234setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);235setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);236setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);237setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);238239setOperationAction(ISD::STORE, MVT::f32, Promote);240AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);241242setOperationAction(ISD::STORE, MVT::v2f32, Promote);243AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);244245setOperationAction(ISD::STORE, MVT::v3f32, Promote);246AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);247248setOperationAction(ISD::STORE, MVT::v4f32, Promote);249AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);250251setOperationAction(ISD::STORE, MVT::v5f32, Promote);252AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);253254setOperationAction(ISD::STORE, MVT::v6f32, Promote);255AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);256257setOperationAction(ISD::STORE, MVT::v7f32, Promote);258AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);259260setOperationAction(ISD::STORE, MVT::v8f32, Promote);261AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);262263setOperationAction(ISD::STORE, MVT::v9f32, Promote);264AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);265266setOperationAction(ISD::STORE, MVT::v10f32, Promote);267AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);268269setOperationAction(ISD::STORE, MVT::v11f32, Promote);270AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);271272setOperationAction(ISD::STORE, MVT::v12f32, Promote);273AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);274275setOperationAction(ISD::STORE, MVT::v16f32, Promote);276AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);277278setOperationAction(ISD::STORE, MVT::v32f32, Promote);279AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);280281setOperationAction(ISD::STORE, MVT::i64, Promote);282AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);283284setOperationAction(ISD::STORE, MVT::v2i64, Promote);285AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);286287setOperationAction(ISD::STORE, MVT::f64, Promote);288AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);289290setOperationAction(ISD::STORE, MVT::v2f64, Promote);291AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);292293setOperationAction(ISD::STORE, MVT::v3i64, Promote);294AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);295296setOperationAction(ISD::STORE, MVT::v3f64, Promote);297AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);298299setOperationAction(ISD::STORE, MVT::v4i64, Promote);300AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);301302setOperationAction(ISD::STORE, MVT::v4f64, Promote);303AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);304305setOperationAction(ISD::STORE, MVT::v8i64, Promote);306AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);307308setOperationAction(ISD::STORE, MVT::v8f64, Promote);309AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);310311setOperationAction(ISD::STORE, MVT::v16i64, Promote);312AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);313314setOperationAction(ISD::STORE, MVT::v16f64, Promote);315AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);316317setOperationAction(ISD::STORE, MVT::i128, Promote);318AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);319320setTruncStoreAction(MVT::i64, MVT::i1, Expand);321setTruncStoreAction(MVT::i64, MVT::i8, Expand);322setTruncStoreAction(MVT::i64, MVT::i16, Expand);323setTruncStoreAction(MVT::i64, MVT::i32, Expand);324325setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);326setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);327setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);328setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);329330setTruncStoreAction(MVT::f32, MVT::bf16, Expand);331setTruncStoreAction(MVT::f32, MVT::f16, Expand);332setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);333setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);334setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);335setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);336setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);337setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);338setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);339setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);340setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);341setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);342setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);343setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);344345setTruncStoreAction(MVT::f64, MVT::bf16, Expand);346setTruncStoreAction(MVT::f64, MVT::f16, Expand);347setTruncStoreAction(MVT::f64, MVT::f32, Expand);348349setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);350setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);351setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);352353setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);354355setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);356setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);357setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);358setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);359setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);360setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);361setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);362363setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);364setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);365setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);366setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);367setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);368369setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);370setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);371setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);372373setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);374setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);375setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);376setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);377setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);378setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);379setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);380setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);381382setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);383setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);384385setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);386387// For R600, this is totally unsupported, just custom lower to produce an388// error.389setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);390391// Library functions. These default to Expand, but we have instructions392// for them.393setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,394ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},395MVT::f32, Legal);396397setOperationAction(ISD::FLOG2, MVT::f32, Custom);398setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);399400setOperationAction(401{ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,402Custom);403404setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);405406setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);407408setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);409410if (Subtarget->has16BitInsts())411setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);412else {413setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);414setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);415}416417setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,418Custom);419420// FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches421// scalarization code. Can be removed when IS_FPCLASS expand isn't called by422// default unless marked custom/legal.423setOperationAction(424ISD::IS_FPCLASS,425{MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,426MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,427MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},428Custom);429430// Expand to fneg + fadd.431setOperationAction(ISD::FSUB, MVT::f64, Expand);432433setOperationAction(ISD::CONCAT_VECTORS,434{MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,435MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,436MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,437MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,438MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},439Custom);440441// FIXME: Why is v8f16/v8bf16 missing?442setOperationAction(443ISD::EXTRACT_SUBVECTOR,444{MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,445MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,446MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,447MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,448MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,449MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,450MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,451MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,452MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,453MVT::v32i16, MVT::v32f16, MVT::v32bf16},454Custom);455456setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);457setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);458459const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };460for (MVT VT : ScalarIntVTs) {461// These should use [SU]DIVREM, so set them to expand462setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,463Expand);464465// GPU does not have divrem function for signed or unsigned.466setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);467468// GPU does not have [S|U]MUL_LOHI functions as a single instruction.469setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);470471setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);472473// AMDGPU uses ADDC/SUBC/ADDE/SUBE474setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);475}476477// The hardware supports 32-bit FSHR, but not FSHL.478setOperationAction(ISD::FSHR, MVT::i32, Legal);479480// The hardware supports 32-bit ROTR, but not ROTL.481setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);482setOperationAction(ISD::ROTR, MVT::i64, Expand);483484setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);485486setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);487setOperationAction(488{ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},489MVT::i64, Custom);490setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);491492setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,493Legal);494495setOperationAction(496{ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},497MVT::i64, Custom);498499for (auto VT : {MVT::i8, MVT::i16})500setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom);501502static const MVT::SimpleValueType VectorIntTypes[] = {503MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,504MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};505506for (MVT VT : VectorIntTypes) {507// Expand the following operations for the current type by default.508setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT,509ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU,510ISD::MULHS, ISD::OR, ISD::SHL,511ISD::SRA, ISD::SRL, ISD::ROTL,512ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP,513ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV,514ISD::SREM, ISD::UREM, ISD::SMUL_LOHI,515ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM,516ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC,517ISD::XOR, ISD::BSWAP, ISD::CTPOP,518ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE,519ISD::SETCC},520VT, Expand);521}522523static const MVT::SimpleValueType FloatVectorTypes[] = {524MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,525MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};526527for (MVT VT : FloatVectorTypes) {528setOperationAction(529{ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,530ISD::FADD, ISD::FCEIL, ISD::FCOS,531ISD::FDIV, ISD::FEXP2, ISD::FEXP,532ISD::FEXP10, ISD::FLOG2, ISD::FREM,533ISD::FLOG, ISD::FLOG10, ISD::FPOW,534ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,535ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,536ISD::FSQRT, ISD::FSIN, ISD::FSUB,537ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,538ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,539ISD::FCANONICALIZE, ISD::FROUNDEVEN},540VT, Expand);541}542543// This causes using an unrolled select operation rather than expansion with544// bit operations. This is in general better, but the alternative using BFI545// instructions may be better if the select sources are SGPRs.546setOperationAction(ISD::SELECT, MVT::v2f32, Promote);547AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);548549setOperationAction(ISD::SELECT, MVT::v3f32, Promote);550AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);551552setOperationAction(ISD::SELECT, MVT::v4f32, Promote);553AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);554555setOperationAction(ISD::SELECT, MVT::v5f32, Promote);556AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);557558setOperationAction(ISD::SELECT, MVT::v6f32, Promote);559AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);560561setOperationAction(ISD::SELECT, MVT::v7f32, Promote);562AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);563564setOperationAction(ISD::SELECT, MVT::v9f32, Promote);565AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);566567setOperationAction(ISD::SELECT, MVT::v10f32, Promote);568AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);569570setOperationAction(ISD::SELECT, MVT::v11f32, Promote);571AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);572573setOperationAction(ISD::SELECT, MVT::v12f32, Promote);574AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);575576setSchedulingPreference(Sched::RegPressure);577setJumpIsExpensive(true);578579// FIXME: This is only partially true. If we have to do vector compares, any580// SGPR pair can be a condition register. If we have a uniform condition, we581// are better off doing SALU operations, where there is only one SCC. For now,582// we don't have a way of knowing during instruction selection if a condition583// will be uniform and we always use vector compares. Assume we are using584// vector compares until that is fixed.585setHasMultipleConditionRegisters(true);586587setMinCmpXchgSizeInBits(32);588setSupportsUnalignedAtomics(false);589590PredictableSelectIsExpensive = false;591592// We want to find all load dependencies for long chains of stores to enable593// merging into very wide vectors. The problem is with vectors with > 4594// elements. MergeConsecutiveStores will attempt to merge these because x8/x16595// vectors are a legal type, even though we have to split the loads596// usually. When we can more precisely specify load legality per address597// space, we should be able to make FindBetterChain/MergeConsecutiveStores598// smarter so that they can figure out what to do in 2 iterations without all599// N > 4 stores on the same chain.600GatherAllAliasesMaxDepth = 16;601602// memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry603// about these during lowering.604MaxStoresPerMemcpy = 0xffffffff;605MaxStoresPerMemmove = 0xffffffff;606MaxStoresPerMemset = 0xffffffff;607608// The expansion for 64-bit division is enormous.609if (AMDGPUBypassSlowDiv)610addBypassSlowDiv(64, 32);611612setTargetDAGCombine({ISD::BITCAST, ISD::SHL,613ISD::SRA, ISD::SRL,614ISD::TRUNCATE, ISD::MUL,615ISD::SMUL_LOHI, ISD::UMUL_LOHI,616ISD::MULHU, ISD::MULHS,617ISD::SELECT, ISD::SELECT_CC,618ISD::STORE, ISD::FADD,619ISD::FSUB, ISD::FNEG,620ISD::FABS, ISD::AssertZext,621ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});622623setMaxAtomicSizeInBitsSupported(64);624setMaxDivRemBitWidthSupported(64);625setMaxLargeFPConvertBitWidthSupported(64);626}627628bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {629if (getTargetMachine().Options.NoSignedZerosFPMath)630return true;631632const auto Flags = Op.getNode()->getFlags();633if (Flags.hasNoSignedZeros())634return true;635636return false;637}638639//===----------------------------------------------------------------------===//640// Target Information641//===----------------------------------------------------------------------===//642643LLVM_READNONE644static bool fnegFoldsIntoOpcode(unsigned Opc) {645switch (Opc) {646case ISD::FADD:647case ISD::FSUB:648case ISD::FMUL:649case ISD::FMA:650case ISD::FMAD:651case ISD::FMINNUM:652case ISD::FMAXNUM:653case ISD::FMINNUM_IEEE:654case ISD::FMAXNUM_IEEE:655case ISD::FMINIMUM:656case ISD::FMAXIMUM:657case ISD::SELECT:658case ISD::FSIN:659case ISD::FTRUNC:660case ISD::FRINT:661case ISD::FNEARBYINT:662case ISD::FROUNDEVEN:663case ISD::FCANONICALIZE:664case AMDGPUISD::RCP:665case AMDGPUISD::RCP_LEGACY:666case AMDGPUISD::RCP_IFLAG:667case AMDGPUISD::SIN_HW:668case AMDGPUISD::FMUL_LEGACY:669case AMDGPUISD::FMIN_LEGACY:670case AMDGPUISD::FMAX_LEGACY:671case AMDGPUISD::FMED3:672// TODO: handle llvm.amdgcn.fma.legacy673return true;674case ISD::BITCAST:675llvm_unreachable("bitcast is special cased");676default:677return false;678}679}680681static bool fnegFoldsIntoOp(const SDNode *N) {682unsigned Opc = N->getOpcode();683if (Opc == ISD::BITCAST) {684// TODO: Is there a benefit to checking the conditions performFNegCombine685// does? We don't for the other cases.686SDValue BCSrc = N->getOperand(0);687if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {688return BCSrc.getNumOperands() == 2 &&689BCSrc.getOperand(1).getValueSizeInBits() == 32;690}691692return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;693}694695return fnegFoldsIntoOpcode(Opc);696}697698/// \p returns true if the operation will definitely need to use a 64-bit699/// encoding, and thus will use a VOP3 encoding regardless of the source700/// modifiers.701LLVM_READONLY702static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {703return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||704VT == MVT::f64;705}706707/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the708/// type for ISD::SELECT.709LLVM_READONLY710static bool selectSupportsSourceMods(const SDNode *N) {711// TODO: Only applies if select will be vector712return N->getValueType(0) == MVT::f32;713}714715// Most FP instructions support source modifiers, but this could be refined716// slightly.717LLVM_READONLY718static bool hasSourceMods(const SDNode *N) {719if (isa<MemSDNode>(N))720return false;721722switch (N->getOpcode()) {723case ISD::CopyToReg:724case ISD::FDIV:725case ISD::FREM:726case ISD::INLINEASM:727case ISD::INLINEASM_BR:728case AMDGPUISD::DIV_SCALE:729case ISD::INTRINSIC_W_CHAIN:730731// TODO: Should really be looking at the users of the bitcast. These are732// problematic because bitcasts are used to legalize all stores to integer733// types.734case ISD::BITCAST:735return false;736case ISD::INTRINSIC_WO_CHAIN: {737switch (N->getConstantOperandVal(0)) {738case Intrinsic::amdgcn_interp_p1:739case Intrinsic::amdgcn_interp_p2:740case Intrinsic::amdgcn_interp_mov:741case Intrinsic::amdgcn_interp_p1_f16:742case Intrinsic::amdgcn_interp_p2_f16:743return false;744default:745return true;746}747}748case ISD::SELECT:749return selectSupportsSourceMods(N);750default:751return true;752}753}754755bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,756unsigned CostThreshold) {757// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus758// it is truly free to use a source modifier in all cases. If there are759// multiple users but for each one will necessitate using VOP3, there will be760// a code size increase. Try to avoid increasing code size unless we know it761// will save on the instruction count.762unsigned NumMayIncreaseSize = 0;763MVT VT = N->getValueType(0).getScalarType().getSimpleVT();764765assert(!N->use_empty());766767// XXX - Should this limit number of uses to check?768for (const SDNode *U : N->uses()) {769if (!hasSourceMods(U))770return false;771772if (!opMustUseVOP3Encoding(U, VT)) {773if (++NumMayIncreaseSize > CostThreshold)774return false;775}776}777778return true;779}780781EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,782ISD::NodeType ExtendKind) const {783assert(!VT.isVector() && "only scalar expected");784785// Round to the next multiple of 32-bits.786unsigned Size = VT.getSizeInBits();787if (Size <= 32)788return MVT::i32;789return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));790}791792MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {793return MVT::i32;794}795796bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {797return true;798}799800// The backend supports 32 and 64 bit floating point immediates.801// FIXME: Why are we reporting vectors of FP immediates as legal?802bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,803bool ForCodeSize) const {804EVT ScalarVT = VT.getScalarType();805return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||806(ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));807}808809// We don't want to shrink f64 / f32 constants.810bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {811EVT ScalarVT = VT.getScalarType();812return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);813}814815bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,816ISD::LoadExtType ExtTy,817EVT NewVT) const {818// TODO: This may be worth removing. Check regression tests for diffs.819if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))820return false;821822unsigned NewSize = NewVT.getStoreSizeInBits();823824// If we are reducing to a 32-bit load or a smaller multi-dword load,825// this is always better.826if (NewSize >= 32)827return true;828829EVT OldVT = N->getValueType(0);830unsigned OldSize = OldVT.getStoreSizeInBits();831832MemSDNode *MN = cast<MemSDNode>(N);833unsigned AS = MN->getAddressSpace();834// Do not shrink an aligned scalar load to sub-dword.835// Scalar engine cannot do sub-dword loads.836// TODO: Update this for GFX12 which does have scalar sub-dword loads.837if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&838(AS == AMDGPUAS::CONSTANT_ADDRESS ||839AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||840(isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&841MN->isInvariant())) &&842AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))843return false;844845// Don't produce extloads from sub 32-bit types. SI doesn't have scalar846// extloads, so doing one requires using a buffer_load. In cases where we847// still couldn't use a scalar load, using the wider load shouldn't really848// hurt anything.849850// If the old size already had to be an extload, there's no harm in continuing851// to reduce the width.852return (OldSize < 32);853}854855bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,856const SelectionDAG &DAG,857const MachineMemOperand &MMO) const {858859assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());860861if (LoadTy.getScalarType() == MVT::i32)862return false;863864unsigned LScalarSize = LoadTy.getScalarSizeInBits();865unsigned CastScalarSize = CastTy.getScalarSizeInBits();866867if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))868return false;869870unsigned Fast = 0;871return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),872CastTy, MMO, &Fast) &&873Fast;874}875876// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also877// profitable with the expansion for 64-bit since it's generally good to878// speculate things.879bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {880return true;881}882883bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {884return true;885}886887bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {888switch (N->getOpcode()) {889case ISD::EntryToken:890case ISD::TokenFactor:891return true;892case ISD::INTRINSIC_WO_CHAIN: {893unsigned IntrID = N->getConstantOperandVal(0);894return AMDGPU::isIntrinsicAlwaysUniform(IntrID);895}896case ISD::LOAD:897if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==898AMDGPUAS::CONSTANT_ADDRESS_32BIT)899return true;900return false;901case AMDGPUISD::SETCC: // ballot-style instruction902return true;903}904return false;905}906907SDValue AMDGPUTargetLowering::getNegatedExpression(908SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,909NegatibleCost &Cost, unsigned Depth) const {910911switch (Op.getOpcode()) {912case ISD::FMA:913case ISD::FMAD: {914// Negating a fma is not free if it has users without source mods.915if (!allUsesHaveSourceMods(Op.getNode()))916return SDValue();917break;918}919case AMDGPUISD::RCP: {920SDValue Src = Op.getOperand(0);921EVT VT = Op.getValueType();922SDLoc SL(Op);923924SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,925ForCodeSize, Cost, Depth + 1);926if (NegSrc)927return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());928return SDValue();929}930default:931break;932}933934return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,935ForCodeSize, Cost, Depth);936}937938//===---------------------------------------------------------------------===//939// Target Properties940//===---------------------------------------------------------------------===//941942bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {943assert(VT.isFloatingPoint());944945// Packed operations do not have a fabs modifier.946return VT == MVT::f32 || VT == MVT::f64 ||947(Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));948}949950bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {951assert(VT.isFloatingPoint());952// Report this based on the end legalized type.953VT = VT.getScalarType();954return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;955}956957bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,958unsigned NumElem,959unsigned AS) const {960return true;961}962963bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {964// There are few operations which truly have vector input operands. Any vector965// operation is going to involve operations on each component, and a966// build_vector will be a copy per element, so it always makes sense to use a967// build_vector input in place of the extracted element to avoid a copy into a968// super register.969//970// We should probably only do this if all users are extracts only, but this971// should be the common case.972return true;973}974975bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {976// Truncate is just accessing a subregister.977978unsigned SrcSize = Source.getSizeInBits();979unsigned DestSize = Dest.getSizeInBits();980981return DestSize < SrcSize && DestSize % 32 == 0 ;982}983984bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {985// Truncate is just accessing a subregister.986987unsigned SrcSize = Source->getScalarSizeInBits();988unsigned DestSize = Dest->getScalarSizeInBits();989990if (DestSize== 16 && Subtarget->has16BitInsts())991return SrcSize >= 32;992993return DestSize < SrcSize && DestSize % 32 == 0;994}995996bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {997unsigned SrcSize = Src->getScalarSizeInBits();998unsigned DestSize = Dest->getScalarSizeInBits();9991000if (SrcSize == 16 && Subtarget->has16BitInsts())1001return DestSize >= 32;10021003return SrcSize == 32 && DestSize == 64;1004}10051006bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {1007// Any register load of a 64-bit value really requires 2 32-bit moves. For all1008// practical purposes, the extra mov 0 to load a 64-bit is free. As used,1009// this will enable reducing 64-bit operations the 32-bit, which is always1010// good.10111012if (Src == MVT::i16)1013return Dest == MVT::i32 ||Dest == MVT::i64 ;10141015return Src == MVT::i32 && Dest == MVT::i64;1016}10171018bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {1019// There aren't really 64-bit registers, but pairs of 32-bit ones and only a1020// limited number of native 64-bit operations. Shrinking an operation to fit1021// in a single 32-bit register should always be helpful. As currently used,1022// this is much less general than the name suggests, and is only used in1023// places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is1024// not profitable, and may actually be harmful.1025return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;1026}10271028bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(1029const SDNode* N, CombineLevel Level) const {1030assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||1031N->getOpcode() == ISD::SRL) &&1032"Expected shift op");1033// Always commute pre-type legalization and right shifts.1034// We're looking for shl(or(x,y),z) patterns.1035if (Level < CombineLevel::AfterLegalizeTypes ||1036N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)1037return true;10381039// If only user is a i32 right-shift, then don't destroy a BFE pattern.1040if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&1041(N->use_begin()->getOpcode() == ISD::SRA ||1042N->use_begin()->getOpcode() == ISD::SRL))1043return false;10441045// Don't destroy or(shl(load_zext(),c), load_zext()) patterns.1046auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {1047if (LHS.getOpcode() != ISD::SHL)1048return false;1049auto *RHSLd = dyn_cast<LoadSDNode>(RHS);1050auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));1051auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));1052return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&1053LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&1054RHSLd->getExtensionType() == ISD::ZEXTLOAD;1055};1056SDValue LHS = N->getOperand(0).getOperand(0);1057SDValue RHS = N->getOperand(0).getOperand(1);1058return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));1059}10601061//===---------------------------------------------------------------------===//1062// TargetLowering Callbacks1063//===---------------------------------------------------------------------===//10641065CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,1066bool IsVarArg) {1067switch (CC) {1068case CallingConv::AMDGPU_VS:1069case CallingConv::AMDGPU_GS:1070case CallingConv::AMDGPU_PS:1071case CallingConv::AMDGPU_CS:1072case CallingConv::AMDGPU_HS:1073case CallingConv::AMDGPU_ES:1074case CallingConv::AMDGPU_LS:1075return CC_AMDGPU;1076case CallingConv::AMDGPU_CS_Chain:1077case CallingConv::AMDGPU_CS_ChainPreserve:1078return CC_AMDGPU_CS_CHAIN;1079case CallingConv::C:1080case CallingConv::Fast:1081case CallingConv::Cold:1082return CC_AMDGPU_Func;1083case CallingConv::AMDGPU_Gfx:1084return CC_SI_Gfx;1085case CallingConv::AMDGPU_KERNEL:1086case CallingConv::SPIR_KERNEL:1087default:1088report_fatal_error("Unsupported calling convention for call");1089}1090}10911092CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,1093bool IsVarArg) {1094switch (CC) {1095case CallingConv::AMDGPU_KERNEL:1096case CallingConv::SPIR_KERNEL:1097llvm_unreachable("kernels should not be handled here");1098case CallingConv::AMDGPU_VS:1099case CallingConv::AMDGPU_GS:1100case CallingConv::AMDGPU_PS:1101case CallingConv::AMDGPU_CS:1102case CallingConv::AMDGPU_CS_Chain:1103case CallingConv::AMDGPU_CS_ChainPreserve:1104case CallingConv::AMDGPU_HS:1105case CallingConv::AMDGPU_ES:1106case CallingConv::AMDGPU_LS:1107return RetCC_SI_Shader;1108case CallingConv::AMDGPU_Gfx:1109return RetCC_SI_Gfx;1110case CallingConv::C:1111case CallingConv::Fast:1112case CallingConv::Cold:1113return RetCC_AMDGPU_Func;1114default:1115report_fatal_error("Unsupported calling convention.");1116}1117}11181119/// The SelectionDAGBuilder will automatically promote function arguments1120/// with illegal types. However, this does not work for the AMDGPU targets1121/// since the function arguments are stored in memory as these illegal types.1122/// In order to handle this properly we need to get the original types sizes1123/// from the LLVM IR Function and fixup the ISD:InputArg values before1124/// passing them to AnalyzeFormalArguments()11251126/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting1127/// input values across multiple registers. Each item in the Ins array1128/// represents a single value that will be stored in registers. Ins[x].VT is1129/// the value type of the value that will be stored in the register, so1130/// whatever SDNode we lower the argument to needs to be this type.1131///1132/// In order to correctly lower the arguments we need to know the size of each1133/// argument. Since Ins[x].VT gives us the size of the register that will1134/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type1135/// for the original function argument so that we can deduce the correct memory1136/// type to use for Ins[x]. In most cases the correct memory type will be1137/// Ins[x].ArgVT. However, this will not always be the case. If, for example,1138/// we have a kernel argument of type v8i8, this argument will be split into1139/// 8 parts and each part will be represented by its own item in the Ins array.1140/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of1141/// the argument before it was split. From this, we deduce that the memory type1142/// for each individual part is i8. We pass the memory type as LocVT to the1143/// calling convention analysis function and the register type (Ins[x].VT) as1144/// the ValVT.1145void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(1146CCState &State,1147const SmallVectorImpl<ISD::InputArg> &Ins) const {1148const MachineFunction &MF = State.getMachineFunction();1149const Function &Fn = MF.getFunction();1150LLVMContext &Ctx = Fn.getParent()->getContext();1151const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);1152const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();1153CallingConv::ID CC = Fn.getCallingConv();11541155Align MaxAlign = Align(1);1156uint64_t ExplicitArgOffset = 0;1157const DataLayout &DL = Fn.getDataLayout();11581159unsigned InIndex = 0;11601161for (const Argument &Arg : Fn.args()) {1162const bool IsByRef = Arg.hasByRefAttr();1163Type *BaseArgTy = Arg.getType();1164Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;1165Align Alignment = DL.getValueOrABITypeAlignment(1166IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);1167MaxAlign = std::max(Alignment, MaxAlign);1168uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);11691170uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;1171ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;11721173// We're basically throwing away everything passed into us and starting over1174// to get accurate in-memory offsets. The "PartOffset" is completely useless1175// to us as computed in Ins.1176//1177// We also need to figure out what type legalization is trying to do to get1178// the correct memory offsets.11791180SmallVector<EVT, 16> ValueVTs;1181SmallVector<uint64_t, 16> Offsets;1182ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);11831184for (unsigned Value = 0, NumValues = ValueVTs.size();1185Value != NumValues; ++Value) {1186uint64_t BasePartOffset = Offsets[Value];11871188EVT ArgVT = ValueVTs[Value];1189EVT MemVT = ArgVT;1190MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);1191unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);11921193if (NumRegs == 1) {1194// This argument is not split, so the IR type is the memory type.1195if (ArgVT.isExtended()) {1196// We have an extended type, like i24, so we should just use the1197// register type.1198MemVT = RegisterVT;1199} else {1200MemVT = ArgVT;1201}1202} else if (ArgVT.isVector() && RegisterVT.isVector() &&1203ArgVT.getScalarType() == RegisterVT.getScalarType()) {1204assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());1205// We have a vector value which has been split into a vector with1206// the same scalar type, but fewer elements. This should handle1207// all the floating-point vector types.1208MemVT = RegisterVT;1209} else if (ArgVT.isVector() &&1210ArgVT.getVectorNumElements() == NumRegs) {1211// This arg has been split so that each element is stored in a separate1212// register.1213MemVT = ArgVT.getScalarType();1214} else if (ArgVT.isExtended()) {1215// We have an extended type, like i65.1216MemVT = RegisterVT;1217} else {1218unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;1219assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);1220if (RegisterVT.isInteger()) {1221MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);1222} else if (RegisterVT.isVector()) {1223assert(!RegisterVT.getScalarType().isFloatingPoint());1224unsigned NumElements = RegisterVT.getVectorNumElements();1225assert(MemoryBits % NumElements == 0);1226// This vector type has been split into another vector type with1227// a different elements size.1228EVT ScalarVT = EVT::getIntegerVT(State.getContext(),1229MemoryBits / NumElements);1230MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);1231} else {1232llvm_unreachable("cannot deduce memory type.");1233}1234}12351236// Convert one element vectors to scalar.1237if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)1238MemVT = MemVT.getScalarType();12391240// Round up vec3/vec5 argument.1241if (MemVT.isVector() && !MemVT.isPow2VectorType()) {1242assert(MemVT.getVectorNumElements() == 3 ||1243MemVT.getVectorNumElements() == 5 ||1244(MemVT.getVectorNumElements() >= 9 &&1245MemVT.getVectorNumElements() <= 12));1246MemVT = MemVT.getPow2VectorType(State.getContext());1247} else if (!MemVT.isSimple() && !MemVT.isVector()) {1248MemVT = MemVT.getRoundIntegerType(State.getContext());1249}12501251unsigned PartOffset = 0;1252for (unsigned i = 0; i != NumRegs; ++i) {1253State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,1254BasePartOffset + PartOffset,1255MemVT.getSimpleVT(),1256CCValAssign::Full));1257PartOffset += MemVT.getStoreSize();1258}1259}1260}1261}12621263SDValue AMDGPUTargetLowering::LowerReturn(1264SDValue Chain, CallingConv::ID CallConv,1265bool isVarArg,1266const SmallVectorImpl<ISD::OutputArg> &Outs,1267const SmallVectorImpl<SDValue> &OutVals,1268const SDLoc &DL, SelectionDAG &DAG) const {1269// FIXME: Fails for r600 tests1270//assert(!isVarArg && Outs.empty() && OutVals.empty() &&1271// "wave terminate should not have return values");1272return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);1273}12741275//===---------------------------------------------------------------------===//1276// Target specific lowering1277//===---------------------------------------------------------------------===//12781279/// Selects the correct CCAssignFn for a given CallingConvention value.1280CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,1281bool IsVarArg) {1282return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);1283}12841285CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,1286bool IsVarArg) {1287return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);1288}12891290SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,1291SelectionDAG &DAG,1292MachineFrameInfo &MFI,1293int ClobberedFI) const {1294SmallVector<SDValue, 8> ArgChains;1295int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);1296int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;12971298// Include the original chain at the beginning of the list. When this is1299// used by target LowerCall hooks, this helps legalize find the1300// CALLSEQ_BEGIN node.1301ArgChains.push_back(Chain);13021303// Add a chain value for each stack argument corresponding1304for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {1305if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {1306if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {1307if (FI->getIndex() < 0) {1308int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());1309int64_t InLastByte = InFirstByte;1310InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;13111312if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||1313(FirstByte <= InFirstByte && InFirstByte <= LastByte))1314ArgChains.push_back(SDValue(L, 1));1315}1316}1317}1318}13191320// Build a tokenfactor for all the chains.1321return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);1322}13231324SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,1325SmallVectorImpl<SDValue> &InVals,1326StringRef Reason) const {1327SDValue Callee = CLI.Callee;1328SelectionDAG &DAG = CLI.DAG;13291330const Function &Fn = DAG.getMachineFunction().getFunction();13311332StringRef FuncName("<unknown>");13331334if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))1335FuncName = G->getSymbol();1336else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))1337FuncName = G->getGlobal()->getName();13381339DiagnosticInfoUnsupported NoCalls(1340Fn, Reason + FuncName, CLI.DL.getDebugLoc());1341DAG.getContext()->diagnose(NoCalls);13421343if (!CLI.IsTailCall) {1344for (ISD::InputArg &Arg : CLI.Ins)1345InVals.push_back(DAG.getUNDEF(Arg.VT));1346}13471348return DAG.getEntryNode();1349}13501351SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,1352SmallVectorImpl<SDValue> &InVals) const {1353return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");1354}13551356SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,1357SelectionDAG &DAG) const {1358const Function &Fn = DAG.getMachineFunction().getFunction();13591360DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",1361SDLoc(Op).getDebugLoc());1362DAG.getContext()->diagnose(NoDynamicAlloca);1363auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};1364return DAG.getMergeValues(Ops, SDLoc());1365}13661367SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,1368SelectionDAG &DAG) const {1369switch (Op.getOpcode()) {1370default:1371Op->print(errs(), &DAG);1372llvm_unreachable("Custom lowering code for this "1373"instruction is not implemented yet!");1374break;1375case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);1376case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);1377case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);1378case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);1379case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);1380case ISD::FREM: return LowerFREM(Op, DAG);1381case ISD::FCEIL: return LowerFCEIL(Op, DAG);1382case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);1383case ISD::FRINT: return LowerFRINT(Op, DAG);1384case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);1385case ISD::FROUNDEVEN:1386return LowerFROUNDEVEN(Op, DAG);1387case ISD::FROUND: return LowerFROUND(Op, DAG);1388case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);1389case ISD::FLOG2:1390return LowerFLOG2(Op, DAG);1391case ISD::FLOG:1392case ISD::FLOG10:1393return LowerFLOGCommon(Op, DAG);1394case ISD::FEXP:1395case ISD::FEXP10:1396return lowerFEXP(Op, DAG);1397case ISD::FEXP2:1398return lowerFEXP2(Op, DAG);1399case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);1400case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);1401case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);1402case ISD::FP_TO_SINT:1403case ISD::FP_TO_UINT:1404return LowerFP_TO_INT(Op, DAG);1405case ISD::CTTZ:1406case ISD::CTTZ_ZERO_UNDEF:1407case ISD::CTLZ:1408case ISD::CTLZ_ZERO_UNDEF:1409return LowerCTLZ_CTTZ(Op, DAG);1410case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);1411}1412return Op;1413}14141415void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,1416SmallVectorImpl<SDValue> &Results,1417SelectionDAG &DAG) const {1418switch (N->getOpcode()) {1419case ISD::SIGN_EXTEND_INREG:1420// Different parts of legalization seem to interpret which type of1421// sign_extend_inreg is the one to check for custom lowering. The extended1422// from type is what really matters, but some places check for custom1423// lowering of the result type. This results in trying to use1424// ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do1425// nothing here and let the illegal result integer be handled normally.1426return;1427case ISD::FLOG2:1428if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))1429Results.push_back(Lowered);1430return;1431case ISD::FLOG:1432case ISD::FLOG10:1433if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))1434Results.push_back(Lowered);1435return;1436case ISD::FEXP2:1437if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))1438Results.push_back(Lowered);1439return;1440case ISD::FEXP:1441case ISD::FEXP10:1442if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))1443Results.push_back(Lowered);1444return;1445case ISD::CTLZ:1446case ISD::CTLZ_ZERO_UNDEF:1447if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))1448Results.push_back(Lowered);1449return;1450default:1451return;1452}1453}14541455SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,1456SDValue Op,1457SelectionDAG &DAG) const {14581459const DataLayout &DL = DAG.getDataLayout();1460GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);1461const GlobalValue *GV = G->getGlobal();14621463if (!MFI->isModuleEntryFunction()) {1464if (std::optional<uint32_t> Address =1465AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {1466return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());1467}1468}14691470if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||1471G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {1472if (!MFI->isModuleEntryFunction() &&1473GV->getName() != "llvm.amdgcn.module.lds") {1474SDLoc DL(Op);1475const Function &Fn = DAG.getMachineFunction().getFunction();1476DiagnosticInfoUnsupported BadLDSDecl(1477Fn, "local memory global used by non-kernel function",1478DL.getDebugLoc(), DS_Warning);1479DAG.getContext()->diagnose(BadLDSDecl);14801481// We currently don't have a way to correctly allocate LDS objects that1482// aren't directly associated with a kernel. We do force inlining of1483// functions that use local objects. However, if these dead functions are1484// not eliminated, we don't want a compile time error. Just emit a warning1485// and a trap, since there should be no callable path here.1486SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());1487SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,1488Trap, DAG.getRoot());1489DAG.setRoot(OutputChain);1490return DAG.getUNDEF(Op.getValueType());1491}14921493// XXX: What does the value of G->getOffset() mean?1494assert(G->getOffset() == 0 &&1495"Do not know what to do with an non-zero offset");14961497// TODO: We could emit code to handle the initialization somewhere.1498// We ignore the initializer for now and legalize it to allow selection.1499// The initializer will anyway get errored out during assembly emission.1500unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));1501return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());1502}1503return SDValue();1504}15051506SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,1507SelectionDAG &DAG) const {1508SmallVector<SDValue, 8> Args;1509SDLoc SL(Op);15101511EVT VT = Op.getValueType();1512if (VT.getVectorElementType().getSizeInBits() < 32) {1513unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();1514if (OpBitSize >= 32 && OpBitSize % 32 == 0) {1515unsigned NewNumElt = OpBitSize / 32;1516EVT NewEltVT = (NewNumElt == 1) ? MVT::i321517: EVT::getVectorVT(*DAG.getContext(),1518MVT::i32, NewNumElt);1519for (const SDUse &U : Op->ops()) {1520SDValue In = U.get();1521SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);1522if (NewNumElt > 1)1523DAG.ExtractVectorElements(NewIn, Args);1524else1525Args.push_back(NewIn);1526}15271528EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,1529NewNumElt * Op.getNumOperands());1530SDValue BV = DAG.getBuildVector(NewVT, SL, Args);1531return DAG.getNode(ISD::BITCAST, SL, VT, BV);1532}1533}15341535for (const SDUse &U : Op->ops())1536DAG.ExtractVectorElements(U.get(), Args);15371538return DAG.getBuildVector(Op.getValueType(), SL, Args);1539}15401541SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,1542SelectionDAG &DAG) const {1543SDLoc SL(Op);1544SmallVector<SDValue, 8> Args;1545unsigned Start = Op.getConstantOperandVal(1);1546EVT VT = Op.getValueType();1547EVT SrcVT = Op.getOperand(0).getValueType();15481549if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {1550unsigned NumElt = VT.getVectorNumElements();1551unsigned NumSrcElt = SrcVT.getVectorNumElements();1552assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");15531554// Extract 32-bit registers at a time.1555EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);1556EVT NewVT = NumElt == 21557? MVT::i321558: EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);1559SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));15601561DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);1562if (NumElt == 2)1563Tmp = Args[0];1564else1565Tmp = DAG.getBuildVector(NewVT, SL, Args);15661567return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);1568}15691570DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,1571VT.getVectorNumElements());15721573return DAG.getBuildVector(Op.getValueType(), SL, Args);1574}15751576// TODO: Handle fabs too1577static SDValue peekFNeg(SDValue Val) {1578if (Val.getOpcode() == ISD::FNEG)1579return Val.getOperand(0);15801581return Val;1582}15831584static SDValue peekFPSignOps(SDValue Val) {1585if (Val.getOpcode() == ISD::FNEG)1586Val = Val.getOperand(0);1587if (Val.getOpcode() == ISD::FABS)1588Val = Val.getOperand(0);1589if (Val.getOpcode() == ISD::FCOPYSIGN)1590Val = Val.getOperand(0);1591return Val;1592}15931594SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(1595const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,1596SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {1597SelectionDAG &DAG = DCI.DAG;1598ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();1599switch (CCOpcode) {1600case ISD::SETOEQ:1601case ISD::SETONE:1602case ISD::SETUNE:1603case ISD::SETNE:1604case ISD::SETUEQ:1605case ISD::SETEQ:1606case ISD::SETFALSE:1607case ISD::SETFALSE2:1608case ISD::SETTRUE:1609case ISD::SETTRUE2:1610case ISD::SETUO:1611case ISD::SETO:1612break;1613case ISD::SETULE:1614case ISD::SETULT: {1615if (LHS == True)1616return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);1617return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);1618}1619case ISD::SETOLE:1620case ISD::SETOLT:1621case ISD::SETLE:1622case ISD::SETLT: {1623// Ordered. Assume ordered for undefined.16241625// Only do this after legalization to avoid interfering with other combines1626// which might occur.1627if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&1628!DCI.isCalledByLegalizer())1629return SDValue();16301631// We need to permute the operands to get the correct NaN behavior. The1632// selected operand is the second one based on the failing compare with NaN,1633// so permute it based on the compare type the hardware uses.1634if (LHS == True)1635return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);1636return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);1637}1638case ISD::SETUGE:1639case ISD::SETUGT: {1640if (LHS == True)1641return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);1642return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);1643}1644case ISD::SETGT:1645case ISD::SETGE:1646case ISD::SETOGE:1647case ISD::SETOGT: {1648if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&1649!DCI.isCalledByLegalizer())1650return SDValue();16511652if (LHS == True)1653return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);1654return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);1655}1656case ISD::SETCC_INVALID:1657llvm_unreachable("Invalid setcc condcode!");1658}1659return SDValue();1660}16611662/// Generate Min/Max node1663SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,1664SDValue LHS, SDValue RHS,1665SDValue True, SDValue False,1666SDValue CC,1667DAGCombinerInfo &DCI) const {1668if ((LHS == True && RHS == False) || (LHS == False && RHS == True))1669return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);16701671SelectionDAG &DAG = DCI.DAG;16721673// If we can't directly match this, try to see if we can fold an fneg to1674// match.16751676ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);1677ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);1678SDValue NegTrue = peekFNeg(True);16791680// Undo the combine foldFreeOpFromSelect does if it helps us match the1681// fmin/fmax.1682//1683// select (fcmp olt (lhs, K)), (fneg lhs), -K1684// -> fneg (fmin_legacy lhs, K)1685//1686// TODO: Use getNegatedExpression1687if (LHS == NegTrue && CFalse && CRHS) {1688APFloat NegRHS = neg(CRHS->getValueAPF());1689if (NegRHS == CFalse->getValueAPF()) {1690SDValue Combined =1691combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);1692if (Combined)1693return DAG.getNode(ISD::FNEG, DL, VT, Combined);1694return SDValue();1695}1696}16971698return SDValue();1699}17001701std::pair<SDValue, SDValue>1702AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {1703SDLoc SL(Op);17041705SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);17061707const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);1708const SDValue One = DAG.getConstant(1, SL, MVT::i32);17091710SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);1711SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);17121713return std::pair(Lo, Hi);1714}17151716SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {1717SDLoc SL(Op);17181719SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);1720const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);1721return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);1722}17231724SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {1725SDLoc SL(Op);17261727SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);1728const SDValue One = DAG.getConstant(1, SL, MVT::i32);1729return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);1730}17311732// Split a vector type into two parts. The first part is a power of two vector.1733// The second part is whatever is left over, and is a scalar if it would1734// otherwise be a 1-vector.1735std::pair<EVT, EVT>1736AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {1737EVT LoVT, HiVT;1738EVT EltVT = VT.getVectorElementType();1739unsigned NumElts = VT.getVectorNumElements();1740unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);1741LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);1742HiVT = NumElts - LoNumElts == 11743? EltVT1744: EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);1745return std::pair(LoVT, HiVT);1746}17471748// Split a vector value into two parts of types LoVT and HiVT. HiVT could be1749// scalar.1750std::pair<SDValue, SDValue>1751AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,1752const EVT &LoVT, const EVT &HiVT,1753SelectionDAG &DAG) const {1754assert(LoVT.getVectorNumElements() +1755(HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=1756N.getValueType().getVectorNumElements() &&1757"More vector elements requested than available!");1758SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,1759DAG.getVectorIdxConstant(0, DL));1760SDValue Hi = DAG.getNode(1761HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,1762HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));1763return std::pair(Lo, Hi);1764}17651766SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,1767SelectionDAG &DAG) const {1768LoadSDNode *Load = cast<LoadSDNode>(Op);1769EVT VT = Op.getValueType();1770SDLoc SL(Op);177117721773// If this is a 2 element vector, we really want to scalarize and not create1774// weird 1 element vectors.1775if (VT.getVectorNumElements() == 2) {1776SDValue Ops[2];1777std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);1778return DAG.getMergeValues(Ops, SL);1779}17801781SDValue BasePtr = Load->getBasePtr();1782EVT MemVT = Load->getMemoryVT();17831784const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();17851786EVT LoVT, HiVT;1787EVT LoMemVT, HiMemVT;1788SDValue Lo, Hi;17891790std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);1791std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);1792std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);17931794unsigned Size = LoMemVT.getStoreSize();1795Align BaseAlign = Load->getAlign();1796Align HiAlign = commonAlignment(BaseAlign, Size);17971798SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,1799Load->getChain(), BasePtr, SrcValue, LoMemVT,1800BaseAlign, Load->getMemOperand()->getFlags());1801SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));1802SDValue HiLoad =1803DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),1804HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),1805HiMemVT, HiAlign, Load->getMemOperand()->getFlags());18061807SDValue Join;1808if (LoVT == HiVT) {1809// This is the case that the vector is power of two so was evenly split.1810Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);1811} else {1812Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,1813DAG.getVectorIdxConstant(0, SL));1814Join = DAG.getNode(1815HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,1816VT, Join, HiLoad,1817DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));1818}18191820SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,1821LoLoad.getValue(1), HiLoad.getValue(1))};18221823return DAG.getMergeValues(Ops, SL);1824}18251826SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,1827SelectionDAG &DAG) const {1828LoadSDNode *Load = cast<LoadSDNode>(Op);1829EVT VT = Op.getValueType();1830SDValue BasePtr = Load->getBasePtr();1831EVT MemVT = Load->getMemoryVT();1832SDLoc SL(Op);1833const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();1834Align BaseAlign = Load->getAlign();1835unsigned NumElements = MemVT.getVectorNumElements();18361837// Widen from vec3 to vec4 when the load is at least 8-byte aligned1838// or 16-byte fully dereferenceable. Otherwise, split the vector load.1839if (NumElements != 3 ||1840(BaseAlign < Align(8) &&1841!SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))1842return SplitVectorLoad(Op, DAG);18431844assert(NumElements == 3);18451846EVT WideVT =1847EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);1848EVT WideMemVT =1849EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);1850SDValue WideLoad = DAG.getExtLoad(1851Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,1852WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());1853return DAG.getMergeValues(1854{DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,1855DAG.getVectorIdxConstant(0, SL)),1856WideLoad.getValue(1)},1857SL);1858}18591860SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,1861SelectionDAG &DAG) const {1862StoreSDNode *Store = cast<StoreSDNode>(Op);1863SDValue Val = Store->getValue();1864EVT VT = Val.getValueType();18651866// If this is a 2 element vector, we really want to scalarize and not create1867// weird 1 element vectors.1868if (VT.getVectorNumElements() == 2)1869return scalarizeVectorStore(Store, DAG);18701871EVT MemVT = Store->getMemoryVT();1872SDValue Chain = Store->getChain();1873SDValue BasePtr = Store->getBasePtr();1874SDLoc SL(Op);18751876EVT LoVT, HiVT;1877EVT LoMemVT, HiMemVT;1878SDValue Lo, Hi;18791880std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);1881std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);1882std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);18831884SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());18851886const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();1887Align BaseAlign = Store->getAlign();1888unsigned Size = LoMemVT.getStoreSize();1889Align HiAlign = commonAlignment(BaseAlign, Size);18901891SDValue LoStore =1892DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,1893Store->getMemOperand()->getFlags());1894SDValue HiStore =1895DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),1896HiMemVT, HiAlign, Store->getMemOperand()->getFlags());18971898return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);1899}19001901// This is a shortcut for integer division because we have fast i32<->f321902// conversions, and fast f32 reciprocal instructions. The fractional part of a1903// float is enough to accurately represent up to a 24-bit signed integer.1904SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,1905bool Sign) const {1906SDLoc DL(Op);1907EVT VT = Op.getValueType();1908SDValue LHS = Op.getOperand(0);1909SDValue RHS = Op.getOperand(1);1910MVT IntVT = MVT::i32;1911MVT FltVT = MVT::f32;19121913unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);1914if (LHSSignBits < 9)1915return SDValue();19161917unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);1918if (RHSSignBits < 9)1919return SDValue();19201921unsigned BitSize = VT.getSizeInBits();1922unsigned SignBits = std::min(LHSSignBits, RHSSignBits);1923unsigned DivBits = BitSize - SignBits;1924if (Sign)1925++DivBits;19261927ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;1928ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;19291930SDValue jq = DAG.getConstant(1, DL, IntVT);19311932if (Sign) {1933// char|short jq = ia ^ ib;1934jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);19351936// jq = jq >> (bitsize - 2)1937jq = DAG.getNode(ISD::SRA, DL, VT, jq,1938DAG.getConstant(BitSize - 2, DL, VT));19391940// jq = jq | 0x11941jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));1942}19431944// int ia = (int)LHS;1945SDValue ia = LHS;19461947// int ib, (int)RHS;1948SDValue ib = RHS;19491950// float fa = (float)ia;1951SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);19521953// float fb = (float)ib;1954SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);19551956SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,1957fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));19581959// fq = trunc(fq);1960fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);19611962// float fqneg = -fq;1963SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);19641965MachineFunction &MF = DAG.getMachineFunction();19661967bool UseFmadFtz = false;1968if (Subtarget->isGCN()) {1969const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();1970UseFmadFtz =1971MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();1972}19731974// float fr = mad(fqneg, fb, fa);1975unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA1976: UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ1977: (unsigned)ISD::FMAD;1978SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);19791980// int iq = (int)fq;1981SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);19821983// fr = fabs(fr);1984fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);19851986// fb = fabs(fb);1987fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);19881989EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);19901991// int cv = fr >= fb;1992SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);19931994// jq = (cv ? jq : 0);1995jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));19961997// dst = iq + jq;1998SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);19992000// Rem needs compensation, it's easier to recompute it2001SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);2002Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);20032004// Truncate to number of bits this divide really is.2005if (Sign) {2006SDValue InRegSize2007= DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));2008Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);2009Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);2010} else {2011SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);2012Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);2013Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);2014}20152016return DAG.getMergeValues({ Div, Rem }, DL);2017}20182019void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,2020SelectionDAG &DAG,2021SmallVectorImpl<SDValue> &Results) const {2022SDLoc DL(Op);2023EVT VT = Op.getValueType();20242025assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");20262027EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());20282029SDValue One = DAG.getConstant(1, DL, HalfVT);2030SDValue Zero = DAG.getConstant(0, DL, HalfVT);20312032//HiLo split2033SDValue LHS_Lo, LHS_Hi;2034SDValue LHS = Op.getOperand(0);2035std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);20362037SDValue RHS_Lo, RHS_Hi;2038SDValue RHS = Op.getOperand(1);2039std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);20402041if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&2042DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {20432044SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),2045LHS_Lo, RHS_Lo);20462047SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});2048SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});20492050Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));2051Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));2052return;2053}20542055if (isTypeLegal(MVT::i64)) {2056// The algorithm here is based on ideas from "Software Integer Division",2057// Tom Rodeheffer, August 2008.20582059MachineFunction &MF = DAG.getMachineFunction();2060const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();20612062// Compute denominator reciprocal.2063unsigned FMAD =2064!Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA2065: MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()2066? (unsigned)ISD::FMAD2067: (unsigned)AMDGPUISD::FMAD_FTZ;20682069SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);2070SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);2071SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,2072DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),2073Cvt_Lo);2074SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);2075SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,2076DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));2077SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,2078DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));2079SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);2080SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,2081DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),2082Mul1);2083SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);2084SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);2085SDValue Rcp64 = DAG.getBitcast(VT,2086DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));20872088SDValue Zero64 = DAG.getConstant(0, DL, VT);2089SDValue One64 = DAG.getConstant(1, DL, VT);2090SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);2091SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);20922093// First round of UNR (Unsigned integer Newton-Raphson).2094SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);2095SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);2096SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);2097SDValue Mulhi1_Lo, Mulhi1_Hi;2098std::tie(Mulhi1_Lo, Mulhi1_Hi) =2099DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);2100SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,2101Mulhi1_Lo, Zero1);2102SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,2103Mulhi1_Hi, Add1_Lo.getValue(1));2104SDValue Add1 = DAG.getBitcast(VT,2105DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));21062107// Second round of UNR.2108SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);2109SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);2110SDValue Mulhi2_Lo, Mulhi2_Hi;2111std::tie(Mulhi2_Lo, Mulhi2_Hi) =2112DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);2113SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,2114Mulhi2_Lo, Zero1);2115SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,2116Mulhi2_Hi, Add2_Lo.getValue(1));2117SDValue Add2 = DAG.getBitcast(VT,2118DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));21192120SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);21212122SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);21232124SDValue Mul3_Lo, Mul3_Hi;2125std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);2126SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,2127Mul3_Lo, Zero1);2128SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,2129Mul3_Hi, Sub1_Lo.getValue(1));2130SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);2131SDValue Sub1 = DAG.getBitcast(VT,2132DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));21332134SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);2135SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,2136ISD::SETUGE);2137SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,2138ISD::SETUGE);2139SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);21402141// TODO: Here and below portions of the code can be enclosed into if/endif.2142// Currently control flow is unconditional and we have 4 selects after2143// potential endif to substitute PHIs.21442145// if C3 != 0 ...2146SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,2147RHS_Lo, Zero1);2148SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,2149RHS_Hi, Sub1_Lo.getValue(1));2150SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,2151Zero, Sub2_Lo.getValue(1));2152SDValue Sub2 = DAG.getBitcast(VT,2153DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));21542155SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);21562157SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,2158ISD::SETUGE);2159SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,2160ISD::SETUGE);2161SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);21622163// if (C6 != 0)2164SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);21652166SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,2167RHS_Lo, Zero1);2168SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,2169RHS_Hi, Sub2_Lo.getValue(1));2170SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,2171Zero, Sub3_Lo.getValue(1));2172SDValue Sub3 = DAG.getBitcast(VT,2173DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));21742175// endif C62176// endif C321772178SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);2179SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);21802181SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);2182SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);21832184Results.push_back(Div);2185Results.push_back(Rem);21862187return;2188}21892190// r600 expandion.2191// Get Speculative values2192SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);2193SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);21942195SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);2196SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});2197REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);21982199SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);2200SDValue DIV_Lo = Zero;22012202const unsigned halfBitWidth = HalfVT.getSizeInBits();22032204for (unsigned i = 0; i < halfBitWidth; ++i) {2205const unsigned bitPos = halfBitWidth - i - 1;2206SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);2207// Get value of high bit2208SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);2209HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);2210HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);22112212// Shift2213REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));2214// Add LHS high bit2215REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);22162217SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);2218SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);22192220DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);22212222// Update REM2223SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);2224REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);2225}22262227SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});2228DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);2229Results.push_back(DIV);2230Results.push_back(REM);2231}22322233SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,2234SelectionDAG &DAG) const {2235SDLoc DL(Op);2236EVT VT = Op.getValueType();22372238if (VT == MVT::i64) {2239SmallVector<SDValue, 2> Results;2240LowerUDIVREM64(Op, DAG, Results);2241return DAG.getMergeValues(Results, DL);2242}22432244if (VT == MVT::i32) {2245if (SDValue Res = LowerDIVREM24(Op, DAG, false))2246return Res;2247}22482249SDValue X = Op.getOperand(0);2250SDValue Y = Op.getOperand(1);22512252// See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the2253// algorithm used here.22542255// Initial estimate of inv(y).2256SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);22572258// One round of UNR.2259SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);2260SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);2261Z = DAG.getNode(ISD::ADD, DL, VT, Z,2262DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));22632264// Quotient/remainder estimate.2265SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);2266SDValue R =2267DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));22682269// First quotient/remainder refinement.2270EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);2271SDValue One = DAG.getConstant(1, DL, VT);2272SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);2273Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,2274DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);2275R = DAG.getNode(ISD::SELECT, DL, VT, Cond,2276DAG.getNode(ISD::SUB, DL, VT, R, Y), R);22772278// Second quotient/remainder refinement.2279Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);2280Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,2281DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);2282R = DAG.getNode(ISD::SELECT, DL, VT, Cond,2283DAG.getNode(ISD::SUB, DL, VT, R, Y), R);22842285return DAG.getMergeValues({Q, R}, DL);2286}22872288SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,2289SelectionDAG &DAG) const {2290SDLoc DL(Op);2291EVT VT = Op.getValueType();22922293SDValue LHS = Op.getOperand(0);2294SDValue RHS = Op.getOperand(1);22952296SDValue Zero = DAG.getConstant(0, DL, VT);2297SDValue NegOne = DAG.getConstant(-1, DL, VT);22982299if (VT == MVT::i32) {2300if (SDValue Res = LowerDIVREM24(Op, DAG, true))2301return Res;2302}23032304if (VT == MVT::i64 &&2305DAG.ComputeNumSignBits(LHS) > 32 &&2306DAG.ComputeNumSignBits(RHS) > 32) {2307EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());23082309//HiLo split2310SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);2311SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);2312SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),2313LHS_Lo, RHS_Lo);2314SDValue Res[2] = {2315DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),2316DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))2317};2318return DAG.getMergeValues(Res, DL);2319}23202321SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);2322SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);2323SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);2324SDValue RSign = LHSign; // Remainder sign is the same as LHS23252326LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);2327RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);23282329LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);2330RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);23312332SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);2333SDValue Rem = Div.getValue(1);23342335Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);2336Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);23372338Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);2339Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);23402341SDValue Res[2] = {2342Div,2343Rem2344};2345return DAG.getMergeValues(Res, DL);2346}23472348// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)2349SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {2350SDLoc SL(Op);2351EVT VT = Op.getValueType();2352auto Flags = Op->getFlags();2353SDValue X = Op.getOperand(0);2354SDValue Y = Op.getOperand(1);23552356SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);2357SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);2358SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);2359// TODO: For f32 use FMAD instead if !hasFastFMA32?2360return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);2361}23622363SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {2364SDLoc SL(Op);2365SDValue Src = Op.getOperand(0);23662367// result = trunc(src)2368// if (src > 0.0 && src != result)2369// result += 1.023702371SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);23722373const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);2374const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);23752376EVT SetCCVT =2377getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);23782379SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);2380SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);2381SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);23822383SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);2384// TODO: Should this propagate fast-math-flags?2385return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);2386}23872388static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,2389SelectionDAG &DAG) {2390const unsigned FractBits = 52;2391const unsigned ExpBits = 11;23922393SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,2394Hi,2395DAG.getConstant(FractBits - 32, SL, MVT::i32),2396DAG.getConstant(ExpBits, SL, MVT::i32));2397SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,2398DAG.getConstant(1023, SL, MVT::i32));23992400return Exp;2401}24022403SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {2404SDLoc SL(Op);2405SDValue Src = Op.getOperand(0);24062407assert(Op.getValueType() == MVT::f64);24082409const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);24102411// Extract the upper half, since this is where we will find the sign and2412// exponent.2413SDValue Hi = getHiHalf64(Src, DAG);24142415SDValue Exp = extractF64Exponent(Hi, SL, DAG);24162417const unsigned FractBits = 52;24182419// Extract the sign bit.2420const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);2421SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);24222423// Extend back to 64-bits.2424SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});2425SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);24262427SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);2428const SDValue FractMask2429= DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);24302431SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);2432SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);2433SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);24342435EVT SetCCVT =2436getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);24372438const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);24392440SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);2441SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);24422443SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);2444SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);24452446return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);2447}24482449SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,2450SelectionDAG &DAG) const {2451SDLoc SL(Op);2452SDValue Src = Op.getOperand(0);24532454assert(Op.getValueType() == MVT::f64);24552456APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");2457SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);2458SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);24592460// TODO: Should this propagate fast-math-flags?24612462SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);2463SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);24642465SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);24662467APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");2468SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);24692470EVT SetCCVT =2471getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);2472SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);24732474return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);2475}24762477SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,2478SelectionDAG &DAG) const {2479// FNEARBYINT and FRINT are the same, except in their handling of FP2480// exceptions. Those aren't really meaningful for us, and OpenCL only has2481// rint, so just treat them as equivalent.2482return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),2483Op.getOperand(0));2484}24852486SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {2487auto VT = Op.getValueType();2488auto Arg = Op.getOperand(0u);2489return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);2490}24912492// XXX - May require not supporting f32 denormals?24932494// Don't handle v2f16. The extra instructions to scalarize and repack around the2495// compare and vselect end up producing worse code than scalarizing the whole2496// operation.2497SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {2498SDLoc SL(Op);2499SDValue X = Op.getOperand(0);2500EVT VT = Op.getValueType();25012502SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);25032504// TODO: Should this propagate fast-math-flags?25052506SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);25072508SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);25092510const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);2511const SDValue One = DAG.getConstantFP(1.0, SL, VT);25122513EVT SetCCVT =2514getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);25152516const SDValue Half = DAG.getConstantFP(0.5, SL, VT);2517SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);2518SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);25192520SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);2521return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);2522}25232524SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {2525SDLoc SL(Op);2526SDValue Src = Op.getOperand(0);25272528// result = trunc(src);2529// if (src < 0.0 && src != result)2530// result += -1.0.25312532SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);25332534const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);2535const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);25362537EVT SetCCVT =2538getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);25392540SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);2541SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);2542SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);25432544SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);2545// TODO: Should this propagate fast-math-flags?2546return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);2547}25482549/// Return true if it's known that \p Src can never be an f32 denormal value.2550static bool valueIsKnownNeverF32Denorm(SDValue Src) {2551switch (Src.getOpcode()) {2552case ISD::FP_EXTEND:2553return Src.getOperand(0).getValueType() == MVT::f16;2554case ISD::FP16_TO_FP:2555case ISD::FFREXP:2556return true;2557case ISD::INTRINSIC_WO_CHAIN: {2558unsigned IntrinsicID = Src.getConstantOperandVal(0);2559switch (IntrinsicID) {2560case Intrinsic::amdgcn_frexp_mant:2561return true;2562default:2563return false;2564}2565}2566default:2567return false;2568}25692570llvm_unreachable("covered opcode switch");2571}25722573bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,2574SDNodeFlags Flags) {2575if (Flags.hasApproximateFuncs())2576return true;2577auto &Options = DAG.getTarget().Options;2578return Options.UnsafeFPMath || Options.ApproxFuncFPMath;2579}25802581bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,2582SDValue Src,2583SDNodeFlags Flags) {2584return !valueIsKnownNeverF32Denorm(Src) &&2585DAG.getMachineFunction()2586.getDenormalMode(APFloat::IEEEsingle())2587.Input != DenormalMode::PreserveSign;2588}25892590SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,2591SDValue Src,2592SDNodeFlags Flags) const {2593SDLoc SL(Src);2594EVT VT = Src.getValueType();2595const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);2596SDValue SmallestNormal =2597DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);25982599// Want to scale denormals up, but negatives and 0 work just as well on the2600// scaled path.2601SDValue IsLtSmallestNormal = DAG.getSetCC(2602SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,2603SmallestNormal, ISD::SETOLT);26042605return IsLtSmallestNormal;2606}26072608SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,2609SDNodeFlags Flags) const {2610SDLoc SL(Src);2611EVT VT = Src.getValueType();2612const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);2613SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);26142615SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);2616SDValue IsFinite = DAG.getSetCC(2617SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,2618Inf, ISD::SETOLT);2619return IsFinite;2620}26212622/// If denormal handling is required return the scaled input to FLOG2, and the2623/// check for denormal range. Otherwise, return null values.2624std::pair<SDValue, SDValue>2625AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,2626SDValue Src, SDNodeFlags Flags) const {2627if (!needsDenormHandlingF32(DAG, Src, Flags))2628return {};26292630MVT VT = MVT::f32;2631const fltSemantics &Semantics = APFloat::IEEEsingle();2632SDValue SmallestNormal =2633DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);26342635SDValue IsLtSmallestNormal = DAG.getSetCC(2636SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,2637SmallestNormal, ISD::SETOLT);26382639SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);2640SDValue One = DAG.getConstantFP(1.0, SL, VT);2641SDValue ScaleFactor =2642DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);26432644SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);2645return {ScaledInput, IsLtSmallestNormal};2646}26472648SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {2649// v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.2650// If we have to handle denormals, scale up the input and adjust the result.26512652// scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)2653// log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)26542655SDLoc SL(Op);2656EVT VT = Op.getValueType();2657SDValue Src = Op.getOperand(0);2658SDNodeFlags Flags = Op->getFlags();26592660if (VT == MVT::f16) {2661// Nothing in half is a denormal when promoted to f32.2662assert(!Subtarget->has16BitInsts());2663SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);2664SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);2665return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,2666DAG.getTargetConstant(0, SL, MVT::i32), Flags);2667}26682669auto [ScaledInput, IsLtSmallestNormal] =2670getScaledLogInput(DAG, SL, Src, Flags);2671if (!ScaledInput)2672return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);26732674SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);26752676SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);2677SDValue Zero = DAG.getConstantFP(0.0, SL, VT);2678SDValue ResultOffset =2679DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);2680return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);2681}26822683static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,2684SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {2685SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);2686return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);2687}26882689SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,2690SelectionDAG &DAG) const {2691SDValue X = Op.getOperand(0);2692EVT VT = Op.getValueType();2693SDNodeFlags Flags = Op->getFlags();2694SDLoc DL(Op);26952696const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;2697assert(IsLog10 || Op.getOpcode() == ISD::FLOG);26982699const auto &Options = getTargetMachine().Options;2700if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||2701Options.ApproxFuncFPMath || Options.UnsafeFPMath) {27022703if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {2704// Log and multiply in f32 is good enough for f16.2705X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);2706}27072708SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);2709if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {2710return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,2711DAG.getTargetConstant(0, DL, MVT::i32), Flags);2712}27132714return Lowered;2715}27162717auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);2718if (ScaledInput)2719X = ScaledInput;27202721SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);27222723SDValue R;2724if (Subtarget->hasFastFMAF32()) {2725// c+cc are ln(2)/ln(10) to more than 49 bits2726const float c_log10 = 0x1.344134p-2f;2727const float cc_log10 = 0x1.09f79ep-26f;27282729// c + cc is ln(2) to more than 49 bits2730const float c_log = 0x1.62e42ep-1f;2731const float cc_log = 0x1.efa39ep-25f;27322733SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);2734SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);27352736R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);2737SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);2738SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);2739SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);2740R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);2741} else {2742// ch+ct is ln(2)/ln(10) to more than 36 bits2743const float ch_log10 = 0x1.344000p-2f;2744const float ct_log10 = 0x1.3509f6p-18f;27452746// ch + ct is ln(2) to more than 36 bits2747const float ch_log = 0x1.62e000p-1f;2748const float ct_log = 0x1.0bfbe8p-15f;27492750SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);2751SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);27522753SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);2754SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);2755SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);2756SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);2757SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);27582759SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);2760SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);2761SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);2762R = getMad(DAG, DL, VT, YH, CH, Mad1);2763}27642765const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&2766(Flags.hasNoInfs() || Options.NoInfsFPMath);27672768// TODO: Check if known finite from source value.2769if (!IsFiniteOnly) {2770SDValue IsFinite = getIsFinite(DAG, Y, Flags);2771R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);2772}27732774if (IsScaled) {2775SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);2776SDValue ShiftK =2777DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);2778SDValue Shift =2779DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);2780R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);2781}27822783return R;2784}27852786SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {2787return LowerFLOGCommon(Op, DAG);2788}27892790// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a2791// promote f16 operation.2792SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,2793SelectionDAG &DAG, bool IsLog10,2794SDNodeFlags Flags) const {2795EVT VT = Src.getValueType();2796unsigned LogOp =2797VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;27982799double Log2BaseInverted =2800IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;28012802if (VT == MVT::f32) {2803auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);2804if (ScaledInput) {2805SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);2806SDValue ScaledResultOffset =2807DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);28082809SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);28102811SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,2812ScaledResultOffset, Zero, Flags);28132814SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);28152816if (Subtarget->hasFastFMAF32())2817return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,2818Flags);2819SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);2820return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);2821}2822}28232824SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);2825SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);28262827return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,2828Flags);2829}28302831SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {2832// v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.2833// If we have to handle denormals, scale up the input and adjust the result.28342835SDLoc SL(Op);2836EVT VT = Op.getValueType();2837SDValue Src = Op.getOperand(0);2838SDNodeFlags Flags = Op->getFlags();28392840if (VT == MVT::f16) {2841// Nothing in half is a denormal when promoted to f32.2842assert(!Subtarget->has16BitInsts());2843SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);2844SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);2845return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,2846DAG.getTargetConstant(0, SL, MVT::i32), Flags);2847}28482849assert(VT == MVT::f32);28502851if (!needsDenormHandlingF32(DAG, Src, Flags))2852return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);28532854// bool needs_scaling = x < -0x1.f80000p+6f;2855// v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);28562857// -nextafter(128.0, -1)2858SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);28592860EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);28612862SDValue NeedsScaling =2863DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);28642865SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);2866SDValue Zero = DAG.getConstantFP(0.0, SL, VT);28672868SDValue AddOffset =2869DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);28702871SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);2872SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);28732874SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);2875SDValue One = DAG.getConstantFP(1.0, SL, VT);2876SDValue ResultScale =2877DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);28782879return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);2880}28812882SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,2883SelectionDAG &DAG,2884SDNodeFlags Flags) const {2885EVT VT = X.getValueType();2886const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);28872888if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {2889// exp2(M_LOG2E_F * f);2890SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);2891return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP2892: (unsigned)ISD::FEXP2,2893SL, VT, Mul, Flags);2894}28952896EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);28972898SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);2899SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);29002901SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);29022903SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);29042905SDValue AdjustedX =2906DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);29072908SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);29092910SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);29112912SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);2913SDValue AdjustedResult =2914DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);29152916return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,2917Flags);2918}29192920/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be2921/// handled correctly.2922SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,2923SelectionDAG &DAG,2924SDNodeFlags Flags) const {2925const EVT VT = X.getValueType();2926const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;29272928if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {2929// exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);2930SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);2931SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);29322933SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);2934SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);2935SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);2936SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);2937return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);2938}29392940// bool s = x < -0x1.2f7030p+5f;2941// x += s ? 0x1.0p+5f : 0.0f;2942// exp10 = exp2(x * 0x1.a92000p+1f) *2943// exp2(x * 0x1.4f0978p-11f) *2944// (s ? 0x1.9f623ep-107f : 1.0f);29452946EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);29472948SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);2949SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);29502951SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);2952SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);2953SDValue AdjustedX =2954DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);29552956SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);2957SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);29582959SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);2960SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);2961SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);2962SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);29632964SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);29652966SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);2967SDValue AdjustedResult =2968DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);29692970return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,2971Flags);2972}29732974SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {2975EVT VT = Op.getValueType();2976SDLoc SL(Op);2977SDValue X = Op.getOperand(0);2978SDNodeFlags Flags = Op->getFlags();2979const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;29802981if (VT.getScalarType() == MVT::f16) {2982// v_exp_f16 (fmul x, log2e)2983if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?2984return lowerFEXPUnsafe(X, SL, DAG, Flags);29852986if (VT.isVector())2987return SDValue();29882989// exp(f16 x) ->2990// fptrunc (v_exp_f32 (fmul (fpext x), log2e))29912992// Nothing in half is a denormal when promoted to f32.2993SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);2994SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);2995return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,2996DAG.getTargetConstant(0, SL, MVT::i32), Flags);2997}29982999assert(VT == MVT::f32);30003001// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying3002// library behavior. Also, is known-not-daz source sufficient?3003if (allowApproxFunc(DAG, Flags)) {3004return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)3005: lowerFEXPUnsafe(X, SL, DAG, Flags);3006}30073008// Algorithm:3009//3010// e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)3011//3012// x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer3013// n = 64*m + j, 0 <= j < 643014//3015// e^x = 2^((64*m + j + f)/64)3016// = (2^m) * (2^(j/64)) * 2^(f/64)3017// = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))3018//3019// f = x*(64/ln(2)) - n3020// r = f*(ln(2)/64) = x - n*(ln(2)/64)3021//3022// e^x = (2^m) * (2^(j/64)) * e^r3023//3024// (2^(j/64)) is precomputed3025//3026// e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!3027// e^r = 1 + q3028//3029// q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!3030//3031// e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )3032SDNodeFlags FlagsNoContract = Flags;3033FlagsNoContract.setAllowContract(false);30343035SDValue PH, PL;3036if (Subtarget->hasFastFMAF32()) {3037const float c_exp = numbers::log2ef;3038const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits3039const float c_exp10 = 0x1.a934f0p+1f;3040const float cc_exp10 = 0x1.2f346ep-24f;30413042SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);3043SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);30443045PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);3046SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);3047SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);3048PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);3049} else {3050const float ch_exp = 0x1.714000p+0f;3051const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits30523053const float ch_exp10 = 0x1.a92000p+1f;3054const float cl_exp10 = 0x1.4f0978p-11f;30553056SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);3057SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);30583059SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);3060SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);3061SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);3062SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);3063SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);30643065PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);30663067SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);3068SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);3069PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);3070}30713072SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);30733074// It is unsafe to contract this fsub into the PH multiply.3075SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);30763077SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);3078SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);3079SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);30803081SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);30823083SDValue UnderflowCheckConst =3084DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);30853086EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);3087SDValue Zero = DAG.getConstantFP(0.0, SL, VT);3088SDValue Underflow =3089DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);30903091R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);3092const auto &Options = getTargetMachine().Options;30933094if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {3095SDValue OverflowCheckConst =3096DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);3097SDValue Overflow =3098DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);3099SDValue Inf =3100DAG.getConstantFP(APFloat::getInf(APFloat::IEEEsingle()), SL, VT);3101R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);3102}31033104return R;3105}31063107static bool isCtlzOpc(unsigned Opc) {3108return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;3109}31103111static bool isCttzOpc(unsigned Opc) {3112return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;3113}31143115SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,3116SelectionDAG &DAG) const {3117auto SL = SDLoc(Op);3118auto Opc = Op.getOpcode();3119auto Arg = Op.getOperand(0u);3120auto ResultVT = Op.getValueType();31213122if (ResultVT != MVT::i8 && ResultVT != MVT::i16)3123return {};31243125assert(isCtlzOpc(Opc));3126assert(ResultVT == Arg.getValueType());31273128const uint64_t NumBits = ResultVT.getFixedSizeInBits();3129SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);3130SDValue NewOp;31313132if (Opc == ISD::CTLZ_ZERO_UNDEF) {3133NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);3134NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);3135NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);3136} else {3137NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);3138NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);3139NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);3140}31413142return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);3143}31443145SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {3146SDLoc SL(Op);3147SDValue Src = Op.getOperand(0);31483149assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));3150bool Ctlz = isCtlzOpc(Op.getOpcode());3151unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;31523153bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||3154Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;3155bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;31563157if (Src.getValueType() == MVT::i32 || Is64BitScalar) {3158// (ctlz hi:lo) -> (umin (ffbh src), 32)3159// (cttz hi:lo) -> (umin (ffbl src), 32)3160// (ctlz_zero_undef src) -> (ffbh src)3161// (cttz_zero_undef src) -> (ffbl src)31623163// 64-bit scalar version produce 32-bit result3164// (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)3165// (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)3166// (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)3167// (cttz_zero_undef src) -> (S_FF1_I32_B64 src)3168SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);3169if (!ZeroUndef) {3170const SDValue ConstVal = DAG.getConstant(3171Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);3172NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);3173}3174return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);3175}31763177SDValue Lo, Hi;3178std::tie(Lo, Hi) = split64BitValue(Src, DAG);31793180SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);3181SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);31823183// (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)3184// (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)3185// (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))3186// (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))31873188unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;3189const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);3190if (Ctlz)3191OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);3192else3193OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);31943195SDValue NewOpr;3196NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);3197if (!ZeroUndef) {3198const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);3199NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);3200}32013202return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);3203}32043205SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,3206bool Signed) const {3207// The regular method converting a 64-bit integer to float roughly consists of3208// 2 steps: normalization and rounding. In fact, after normalization, the3209// conversion from a 64-bit integer to a float is essentially the same as the3210// one from a 32-bit integer. The only difference is that it has more3211// trailing bits to be rounded. To leverage the native 32-bit conversion, a3212// 64-bit integer could be preprocessed and fit into a 32-bit integer then3213// converted into the correct float number. The basic steps for the unsigned3214// conversion are illustrated in the following pseudo code:3215//3216// f32 uitofp(i64 u) {3217// i32 hi, lo = split(u);3218// // Only count the leading zeros in hi as we have native support of the3219// // conversion from i32 to f32. If hi is all 0s, the conversion is3220// // reduced to a 32-bit one automatically.3221// i32 shamt = clz(hi); // Return 32 if hi is all 0s.3222// u <<= shamt;3223// hi, lo = split(u);3224// hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.3225// // convert it as a 32-bit integer and scale the result back.3226// return uitofp(hi) * 2^(32 - shamt);3227// }3228//3229// The signed one follows the same principle but uses 'ffbh_i32' to count its3230// sign bits instead. If 'ffbh_i32' is not available, its absolute value is3231// converted instead followed by negation based its sign bit.32323233SDLoc SL(Op);3234SDValue Src = Op.getOperand(0);32353236SDValue Lo, Hi;3237std::tie(Lo, Hi) = split64BitValue(Src, DAG);3238SDValue Sign;3239SDValue ShAmt;3240if (Signed && Subtarget->isGCN()) {3241// We also need to consider the sign bit in Lo if Hi has just sign bits,3242// i.e. Hi is 0 or -1. However, that only needs to take the MSB into3243// account. That is, the maximal shift is3244// - 32 if Lo and Hi have opposite signs;3245// - 33 if Lo and Hi have the same sign.3246//3247// Or, MaxShAmt = 33 + OppositeSign, where3248//3249// OppositeSign is defined as ((Lo ^ Hi) >> 31), which is3250// - -1 if Lo and Hi have opposite signs; and3251// - 0 otherwise.3252//3253// All in all, ShAmt is calculated as3254//3255// umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.3256//3257// or3258//3259// umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).3260//3261// to reduce the critical path.3262SDValue OppositeSign = DAG.getNode(3263ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),3264DAG.getConstant(31, SL, MVT::i32));3265SDValue MaxShAmt =3266DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),3267OppositeSign);3268// Count the leading sign bits.3269ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);3270// Different from unsigned conversion, the shift should be one bit less to3271// preserve the sign bit.3272ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,3273DAG.getConstant(1, SL, MVT::i32));3274ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);3275} else {3276if (Signed) {3277// Without 'ffbh_i32', only leading zeros could be counted. Take the3278// absolute value first.3279Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,3280DAG.getConstant(63, SL, MVT::i64));3281SDValue Abs =3282DAG.getNode(ISD::XOR, SL, MVT::i64,3283DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);3284std::tie(Lo, Hi) = split64BitValue(Abs, DAG);3285}3286// Count the leading zeros.3287ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);3288// The shift amount for signed integers is [0, 32].3289}3290// Normalize the given 64-bit integer.3291SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);3292// Split it again.3293std::tie(Lo, Hi) = split64BitValue(Norm, DAG);3294// Calculate the adjust bit for rounding.3295// (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)3296SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,3297DAG.getConstant(1, SL, MVT::i32), Lo);3298// Get the 32-bit normalized integer.3299Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);3300// Convert the normalized 32-bit integer into f32.3301unsigned Opc =3302(Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;3303SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);33043305// Finally, need to scale back the converted floating number as the original3306// 64-bit integer is converted as a 32-bit one.3307ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),3308ShAmt);3309// On GCN, use LDEXP directly.3310if (Subtarget->isGCN())3311return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);33123313// Otherwise, align 'ShAmt' to the exponent part and add it into the exponent3314// part directly to emulate the multiplication of 2^ShAmt. That 8-bit3315// exponent is enough to avoid overflowing into the sign bit.3316SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,3317DAG.getConstant(23, SL, MVT::i32));3318SDValue IVal =3319DAG.getNode(ISD::ADD, SL, MVT::i32,3320DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);3321if (Signed) {3322// Set the sign bit.3323Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,3324DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),3325DAG.getConstant(31, SL, MVT::i32));3326IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);3327}3328return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);3329}33303331SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,3332bool Signed) const {3333SDLoc SL(Op);3334SDValue Src = Op.getOperand(0);33353336SDValue Lo, Hi;3337std::tie(Lo, Hi) = split64BitValue(Src, DAG);33383339SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,3340SL, MVT::f64, Hi);33413342SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);33433344SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,3345DAG.getConstant(32, SL, MVT::i32));3346// TODO: Should this propagate fast-math-flags?3347return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);3348}33493350SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,3351SelectionDAG &DAG) const {3352// TODO: Factor out code common with LowerSINT_TO_FP.3353EVT DestVT = Op.getValueType();3354SDValue Src = Op.getOperand(0);3355EVT SrcVT = Src.getValueType();33563357if (SrcVT == MVT::i16) {3358if (DestVT == MVT::f16)3359return Op;3360SDLoc DL(Op);33613362// Promote src to i323363SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);3364return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);3365}33663367if (DestVT == MVT::bf16) {3368SDLoc SL(Op);3369SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);3370SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);3371return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);3372}33733374if (SrcVT != MVT::i64)3375return Op;33763377if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {3378SDLoc DL(Op);33793380SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);3381SDValue FPRoundFlag =3382DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);3383SDValue FPRound =3384DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);33853386return FPRound;3387}33883389if (DestVT == MVT::f32)3390return LowerINT_TO_FP32(Op, DAG, false);33913392assert(DestVT == MVT::f64);3393return LowerINT_TO_FP64(Op, DAG, false);3394}33953396SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,3397SelectionDAG &DAG) const {3398EVT DestVT = Op.getValueType();33993400SDValue Src = Op.getOperand(0);3401EVT SrcVT = Src.getValueType();34023403if (SrcVT == MVT::i16) {3404if (DestVT == MVT::f16)3405return Op;34063407SDLoc DL(Op);3408// Promote src to i323409SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);3410return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);3411}34123413if (DestVT == MVT::bf16) {3414SDLoc SL(Op);3415SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);3416SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);3417return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);3418}34193420if (SrcVT != MVT::i64)3421return Op;34223423// TODO: Factor out code common with LowerUINT_TO_FP.34243425if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {3426SDLoc DL(Op);3427SDValue Src = Op.getOperand(0);34283429SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);3430SDValue FPRoundFlag =3431DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);3432SDValue FPRound =3433DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);34343435return FPRound;3436}34373438if (DestVT == MVT::f32)3439return LowerINT_TO_FP32(Op, DAG, true);34403441assert(DestVT == MVT::f64);3442return LowerINT_TO_FP64(Op, DAG, true);3443}34443445SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,3446bool Signed) const {3447SDLoc SL(Op);34483449SDValue Src = Op.getOperand(0);3450EVT SrcVT = Src.getValueType();34513452assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);34533454// The basic idea of converting a floating point number into a pair of 32-bit3455// integers is illustrated as follows:3456//3457// tf := trunc(val);3458// hif := floor(tf * 2^-32);3459// lof := tf - hif * 2^32; // lof is always positive due to floor.3460// hi := fptoi(hif);3461// lo := fptoi(lof);3462//3463SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);3464SDValue Sign;3465if (Signed && SrcVT == MVT::f32) {3466// However, a 32-bit floating point number has only 23 bits mantissa and3467// it's not enough to hold all the significant bits of `lof` if val is3468// negative. To avoid the loss of precision, We need to take the absolute3469// value after truncating and flip the result back based on the original3470// signedness.3471Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,3472DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),3473DAG.getConstant(31, SL, MVT::i32));3474Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);3475}34763477SDValue K0, K1;3478if (SrcVT == MVT::f64) {3479K0 = DAG.getConstantFP(3480llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,3481SrcVT);3482K1 = DAG.getConstantFP(3483llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,3484SrcVT);3485} else {3486K0 = DAG.getConstantFP(3487llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);3488K1 = DAG.getConstantFP(3489llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);3490}3491// TODO: Should this propagate fast-math-flags?3492SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);34933494SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);34953496SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);34973498SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT3499: ISD::FP_TO_UINT,3500SL, MVT::i32, FloorMul);3501SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);35023503SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,3504DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));35053506if (Signed && SrcVT == MVT::f32) {3507assert(Sign);3508// Flip the result based on the signedness, which is either all 0s or 1s.3509Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,3510DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));3511// r := xor(r, sign) - sign;3512Result =3513DAG.getNode(ISD::SUB, SL, MVT::i64,3514DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);3515}35163517return Result;3518}35193520SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {3521SDLoc DL(Op);3522SDValue N0 = Op.getOperand(0);35233524// Convert to target node to get known bits3525if (N0.getValueType() == MVT::f32)3526return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);35273528if (getTargetMachine().Options.UnsafeFPMath) {3529// There is a generic expand for FP_TO_FP16 with unsafe fast math.3530return SDValue();3531}35323533assert(N0.getSimpleValueType() == MVT::f64);35343535// f64 -> f16 conversion using round-to-nearest-even rounding mode.3536const unsigned ExpMask = 0x7ff;3537const unsigned ExpBiasf64 = 1023;3538const unsigned ExpBiasf16 = 15;3539SDValue Zero = DAG.getConstant(0, DL, MVT::i32);3540SDValue One = DAG.getConstant(1, DL, MVT::i32);3541SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);3542SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,3543DAG.getConstant(32, DL, MVT::i64));3544UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);3545U = DAG.getZExtOrTrunc(U, DL, MVT::i32);3546SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,3547DAG.getConstant(20, DL, MVT::i64));3548E = DAG.getNode(ISD::AND, DL, MVT::i32, E,3549DAG.getConstant(ExpMask, DL, MVT::i32));3550// Subtract the fp64 exponent bias (1023) to get the real exponent and3551// add the f16 bias (15) to get the biased exponent for the f16 format.3552E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,3553DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));35543555SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,3556DAG.getConstant(8, DL, MVT::i32));3557M = DAG.getNode(ISD::AND, DL, MVT::i32, M,3558DAG.getConstant(0xffe, DL, MVT::i32));35593560SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,3561DAG.getConstant(0x1ff, DL, MVT::i32));3562MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);35633564SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);3565M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);35663567// (M != 0 ? 0x0200 : 0) | 0x7c00;3568SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,3569DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),3570Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));35713572// N = M | (E << 12);3573SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,3574DAG.getNode(ISD::SHL, DL, MVT::i32, E,3575DAG.getConstant(12, DL, MVT::i32)));35763577// B = clamp(1-E, 0, 13);3578SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,3579One, E);3580SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);3581B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,3582DAG.getConstant(13, DL, MVT::i32));35833584SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,3585DAG.getConstant(0x1000, DL, MVT::i32));35863587SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);3588SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);3589SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);3590D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);35913592SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);3593SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,3594DAG.getConstant(0x7, DL, MVT::i32));3595V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,3596DAG.getConstant(2, DL, MVT::i32));3597SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),3598One, Zero, ISD::SETEQ);3599SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),3600One, Zero, ISD::SETGT);3601V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);3602V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);36033604V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),3605DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);3606V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),3607I, V, ISD::SETEQ);36083609// Extract the sign bit.3610SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,3611DAG.getConstant(16, DL, MVT::i32));3612Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,3613DAG.getConstant(0x8000, DL, MVT::i32));36143615V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);3616return DAG.getZExtOrTrunc(V, DL, Op.getValueType());3617}36183619SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,3620SelectionDAG &DAG) const {3621SDValue Src = Op.getOperand(0);3622unsigned OpOpcode = Op.getOpcode();3623EVT SrcVT = Src.getValueType();3624EVT DestVT = Op.getValueType();36253626// Will be selected natively3627if (SrcVT == MVT::f16 && DestVT == MVT::i16)3628return Op;36293630if (SrcVT == MVT::bf16) {3631SDLoc DL(Op);3632SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);3633return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);3634}36353636// Promote i16 to i323637if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {3638SDLoc DL(Op);36393640SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);3641return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);3642}36433644if (DestVT != MVT::i64)3645return Op;36463647if (SrcVT == MVT::f16 ||3648(SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {3649SDLoc DL(Op);36503651SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);3652unsigned Ext =3653OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;3654return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);3655}36563657if (SrcVT == MVT::f32 || SrcVT == MVT::f64)3658return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);36593660return SDValue();3661}36623663SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,3664SelectionDAG &DAG) const {3665EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();3666MVT VT = Op.getSimpleValueType();3667MVT ScalarVT = VT.getScalarType();36683669assert(VT.isVector());36703671SDValue Src = Op.getOperand(0);3672SDLoc DL(Op);36733674// TODO: Don't scalarize on Evergreen?3675unsigned NElts = VT.getVectorNumElements();3676SmallVector<SDValue, 8> Args;3677DAG.ExtractVectorElements(Src, Args, 0, NElts);36783679SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());3680for (unsigned I = 0; I < NElts; ++I)3681Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);36823683return DAG.getBuildVector(VT, DL, Args);3684}36853686//===----------------------------------------------------------------------===//3687// Custom DAG optimizations3688//===----------------------------------------------------------------------===//36893690static bool isU24(SDValue Op, SelectionDAG &DAG) {3691return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;3692}36933694static bool isI24(SDValue Op, SelectionDAG &DAG) {3695EVT VT = Op.getValueType();3696return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated3697// as unsigned 24-bit values.3698AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;3699}37003701static SDValue simplifyMul24(SDNode *Node24,3702TargetLowering::DAGCombinerInfo &DCI) {3703SelectionDAG &DAG = DCI.DAG;3704const TargetLowering &TLI = DAG.getTargetLoweringInfo();3705bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;37063707SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);3708SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);3709unsigned NewOpcode = Node24->getOpcode();3710if (IsIntrin) {3711unsigned IID = Node24->getConstantOperandVal(0);3712switch (IID) {3713case Intrinsic::amdgcn_mul_i24:3714NewOpcode = AMDGPUISD::MUL_I24;3715break;3716case Intrinsic::amdgcn_mul_u24:3717NewOpcode = AMDGPUISD::MUL_U24;3718break;3719case Intrinsic::amdgcn_mulhi_i24:3720NewOpcode = AMDGPUISD::MULHI_I24;3721break;3722case Intrinsic::amdgcn_mulhi_u24:3723NewOpcode = AMDGPUISD::MULHI_U24;3724break;3725default:3726llvm_unreachable("Expected 24-bit mul intrinsic");3727}3728}37293730APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);37313732// First try to simplify using SimplifyMultipleUseDemandedBits which allows3733// the operands to have other uses, but will only perform simplifications that3734// involve bypassing some nodes for this user.3735SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);3736SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);3737if (DemandedLHS || DemandedRHS)3738return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),3739DemandedLHS ? DemandedLHS : LHS,3740DemandedRHS ? DemandedRHS : RHS);37413742// Now try SimplifyDemandedBits which can simplify the nodes used by our3743// operands if this node is the only user.3744if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))3745return SDValue(Node24, 0);3746if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))3747return SDValue(Node24, 0);37483749return SDValue();3750}37513752template <typename IntTy>3753static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,3754uint32_t Width, const SDLoc &DL) {3755if (Width + Offset < 32) {3756uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);3757IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);3758return DAG.getConstant(Result, DL, MVT::i32);3759}37603761return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);3762}37633764static bool hasVolatileUser(SDNode *Val) {3765for (SDNode *U : Val->uses()) {3766if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {3767if (M->isVolatile())3768return true;3769}3770}37713772return false;3773}37743775bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {3776// i32 vectors are the canonical memory type.3777if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))3778return false;37793780if (!VT.isByteSized())3781return false;37823783unsigned Size = VT.getStoreSize();37843785if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())3786return false;37873788if (Size == 3 || (Size > 4 && (Size % 4 != 0)))3789return false;37903791return true;3792}37933794// Replace load of an illegal type with a store of a bitcast to a friendlier3795// type.3796SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,3797DAGCombinerInfo &DCI) const {3798if (!DCI.isBeforeLegalize())3799return SDValue();38003801LoadSDNode *LN = cast<LoadSDNode>(N);3802if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))3803return SDValue();38043805SDLoc SL(N);3806SelectionDAG &DAG = DCI.DAG;3807EVT VT = LN->getMemoryVT();38083809unsigned Size = VT.getStoreSize();3810Align Alignment = LN->getAlign();3811if (Alignment < Size && isTypeLegal(VT)) {3812unsigned IsFast;3813unsigned AS = LN->getAddressSpace();38143815// Expand unaligned loads earlier than legalization. Due to visitation order3816// problems during legalization, the emitted instructions to pack and unpack3817// the bytes again are not eliminated in the case of an unaligned copy.3818if (!allowsMisalignedMemoryAccesses(3819VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {3820if (VT.isVector())3821return SplitVectorLoad(SDValue(LN, 0), DAG);38223823SDValue Ops[2];3824std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);38253826return DAG.getMergeValues(Ops, SDLoc(N));3827}38283829if (!IsFast)3830return SDValue();3831}38323833if (!shouldCombineMemoryType(VT))3834return SDValue();38353836EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);38373838SDValue NewLoad3839= DAG.getLoad(NewVT, SL, LN->getChain(),3840LN->getBasePtr(), LN->getMemOperand());38413842SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);3843DCI.CombineTo(N, BC, NewLoad.getValue(1));3844return SDValue(N, 0);3845}38463847// Replace store of an illegal type with a store of a bitcast to a friendlier3848// type.3849SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,3850DAGCombinerInfo &DCI) const {3851if (!DCI.isBeforeLegalize())3852return SDValue();38533854StoreSDNode *SN = cast<StoreSDNode>(N);3855if (!SN->isSimple() || !ISD::isNormalStore(SN))3856return SDValue();38573858EVT VT = SN->getMemoryVT();3859unsigned Size = VT.getStoreSize();38603861SDLoc SL(N);3862SelectionDAG &DAG = DCI.DAG;3863Align Alignment = SN->getAlign();3864if (Alignment < Size && isTypeLegal(VT)) {3865unsigned IsFast;3866unsigned AS = SN->getAddressSpace();38673868// Expand unaligned stores earlier than legalization. Due to visitation3869// order problems during legalization, the emitted instructions to pack and3870// unpack the bytes again are not eliminated in the case of an unaligned3871// copy.3872if (!allowsMisalignedMemoryAccesses(3873VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {3874if (VT.isVector())3875return SplitVectorStore(SDValue(SN, 0), DAG);38763877return expandUnalignedStore(SN, DAG);3878}38793880if (!IsFast)3881return SDValue();3882}38833884if (!shouldCombineMemoryType(VT))3885return SDValue();38863887EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);3888SDValue Val = SN->getValue();38893890//DCI.AddToWorklist(Val.getNode());38913892bool OtherUses = !Val.hasOneUse();3893SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);3894if (OtherUses) {3895SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);3896DAG.ReplaceAllUsesOfValueWith(Val, CastBack);3897}38983899return DAG.getStore(SN->getChain(), SL, CastVal,3900SN->getBasePtr(), SN->getMemOperand());3901}39023903// FIXME: This should go in generic DAG combiner with an isTruncateFree check,3904// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU3905// issues.3906SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,3907DAGCombinerInfo &DCI) const {3908SelectionDAG &DAG = DCI.DAG;3909SDValue N0 = N->getOperand(0);39103911// (vt2 (assertzext (truncate vt0:x), vt1)) ->3912// (vt2 (truncate (assertzext vt0:x, vt1)))3913if (N0.getOpcode() == ISD::TRUNCATE) {3914SDValue N1 = N->getOperand(1);3915EVT ExtVT = cast<VTSDNode>(N1)->getVT();3916SDLoc SL(N);39173918SDValue Src = N0.getOperand(0);3919EVT SrcVT = Src.getValueType();3920if (SrcVT.bitsGE(ExtVT)) {3921SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);3922return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);3923}3924}39253926return SDValue();3927}39283929SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(3930SDNode *N, DAGCombinerInfo &DCI) const {3931unsigned IID = N->getConstantOperandVal(0);3932switch (IID) {3933case Intrinsic::amdgcn_mul_i24:3934case Intrinsic::amdgcn_mul_u24:3935case Intrinsic::amdgcn_mulhi_i24:3936case Intrinsic::amdgcn_mulhi_u24:3937return simplifyMul24(N, DCI);3938case Intrinsic::amdgcn_fract:3939case Intrinsic::amdgcn_rsq:3940case Intrinsic::amdgcn_rcp_legacy:3941case Intrinsic::amdgcn_rsq_legacy:3942case Intrinsic::amdgcn_rsq_clamp: {3943// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted3944SDValue Src = N->getOperand(1);3945return Src.isUndef() ? Src : SDValue();3946}3947case Intrinsic::amdgcn_frexp_exp: {3948// frexp_exp (fneg x) -> frexp_exp x3949// frexp_exp (fabs x) -> frexp_exp x3950// frexp_exp (fneg (fabs x)) -> frexp_exp x3951SDValue Src = N->getOperand(1);3952SDValue PeekSign = peekFPSignOps(Src);3953if (PeekSign == Src)3954return SDValue();3955return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),39560);3957}3958default:3959return SDValue();3960}3961}39623963/// Split the 64-bit value \p LHS into two 32-bit components, and perform the3964/// binary operation \p Opc to it with the corresponding constant operands.3965SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(3966DAGCombinerInfo &DCI, const SDLoc &SL,3967unsigned Opc, SDValue LHS,3968uint32_t ValLo, uint32_t ValHi) const {3969SelectionDAG &DAG = DCI.DAG;3970SDValue Lo, Hi;3971std::tie(Lo, Hi) = split64BitValue(LHS, DAG);39723973SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);3974SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);39753976SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);3977SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);39783979// Re-visit the ands. It's possible we eliminated one of them and it could3980// simplify the vector.3981DCI.AddToWorklist(Lo.getNode());3982DCI.AddToWorklist(Hi.getNode());39833984SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});3985return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);3986}39873988SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,3989DAGCombinerInfo &DCI) const {3990EVT VT = N->getValueType(0);39913992ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));3993if (!RHS)3994return SDValue();39953996SDValue LHS = N->getOperand(0);3997unsigned RHSVal = RHS->getZExtValue();3998if (!RHSVal)3999return LHS;40004001SDLoc SL(N);4002SelectionDAG &DAG = DCI.DAG;40034004switch (LHS->getOpcode()) {4005default:4006break;4007case ISD::ZERO_EXTEND:4008case ISD::SIGN_EXTEND:4009case ISD::ANY_EXTEND: {4010SDValue X = LHS->getOperand(0);40114012if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&4013isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {4014// Prefer build_vector as the canonical form if packed types are legal.4015// (shl ([asz]ext i16:x), 16 -> build_vector 0, x4016SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,4017{ DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });4018return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);4019}40204021// shl (ext x) => zext (shl x), if shift does not overflow int4022if (VT != MVT::i64)4023break;4024KnownBits Known = DAG.computeKnownBits(X);4025unsigned LZ = Known.countMinLeadingZeros();4026if (LZ < RHSVal)4027break;4028EVT XVT = X.getValueType();4029SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));4030return DAG.getZExtOrTrunc(Shl, SL, VT);4031}4032}40334034if (VT != MVT::i64)4035return SDValue();40364037// i64 (shl x, C) -> (build_pair 0, (shl x, C -32))40384039// On some subtargets, 64-bit shift is a quarter rate instruction. In the4040// common case, splitting this into a move and a 32-bit shift is faster and4041// the same code size.4042if (RHSVal < 32)4043return SDValue();40444045SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);40464047SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);4048SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);40494050const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);40514052SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});4053return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);4054}40554056SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,4057DAGCombinerInfo &DCI) const {4058if (N->getValueType(0) != MVT::i64)4059return SDValue();40604061const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));4062if (!RHS)4063return SDValue();40644065SelectionDAG &DAG = DCI.DAG;4066SDLoc SL(N);4067unsigned RHSVal = RHS->getZExtValue();40684069// (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)4070if (RHSVal == 32) {4071SDValue Hi = getHiHalf64(N->getOperand(0), DAG);4072SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,4073DAG.getConstant(31, SL, MVT::i32));40744075SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});4076return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);4077}40784079// (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)4080if (RHSVal == 63) {4081SDValue Hi = getHiHalf64(N->getOperand(0), DAG);4082SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,4083DAG.getConstant(31, SL, MVT::i32));4084SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});4085return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);4086}40874088return SDValue();4089}40904091SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,4092DAGCombinerInfo &DCI) const {4093auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));4094if (!RHS)4095return SDValue();40964097EVT VT = N->getValueType(0);4098SDValue LHS = N->getOperand(0);4099unsigned ShiftAmt = RHS->getZExtValue();4100SelectionDAG &DAG = DCI.DAG;4101SDLoc SL(N);41024103// fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)4104// this improves the ability to match BFE patterns in isel.4105if (LHS.getOpcode() == ISD::AND) {4106if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {4107unsigned MaskIdx, MaskLen;4108if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&4109MaskIdx == ShiftAmt) {4110return DAG.getNode(4111ISD::AND, SL, VT,4112DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),4113DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));4114}4115}4116}41174118if (VT != MVT::i64)4119return SDValue();41204121if (ShiftAmt < 32)4122return SDValue();41234124// srl i64:x, C for C >= 324125// =>4126// build_pair (srl hi_32(x), C - 32), 04127SDValue Zero = DAG.getConstant(0, SL, MVT::i32);41284129SDValue Hi = getHiHalf64(LHS, DAG);41304131SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);4132SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);41334134SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});41354136return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);4137}41384139SDValue AMDGPUTargetLowering::performTruncateCombine(4140SDNode *N, DAGCombinerInfo &DCI) const {4141SDLoc SL(N);4142SelectionDAG &DAG = DCI.DAG;4143EVT VT = N->getValueType(0);4144SDValue Src = N->getOperand(0);41454146// vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)4147if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {4148SDValue Vec = Src.getOperand(0);4149if (Vec.getOpcode() == ISD::BUILD_VECTOR) {4150SDValue Elt0 = Vec.getOperand(0);4151EVT EltVT = Elt0.getValueType();4152if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {4153if (EltVT.isFloatingPoint()) {4154Elt0 = DAG.getNode(ISD::BITCAST, SL,4155EltVT.changeTypeToInteger(), Elt0);4156}41574158return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);4159}4160}4161}41624163// Equivalent of above for accessing the high element of a vector as an4164// integer operation.4165// trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)4166if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {4167if (auto K = isConstOrConstSplat(Src.getOperand(1))) {4168if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {4169SDValue BV = stripBitcast(Src.getOperand(0));4170if (BV.getOpcode() == ISD::BUILD_VECTOR &&4171BV.getValueType().getVectorNumElements() == 2) {4172SDValue SrcElt = BV.getOperand(1);4173EVT SrcEltVT = SrcElt.getValueType();4174if (SrcEltVT.isFloatingPoint()) {4175SrcElt = DAG.getNode(ISD::BITCAST, SL,4176SrcEltVT.changeTypeToInteger(), SrcElt);4177}41784179return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);4180}4181}4182}4183}41844185// Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.4186//4187// i16 (trunc (srl i64:x, K)), K <= 16 ->4188// i16 (trunc (srl (i32 (trunc x), K)))4189if (VT.getScalarSizeInBits() < 32) {4190EVT SrcVT = Src.getValueType();4191if (SrcVT.getScalarSizeInBits() > 32 &&4192(Src.getOpcode() == ISD::SRL ||4193Src.getOpcode() == ISD::SRA ||4194Src.getOpcode() == ISD::SHL)) {4195SDValue Amt = Src.getOperand(1);4196KnownBits Known = DAG.computeKnownBits(Amt);41974198// - For left shifts, do the transform as long as the shift4199// amount is still legal for i32, so when ShiftAmt < 32 (<= 31)4200// - For right shift, do it if ShiftAmt <= (32 - Size) to avoid4201// losing information stored in the high bits when truncating.4202const unsigned MaxCstSize =4203(Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());4204if (Known.getMaxValue().ule(MaxCstSize)) {4205EVT MidVT = VT.isVector() ?4206EVT::getVectorVT(*DAG.getContext(), MVT::i32,4207VT.getVectorNumElements()) : MVT::i32;42084209EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());4210SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,4211Src.getOperand(0));4212DCI.AddToWorklist(Trunc.getNode());42134214if (Amt.getValueType() != NewShiftVT) {4215Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);4216DCI.AddToWorklist(Amt.getNode());4217}42184219SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,4220Trunc, Amt);4221return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);4222}4223}4224}42254226return SDValue();4227}42284229// We need to specifically handle i64 mul here to avoid unnecessary conversion4230// instructions. If we only match on the legalized i64 mul expansion,4231// SimplifyDemandedBits will be unable to remove them because there will be4232// multiple uses due to the separate mul + mulh[su].4233static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,4234SDValue N0, SDValue N1, unsigned Size, bool Signed) {4235if (Size <= 32) {4236unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;4237return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);4238}42394240unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;4241unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;42424243SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);4244SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);42454246return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);4247}42484249/// If \p V is an add of a constant 1, returns the other operand. Otherwise4250/// return SDValue().4251static SDValue getAddOneOp(const SDNode *V) {4252if (V->getOpcode() != ISD::ADD)4253return SDValue();42544255return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();4256}42574258SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,4259DAGCombinerInfo &DCI) const {4260assert(N->getOpcode() == ISD::MUL);4261EVT VT = N->getValueType(0);42624263// Don't generate 24-bit multiplies on values that are in SGPRs, since4264// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs4265// unnecessarily). isDivergent() is used as an approximation of whether the4266// value is in an SGPR.4267if (!N->isDivergent())4268return SDValue();42694270unsigned Size = VT.getSizeInBits();4271if (VT.isVector() || Size > 64)4272return SDValue();42734274SelectionDAG &DAG = DCI.DAG;4275SDLoc DL(N);42764277SDValue N0 = N->getOperand(0);4278SDValue N1 = N->getOperand(1);42794280// Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad4281// matching.42824283// mul x, (add y, 1) -> add (mul x, y), x4284auto IsFoldableAdd = [](SDValue V) -> SDValue {4285SDValue AddOp = getAddOneOp(V.getNode());4286if (!AddOp)4287return SDValue();42884289if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {4290return U->getOpcode() == ISD::MUL;4291}))4292return AddOp;42934294return SDValue();4295};42964297// FIXME: The selection pattern is not properly checking for commuted4298// operands, so we have to place the mul in the LHS4299if (SDValue MulOper = IsFoldableAdd(N0)) {4300SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);4301return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);4302}43034304if (SDValue MulOper = IsFoldableAdd(N1)) {4305SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);4306return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);4307}43084309// There are i16 integer mul/mad.4310if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))4311return SDValue();43124313// SimplifyDemandedBits has the annoying habit of turning useful zero_extends4314// in the source into any_extends if the result of the mul is truncated. Since4315// we can assume the high bits are whatever we want, use the underlying value4316// to avoid the unknown high bits from interfering.4317if (N0.getOpcode() == ISD::ANY_EXTEND)4318N0 = N0.getOperand(0);43194320if (N1.getOpcode() == ISD::ANY_EXTEND)4321N1 = N1.getOperand(0);43224323SDValue Mul;43244325if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {4326N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);4327N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);4328Mul = getMul24(DAG, DL, N0, N1, Size, false);4329} else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {4330N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);4331N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);4332Mul = getMul24(DAG, DL, N0, N1, Size, true);4333} else {4334return SDValue();4335}43364337// We need to use sext even for MUL_U24, because MUL_U24 is used4338// for signed multiply of 8 and 16-bit types.4339return DAG.getSExtOrTrunc(Mul, DL, VT);4340}43414342SDValue4343AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,4344DAGCombinerInfo &DCI) const {4345if (N->getValueType(0) != MVT::i32)4346return SDValue();43474348SelectionDAG &DAG = DCI.DAG;4349SDLoc DL(N);43504351bool Signed = N->getOpcode() == ISD::SMUL_LOHI;4352SDValue N0 = N->getOperand(0);4353SDValue N1 = N->getOperand(1);43544355// SimplifyDemandedBits has the annoying habit of turning useful zero_extends4356// in the source into any_extends if the result of the mul is truncated. Since4357// we can assume the high bits are whatever we want, use the underlying value4358// to avoid the unknown high bits from interfering.4359if (N0.getOpcode() == ISD::ANY_EXTEND)4360N0 = N0.getOperand(0);4361if (N1.getOpcode() == ISD::ANY_EXTEND)4362N1 = N1.getOperand(0);43634364// Try to use two fast 24-bit multiplies (one for each half of the result)4365// instead of one slow extending multiply.4366unsigned LoOpcode = 0;4367unsigned HiOpcode = 0;4368if (Signed) {4369if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {4370N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);4371N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);4372LoOpcode = AMDGPUISD::MUL_I24;4373HiOpcode = AMDGPUISD::MULHI_I24;4374}4375} else {4376if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {4377N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);4378N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);4379LoOpcode = AMDGPUISD::MUL_U24;4380HiOpcode = AMDGPUISD::MULHI_U24;4381}4382}4383if (!LoOpcode)4384return SDValue();43854386SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);4387SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);4388DCI.CombineTo(N, Lo, Hi);4389return SDValue(N, 0);4390}43914392SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,4393DAGCombinerInfo &DCI) const {4394EVT VT = N->getValueType(0);43954396if (!Subtarget->hasMulI24() || VT.isVector())4397return SDValue();43984399// Don't generate 24-bit multiplies on values that are in SGPRs, since4400// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs4401// unnecessarily). isDivergent() is used as an approximation of whether the4402// value is in an SGPR.4403// This doesn't apply if no s_mul_hi is available (since we'll end up with a4404// valu op anyway)4405if (Subtarget->hasSMulHi() && !N->isDivergent())4406return SDValue();44074408SelectionDAG &DAG = DCI.DAG;4409SDLoc DL(N);44104411SDValue N0 = N->getOperand(0);4412SDValue N1 = N->getOperand(1);44134414if (!isI24(N0, DAG) || !isI24(N1, DAG))4415return SDValue();44164417N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);4418N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);44194420SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);4421DCI.AddToWorklist(Mulhi.getNode());4422return DAG.getSExtOrTrunc(Mulhi, DL, VT);4423}44244425SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,4426DAGCombinerInfo &DCI) const {4427EVT VT = N->getValueType(0);44284429if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)4430return SDValue();44314432// Don't generate 24-bit multiplies on values that are in SGPRs, since4433// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs4434// unnecessarily). isDivergent() is used as an approximation of whether the4435// value is in an SGPR.4436// This doesn't apply if no s_mul_hi is available (since we'll end up with a4437// valu op anyway)4438if (Subtarget->hasSMulHi() && !N->isDivergent())4439return SDValue();44404441SelectionDAG &DAG = DCI.DAG;4442SDLoc DL(N);44434444SDValue N0 = N->getOperand(0);4445SDValue N1 = N->getOperand(1);44464447if (!isU24(N0, DAG) || !isU24(N1, DAG))4448return SDValue();44494450N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);4451N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);44524453SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);4454DCI.AddToWorklist(Mulhi.getNode());4455return DAG.getZExtOrTrunc(Mulhi, DL, VT);4456}44574458SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,4459SDValue Op,4460const SDLoc &DL,4461unsigned Opc) const {4462EVT VT = Op.getValueType();4463EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);4464if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&4465LegalVT != MVT::i16))4466return SDValue();44674468if (VT != MVT::i32)4469Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);44704471SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);4472if (VT != MVT::i32)4473FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);44744475return FFBX;4476}44774478// The native instructions return -1 on 0 input. Optimize out a select that4479// produces -1 on 0.4480//4481// TODO: If zero is not undef, we could also do this if the output is compared4482// against the bitwidth.4483//4484// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.4485SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,4486SDValue LHS, SDValue RHS,4487DAGCombinerInfo &DCI) const {4488if (!isNullConstant(Cond.getOperand(1)))4489return SDValue();44904491SelectionDAG &DAG = DCI.DAG;4492ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();4493SDValue CmpLHS = Cond.getOperand(0);44944495// select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x4496// select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x4497if (CCOpcode == ISD::SETEQ &&4498(isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&4499RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {4500unsigned Opc =4501isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;4502return getFFBX_U32(DAG, CmpLHS, SL, Opc);4503}45044505// select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x4506// select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x4507if (CCOpcode == ISD::SETNE &&4508(isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&4509LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {4510unsigned Opc =4511isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;45124513return getFFBX_U32(DAG, CmpLHS, SL, Opc);4514}45154516return SDValue();4517}45184519static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,4520unsigned Op,4521const SDLoc &SL,4522SDValue Cond,4523SDValue N1,4524SDValue N2) {4525SelectionDAG &DAG = DCI.DAG;4526EVT VT = N1.getValueType();45274528SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,4529N1.getOperand(0), N2.getOperand(0));4530DCI.AddToWorklist(NewSelect.getNode());4531return DAG.getNode(Op, SL, VT, NewSelect);4532}45334534// Pull a free FP operation out of a select so it may fold into uses.4535//4536// select c, (fneg x), (fneg y) -> fneg (select c, x, y)4537// select c, (fneg x), k -> fneg (select c, x, (fneg k))4538//4539// select c, (fabs x), (fabs y) -> fabs (select c, x, y)4540// select c, (fabs x), +k -> fabs (select c, x, k)4541SDValue4542AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,4543SDValue N) const {4544SelectionDAG &DAG = DCI.DAG;4545SDValue Cond = N.getOperand(0);4546SDValue LHS = N.getOperand(1);4547SDValue RHS = N.getOperand(2);45484549EVT VT = N.getValueType();4550if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||4551(LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {4552if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))4553return SDValue();45544555return distributeOpThroughSelect(DCI, LHS.getOpcode(),4556SDLoc(N), Cond, LHS, RHS);4557}45584559bool Inv = false;4560if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {4561std::swap(LHS, RHS);4562Inv = true;4563}45644565// TODO: Support vector constants.4566ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);4567if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&4568!selectSupportsSourceMods(N.getNode())) {4569SDLoc SL(N);4570// If one side is an fneg/fabs and the other is a constant, we can push the4571// fneg/fabs down. If it's an fabs, the constant needs to be non-negative.4572SDValue NewLHS = LHS.getOperand(0);4573SDValue NewRHS = RHS;45744575// Careful: if the neg can be folded up, don't try to pull it back down.4576bool ShouldFoldNeg = true;45774578if (NewLHS.hasOneUse()) {4579unsigned Opc = NewLHS.getOpcode();4580if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))4581ShouldFoldNeg = false;4582if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)4583ShouldFoldNeg = false;4584}45854586if (ShouldFoldNeg) {4587if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())4588return SDValue();45894590// We're going to be forced to use a source modifier anyway, there's no4591// point to pulling the negate out unless we can get a size reduction by4592// negating the constant.4593//4594// TODO: Generalize to use getCheaperNegatedExpression which doesn't know4595// about cheaper constants.4596if (NewLHS.getOpcode() == ISD::FABS &&4597getConstantNegateCost(CRHS) != NegatibleCost::Cheaper)4598return SDValue();45994600if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))4601return SDValue();46024603if (LHS.getOpcode() == ISD::FNEG)4604NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);46054606if (Inv)4607std::swap(NewLHS, NewRHS);46084609SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,4610Cond, NewLHS, NewRHS);4611DCI.AddToWorklist(NewSelect.getNode());4612return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);4613}4614}46154616return SDValue();4617}46184619SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,4620DAGCombinerInfo &DCI) const {4621if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))4622return Folded;46234624SDValue Cond = N->getOperand(0);4625if (Cond.getOpcode() != ISD::SETCC)4626return SDValue();46274628EVT VT = N->getValueType(0);4629SDValue LHS = Cond.getOperand(0);4630SDValue RHS = Cond.getOperand(1);4631SDValue CC = Cond.getOperand(2);46324633SDValue True = N->getOperand(1);4634SDValue False = N->getOperand(2);46354636if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.4637SelectionDAG &DAG = DCI.DAG;4638if (DAG.isConstantValueOfAnyType(True) &&4639!DAG.isConstantValueOfAnyType(False)) {4640// Swap cmp + select pair to move constant to false input.4641// This will allow using VOPC cndmasks more often.4642// select (setcc x, y), k, x -> select (setccinv x, y), x, k46434644SDLoc SL(N);4645ISD::CondCode NewCC =4646getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());46474648SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);4649return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);4650}46514652if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {4653SDValue MinMax4654= combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);4655// Revisit this node so we can catch min3/max3/med3 patterns.4656//DCI.AddToWorklist(MinMax.getNode());4657return MinMax;4658}4659}46604661// There's no reason to not do this if the condition has other uses.4662return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);4663}46644665static bool isInv2Pi(const APFloat &APF) {4666static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));4667static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));4668static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));46694670return APF.bitwiseIsEqual(KF16) ||4671APF.bitwiseIsEqual(KF32) ||4672APF.bitwiseIsEqual(KF64);4673}46744675// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an4676// additional cost to negate them.4677TargetLowering::NegatibleCost4678AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {4679if (C->isZero())4680return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;46814682if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))4683return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;46844685return NegatibleCost::Neutral;4686}46874688bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {4689if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))4690return getConstantNegateCost(C) == NegatibleCost::Expensive;4691return false;4692}46934694bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {4695if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))4696return getConstantNegateCost(C) == NegatibleCost::Cheaper;4697return false;4698}46994700static unsigned inverseMinMax(unsigned Opc) {4701switch (Opc) {4702case ISD::FMAXNUM:4703return ISD::FMINNUM;4704case ISD::FMINNUM:4705return ISD::FMAXNUM;4706case ISD::FMAXNUM_IEEE:4707return ISD::FMINNUM_IEEE;4708case ISD::FMINNUM_IEEE:4709return ISD::FMAXNUM_IEEE;4710case ISD::FMAXIMUM:4711return ISD::FMINIMUM;4712case ISD::FMINIMUM:4713return ISD::FMAXIMUM;4714case AMDGPUISD::FMAX_LEGACY:4715return AMDGPUISD::FMIN_LEGACY;4716case AMDGPUISD::FMIN_LEGACY:4717return AMDGPUISD::FMAX_LEGACY;4718default:4719llvm_unreachable("invalid min/max opcode");4720}4721}47224723/// \return true if it's profitable to try to push an fneg into its source4724/// instruction.4725bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {4726// If the input has multiple uses and we can either fold the negate down, or4727// the other uses cannot, give up. This both prevents unprofitable4728// transformations and infinite loops: we won't repeatedly try to fold around4729// a negate that has no 'good' form.4730if (N0.hasOneUse()) {4731// This may be able to fold into the source, but at a code size cost. Don't4732// fold if the fold into the user is free.4733if (allUsesHaveSourceMods(N, 0))4734return false;4735} else {4736if (fnegFoldsIntoOp(N0.getNode()) &&4737(allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))4738return false;4739}47404741return true;4742}47434744SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,4745DAGCombinerInfo &DCI) const {4746SelectionDAG &DAG = DCI.DAG;4747SDValue N0 = N->getOperand(0);4748EVT VT = N->getValueType(0);47494750unsigned Opc = N0.getOpcode();47514752if (!shouldFoldFNegIntoSrc(N, N0))4753return SDValue();47544755SDLoc SL(N);4756switch (Opc) {4757case ISD::FADD: {4758if (!mayIgnoreSignedZero(N0))4759return SDValue();47604761// (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))4762SDValue LHS = N0.getOperand(0);4763SDValue RHS = N0.getOperand(1);47644765if (LHS.getOpcode() != ISD::FNEG)4766LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);4767else4768LHS = LHS.getOperand(0);47694770if (RHS.getOpcode() != ISD::FNEG)4771RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);4772else4773RHS = RHS.getOperand(0);47744775SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());4776if (Res.getOpcode() != ISD::FADD)4777return SDValue(); // Op got folded away.4778if (!N0.hasOneUse())4779DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));4780return Res;4781}4782case ISD::FMUL:4783case AMDGPUISD::FMUL_LEGACY: {4784// (fneg (fmul x, y)) -> (fmul x, (fneg y))4785// (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))4786SDValue LHS = N0.getOperand(0);4787SDValue RHS = N0.getOperand(1);47884789if (LHS.getOpcode() == ISD::FNEG)4790LHS = LHS.getOperand(0);4791else if (RHS.getOpcode() == ISD::FNEG)4792RHS = RHS.getOperand(0);4793else4794RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);47954796SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());4797if (Res.getOpcode() != Opc)4798return SDValue(); // Op got folded away.4799if (!N0.hasOneUse())4800DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));4801return Res;4802}4803case ISD::FMA:4804case ISD::FMAD: {4805// TODO: handle llvm.amdgcn.fma.legacy4806if (!mayIgnoreSignedZero(N0))4807return SDValue();48084809// (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))4810SDValue LHS = N0.getOperand(0);4811SDValue MHS = N0.getOperand(1);4812SDValue RHS = N0.getOperand(2);48134814if (LHS.getOpcode() == ISD::FNEG)4815LHS = LHS.getOperand(0);4816else if (MHS.getOpcode() == ISD::FNEG)4817MHS = MHS.getOperand(0);4818else4819MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);48204821if (RHS.getOpcode() != ISD::FNEG)4822RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);4823else4824RHS = RHS.getOperand(0);48254826SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);4827if (Res.getOpcode() != Opc)4828return SDValue(); // Op got folded away.4829if (!N0.hasOneUse())4830DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));4831return Res;4832}4833case ISD::FMAXNUM:4834case ISD::FMINNUM:4835case ISD::FMAXNUM_IEEE:4836case ISD::FMINNUM_IEEE:4837case ISD::FMINIMUM:4838case ISD::FMAXIMUM:4839case AMDGPUISD::FMAX_LEGACY:4840case AMDGPUISD::FMIN_LEGACY: {4841// fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)4842// fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)4843// fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)4844// fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)48454846SDValue LHS = N0.getOperand(0);4847SDValue RHS = N0.getOperand(1);48484849// 0 doesn't have a negated inline immediate.4850// TODO: This constant check should be generalized to other operations.4851if (isConstantCostlierToNegate(RHS))4852return SDValue();48534854SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);4855SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);4856unsigned Opposite = inverseMinMax(Opc);48574858SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());4859if (Res.getOpcode() != Opposite)4860return SDValue(); // Op got folded away.4861if (!N0.hasOneUse())4862DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));4863return Res;4864}4865case AMDGPUISD::FMED3: {4866SDValue Ops[3];4867for (unsigned I = 0; I < 3; ++I)4868Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());48694870SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());4871if (Res.getOpcode() != AMDGPUISD::FMED3)4872return SDValue(); // Op got folded away.48734874if (!N0.hasOneUse()) {4875SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);4876DAG.ReplaceAllUsesWith(N0, Neg);48774878for (SDNode *U : Neg->uses())4879DCI.AddToWorklist(U);4880}48814882return Res;4883}4884case ISD::FP_EXTEND:4885case ISD::FTRUNC:4886case ISD::FRINT:4887case ISD::FNEARBYINT: // XXX - Should fround be handled?4888case ISD::FROUNDEVEN:4889case ISD::FSIN:4890case ISD::FCANONICALIZE:4891case AMDGPUISD::RCP:4892case AMDGPUISD::RCP_LEGACY:4893case AMDGPUISD::RCP_IFLAG:4894case AMDGPUISD::SIN_HW: {4895SDValue CvtSrc = N0.getOperand(0);4896if (CvtSrc.getOpcode() == ISD::FNEG) {4897// (fneg (fp_extend (fneg x))) -> (fp_extend x)4898// (fneg (rcp (fneg x))) -> (rcp x)4899return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));4900}49014902if (!N0.hasOneUse())4903return SDValue();49044905// (fneg (fp_extend x)) -> (fp_extend (fneg x))4906// (fneg (rcp x)) -> (rcp (fneg x))4907SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);4908return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());4909}4910case ISD::FP_ROUND: {4911SDValue CvtSrc = N0.getOperand(0);49124913if (CvtSrc.getOpcode() == ISD::FNEG) {4914// (fneg (fp_round (fneg x))) -> (fp_round x)4915return DAG.getNode(ISD::FP_ROUND, SL, VT,4916CvtSrc.getOperand(0), N0.getOperand(1));4917}49184919if (!N0.hasOneUse())4920return SDValue();49214922// (fneg (fp_round x)) -> (fp_round (fneg x))4923SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);4924return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));4925}4926case ISD::FP16_TO_FP: {4927// v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal4928// f16, but legalization of f16 fneg ends up pulling it out of the source.4929// Put the fneg back as a legal source operation that can be matched later.4930SDLoc SL(N);49314932SDValue Src = N0.getOperand(0);4933EVT SrcVT = Src.getValueType();49344935// fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)4936SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,4937DAG.getConstant(0x8000, SL, SrcVT));4938return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);4939}4940case ISD::SELECT: {4941// fneg (select c, a, b) -> select c, (fneg a), (fneg b)4942// TODO: Invert conditions of foldFreeOpFromSelect4943return SDValue();4944}4945case ISD::BITCAST: {4946SDLoc SL(N);4947SDValue BCSrc = N0.getOperand(0);4948if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {4949SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);4950if (HighBits.getValueType().getSizeInBits() != 32 ||4951!fnegFoldsIntoOp(HighBits.getNode()))4952return SDValue();49534954// f64 fneg only really needs to operate on the high half of of the4955// register, so try to force it to an f32 operation to help make use of4956// source modifiers.4957//4958//4959// fneg (f64 (bitcast (build_vector x, y))) ->4960// f64 (bitcast (build_vector (bitcast i32:x to f32),4961// (fneg (bitcast i32:y to f32)))49624963SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);4964SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);4965SDValue CastBack =4966DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);49674968SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());4969Ops.back() = CastBack;4970DCI.AddToWorklist(NegHi.getNode());4971SDValue Build =4972DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);4973SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);49744975if (!N0.hasOneUse())4976DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));4977return Result;4978}49794980if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&4981BCSrc.hasOneUse()) {4982// fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->4983// select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)49844985// TODO: Cast back result for multiple uses is beneficial in some cases.49864987SDValue LHS =4988DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));4989SDValue RHS =4990DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));49914992SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);4993SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);49944995return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,4996NegRHS);4997}49984999return SDValue();5000}5001default:5002return SDValue();5003}5004}50055006SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,5007DAGCombinerInfo &DCI) const {5008SelectionDAG &DAG = DCI.DAG;5009SDValue N0 = N->getOperand(0);50105011if (!N0.hasOneUse())5012return SDValue();50135014switch (N0.getOpcode()) {5015case ISD::FP16_TO_FP: {5016assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");5017SDLoc SL(N);5018SDValue Src = N0.getOperand(0);5019EVT SrcVT = Src.getValueType();50205021// fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)5022SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,5023DAG.getConstant(0x7fff, SL, SrcVT));5024return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);5025}5026default:5027return SDValue();5028}5029}50305031SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,5032DAGCombinerInfo &DCI) const {5033const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));5034if (!CFP)5035return SDValue();50365037// XXX - Should this flush denormals?5038const APFloat &Val = CFP->getValueAPF();5039APFloat One(Val.getSemantics(), "1.0");5040return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));5041}50425043SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,5044DAGCombinerInfo &DCI) const {5045SelectionDAG &DAG = DCI.DAG;5046SDLoc DL(N);50475048switch(N->getOpcode()) {5049default:5050break;5051case ISD::BITCAST: {5052EVT DestVT = N->getValueType(0);50535054// Push casts through vector builds. This helps avoid emitting a large5055// number of copies when materializing floating point vector constants.5056//5057// vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>5058// vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))5059if (DestVT.isVector()) {5060SDValue Src = N->getOperand(0);5061if (Src.getOpcode() == ISD::BUILD_VECTOR &&5062(DCI.getDAGCombineLevel() < AfterLegalizeDAG ||5063isOperationLegal(ISD::BUILD_VECTOR, DestVT))) {5064EVT SrcVT = Src.getValueType();5065unsigned NElts = DestVT.getVectorNumElements();50665067if (SrcVT.getVectorNumElements() == NElts) {5068EVT DestEltVT = DestVT.getVectorElementType();50695070SmallVector<SDValue, 8> CastedElts;5071SDLoc SL(N);5072for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {5073SDValue Elt = Src.getOperand(I);5074CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));5075}50765077return DAG.getBuildVector(DestVT, SL, CastedElts);5078}5079}5080}50815082if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())5083break;50845085// Fold bitcasts of constants.5086//5087// v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)5088// TODO: Generalize and move to DAGCombiner5089SDValue Src = N->getOperand(0);5090if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {5091SDLoc SL(N);5092uint64_t CVal = C->getZExtValue();5093SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,5094DAG.getConstant(Lo_32(CVal), SL, MVT::i32),5095DAG.getConstant(Hi_32(CVal), SL, MVT::i32));5096return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);5097}50985099if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {5100const APInt &Val = C->getValueAPF().bitcastToAPInt();5101SDLoc SL(N);5102uint64_t CVal = Val.getZExtValue();5103SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,5104DAG.getConstant(Lo_32(CVal), SL, MVT::i32),5105DAG.getConstant(Hi_32(CVal), SL, MVT::i32));51065107return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);5108}51095110break;5111}5112case ISD::SHL: {5113if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)5114break;51155116return performShlCombine(N, DCI);5117}5118case ISD::SRL: {5119if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)5120break;51215122return performSrlCombine(N, DCI);5123}5124case ISD::SRA: {5125if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)5126break;51275128return performSraCombine(N, DCI);5129}5130case ISD::TRUNCATE:5131return performTruncateCombine(N, DCI);5132case ISD::MUL:5133return performMulCombine(N, DCI);5134case AMDGPUISD::MUL_U24:5135case AMDGPUISD::MUL_I24: {5136if (SDValue Simplified = simplifyMul24(N, DCI))5137return Simplified;5138break;5139}5140case AMDGPUISD::MULHI_I24:5141case AMDGPUISD::MULHI_U24:5142return simplifyMul24(N, DCI);5143case ISD::SMUL_LOHI:5144case ISD::UMUL_LOHI:5145return performMulLoHiCombine(N, DCI);5146case ISD::MULHS:5147return performMulhsCombine(N, DCI);5148case ISD::MULHU:5149return performMulhuCombine(N, DCI);5150case ISD::SELECT:5151return performSelectCombine(N, DCI);5152case ISD::FNEG:5153return performFNegCombine(N, DCI);5154case ISD::FABS:5155return performFAbsCombine(N, DCI);5156case AMDGPUISD::BFE_I32:5157case AMDGPUISD::BFE_U32: {5158assert(!N->getValueType(0).isVector() &&5159"Vector handling of BFE not implemented");5160ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));5161if (!Width)5162break;51635164uint32_t WidthVal = Width->getZExtValue() & 0x1f;5165if (WidthVal == 0)5166return DAG.getConstant(0, DL, MVT::i32);51675168ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));5169if (!Offset)5170break;51715172SDValue BitsFrom = N->getOperand(0);5173uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;51745175bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;51765177if (OffsetVal == 0) {5178// This is already sign / zero extended, so try to fold away extra BFEs.5179unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);51805181unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);5182if (OpSignBits >= SignBits)5183return BitsFrom;51845185EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);5186if (Signed) {5187// This is a sign_extend_inreg. Replace it to take advantage of existing5188// DAG Combines. If not eliminated, we will match back to BFE during5189// selection.51905191// TODO: The sext_inreg of extended types ends, although we can could5192// handle them in a single BFE.5193return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,5194DAG.getValueType(SmallVT));5195}51965197return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);5198}51995200if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {5201if (Signed) {5202return constantFoldBFE<int32_t>(DAG,5203CVal->getSExtValue(),5204OffsetVal,5205WidthVal,5206DL);5207}52085209return constantFoldBFE<uint32_t>(DAG,5210CVal->getZExtValue(),5211OffsetVal,5212WidthVal,5213DL);5214}52155216if ((OffsetVal + WidthVal) >= 32 &&5217!(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {5218SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);5219return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,5220BitsFrom, ShiftVal);5221}52225223if (BitsFrom.hasOneUse()) {5224APInt Demanded = APInt::getBitsSet(32,5225OffsetVal,5226OffsetVal + WidthVal);52275228KnownBits Known;5229TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),5230!DCI.isBeforeLegalizeOps());5231const TargetLowering &TLI = DAG.getTargetLoweringInfo();5232if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||5233TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {5234DCI.CommitTargetLoweringOpt(TLO);5235}5236}52375238break;5239}5240case ISD::LOAD:5241return performLoadCombine(N, DCI);5242case ISD::STORE:5243return performStoreCombine(N, DCI);5244case AMDGPUISD::RCP:5245case AMDGPUISD::RCP_IFLAG:5246return performRcpCombine(N, DCI);5247case ISD::AssertZext:5248case ISD::AssertSext:5249return performAssertSZExtCombine(N, DCI);5250case ISD::INTRINSIC_WO_CHAIN:5251return performIntrinsicWOChainCombine(N, DCI);5252case AMDGPUISD::FMAD_FTZ: {5253SDValue N0 = N->getOperand(0);5254SDValue N1 = N->getOperand(1);5255SDValue N2 = N->getOperand(2);5256EVT VT = N->getValueType(0);52575258// FMAD_FTZ is a FMAD + flush denormals to zero.5259// We flush the inputs, the intermediate step, and the output.5260ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);5261ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);5262ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);5263if (N0CFP && N1CFP && N2CFP) {5264const auto FTZ = [](const APFloat &V) {5265if (V.isDenormal()) {5266APFloat Zero(V.getSemantics(), 0);5267return V.isNegative() ? -Zero : Zero;5268}5269return V;5270};52715272APFloat V0 = FTZ(N0CFP->getValueAPF());5273APFloat V1 = FTZ(N1CFP->getValueAPF());5274APFloat V2 = FTZ(N2CFP->getValueAPF());5275V0.multiply(V1, APFloat::rmNearestTiesToEven);5276V0 = FTZ(V0);5277V0.add(V2, APFloat::rmNearestTiesToEven);5278return DAG.getConstantFP(FTZ(V0), DL, VT);5279}5280break;5281}5282}5283return SDValue();5284}52855286//===----------------------------------------------------------------------===//5287// Helper functions5288//===----------------------------------------------------------------------===//52895290SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,5291const TargetRegisterClass *RC,5292Register Reg, EVT VT,5293const SDLoc &SL,5294bool RawReg) const {5295MachineFunction &MF = DAG.getMachineFunction();5296MachineRegisterInfo &MRI = MF.getRegInfo();5297Register VReg;52985299if (!MRI.isLiveIn(Reg)) {5300VReg = MRI.createVirtualRegister(RC);5301MRI.addLiveIn(Reg, VReg);5302} else {5303VReg = MRI.getLiveInVirtReg(Reg);5304}53055306if (RawReg)5307return DAG.getRegister(VReg, VT);53085309return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);5310}53115312// This may be called multiple times, and nothing prevents creating multiple5313// objects at the same offset. See if we already defined this object.5314static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,5315int64_t Offset) {5316for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {5317if (MFI.getObjectOffset(I) == Offset) {5318assert(MFI.getObjectSize(I) == Size);5319return I;5320}5321}53225323return MFI.CreateFixedObject(Size, Offset, true);5324}53255326SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,5327EVT VT,5328const SDLoc &SL,5329int64_t Offset) const {5330MachineFunction &MF = DAG.getMachineFunction();5331MachineFrameInfo &MFI = MF.getFrameInfo();5332int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);53335334auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);5335SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);53365337return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),5338MachineMemOperand::MODereferenceable |5339MachineMemOperand::MOInvariant);5340}53415342SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,5343const SDLoc &SL,5344SDValue Chain,5345SDValue ArgVal,5346int64_t Offset) const {5347MachineFunction &MF = DAG.getMachineFunction();5348MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);5349const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();53505351SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);5352// Stores to the argument stack area are relative to the stack pointer.5353SDValue SP =5354DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);5355Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);5356SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),5357MachineMemOperand::MODereferenceable);5358return Store;5359}53605361SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,5362const TargetRegisterClass *RC,5363EVT VT, const SDLoc &SL,5364const ArgDescriptor &Arg) const {5365assert(Arg && "Attempting to load missing argument");53665367SDValue V = Arg.isRegister() ?5368CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :5369loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());53705371if (!Arg.isMasked())5372return V;53735374unsigned Mask = Arg.getMask();5375unsigned Shift = llvm::countr_zero<unsigned>(Mask);5376V = DAG.getNode(ISD::SRL, SL, VT, V,5377DAG.getShiftAmountConstant(Shift, VT, SL));5378return DAG.getNode(ISD::AND, SL, VT, V,5379DAG.getConstant(Mask >> Shift, SL, VT));5380}53815382uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(5383uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {5384unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();5385const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();5386uint64_t ArgOffset =5387alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;5388switch (Param) {5389case FIRST_IMPLICIT:5390return ArgOffset;5391case PRIVATE_BASE:5392return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;5393case SHARED_BASE:5394return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;5395case QUEUE_PTR:5396return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;5397}5398llvm_unreachable("unexpected implicit parameter type");5399}54005401uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(5402const MachineFunction &MF, const ImplicitParameter Param) const {5403const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();5404return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param);5405}54065407#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;54085409const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {5410switch ((AMDGPUISD::NodeType)Opcode) {5411case AMDGPUISD::FIRST_NUMBER: break;5412// AMDIL DAG nodes5413NODE_NAME_CASE(UMUL);5414NODE_NAME_CASE(BRANCH_COND);54155416// AMDGPU DAG nodes5417NODE_NAME_CASE(IF)5418NODE_NAME_CASE(ELSE)5419NODE_NAME_CASE(LOOP)5420NODE_NAME_CASE(CALL)5421NODE_NAME_CASE(TC_RETURN)5422NODE_NAME_CASE(TC_RETURN_GFX)5423NODE_NAME_CASE(TC_RETURN_CHAIN)5424NODE_NAME_CASE(TRAP)5425NODE_NAME_CASE(RET_GLUE)5426NODE_NAME_CASE(WAVE_ADDRESS)5427NODE_NAME_CASE(RETURN_TO_EPILOG)5428NODE_NAME_CASE(ENDPGM)5429NODE_NAME_CASE(ENDPGM_TRAP)5430NODE_NAME_CASE(SIMULATED_TRAP)5431NODE_NAME_CASE(DWORDADDR)5432NODE_NAME_CASE(FRACT)5433NODE_NAME_CASE(SETCC)5434NODE_NAME_CASE(SETREG)5435NODE_NAME_CASE(DENORM_MODE)5436NODE_NAME_CASE(FMA_W_CHAIN)5437NODE_NAME_CASE(FMUL_W_CHAIN)5438NODE_NAME_CASE(CLAMP)5439NODE_NAME_CASE(COS_HW)5440NODE_NAME_CASE(SIN_HW)5441NODE_NAME_CASE(FMAX_LEGACY)5442NODE_NAME_CASE(FMIN_LEGACY)5443NODE_NAME_CASE(FMAX3)5444NODE_NAME_CASE(SMAX3)5445NODE_NAME_CASE(UMAX3)5446NODE_NAME_CASE(FMIN3)5447NODE_NAME_CASE(SMIN3)5448NODE_NAME_CASE(UMIN3)5449NODE_NAME_CASE(FMED3)5450NODE_NAME_CASE(SMED3)5451NODE_NAME_CASE(UMED3)5452NODE_NAME_CASE(FMAXIMUM3)5453NODE_NAME_CASE(FMINIMUM3)5454NODE_NAME_CASE(FDOT2)5455NODE_NAME_CASE(URECIP)5456NODE_NAME_CASE(DIV_SCALE)5457NODE_NAME_CASE(DIV_FMAS)5458NODE_NAME_CASE(DIV_FIXUP)5459NODE_NAME_CASE(FMAD_FTZ)5460NODE_NAME_CASE(RCP)5461NODE_NAME_CASE(RSQ)5462NODE_NAME_CASE(RCP_LEGACY)5463NODE_NAME_CASE(RCP_IFLAG)5464NODE_NAME_CASE(LOG)5465NODE_NAME_CASE(EXP)5466NODE_NAME_CASE(FMUL_LEGACY)5467NODE_NAME_CASE(RSQ_CLAMP)5468NODE_NAME_CASE(FP_CLASS)5469NODE_NAME_CASE(DOT4)5470NODE_NAME_CASE(CARRY)5471NODE_NAME_CASE(BORROW)5472NODE_NAME_CASE(BFE_U32)5473NODE_NAME_CASE(BFE_I32)5474NODE_NAME_CASE(BFI)5475NODE_NAME_CASE(BFM)5476NODE_NAME_CASE(FFBH_U32)5477NODE_NAME_CASE(FFBH_I32)5478NODE_NAME_CASE(FFBL_B32)5479NODE_NAME_CASE(MUL_U24)5480NODE_NAME_CASE(MUL_I24)5481NODE_NAME_CASE(MULHI_U24)5482NODE_NAME_CASE(MULHI_I24)5483NODE_NAME_CASE(MAD_U24)5484NODE_NAME_CASE(MAD_I24)5485NODE_NAME_CASE(MAD_I64_I32)5486NODE_NAME_CASE(MAD_U64_U32)5487NODE_NAME_CASE(PERM)5488NODE_NAME_CASE(TEXTURE_FETCH)5489NODE_NAME_CASE(R600_EXPORT)5490NODE_NAME_CASE(CONST_ADDRESS)5491NODE_NAME_CASE(REGISTER_LOAD)5492NODE_NAME_CASE(REGISTER_STORE)5493NODE_NAME_CASE(SAMPLE)5494NODE_NAME_CASE(SAMPLEB)5495NODE_NAME_CASE(SAMPLED)5496NODE_NAME_CASE(SAMPLEL)5497NODE_NAME_CASE(CVT_F32_UBYTE0)5498NODE_NAME_CASE(CVT_F32_UBYTE1)5499NODE_NAME_CASE(CVT_F32_UBYTE2)5500NODE_NAME_CASE(CVT_F32_UBYTE3)5501NODE_NAME_CASE(CVT_PKRTZ_F16_F32)5502NODE_NAME_CASE(CVT_PKNORM_I16_F32)5503NODE_NAME_CASE(CVT_PKNORM_U16_F32)5504NODE_NAME_CASE(CVT_PK_I16_I32)5505NODE_NAME_CASE(CVT_PK_U16_U32)5506NODE_NAME_CASE(FP_TO_FP16)5507NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)5508NODE_NAME_CASE(CONST_DATA_PTR)5509NODE_NAME_CASE(PC_ADD_REL_OFFSET)5510NODE_NAME_CASE(LDS)5511NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)5512NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)5513NODE_NAME_CASE(DUMMY_CHAIN)5514case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;5515NODE_NAME_CASE(LOAD_D16_HI)5516NODE_NAME_CASE(LOAD_D16_LO)5517NODE_NAME_CASE(LOAD_D16_HI_I8)5518NODE_NAME_CASE(LOAD_D16_HI_U8)5519NODE_NAME_CASE(LOAD_D16_LO_I8)5520NODE_NAME_CASE(LOAD_D16_LO_U8)5521NODE_NAME_CASE(STORE_MSKOR)5522NODE_NAME_CASE(LOAD_CONSTANT)5523NODE_NAME_CASE(TBUFFER_STORE_FORMAT)5524NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)5525NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)5526NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)5527NODE_NAME_CASE(DS_ORDERED_COUNT)5528NODE_NAME_CASE(ATOMIC_CMP_SWAP)5529NODE_NAME_CASE(BUFFER_LOAD)5530NODE_NAME_CASE(BUFFER_LOAD_UBYTE)5531NODE_NAME_CASE(BUFFER_LOAD_USHORT)5532NODE_NAME_CASE(BUFFER_LOAD_BYTE)5533NODE_NAME_CASE(BUFFER_LOAD_SHORT)5534NODE_NAME_CASE(BUFFER_LOAD_TFE)5535NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)5536NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)5537NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)5538NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)5539NODE_NAME_CASE(BUFFER_LOAD_FORMAT)5540NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)5541NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)5542NODE_NAME_CASE(SBUFFER_LOAD)5543NODE_NAME_CASE(SBUFFER_LOAD_BYTE)5544NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)5545NODE_NAME_CASE(SBUFFER_LOAD_SHORT)5546NODE_NAME_CASE(SBUFFER_LOAD_USHORT)5547NODE_NAME_CASE(BUFFER_STORE)5548NODE_NAME_CASE(BUFFER_STORE_BYTE)5549NODE_NAME_CASE(BUFFER_STORE_SHORT)5550NODE_NAME_CASE(BUFFER_STORE_FORMAT)5551NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)5552NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)5553NODE_NAME_CASE(BUFFER_ATOMIC_ADD)5554NODE_NAME_CASE(BUFFER_ATOMIC_SUB)5555NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)5556NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)5557NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)5558NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)5559NODE_NAME_CASE(BUFFER_ATOMIC_AND)5560NODE_NAME_CASE(BUFFER_ATOMIC_OR)5561NODE_NAME_CASE(BUFFER_ATOMIC_XOR)5562NODE_NAME_CASE(BUFFER_ATOMIC_INC)5563NODE_NAME_CASE(BUFFER_ATOMIC_DEC)5564NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)5565NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)5566NODE_NAME_CASE(BUFFER_ATOMIC_FADD)5567NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)5568NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)5569NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)55705571case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;5572}5573return nullptr;5574}55755576SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,5577SelectionDAG &DAG, int Enabled,5578int &RefinementSteps,5579bool &UseOneConstNR,5580bool Reciprocal) const {5581EVT VT = Operand.getValueType();55825583if (VT == MVT::f32) {5584RefinementSteps = 0;5585return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);5586}55875588// TODO: There is also f64 rsq instruction, but the documentation is less5589// clear on its precision.55905591return SDValue();5592}55935594SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,5595SelectionDAG &DAG, int Enabled,5596int &RefinementSteps) const {5597EVT VT = Operand.getValueType();55985599if (VT == MVT::f32) {5600// Reciprocal, < 1 ulp error.5601//5602// This reciprocal approximation converges to < 0.5 ulp error with one5603// newton rhapson performed with two fused multiple adds (FMAs).56045605RefinementSteps = 0;5606return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);5607}56085609// TODO: There is also f64 rcp instruction, but the documentation is less5610// clear on its precision.56115612return SDValue();5613}56145615static unsigned workitemIntrinsicDim(unsigned ID) {5616switch (ID) {5617case Intrinsic::amdgcn_workitem_id_x:5618return 0;5619case Intrinsic::amdgcn_workitem_id_y:5620return 1;5621case Intrinsic::amdgcn_workitem_id_z:5622return 2;5623default:5624llvm_unreachable("not a workitem intrinsic");5625}5626}56275628void AMDGPUTargetLowering::computeKnownBitsForTargetNode(5629const SDValue Op, KnownBits &Known,5630const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {56315632Known.resetAll(); // Don't know anything.56335634unsigned Opc = Op.getOpcode();56355636switch (Opc) {5637default:5638break;5639case AMDGPUISD::CARRY:5640case AMDGPUISD::BORROW: {5641Known.Zero = APInt::getHighBitsSet(32, 31);5642break;5643}56445645case AMDGPUISD::BFE_I32:5646case AMDGPUISD::BFE_U32: {5647ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));5648if (!CWidth)5649return;56505651uint32_t Width = CWidth->getZExtValue() & 0x1f;56525653if (Opc == AMDGPUISD::BFE_U32)5654Known.Zero = APInt::getHighBitsSet(32, 32 - Width);56555656break;5657}5658case AMDGPUISD::FP_TO_FP16: {5659unsigned BitWidth = Known.getBitWidth();56605661// High bits are zero.5662Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);5663break;5664}5665case AMDGPUISD::MUL_U24:5666case AMDGPUISD::MUL_I24: {5667KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);5668KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);5669unsigned TrailZ = LHSKnown.countMinTrailingZeros() +5670RHSKnown.countMinTrailingZeros();5671Known.Zero.setLowBits(std::min(TrailZ, 32u));5672// Skip extra check if all bits are known zeros.5673if (TrailZ >= 32)5674break;56755676// Truncate to 24 bits.5677LHSKnown = LHSKnown.trunc(24);5678RHSKnown = RHSKnown.trunc(24);56795680if (Opc == AMDGPUISD::MUL_I24) {5681unsigned LHSValBits = LHSKnown.countMaxSignificantBits();5682unsigned RHSValBits = RHSKnown.countMaxSignificantBits();5683unsigned MaxValBits = LHSValBits + RHSValBits;5684if (MaxValBits > 32)5685break;5686unsigned SignBits = 32 - MaxValBits + 1;5687bool LHSNegative = LHSKnown.isNegative();5688bool LHSNonNegative = LHSKnown.isNonNegative();5689bool LHSPositive = LHSKnown.isStrictlyPositive();5690bool RHSNegative = RHSKnown.isNegative();5691bool RHSNonNegative = RHSKnown.isNonNegative();5692bool RHSPositive = RHSKnown.isStrictlyPositive();56935694if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))5695Known.Zero.setHighBits(SignBits);5696else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))5697Known.One.setHighBits(SignBits);5698} else {5699unsigned LHSValBits = LHSKnown.countMaxActiveBits();5700unsigned RHSValBits = RHSKnown.countMaxActiveBits();5701unsigned MaxValBits = LHSValBits + RHSValBits;5702if (MaxValBits >= 32)5703break;5704Known.Zero.setBitsFrom(MaxValBits);5705}5706break;5707}5708case AMDGPUISD::PERM: {5709ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));5710if (!CMask)5711return;57125713KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);5714KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);5715unsigned Sel = CMask->getZExtValue();57165717for (unsigned I = 0; I < 32; I += 8) {5718unsigned SelBits = Sel & 0xff;5719if (SelBits < 4) {5720SelBits *= 8;5721Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;5722Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;5723} else if (SelBits < 7) {5724SelBits = (SelBits & 3) * 8;5725Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;5726Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;5727} else if (SelBits == 0x0c) {5728Known.Zero |= 0xFFull << I;5729} else if (SelBits > 0x0c) {5730Known.One |= 0xFFull << I;5731}5732Sel >>= 8;5733}5734break;5735}5736case AMDGPUISD::BUFFER_LOAD_UBYTE: {5737Known.Zero.setHighBits(24);5738break;5739}5740case AMDGPUISD::BUFFER_LOAD_USHORT: {5741Known.Zero.setHighBits(16);5742break;5743}5744case AMDGPUISD::LDS: {5745auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());5746Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());57475748Known.Zero.setHighBits(16);5749Known.Zero.setLowBits(Log2(Alignment));5750break;5751}5752case AMDGPUISD::SMIN3:5753case AMDGPUISD::SMAX3:5754case AMDGPUISD::SMED3:5755case AMDGPUISD::UMIN3:5756case AMDGPUISD::UMAX3:5757case AMDGPUISD::UMED3: {5758KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);5759if (Known2.isUnknown())5760break;57615762KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);5763if (Known1.isUnknown())5764break;57655766KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);5767if (Known0.isUnknown())5768break;57695770// TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.5771Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;5772Known.One = Known0.One & Known1.One & Known2.One;5773break;5774}5775case ISD::INTRINSIC_WO_CHAIN: {5776unsigned IID = Op.getConstantOperandVal(0);5777switch (IID) {5778case Intrinsic::amdgcn_workitem_id_x:5779case Intrinsic::amdgcn_workitem_id_y:5780case Intrinsic::amdgcn_workitem_id_z: {5781unsigned MaxValue = Subtarget->getMaxWorkitemID(5782DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));5783Known.Zero.setHighBits(llvm::countl_zero(MaxValue));5784break;5785}5786default:5787break;5788}5789}5790}5791}57925793unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(5794SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,5795unsigned Depth) const {5796switch (Op.getOpcode()) {5797case AMDGPUISD::BFE_I32: {5798ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));5799if (!Width)5800return 1;58015802unsigned SignBits = 32 - Width->getZExtValue() + 1;5803if (!isNullConstant(Op.getOperand(1)))5804return SignBits;58055806// TODO: Could probably figure something out with non-0 offsets.5807unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);5808return std::max(SignBits, Op0SignBits);5809}58105811case AMDGPUISD::BFE_U32: {5812ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));5813return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;5814}58155816case AMDGPUISD::CARRY:5817case AMDGPUISD::BORROW:5818return 31;5819case AMDGPUISD::BUFFER_LOAD_BYTE:5820return 25;5821case AMDGPUISD::BUFFER_LOAD_SHORT:5822return 17;5823case AMDGPUISD::BUFFER_LOAD_UBYTE:5824return 24;5825case AMDGPUISD::BUFFER_LOAD_USHORT:5826return 16;5827case AMDGPUISD::FP_TO_FP16:5828return 16;5829case AMDGPUISD::SMIN3:5830case AMDGPUISD::SMAX3:5831case AMDGPUISD::SMED3:5832case AMDGPUISD::UMIN3:5833case AMDGPUISD::UMAX3:5834case AMDGPUISD::UMED3: {5835unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);5836if (Tmp2 == 1)5837return 1; // Early out.58385839unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);5840if (Tmp1 == 1)5841return 1; // Early out.58425843unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);5844if (Tmp0 == 1)5845return 1; // Early out.58465847return std::min({Tmp0, Tmp1, Tmp2});5848}5849default:5850return 1;5851}5852}58535854unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(5855GISelKnownBits &Analysis, Register R,5856const APInt &DemandedElts, const MachineRegisterInfo &MRI,5857unsigned Depth) const {5858const MachineInstr *MI = MRI.getVRegDef(R);5859if (!MI)5860return 1;58615862// TODO: Check range metadata on MMO.5863switch (MI->getOpcode()) {5864case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:5865return 25;5866case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:5867return 17;5868case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:5869return 24;5870case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:5871return 16;5872case AMDGPU::G_AMDGPU_SMED3:5873case AMDGPU::G_AMDGPU_UMED3: {5874auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();5875unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);5876if (Tmp2 == 1)5877return 1;5878unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);5879if (Tmp1 == 1)5880return 1;5881unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);5882if (Tmp0 == 1)5883return 1;5884return std::min({Tmp0, Tmp1, Tmp2});5885}5886default:5887return 1;5888}5889}58905891bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,5892const SelectionDAG &DAG,5893bool SNaN,5894unsigned Depth) const {5895unsigned Opcode = Op.getOpcode();5896switch (Opcode) {5897case AMDGPUISD::FMIN_LEGACY:5898case AMDGPUISD::FMAX_LEGACY: {5899if (SNaN)5900return true;59015902// TODO: Can check no nans on one of the operands for each one, but which5903// one?5904return false;5905}5906case AMDGPUISD::FMUL_LEGACY:5907case AMDGPUISD::CVT_PKRTZ_F16_F32: {5908if (SNaN)5909return true;5910return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&5911DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);5912}5913case AMDGPUISD::FMED3:5914case AMDGPUISD::FMIN3:5915case AMDGPUISD::FMAX3:5916case AMDGPUISD::FMINIMUM3:5917case AMDGPUISD::FMAXIMUM3:5918case AMDGPUISD::FMAD_FTZ: {5919if (SNaN)5920return true;5921return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&5922DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&5923DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);5924}5925case AMDGPUISD::CVT_F32_UBYTE0:5926case AMDGPUISD::CVT_F32_UBYTE1:5927case AMDGPUISD::CVT_F32_UBYTE2:5928case AMDGPUISD::CVT_F32_UBYTE3:5929return true;59305931case AMDGPUISD::RCP:5932case AMDGPUISD::RSQ:5933case AMDGPUISD::RCP_LEGACY:5934case AMDGPUISD::RSQ_CLAMP: {5935if (SNaN)5936return true;59375938// TODO: Need is known positive check.5939return false;5940}5941case ISD::FLDEXP:5942case AMDGPUISD::FRACT: {5943if (SNaN)5944return true;5945return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);5946}5947case AMDGPUISD::DIV_SCALE:5948case AMDGPUISD::DIV_FMAS:5949case AMDGPUISD::DIV_FIXUP:5950// TODO: Refine on operands.5951return SNaN;5952case AMDGPUISD::SIN_HW:5953case AMDGPUISD::COS_HW: {5954// TODO: Need check for infinity5955return SNaN;5956}5957case ISD::INTRINSIC_WO_CHAIN: {5958unsigned IntrinsicID = Op.getConstantOperandVal(0);5959// TODO: Handle more intrinsics5960switch (IntrinsicID) {5961case Intrinsic::amdgcn_cubeid:5962return true;59635964case Intrinsic::amdgcn_frexp_mant: {5965if (SNaN)5966return true;5967return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);5968}5969case Intrinsic::amdgcn_cvt_pkrtz: {5970if (SNaN)5971return true;5972return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&5973DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);5974}5975case Intrinsic::amdgcn_rcp:5976case Intrinsic::amdgcn_rsq:5977case Intrinsic::amdgcn_rcp_legacy:5978case Intrinsic::amdgcn_rsq_legacy:5979case Intrinsic::amdgcn_rsq_clamp: {5980if (SNaN)5981return true;59825983// TODO: Need is known positive check.5984return false;5985}5986case Intrinsic::amdgcn_trig_preop:5987case Intrinsic::amdgcn_fdot2:5988// TODO: Refine on operand5989return SNaN;5990case Intrinsic::amdgcn_fma_legacy:5991if (SNaN)5992return true;5993return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&5994DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&5995DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);5996default:5997return false;5998}5999}6000default:6001return false;6002}6003}60046005bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,6006Register N0, Register N1) const {6007return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks6008}60096010TargetLowering::AtomicExpansionKind6011AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {6012switch (RMW->getOperation()) {6013case AtomicRMWInst::Nand:6014case AtomicRMWInst::FAdd:6015case AtomicRMWInst::FSub:6016case AtomicRMWInst::FMax:6017case AtomicRMWInst::FMin:6018return AtomicExpansionKind::CmpXChg;6019case AtomicRMWInst::Xchg: {6020const DataLayout &DL = RMW->getFunction()->getDataLayout();6021unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());6022if (ValSize == 32 || ValSize == 64)6023return AtomicExpansionKind::None;6024return AtomicExpansionKind::CmpXChg;6025}6026default: {6027if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {6028unsigned Size = IntTy->getBitWidth();6029if (Size == 32 || Size == 64)6030return AtomicExpansionKind::None;6031}60326033return AtomicExpansionKind::CmpXChg;6034}6035}6036}60376038/// Whether it is profitable to sink the operands of an6039/// Instruction I to the basic block of I.6040/// This helps using several modifiers (like abs and neg) more often.6041bool AMDGPUTargetLowering::shouldSinkOperands(6042Instruction *I, SmallVectorImpl<Use *> &Ops) const {6043using namespace PatternMatch;60446045for (auto &Op : I->operands()) {6046// Ensure we are not already sinking this operand.6047if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))6048continue;60496050if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))6051Ops.push_back(&Op);6052}60536054return !Ops.empty();6055}605660576058