Path: blob/main/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
35271 views
//===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//7//8// Top-level implementation for the NVPTX target.9//10//===----------------------------------------------------------------------===//1112#include "NVPTXTargetMachine.h"13#include "NVPTX.h"14#include "NVPTXAliasAnalysis.h"15#include "NVPTXAllocaHoisting.h"16#include "NVPTXAtomicLower.h"17#include "NVPTXCtorDtorLowering.h"18#include "NVPTXLowerAggrCopies.h"19#include "NVPTXMachineFunctionInfo.h"20#include "NVPTXTargetObjectFile.h"21#include "NVPTXTargetTransformInfo.h"22#include "TargetInfo/NVPTXTargetInfo.h"23#include "llvm/ADT/STLExtras.h"24#include "llvm/Analysis/TargetTransformInfo.h"25#include "llvm/CodeGen/Passes.h"26#include "llvm/CodeGen/TargetPassConfig.h"27#include "llvm/IR/IntrinsicsNVPTX.h"28#include "llvm/MC/TargetRegistry.h"29#include "llvm/Pass.h"30#include "llvm/Passes/PassBuilder.h"31#include "llvm/Support/CommandLine.h"32#include "llvm/Target/TargetMachine.h"33#include "llvm/Target/TargetOptions.h"34#include "llvm/TargetParser/Triple.h"35#include "llvm/Transforms/IPO/ExpandVariadics.h"36#include "llvm/Transforms/Scalar.h"37#include "llvm/Transforms/Scalar/GVN.h"38#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"39#include <cassert>40#include <optional>41#include <string>4243using namespace llvm;4445// LSV is still relatively new; this switch lets us turn it off in case we46// encounter (or suspect) a bug.47static cl::opt<bool>48DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer",49cl::desc("Disable load/store vectorizer"),50cl::init(false), cl::Hidden);5152// TODO: Remove this flag when we are confident with no regressions.53static cl::opt<bool> DisableRequireStructuredCFG(54"disable-nvptx-require-structured-cfg",55cl::desc("Transitional flag to turn off NVPTX's requirement on preserving "56"structured CFG. The requirement should be disabled only when "57"unexpected regressions happen."),58cl::init(false), cl::Hidden);5960static cl::opt<bool> UseShortPointersOpt(61"nvptx-short-ptr",62cl::desc(63"Use 32-bit pointers for accessing const/local/shared address spaces."),64cl::init(false), cl::Hidden);6566namespace llvm {6768void initializeGenericToNVVMLegacyPassPass(PassRegistry &);69void initializeNVPTXAllocaHoistingPass(PassRegistry &);70void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &);71void initializeNVPTXAtomicLowerPass(PassRegistry &);72void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);73void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);74void initializeNVPTXLowerAllocaPass(PassRegistry &);75void initializeNVPTXLowerUnreachablePass(PassRegistry &);76void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);77void initializeNVPTXLowerArgsPass(PassRegistry &);78void initializeNVPTXProxyRegErasurePass(PassRegistry &);79void initializeNVVMIntrRangePass(PassRegistry &);80void initializeNVVMReflectPass(PassRegistry &);81void initializeNVPTXAAWrapperPassPass(PassRegistry &);82void initializeNVPTXExternalAAWrapperPass(PassRegistry &);8384} // end namespace llvm8586extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {87// Register the target.88RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());89RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());9091PassRegistry &PR = *PassRegistry::getPassRegistry();92// FIXME: This pass is really intended to be invoked during IR optimization,93// but it's very NVPTX-specific.94initializeNVVMReflectPass(PR);95initializeNVVMIntrRangePass(PR);96initializeGenericToNVVMLegacyPassPass(PR);97initializeNVPTXAllocaHoistingPass(PR);98initializeNVPTXAssignValidGlobalNamesPass(PR);99initializeNVPTXAtomicLowerPass(PR);100initializeNVPTXLowerArgsPass(PR);101initializeNVPTXLowerAllocaPass(PR);102initializeNVPTXLowerUnreachablePass(PR);103initializeNVPTXCtorDtorLoweringLegacyPass(PR);104initializeNVPTXLowerAggrCopiesPass(PR);105initializeNVPTXProxyRegErasurePass(PR);106initializeNVPTXDAGToDAGISelLegacyPass(PR);107initializeNVPTXAAWrapperPassPass(PR);108initializeNVPTXExternalAAWrapperPass(PR);109}110111static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {112std::string Ret = "e";113114if (!is64Bit)115Ret += "-p:32:32";116else if (UseShortPointers)117Ret += "-p3:32:32-p4:32:32-p5:32:32";118119Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";120121return Ret;122}123124NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,125StringRef CPU, StringRef FS,126const TargetOptions &Options,127std::optional<Reloc::Model> RM,128std::optional<CodeModel::Model> CM,129CodeGenOptLevel OL, bool is64bit)130// The pic relocation model is used regardless of what the client has131// specified, as it is the only relocation model currently supported.132: LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT,133CPU, FS, Options, Reloc::PIC_,134getEffectiveCodeModel(CM, CodeModel::Small), OL),135is64bit(is64bit), TLOF(std::make_unique<NVPTXTargetObjectFile>()),136Subtarget(TT, std::string(CPU), std::string(FS), *this),137StrPool(StrAlloc) {138if (TT.getOS() == Triple::NVCL)139drvInterface = NVPTX::NVCL;140else141drvInterface = NVPTX::CUDA;142if (!DisableRequireStructuredCFG)143setRequiresStructuredCFG(true);144initAsmInfo();145}146147NVPTXTargetMachine::~NVPTXTargetMachine() = default;148149void NVPTXTargetMachine32::anchor() {}150151NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,152StringRef CPU, StringRef FS,153const TargetOptions &Options,154std::optional<Reloc::Model> RM,155std::optional<CodeModel::Model> CM,156CodeGenOptLevel OL, bool JIT)157: NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}158159void NVPTXTargetMachine64::anchor() {}160161NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,162StringRef CPU, StringRef FS,163const TargetOptions &Options,164std::optional<Reloc::Model> RM,165std::optional<CodeModel::Model> CM,166CodeGenOptLevel OL, bool JIT)167: NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}168169namespace {170171class NVPTXPassConfig : public TargetPassConfig {172public:173NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM)174: TargetPassConfig(TM, PM) {}175176NVPTXTargetMachine &getNVPTXTargetMachine() const {177return getTM<NVPTXTargetMachine>();178}179180void addIRPasses() override;181bool addInstSelector() override;182void addPreRegAlloc() override;183void addPostRegAlloc() override;184void addMachineSSAOptimization() override;185186FunctionPass *createTargetRegisterAllocator(bool) override;187void addFastRegAlloc() override;188void addOptimizedRegAlloc() override;189190bool addRegAssignAndRewriteFast() override {191llvm_unreachable("should not be used");192}193194bool addRegAssignAndRewriteOptimized() override {195llvm_unreachable("should not be used");196}197198private:199// If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This200// function is only called in opt mode.201void addEarlyCSEOrGVNPass();202203// Add passes that propagate special memory spaces.204void addAddressSpaceInferencePasses();205206// Add passes that perform straight-line scalar optimizations.207void addStraightLineScalarOptimizationPasses();208};209210} // end anonymous namespace211212TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {213return new NVPTXPassConfig(*this, PM);214}215216MachineFunctionInfo *NVPTXTargetMachine::createMachineFunctionInfo(217BumpPtrAllocator &Allocator, const Function &F,218const TargetSubtargetInfo *STI) const {219return NVPTXMachineFunctionInfo::create<NVPTXMachineFunctionInfo>(Allocator,220F, STI);221}222223void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {224AAM.registerFunctionAnalysis<NVPTXAA>();225}226227void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {228#define GET_PASS_REGISTRY "NVPTXPassRegistry.def"229#include "llvm/Passes/TargetPassRegistry.inc"230231PB.registerPipelineStartEPCallback(232[this](ModulePassManager &PM, OptimizationLevel Level) {233FunctionPassManager FPM;234FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));235// Note: NVVMIntrRangePass was causing numerical discrepancies at one236// point, if issues crop up, consider disabling.237FPM.addPass(NVVMIntrRangePass());238PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));239});240}241242TargetTransformInfo243NVPTXTargetMachine::getTargetTransformInfo(const Function &F) const {244return TargetTransformInfo(NVPTXTTIImpl(this, F));245}246247std::pair<const Value *, unsigned>248NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const {249if (auto *II = dyn_cast<IntrinsicInst>(V)) {250switch (II->getIntrinsicID()) {251case Intrinsic::nvvm_isspacep_const:252return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_CONST);253case Intrinsic::nvvm_isspacep_global:254return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_GLOBAL);255case Intrinsic::nvvm_isspacep_local:256return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL);257case Intrinsic::nvvm_isspacep_shared:258case Intrinsic::nvvm_isspacep_shared_cluster:259return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED);260default:261break;262}263}264return std::make_pair(nullptr, -1);265}266267void NVPTXPassConfig::addEarlyCSEOrGVNPass() {268if (getOptLevel() == CodeGenOptLevel::Aggressive)269addPass(createGVNPass());270else271addPass(createEarlyCSEPass());272}273274void NVPTXPassConfig::addAddressSpaceInferencePasses() {275// NVPTXLowerArgs emits alloca for byval parameters which can often276// be eliminated by SROA.277addPass(createSROAPass());278addPass(createNVPTXLowerAllocaPass());279addPass(createInferAddressSpacesPass());280addPass(createNVPTXAtomicLowerPass());281}282283void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {284addPass(createSeparateConstOffsetFromGEPPass());285addPass(createSpeculativeExecutionPass());286// ReassociateGEPs exposes more opportunites for SLSR. See287// the example in reassociate-geps-and-slsr.ll.288addPass(createStraightLineStrengthReducePass());289// SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or290// EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE291// for some of our benchmarks.292addEarlyCSEOrGVNPass();293// Run NaryReassociate after EarlyCSE/GVN to be more effective.294addPass(createNaryReassociatePass());295// NaryReassociate on GEPs creates redundant common expressions, so run296// EarlyCSE after it.297addPass(createEarlyCSEPass());298}299300void NVPTXPassConfig::addIRPasses() {301// The following passes are known to not play well with virtual regs hanging302// around after register allocation (which in our case, is *all* registers).303// We explicitly disable them here. We do, however, need some functionality304// of the PrologEpilogCodeInserter pass, so we emulate that behavior in the305// NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).306disablePass(&PrologEpilogCodeInserterID);307disablePass(&MachineLateInstrsCleanupID);308disablePass(&MachineCopyPropagationID);309disablePass(&TailDuplicateID);310disablePass(&StackMapLivenessID);311disablePass(&LiveDebugValuesID);312disablePass(&PostRAMachineSinkingID);313disablePass(&PostRASchedulerID);314disablePass(&FuncletLayoutID);315disablePass(&PatchableFunctionID);316disablePass(&ShrinkWrapID);317318addPass(createNVPTXAAWrapperPass());319addPass(createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {320if (auto *WrapperPass = P.getAnalysisIfAvailable<NVPTXAAWrapperPass>())321AAR.addAAResult(WrapperPass->getResult());322}));323324// NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running325// it here does nothing. But since we need it for correctness when lowering326// to NVPTX, run it here too, in case whoever built our pass pipeline didn't327// call addEarlyAsPossiblePasses.328const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();329addPass(createNVVMReflectPass(ST.getSmVersion()));330331if (getOptLevel() != CodeGenOptLevel::None)332addPass(createNVPTXImageOptimizerPass());333addPass(createNVPTXAssignValidGlobalNamesPass());334addPass(createGenericToNVVMLegacyPass());335336// NVPTXLowerArgs is required for correctness and should be run right337// before the address space inference passes.338addPass(createNVPTXLowerArgsPass());339if (getOptLevel() != CodeGenOptLevel::None) {340addAddressSpaceInferencePasses();341addStraightLineScalarOptimizationPasses();342}343344addPass(createAtomicExpandLegacyPass());345addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));346addPass(createNVPTXCtorDtorLoweringLegacyPass());347348// === LSR and other generic IR passes ===349TargetPassConfig::addIRPasses();350// EarlyCSE is not always strong enough to clean up what LSR produces. For351// example, GVN can combine352//353// %0 = add %a, %b354// %1 = add %b, %a355//356// and357//358// %0 = shl nsw %a, 2359// %1 = shl %a, 2360//361// but EarlyCSE can do neither of them.362if (getOptLevel() != CodeGenOptLevel::None) {363addEarlyCSEOrGVNPass();364if (!DisableLoadStoreVectorizer)365addPass(createLoadStoreVectorizerPass());366addPass(createSROAPass());367}368369const auto &Options = getNVPTXTargetMachine().Options;370addPass(createNVPTXLowerUnreachablePass(Options.TrapUnreachable,371Options.NoTrapAfterNoreturn));372}373374bool NVPTXPassConfig::addInstSelector() {375const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();376377addPass(createLowerAggrCopies());378addPass(createAllocaHoisting());379addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));380381if (!ST.hasImageHandles())382addPass(createNVPTXReplaceImageHandlesPass());383384return false;385}386387void NVPTXPassConfig::addPreRegAlloc() {388// Remove Proxy Register pseudo instructions used to keep `callseq_end` alive.389addPass(createNVPTXProxyRegErasurePass());390}391392void NVPTXPassConfig::addPostRegAlloc() {393addPass(createNVPTXPrologEpilogPass());394if (getOptLevel() != CodeGenOptLevel::None) {395// NVPTXPrologEpilogPass calculates frame object offset and replace frame396// index with VRFrame register. NVPTXPeephole need to be run after that and397// will replace VRFrame with VRFrameLocal when possible.398addPass(createNVPTXPeephole());399}400}401402FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {403return nullptr; // No reg alloc404}405406void NVPTXPassConfig::addFastRegAlloc() {407addPass(&PHIEliminationID);408addPass(&TwoAddressInstructionPassID);409}410411void NVPTXPassConfig::addOptimizedRegAlloc() {412addPass(&ProcessImplicitDefsID);413addPass(&LiveVariablesID);414addPass(&MachineLoopInfoID);415addPass(&PHIEliminationID);416417addPass(&TwoAddressInstructionPassID);418addPass(&RegisterCoalescerID);419420// PreRA instruction scheduling.421if (addPass(&MachineSchedulerID))422printAndVerify("After Machine Scheduling");423424addPass(&StackSlotColoringID);425426// FIXME: Needs physical registers427// addPass(&MachineLICMID);428429printAndVerify("After StackSlotColoring");430}431432void NVPTXPassConfig::addMachineSSAOptimization() {433// Pre-ra tail duplication.434if (addPass(&EarlyTailDuplicateID))435printAndVerify("After Pre-RegAlloc TailDuplicate");436437// Optimize PHIs before DCE: removing dead PHI cycles may make more438// instructions dead.439addPass(&OptimizePHIsID);440441// This pass merges large allocas. StackSlotColoring is a different pass442// which merges spill slots.443addPass(&StackColoringID);444445// If the target requests it, assign local variables to stack slots relative446// to one another and simplify frame index references where possible.447addPass(&LocalStackSlotAllocationID);448449// With optimization, dead code should already be eliminated. However450// there is one known exception: lowered code for arguments that are only451// used by tail calls, where the tail calls reuse the incoming stack452// arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).453addPass(&DeadMachineInstructionElimID);454printAndVerify("After codegen DCE pass");455456// Allow targets to insert passes that improve instruction level parallelism,457// like if-conversion. Such passes will typically need dominator trees and458// loop info, just like LICM and CSE below.459if (addILPOpts())460printAndVerify("After ILP optimizations");461462addPass(&EarlyMachineLICMID);463addPass(&MachineCSEID);464465addPass(&MachineSinkingID);466printAndVerify("After Machine LICM, CSE and Sinking passes");467468addPass(&PeepholeOptimizerID);469printAndVerify("After codegen peephole optimization pass");470}471472473