Path: blob/main/contrib/llvm-project/clang/lib/Driver/ToolChains/Cuda.cpp
35269 views
//===--- Cuda.cpp - Cuda Tool and ToolChain Implementations -----*- C++ -*-===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//78#include "Cuda.h"9#include "CommonArgs.h"10#include "clang/Basic/Cuda.h"11#include "clang/Config/config.h"12#include "clang/Driver/Compilation.h"13#include "clang/Driver/Distro.h"14#include "clang/Driver/Driver.h"15#include "clang/Driver/DriverDiagnostic.h"16#include "clang/Driver/InputInfo.h"17#include "clang/Driver/Options.h"18#include "llvm/ADT/StringExtras.h"19#include "llvm/Option/ArgList.h"20#include "llvm/Support/FileSystem.h"21#include "llvm/Support/FormatAdapters.h"22#include "llvm/Support/FormatVariadic.h"23#include "llvm/Support/Path.h"24#include "llvm/Support/Process.h"25#include "llvm/Support/Program.h"26#include "llvm/Support/VirtualFileSystem.h"27#include "llvm/TargetParser/Host.h"28#include "llvm/TargetParser/TargetParser.h"29#include <system_error>3031using namespace clang::driver;32using namespace clang::driver::toolchains;33using namespace clang::driver::tools;34using namespace clang;35using namespace llvm::opt;3637namespace {3839CudaVersion getCudaVersion(uint32_t raw_version) {40if (raw_version < 7050)41return CudaVersion::CUDA_70;42if (raw_version < 8000)43return CudaVersion::CUDA_75;44if (raw_version < 9000)45return CudaVersion::CUDA_80;46if (raw_version < 9010)47return CudaVersion::CUDA_90;48if (raw_version < 9020)49return CudaVersion::CUDA_91;50if (raw_version < 10000)51return CudaVersion::CUDA_92;52if (raw_version < 10010)53return CudaVersion::CUDA_100;54if (raw_version < 10020)55return CudaVersion::CUDA_101;56if (raw_version < 11000)57return CudaVersion::CUDA_102;58if (raw_version < 11010)59return CudaVersion::CUDA_110;60if (raw_version < 11020)61return CudaVersion::CUDA_111;62if (raw_version < 11030)63return CudaVersion::CUDA_112;64if (raw_version < 11040)65return CudaVersion::CUDA_113;66if (raw_version < 11050)67return CudaVersion::CUDA_114;68if (raw_version < 11060)69return CudaVersion::CUDA_115;70if (raw_version < 11070)71return CudaVersion::CUDA_116;72if (raw_version < 11080)73return CudaVersion::CUDA_117;74if (raw_version < 11090)75return CudaVersion::CUDA_118;76if (raw_version < 12010)77return CudaVersion::CUDA_120;78if (raw_version < 12020)79return CudaVersion::CUDA_121;80if (raw_version < 12030)81return CudaVersion::CUDA_122;82if (raw_version < 12040)83return CudaVersion::CUDA_123;84if (raw_version < 12050)85return CudaVersion::CUDA_124;86if (raw_version < 12060)87return CudaVersion::CUDA_125;88return CudaVersion::NEW;89}9091CudaVersion parseCudaHFile(llvm::StringRef Input) {92// Helper lambda which skips the words if the line starts with them or returns93// std::nullopt otherwise.94auto StartsWithWords =95[](llvm::StringRef Line,96const SmallVector<StringRef, 3> words) -> std::optional<StringRef> {97for (StringRef word : words) {98if (!Line.consume_front(word))99return {};100Line = Line.ltrim();101}102return Line;103};104105Input = Input.ltrim();106while (!Input.empty()) {107if (auto Line =108StartsWithWords(Input.ltrim(), {"#", "define", "CUDA_VERSION"})) {109uint32_t RawVersion;110Line->consumeInteger(10, RawVersion);111return getCudaVersion(RawVersion);112}113// Find next non-empty line.114Input = Input.drop_front(Input.find_first_of("\n\r")).ltrim();115}116return CudaVersion::UNKNOWN;117}118} // namespace119120void CudaInstallationDetector::WarnIfUnsupportedVersion() {121if (Version > CudaVersion::PARTIALLY_SUPPORTED) {122std::string VersionString = CudaVersionToString(Version);123if (!VersionString.empty())124VersionString.insert(0, " ");125D.Diag(diag::warn_drv_new_cuda_version)126<< VersionString127<< (CudaVersion::PARTIALLY_SUPPORTED != CudaVersion::FULLY_SUPPORTED)128<< CudaVersionToString(CudaVersion::PARTIALLY_SUPPORTED);129} else if (Version > CudaVersion::FULLY_SUPPORTED)130D.Diag(diag::warn_drv_partially_supported_cuda_version)131<< CudaVersionToString(Version);132}133134CudaInstallationDetector::CudaInstallationDetector(135const Driver &D, const llvm::Triple &HostTriple,136const llvm::opt::ArgList &Args)137: D(D) {138struct Candidate {139std::string Path;140bool StrictChecking;141142Candidate(std::string Path, bool StrictChecking = false)143: Path(Path), StrictChecking(StrictChecking) {}144};145SmallVector<Candidate, 4> Candidates;146147// In decreasing order so we prefer newer versions to older versions.148std::initializer_list<const char *> Versions = {"8.0", "7.5", "7.0"};149auto &FS = D.getVFS();150151if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) {152Candidates.emplace_back(153Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str());154} else if (HostTriple.isOSWindows()) {155for (const char *Ver : Versions)156Candidates.emplace_back(157D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" +158Ver);159} else {160if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) {161// Try to find ptxas binary. If the executable is located in a directory162// called 'bin/', its parent directory might be a good guess for a valid163// CUDA installation.164// However, some distributions might installs 'ptxas' to /usr/bin. In that165// case the candidate would be '/usr' which passes the following checks166// because '/usr/include' exists as well. To avoid this case, we always167// check for the directory potentially containing files for libdevice,168// even if the user passes -nocudalib.169if (llvm::ErrorOr<std::string> ptxas =170llvm::sys::findProgramByName("ptxas")) {171SmallString<256> ptxasAbsolutePath;172llvm::sys::fs::real_path(*ptxas, ptxasAbsolutePath);173174StringRef ptxasDir = llvm::sys::path::parent_path(ptxasAbsolutePath);175if (llvm::sys::path::filename(ptxasDir) == "bin")176Candidates.emplace_back(177std::string(llvm::sys::path::parent_path(ptxasDir)),178/*StrictChecking=*/true);179}180}181182Candidates.emplace_back(D.SysRoot + "/usr/local/cuda");183for (const char *Ver : Versions)184Candidates.emplace_back(D.SysRoot + "/usr/local/cuda-" + Ver);185186Distro Dist(FS, llvm::Triple(llvm::sys::getProcessTriple()));187if (Dist.IsDebian() || Dist.IsUbuntu())188// Special case for Debian to have nvidia-cuda-toolkit work189// out of the box. More info on http://bugs.debian.org/882505190Candidates.emplace_back(D.SysRoot + "/usr/lib/cuda");191}192193bool NoCudaLib = Args.hasArg(options::OPT_nogpulib);194195for (const auto &Candidate : Candidates) {196InstallPath = Candidate.Path;197if (InstallPath.empty() || !FS.exists(InstallPath))198continue;199200BinPath = InstallPath + "/bin";201IncludePath = InstallPath + "/include";202LibDevicePath = InstallPath + "/nvvm/libdevice";203204if (!(FS.exists(IncludePath) && FS.exists(BinPath)))205continue;206bool CheckLibDevice = (!NoCudaLib || Candidate.StrictChecking);207if (CheckLibDevice && !FS.exists(LibDevicePath))208continue;209210Version = CudaVersion::UNKNOWN;211if (auto CudaHFile = FS.getBufferForFile(InstallPath + "/include/cuda.h"))212Version = parseCudaHFile((*CudaHFile)->getBuffer());213// As the last resort, make an educated guess between CUDA-7.0, which had214// old-style libdevice bitcode, and an unknown recent CUDA version.215if (Version == CudaVersion::UNKNOWN) {216Version = FS.exists(LibDevicePath + "/libdevice.10.bc")217? CudaVersion::NEW218: CudaVersion::CUDA_70;219}220221if (Version >= CudaVersion::CUDA_90) {222// CUDA-9+ uses single libdevice file for all GPU variants.223std::string FilePath = LibDevicePath + "/libdevice.10.bc";224if (FS.exists(FilePath)) {225for (int Arch = (int)OffloadArch::SM_30, E = (int)OffloadArch::LAST;226Arch < E; ++Arch) {227OffloadArch OA = static_cast<OffloadArch>(Arch);228if (!IsNVIDIAOffloadArch(OA))229continue;230std::string OffloadArchName(OffloadArchToString(OA));231LibDeviceMap[OffloadArchName] = FilePath;232}233}234} else {235std::error_code EC;236for (llvm::vfs::directory_iterator LI = FS.dir_begin(LibDevicePath, EC),237LE;238!EC && LI != LE; LI = LI.increment(EC)) {239StringRef FilePath = LI->path();240StringRef FileName = llvm::sys::path::filename(FilePath);241// Process all bitcode filenames that look like242// libdevice.compute_XX.YY.bc243const StringRef LibDeviceName = "libdevice.";244if (!(FileName.starts_with(LibDeviceName) && FileName.ends_with(".bc")))245continue;246StringRef GpuArch = FileName.slice(247LibDeviceName.size(), FileName.find('.', LibDeviceName.size()));248LibDeviceMap[GpuArch] = FilePath.str();249// Insert map entries for specific devices with this compute250// capability. NVCC's choice of the libdevice library version is251// rather peculiar and depends on the CUDA version.252if (GpuArch == "compute_20") {253LibDeviceMap["sm_20"] = std::string(FilePath);254LibDeviceMap["sm_21"] = std::string(FilePath);255LibDeviceMap["sm_32"] = std::string(FilePath);256} else if (GpuArch == "compute_30") {257LibDeviceMap["sm_30"] = std::string(FilePath);258if (Version < CudaVersion::CUDA_80) {259LibDeviceMap["sm_50"] = std::string(FilePath);260LibDeviceMap["sm_52"] = std::string(FilePath);261LibDeviceMap["sm_53"] = std::string(FilePath);262}263LibDeviceMap["sm_60"] = std::string(FilePath);264LibDeviceMap["sm_61"] = std::string(FilePath);265LibDeviceMap["sm_62"] = std::string(FilePath);266} else if (GpuArch == "compute_35") {267LibDeviceMap["sm_35"] = std::string(FilePath);268LibDeviceMap["sm_37"] = std::string(FilePath);269} else if (GpuArch == "compute_50") {270if (Version >= CudaVersion::CUDA_80) {271LibDeviceMap["sm_50"] = std::string(FilePath);272LibDeviceMap["sm_52"] = std::string(FilePath);273LibDeviceMap["sm_53"] = std::string(FilePath);274}275}276}277}278279// Check that we have found at least one libdevice that we can link in if280// -nocudalib hasn't been specified.281if (LibDeviceMap.empty() && !NoCudaLib)282continue;283284IsValid = true;285break;286}287}288289void CudaInstallationDetector::AddCudaIncludeArgs(290const ArgList &DriverArgs, ArgStringList &CC1Args) const {291if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {292// Add cuda_wrappers/* to our system include path. This lets us wrap293// standard library headers.294SmallString<128> P(D.ResourceDir);295llvm::sys::path::append(P, "include");296llvm::sys::path::append(P, "cuda_wrappers");297CC1Args.push_back("-internal-isystem");298CC1Args.push_back(DriverArgs.MakeArgString(P));299}300301if (DriverArgs.hasArg(options::OPT_nogpuinc))302return;303304if (!isValid()) {305D.Diag(diag::err_drv_no_cuda_installation);306return;307}308309CC1Args.push_back("-include");310CC1Args.push_back("__clang_cuda_runtime_wrapper.h");311}312313void CudaInstallationDetector::CheckCudaVersionSupportsArch(314OffloadArch Arch) const {315if (Arch == OffloadArch::UNKNOWN || Version == CudaVersion::UNKNOWN ||316ArchsWithBadVersion[(int)Arch])317return;318319auto MinVersion = MinVersionForOffloadArch(Arch);320auto MaxVersion = MaxVersionForOffloadArch(Arch);321if (Version < MinVersion || Version > MaxVersion) {322ArchsWithBadVersion[(int)Arch] = true;323D.Diag(diag::err_drv_cuda_version_unsupported)324<< OffloadArchToString(Arch) << CudaVersionToString(MinVersion)325<< CudaVersionToString(MaxVersion) << InstallPath326<< CudaVersionToString(Version);327}328}329330void CudaInstallationDetector::print(raw_ostream &OS) const {331if (isValid())332OS << "Found CUDA installation: " << InstallPath << ", version "333<< CudaVersionToString(Version) << "\n";334}335336namespace {337/// Debug info level for the NVPTX devices. We may need to emit different debug338/// info level for the host and for the device itselfi. This type controls339/// emission of the debug info for the devices. It either prohibits disable info340/// emission completely, or emits debug directives only, or emits same debug341/// info as for the host.342enum DeviceDebugInfoLevel {343DisableDebugInfo, /// Do not emit debug info for the devices.344DebugDirectivesOnly, /// Emit only debug directives.345EmitSameDebugInfoAsHost, /// Use the same debug info level just like for the346/// host.347};348} // anonymous namespace349350/// Define debug info level for the NVPTX devices. If the debug info for both351/// the host and device are disabled (-g0/-ggdb0 or no debug options at all). If352/// only debug directives are requested for the both host and device353/// (-gline-directvies-only), or the debug info only for the device is disabled354/// (optimization is on and --cuda-noopt-device-debug was not specified), the355/// debug directves only must be emitted for the device. Otherwise, use the same356/// debug info level just like for the host (with the limitations of only357/// supported DWARF2 standard).358static DeviceDebugInfoLevel mustEmitDebugInfo(const ArgList &Args) {359const Arg *A = Args.getLastArg(options::OPT_O_Group);360bool IsDebugEnabled = !A || A->getOption().matches(options::OPT_O0) ||361Args.hasFlag(options::OPT_cuda_noopt_device_debug,362options::OPT_no_cuda_noopt_device_debug,363/*Default=*/false);364if (const Arg *A = Args.getLastArg(options::OPT_g_Group)) {365const Option &Opt = A->getOption();366if (Opt.matches(options::OPT_gN_Group)) {367if (Opt.matches(options::OPT_g0) || Opt.matches(options::OPT_ggdb0))368return DisableDebugInfo;369if (Opt.matches(options::OPT_gline_directives_only))370return DebugDirectivesOnly;371}372return IsDebugEnabled ? EmitSameDebugInfoAsHost : DebugDirectivesOnly;373}374return willEmitRemarks(Args) ? DebugDirectivesOnly : DisableDebugInfo;375}376377void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,378const InputInfo &Output,379const InputInfoList &Inputs,380const ArgList &Args,381const char *LinkingOutput) const {382const auto &TC =383static_cast<const toolchains::NVPTXToolChain &>(getToolChain());384assert(TC.getTriple().isNVPTX() && "Wrong platform");385386StringRef GPUArchName;387// If this is a CUDA action we need to extract the device architecture388// from the Job's associated architecture, otherwise use the -march=arch389// option. This option may come from -Xopenmp-target flag or the default390// value.391if (JA.isDeviceOffloading(Action::OFK_Cuda)) {392GPUArchName = JA.getOffloadingArch();393} else {394GPUArchName = Args.getLastArgValue(options::OPT_march_EQ);395if (GPUArchName.empty()) {396C.getDriver().Diag(diag::err_drv_offload_missing_gpu_arch)397<< getToolChain().getArchName() << getShortName();398return;399}400}401402// Obtain architecture from the action.403OffloadArch gpu_arch = StringToOffloadArch(GPUArchName);404assert(gpu_arch != OffloadArch::UNKNOWN &&405"Device action expected to have an architecture.");406407// Check that our installation's ptxas supports gpu_arch.408if (!Args.hasArg(options::OPT_no_cuda_version_check)) {409TC.CudaInstallation.CheckCudaVersionSupportsArch(gpu_arch);410}411412ArgStringList CmdArgs;413CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32");414DeviceDebugInfoLevel DIKind = mustEmitDebugInfo(Args);415if (DIKind == EmitSameDebugInfoAsHost) {416// ptxas does not accept -g option if optimization is enabled, so417// we ignore the compiler's -O* options if we want debug info.418CmdArgs.push_back("-g");419CmdArgs.push_back("--dont-merge-basicblocks");420CmdArgs.push_back("--return-at-end");421} else if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {422// Map the -O we received to -O{0,1,2,3}.423//424// TODO: Perhaps we should map host -O2 to ptxas -O3. -O3 is ptxas's425// default, so it may correspond more closely to the spirit of clang -O2.426427// -O3 seems like the least-bad option when -Osomething is specified to428// clang but it isn't handled below.429StringRef OOpt = "3";430if (A->getOption().matches(options::OPT_O4) ||431A->getOption().matches(options::OPT_Ofast))432OOpt = "3";433else if (A->getOption().matches(options::OPT_O0))434OOpt = "0";435else if (A->getOption().matches(options::OPT_O)) {436// -Os, -Oz, and -O(anything else) map to -O2, for lack of better options.437OOpt = llvm::StringSwitch<const char *>(A->getValue())438.Case("1", "1")439.Case("2", "2")440.Case("3", "3")441.Case("s", "2")442.Case("z", "2")443.Default("2");444}445CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));446} else {447// If no -O was passed, pass -O0 to ptxas -- no opt flag should correspond448// to no optimizations, but ptxas's default is -O3.449CmdArgs.push_back("-O0");450}451if (DIKind == DebugDirectivesOnly)452CmdArgs.push_back("-lineinfo");453454// Pass -v to ptxas if it was passed to the driver.455if (Args.hasArg(options::OPT_v))456CmdArgs.push_back("-v");457458CmdArgs.push_back("--gpu-name");459CmdArgs.push_back(Args.MakeArgString(OffloadArchToString(gpu_arch)));460CmdArgs.push_back("--output-file");461std::string OutputFileName = TC.getInputFilename(Output);462463if (Output.isFilename() && OutputFileName != Output.getFilename())464C.addTempFile(Args.MakeArgString(OutputFileName));465466CmdArgs.push_back(Args.MakeArgString(OutputFileName));467for (const auto &II : Inputs)468CmdArgs.push_back(Args.MakeArgString(II.getFilename()));469470for (const auto &A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))471CmdArgs.push_back(Args.MakeArgString(A));472473bool Relocatable;474if (JA.isOffloading(Action::OFK_OpenMP))475// In OpenMP we need to generate relocatable code.476Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target,477options::OPT_fnoopenmp_relocatable_target,478/*Default=*/true);479else if (JA.isOffloading(Action::OFK_Cuda))480// In CUDA we generate relocatable code by default.481Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,482/*Default=*/false);483else484// Otherwise, we are compiling directly and should create linkable output.485Relocatable = true;486487if (Relocatable)488CmdArgs.push_back("-c");489490const char *Exec;491if (Arg *A = Args.getLastArg(options::OPT_ptxas_path_EQ))492Exec = A->getValue();493else494Exec = Args.MakeArgString(TC.GetProgramPath("ptxas"));495C.addCommand(std::make_unique<Command>(496JA, *this,497ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,498"--options-file"},499Exec, CmdArgs, Inputs, Output));500}501502static bool shouldIncludePTX(const ArgList &Args, StringRef InputArch) {503// The new driver does not include PTX by default to avoid overhead.504bool includePTX = !Args.hasFlag(options::OPT_offload_new_driver,505options::OPT_no_offload_new_driver, false);506for (Arg *A : Args.filtered(options::OPT_cuda_include_ptx_EQ,507options::OPT_no_cuda_include_ptx_EQ)) {508A->claim();509const StringRef ArchStr = A->getValue();510if (A->getOption().matches(options::OPT_cuda_include_ptx_EQ) &&511(ArchStr == "all" || ArchStr == InputArch))512includePTX = true;513else if (A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ) &&514(ArchStr == "all" || ArchStr == InputArch))515includePTX = false;516}517return includePTX;518}519520// All inputs to this linker must be from CudaDeviceActions, as we need to look521// at the Inputs' Actions in order to figure out which GPU architecture they522// correspond to.523void NVPTX::FatBinary::ConstructJob(Compilation &C, const JobAction &JA,524const InputInfo &Output,525const InputInfoList &Inputs,526const ArgList &Args,527const char *LinkingOutput) const {528const auto &TC =529static_cast<const toolchains::CudaToolChain &>(getToolChain());530assert(TC.getTriple().isNVPTX() && "Wrong platform");531532ArgStringList CmdArgs;533if (TC.CudaInstallation.version() <= CudaVersion::CUDA_100)534CmdArgs.push_back("--cuda");535CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32");536CmdArgs.push_back(Args.MakeArgString("--create"));537CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));538if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost)539CmdArgs.push_back("-g");540541for (const auto &II : Inputs) {542auto *A = II.getAction();543assert(A->getInputs().size() == 1 &&544"Device offload action is expected to have a single input");545const char *gpu_arch_str = A->getOffloadingArch();546assert(gpu_arch_str &&547"Device action expected to have associated a GPU architecture!");548OffloadArch gpu_arch = StringToOffloadArch(gpu_arch_str);549550if (II.getType() == types::TY_PP_Asm &&551!shouldIncludePTX(Args, gpu_arch_str))552continue;553// We need to pass an Arch of the form "sm_XX" for cubin files and554// "compute_XX" for ptx.555const char *Arch = (II.getType() == types::TY_PP_Asm)556? OffloadArchToVirtualArchString(gpu_arch)557: gpu_arch_str;558CmdArgs.push_back(559Args.MakeArgString(llvm::Twine("--image=profile=") + Arch +560",file=" + getToolChain().getInputFilename(II)));561}562563for (const auto &A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary))564CmdArgs.push_back(Args.MakeArgString(A));565566const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));567C.addCommand(std::make_unique<Command>(568JA, *this,569ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,570"--options-file"},571Exec, CmdArgs, Inputs, Output));572}573574void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,575const InputInfo &Output,576const InputInfoList &Inputs,577const ArgList &Args,578const char *LinkingOutput) const {579const auto &TC =580static_cast<const toolchains::NVPTXToolChain &>(getToolChain());581ArgStringList CmdArgs;582583assert(TC.getTriple().isNVPTX() && "Wrong platform");584585assert((Output.isFilename() || Output.isNothing()) && "Invalid output.");586if (Output.isFilename()) {587CmdArgs.push_back("-o");588CmdArgs.push_back(Output.getFilename());589}590591if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost)592CmdArgs.push_back("-g");593594if (Args.hasArg(options::OPT_v))595CmdArgs.push_back("-v");596597StringRef GPUArch = Args.getLastArgValue(options::OPT_march_EQ);598if (GPUArch.empty()) {599C.getDriver().Diag(diag::err_drv_offload_missing_gpu_arch)600<< getToolChain().getArchName() << getShortName();601return;602}603604CmdArgs.push_back("-arch");605CmdArgs.push_back(Args.MakeArgString(GPUArch));606607if (Args.hasArg(options::OPT_ptxas_path_EQ))608CmdArgs.push_back(Args.MakeArgString(609"--pxtas-path=" + Args.getLastArgValue(options::OPT_ptxas_path_EQ)));610611if (Args.hasArg(options::OPT_cuda_path_EQ))612CmdArgs.push_back(Args.MakeArgString(613"--cuda-path=" + Args.getLastArgValue(options::OPT_cuda_path_EQ)));614615// Add paths specified in LIBRARY_PATH environment variable as -L options.616addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");617618// Add standard library search paths passed on the command line.619Args.AddAllArgs(CmdArgs, options::OPT_L);620getToolChain().AddFilePathLibArgs(Args, CmdArgs);621AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);622623if (C.getDriver().isUsingLTO())624addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],625C.getDriver().getLTOMode() == LTOK_Thin);626627// Add paths for the default clang library path.628SmallString<256> DefaultLibPath =629llvm::sys::path::parent_path(TC.getDriver().Dir);630llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME);631CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));632633C.addCommand(std::make_unique<Command>(634JA, *this,635ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,636"--options-file"},637Args.MakeArgString(getToolChain().GetProgramPath("clang-nvlink-wrapper")),638CmdArgs, Inputs, Output));639}640641void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,642const llvm::opt::ArgList &Args,643std::vector<StringRef> &Features) {644if (Args.hasArg(options::OPT_cuda_feature_EQ)) {645StringRef PtxFeature =646Args.getLastArgValue(options::OPT_cuda_feature_EQ, "+ptx42");647Features.push_back(Args.MakeArgString(PtxFeature));648return;649}650CudaInstallationDetector CudaInstallation(D, Triple, Args);651652// New CUDA versions often introduce new instructions that are only supported653// by new PTX version, so we need to raise PTX level to enable them in NVPTX654// back-end.655const char *PtxFeature = nullptr;656switch (CudaInstallation.version()) {657#define CASE_CUDA_VERSION(CUDA_VER, PTX_VER) \658case CudaVersion::CUDA_##CUDA_VER: \659PtxFeature = "+ptx" #PTX_VER; \660break;661CASE_CUDA_VERSION(125, 85);662CASE_CUDA_VERSION(124, 84);663CASE_CUDA_VERSION(123, 83);664CASE_CUDA_VERSION(122, 82);665CASE_CUDA_VERSION(121, 81);666CASE_CUDA_VERSION(120, 80);667CASE_CUDA_VERSION(118, 78);668CASE_CUDA_VERSION(117, 77);669CASE_CUDA_VERSION(116, 76);670CASE_CUDA_VERSION(115, 75);671CASE_CUDA_VERSION(114, 74);672CASE_CUDA_VERSION(113, 73);673CASE_CUDA_VERSION(112, 72);674CASE_CUDA_VERSION(111, 71);675CASE_CUDA_VERSION(110, 70);676CASE_CUDA_VERSION(102, 65);677CASE_CUDA_VERSION(101, 64);678CASE_CUDA_VERSION(100, 63);679CASE_CUDA_VERSION(92, 61);680CASE_CUDA_VERSION(91, 61);681CASE_CUDA_VERSION(90, 60);682#undef CASE_CUDA_VERSION683default:684PtxFeature = "+ptx42";685}686Features.push_back(PtxFeature);687}688689/// NVPTX toolchain. Our assembler is ptxas, and our linker is nvlink. This690/// operates as a stand-alone version of the NVPTX tools without the host691/// toolchain.692NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,693const llvm::Triple &HostTriple,694const ArgList &Args, bool Freestanding = false)695: ToolChain(D, Triple, Args), CudaInstallation(D, HostTriple, Args),696Freestanding(Freestanding) {697if (CudaInstallation.isValid())698getProgramPaths().push_back(std::string(CudaInstallation.getBinPath()));699// Lookup binaries into the driver directory, this is used to700// discover the 'nvptx-arch' executable.701getProgramPaths().push_back(getDriver().Dir);702}703704/// We only need the host triple to locate the CUDA binary utilities, use the705/// system's default triple if not provided.706NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,707const ArgList &Args)708: NVPTXToolChain(D, Triple, llvm::Triple(LLVM_HOST_TRIPLE), Args,709/*Freestanding=*/true) {}710711llvm::opt::DerivedArgList *712NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,713StringRef BoundArch,714Action::OffloadKind OffloadKind) const {715DerivedArgList *DAL = ToolChain::TranslateArgs(Args, BoundArch, OffloadKind);716if (!DAL)717DAL = new DerivedArgList(Args.getBaseArgs());718719const OptTable &Opts = getDriver().getOpts();720721for (Arg *A : Args)722if (!llvm::is_contained(*DAL, A))723DAL->append(A);724725if (!DAL->hasArg(options::OPT_march_EQ) && OffloadKind != Action::OFK_None) {726DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),727OffloadArchToString(OffloadArch::CudaDefault));728} else if (DAL->getLastArgValue(options::OPT_march_EQ) == "generic" &&729OffloadKind == Action::OFK_None) {730DAL->eraseArg(options::OPT_march_EQ);731} else if (DAL->getLastArgValue(options::OPT_march_EQ) == "native") {732auto GPUsOrErr = getSystemGPUArchs(Args);733if (!GPUsOrErr) {734getDriver().Diag(diag::err_drv_undetermined_gpu_arch)735<< getArchName() << llvm::toString(GPUsOrErr.takeError()) << "-march";736} else {737if (GPUsOrErr->size() > 1)738getDriver().Diag(diag::warn_drv_multi_gpu_arch)739<< getArchName() << llvm::join(*GPUsOrErr, ", ") << "-march";740DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),741Args.MakeArgString(GPUsOrErr->front()));742}743}744745return DAL;746}747748void NVPTXToolChain::addClangTargetOptions(749const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,750Action::OffloadKind DeviceOffloadingKind) const {751// If we are compiling with a standalone NVPTX toolchain we want to try to752// mimic a standard environment as much as possible. So we enable lowering753// ctor / dtor functions to global symbols that can be registered.754if (Freestanding)755CC1Args.append({"-mllvm", "--nvptx-lower-global-ctor-dtor"});756}757758bool NVPTXToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const {759const Option &O = A->getOption();760return (O.matches(options::OPT_gN_Group) &&761!O.matches(options::OPT_gmodules)) ||762O.matches(options::OPT_g_Flag) ||763O.matches(options::OPT_ggdbN_Group) || O.matches(options::OPT_ggdb) ||764O.matches(options::OPT_gdwarf) || O.matches(options::OPT_gdwarf_2) ||765O.matches(options::OPT_gdwarf_3) || O.matches(options::OPT_gdwarf_4) ||766O.matches(options::OPT_gdwarf_5) ||767O.matches(options::OPT_gcolumn_info);768}769770void NVPTXToolChain::adjustDebugInfoKind(771llvm::codegenoptions::DebugInfoKind &DebugInfoKind,772const ArgList &Args) const {773switch (mustEmitDebugInfo(Args)) {774case DisableDebugInfo:775DebugInfoKind = llvm::codegenoptions::NoDebugInfo;776break;777case DebugDirectivesOnly:778DebugInfoKind = llvm::codegenoptions::DebugDirectivesOnly;779break;780case EmitSameDebugInfoAsHost:781// Use same debug info level as the host.782break;783}784}785786Expected<SmallVector<std::string>>787NVPTXToolChain::getSystemGPUArchs(const ArgList &Args) const {788// Detect NVIDIA GPUs availible on the system.789std::string Program;790if (Arg *A = Args.getLastArg(options::OPT_nvptx_arch_tool_EQ))791Program = A->getValue();792else793Program = GetProgramPath("nvptx-arch");794795auto StdoutOrErr = executeToolChainProgram(Program, /*SecondsToWait=*/10);796if (!StdoutOrErr)797return StdoutOrErr.takeError();798799SmallVector<std::string, 1> GPUArchs;800for (StringRef Arch : llvm::split((*StdoutOrErr)->getBuffer(), "\n"))801if (!Arch.empty())802GPUArchs.push_back(Arch.str());803804if (GPUArchs.empty())805return llvm::createStringError(std::error_code(),806"No NVIDIA GPU detected in the system");807808return std::move(GPUArchs);809}810811/// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary,812/// which isn't properly a linker but nonetheless performs the step of stitching813/// together object files from the assembler into a single blob.814815CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,816const ToolChain &HostTC, const ArgList &Args)817: NVPTXToolChain(D, Triple, HostTC.getTriple(), Args), HostTC(HostTC) {}818819void CudaToolChain::addClangTargetOptions(820const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,821Action::OffloadKind DeviceOffloadingKind) const {822HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);823824StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);825assert(!GpuArch.empty() && "Must have an explicit GPU arch.");826assert((DeviceOffloadingKind == Action::OFK_OpenMP ||827DeviceOffloadingKind == Action::OFK_Cuda) &&828"Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");829830if (DeviceOffloadingKind == Action::OFK_Cuda) {831CC1Args.append(832{"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});833834// Unsized function arguments used for variadics were introduced in CUDA-9.0835// We still do not support generating code that actually uses variadic836// arguments yet, but we do need to allow parsing them as recent CUDA837// headers rely on that. https://github.com/llvm/llvm-project/issues/58410838if (CudaInstallation.version() >= CudaVersion::CUDA_90)839CC1Args.push_back("-fcuda-allow-variadic-functions");840}841842if (DriverArgs.hasArg(options::OPT_nogpulib))843return;844845if (DeviceOffloadingKind == Action::OFK_OpenMP &&846DriverArgs.hasArg(options::OPT_S))847return;848849std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);850if (LibDeviceFile.empty()) {851getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;852return;853}854855CC1Args.push_back("-mlink-builtin-bitcode");856CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));857858clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();859860if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,861options::OPT_fno_cuda_short_ptr, false))862CC1Args.append({"-mllvm", "--nvptx-short-ptr"});863864if (CudaInstallationVersion >= CudaVersion::UNKNOWN)865CC1Args.push_back(866DriverArgs.MakeArgString(Twine("-target-sdk-version=") +867CudaVersionToString(CudaInstallationVersion)));868869if (DeviceOffloadingKind == Action::OFK_OpenMP) {870if (CudaInstallationVersion < CudaVersion::CUDA_92) {871getDriver().Diag(872diag::err_drv_omp_offload_target_cuda_version_not_support)873<< CudaVersionToString(CudaInstallationVersion);874return;875}876877// Link the bitcode library late if we're using device LTO.878if (getDriver().isUsingLTO(/* IsOffload */ true))879return;880881addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, GpuArch.str(),882getTriple(), HostTC);883}884}885886llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType(887const llvm::opt::ArgList &DriverArgs, const JobAction &JA,888const llvm::fltSemantics *FPType) const {889if (JA.getOffloadingDeviceKind() == Action::OFK_Cuda) {890if (FPType && FPType == &llvm::APFloat::IEEEsingle() &&891DriverArgs.hasFlag(options::OPT_fgpu_flush_denormals_to_zero,892options::OPT_fno_gpu_flush_denormals_to_zero, false))893return llvm::DenormalMode::getPreserveSign();894}895896assert(JA.getOffloadingDeviceKind() != Action::OFK_Host);897return llvm::DenormalMode::getIEEE();898}899900void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,901ArgStringList &CC1Args) const {902// Check our CUDA version if we're going to include the CUDA headers.903if (!DriverArgs.hasArg(options::OPT_nogpuinc) &&904!DriverArgs.hasArg(options::OPT_no_cuda_version_check)) {905StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ);906assert(!Arch.empty() && "Must have an explicit GPU arch.");907CudaInstallation.CheckCudaVersionSupportsArch(StringToOffloadArch(Arch));908}909CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);910}911912std::string CudaToolChain::getInputFilename(const InputInfo &Input) const {913// Only object files are changed, for example assembly files keep their .s914// extensions. If the user requested device-only compilation don't change it.915if (Input.getType() != types::TY_Object || getDriver().offloadDeviceOnly())916return ToolChain::getInputFilename(Input);917918return ToolChain::getInputFilename(Input);919}920921llvm::opt::DerivedArgList *922CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,923StringRef BoundArch,924Action::OffloadKind DeviceOffloadKind) const {925DerivedArgList *DAL =926HostTC.TranslateArgs(Args, BoundArch, DeviceOffloadKind);927if (!DAL)928DAL = new DerivedArgList(Args.getBaseArgs());929930const OptTable &Opts = getDriver().getOpts();931932// For OpenMP device offloading, append derived arguments. Make sure933// flags are not duplicated.934// Also append the compute capability.935if (DeviceOffloadKind == Action::OFK_OpenMP) {936for (Arg *A : Args)937if (!llvm::is_contained(*DAL, A))938DAL->append(A);939940if (!DAL->hasArg(options::OPT_march_EQ)) {941StringRef Arch = BoundArch;942if (Arch.empty()) {943auto ArchsOrErr = getSystemGPUArchs(Args);944if (!ArchsOrErr) {945std::string ErrMsg =946llvm::formatv("{0}", llvm::fmt_consume(ArchsOrErr.takeError()));947getDriver().Diag(diag::err_drv_undetermined_gpu_arch)948<< llvm::Triple::getArchTypeName(getArch()) << ErrMsg << "-march";949Arch = OffloadArchToString(OffloadArch::CudaDefault);950} else {951Arch = Args.MakeArgString(ArchsOrErr->front());952}953}954DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), Arch);955}956957return DAL;958}959960for (Arg *A : Args) {961// Make sure flags are not duplicated.962if (!llvm::is_contained(*DAL, A)) {963DAL->append(A);964}965}966967if (!BoundArch.empty()) {968DAL->eraseArg(options::OPT_march_EQ);969DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),970BoundArch);971}972return DAL;973}974975Tool *NVPTXToolChain::buildAssembler() const {976return new tools::NVPTX::Assembler(*this);977}978979Tool *NVPTXToolChain::buildLinker() const {980return new tools::NVPTX::Linker(*this);981}982983Tool *CudaToolChain::buildAssembler() const {984return new tools::NVPTX::Assembler(*this);985}986987Tool *CudaToolChain::buildLinker() const {988return new tools::NVPTX::FatBinary(*this);989}990991void CudaToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {992HostTC.addClangWarningOptions(CC1Args);993}994995ToolChain::CXXStdlibType996CudaToolChain::GetCXXStdlibType(const ArgList &Args) const {997return HostTC.GetCXXStdlibType(Args);998}9991000void CudaToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,1001ArgStringList &CC1Args) const {1002HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args);10031004if (!DriverArgs.hasArg(options::OPT_nogpuinc) && CudaInstallation.isValid())1005CC1Args.append(1006{"-internal-isystem",1007DriverArgs.MakeArgString(CudaInstallation.getIncludePath())});1008}10091010void CudaToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &Args,1011ArgStringList &CC1Args) const {1012HostTC.AddClangCXXStdlibIncludeArgs(Args, CC1Args);1013}10141015void CudaToolChain::AddIAMCUIncludeArgs(const ArgList &Args,1016ArgStringList &CC1Args) const {1017HostTC.AddIAMCUIncludeArgs(Args, CC1Args);1018}10191020SanitizerMask CudaToolChain::getSupportedSanitizers() const {1021// The CudaToolChain only supports sanitizers in the sense that it allows1022// sanitizer arguments on the command line if they are supported by the host1023// toolchain. The CudaToolChain will actually ignore any command line1024// arguments for any of these "supported" sanitizers. That means that no1025// sanitization of device code is actually supported at this time.1026//1027// This behavior is necessary because the host and device toolchains1028// invocations often share the command line, so the device toolchain must1029// tolerate flags meant only for the host toolchain.1030return HostTC.getSupportedSanitizers();1031}10321033VersionTuple CudaToolChain::computeMSVCVersion(const Driver *D,1034const ArgList &Args) const {1035return HostTC.computeMSVCVersion(D, Args);1036}103710381039