Skip to content

Commit 7155c1e

Browse files
authored
[NVPTX] Allow compiling LLVM-IR without -march set (#79873)
Summary: The NVPTX tools require an architecture to be used, however if we are creating generic LLVM-IR we should be able to leave it unspecified. This will result in the `target-cpu` attributes not being set on the functions so it can be changed when linked into code. This allows the standalone `--target=nvptx64-nvidia-cuda` toolchain to create LLVM-IR simmilar to how CUDA's deviceRTL looks from C/C++
1 parent c19436e commit 7155c1e

File tree

6 files changed

+51
-19
lines changed

6 files changed

+51
-19
lines changed

clang/include/clang/Basic/DiagnosticDriverKinds.td

+2
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ def warn_drv_avr_stdlib_not_linked: Warning<
5757
InGroup<AVRRtlibLinkingQuirks>;
5858
def err_drv_cuda_bad_gpu_arch : Error<"unsupported CUDA gpu architecture: %0">;
5959
def err_drv_offload_bad_gpu_arch : Error<"unsupported %0 gpu architecture: %1">;
60+
def err_drv_offload_missing_gpu_arch : Error<
61+
"Must pass in an explicit %0 gpu architecture to '%1'">;
6062
def err_drv_no_cuda_installation : Error<
6163
"cannot find CUDA installation; provide its path via '--cuda-path', or pass "
6264
"'-nocudainc' to build without CUDA includes">;

clang/lib/Basic/Targets/NVPTX.cpp

+7-2
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
5959
// Define available target features
6060
// These must be defined in sorted order!
6161
NoAsmVariants = true;
62-
GPU = CudaArch::SM_20;
62+
GPU = CudaArch::UNUSED;
6363

6464
if (TargetPointerWidth == 32)
6565
resetDataLayout("e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
@@ -169,6 +169,11 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
169169
MacroBuilder &Builder) const {
170170
Builder.defineMacro("__PTX__");
171171
Builder.defineMacro("__NVPTX__");
172+
173+
// Skip setting architecture dependent macros if undefined.
174+
if (GPU == CudaArch::UNUSED && !HostTarget)
175+
return;
176+
172177
if (Opts.CUDAIsDevice || Opts.OpenMPIsTargetDevice || !HostTarget) {
173178
// Set __CUDA_ARCH__ for the GPU specified.
174179
std::string CUDAArchCode = [this] {
@@ -220,10 +225,10 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
220225
case CudaArch::Generic:
221226
case CudaArch::LAST:
222227
break;
223-
case CudaArch::UNUSED:
224228
case CudaArch::UNKNOWN:
225229
assert(false && "No GPU arch when compiling CUDA device code.");
226230
return "";
231+
case CudaArch::UNUSED:
227232
case CudaArch::SM_20:
228233
return "200";
229234
case CudaArch::SM_21:

clang/lib/Basic/Targets/NVPTX.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
7979
initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
8080
StringRef CPU,
8181
const std::vector<std::string> &FeaturesVec) const override {
82-
Features[CudaArchToString(GPU)] = true;
82+
if (GPU != CudaArch::UNUSED)
83+
Features[CudaArchToString(GPU)] = true;
8384
Features["ptx" + std::to_string(PTXVersion)] = true;
8485
return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
8586
}

clang/lib/Driver/ToolChains/Cuda.cpp

+14-6
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,11 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
389389
GPUArchName = JA.getOffloadingArch();
390390
} else {
391391
GPUArchName = Args.getLastArgValue(options::OPT_march_EQ);
392-
assert(!GPUArchName.empty() && "Must have an architecture passed in.");
392+
if (GPUArchName.empty()) {
393+
C.getDriver().Diag(diag::err_drv_offload_missing_gpu_arch)
394+
<< getToolChain().getArchName() << getShortName();
395+
return;
396+
}
393397
}
394398

395399
// Obtain architecture from the action.
@@ -593,7 +597,11 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
593597
CmdArgs.push_back("-v");
594598

595599
StringRef GPUArch = Args.getLastArgValue(options::OPT_march_EQ);
596-
assert(!GPUArch.empty() && "At least one GPU Arch required for nvlink.");
600+
if (GPUArch.empty()) {
601+
C.getDriver().Diag(diag::err_drv_offload_missing_gpu_arch)
602+
<< getToolChain().getArchName() << getShortName();
603+
return;
604+
}
597605

598606
CmdArgs.push_back("-arch");
599607
CmdArgs.push_back(Args.MakeArgString(GPUArch));
@@ -726,9 +734,8 @@ NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
726734
llvm::opt::DerivedArgList *
727735
NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
728736
StringRef BoundArch,
729-
Action::OffloadKind DeviceOffloadKind) const {
730-
DerivedArgList *DAL =
731-
ToolChain::TranslateArgs(Args, BoundArch, DeviceOffloadKind);
737+
Action::OffloadKind OffloadKind) const {
738+
DerivedArgList *DAL = ToolChain::TranslateArgs(Args, BoundArch, OffloadKind);
732739
if (!DAL)
733740
DAL = new DerivedArgList(Args.getBaseArgs());
734741

@@ -738,7 +745,8 @@ NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
738745
if (!llvm::is_contained(*DAL, A))
739746
DAL->append(A);
740747

741-
if (!DAL->hasArg(options::OPT_march_EQ)) {
748+
// TODO: We should accept 'generic' as a valid architecture.
749+
if (!DAL->hasArg(options::OPT_march_EQ) && OffloadKind != Action::OFK_None) {
742750
DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
743751
CudaArchToString(CudaArch::CudaDefault));
744752
} else if (DAL->getLastArgValue(options::OPT_march_EQ) == "native") {

clang/test/Driver/cuda-cross-compiling.c

+14-10
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,6 @@
5959

6060
// LINK: nvlink{{.*}}"-o" "a.out" "-arch" "sm_61" {{.*}} "{{.*}}.cubin"
6161

62-
//
63-
// Test the generated arguments default to a value with no architecture.
64-
//
65-
// RUN: %clang --target=nvptx64-nvidia-cuda -### --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
66-
// RUN: | FileCheck -check-prefix=DEFAULT %s
67-
68-
// DEFAULT: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_52" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s"
69-
// DEFAULT-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_52" "--output-file" "[[CUBIN:.+]].cubin" "[[PTX]].s" "-c"
70-
// DEFAULT-NEXT: nvlink{{.*}}"-o" "a.out" "-arch" "sm_52" {{.*}} "[[CUBIN]].cubin"
71-
7262
//
7363
// Test to ensure that we enable handling global constructors in a freestanding
7464
// Nvidia compilation.
@@ -77,3 +67,17 @@
7767
// RUN: | FileCheck -check-prefix=LOWERING %s
7868

7969
// LOWERING: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-mllvm" "--nvptx-lower-global-ctor-dtor"
70+
71+
//
72+
// Tests for handling a missing architecture.
73+
//
74+
// RUN: not %clang -target nvptx64-nvidia-cuda %s -### 2>&1 \
75+
// RUN: | FileCheck -check-prefix=MISSING %s
76+
77+
// MISSING: error: Must pass in an explicit nvptx64 gpu architecture to 'ptxas'
78+
// MISSING: error: Must pass in an explicit nvptx64 gpu architecture to 'nvlink'
79+
80+
// RUN: %clang -target nvptx64-nvidia-cuda -flto -c %s -### 2>&1 \
81+
// RUN: | FileCheck -check-prefix=GENERIC %s
82+
83+
// GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu"

clang/test/Preprocessor/predefined-arch-macros.c

+12
Original file line numberDiff line numberDiff line change
@@ -4292,6 +4292,18 @@
42924292
// RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ZVECTOR
42934293
// CHECK_SYSTEMZ_ZVECTOR: #define __VEC__ 10304
42944294

4295+
// Begin nvptx tests ----------------
4296+
4297+
// RUN: %clang -march=sm_75 -E -dM %s -o - 2>&1 \
4298+
// RUN: -target nvptx64-unknown-unknown \
4299+
// RUN: | FileCheck -match-full-lines %s -check-prefixes=CHECK_NVPTX,CHECK_ARCH_SM_75
4300+
// RUN: %clang -E -dM %s -o - 2>&1 \
4301+
// RUN: -target nvptx64-unknown-unknown \
4302+
// RUN: | FileCheck -match-full-lines %s -check-prefixes=CHECK_NVPTX,CHECK_ARCH_UNSET
4303+
// CHECK_ARCH_SM_75: #define __CUDA_ARCH__ 750
4304+
// CHECK_ARCH_UNSET-NOT: #define __CUDA_ARCH__
4305+
// CHECK_NVPTX: #define __NVPTX__ 1
4306+
42954307
// Begin amdgcn tests ----------------
42964308

42974309
// RUN: %clang -mcpu=gfx803 -E -dM %s -o - 2>&1 \

0 commit comments

Comments
 (0)