Skip to content

Commit c17c2e7

Browse files
yxsamliuChengChen002
authored andcommitted
[HIP] Allow partial linking for -fgpu-rdc (#81700)
`-fgpu-rdc` mode allows device functions call device functions in different TU. However, currently all device objects have to be linked together since only one fat binary is supported. This is time consuming for AMDGPU backend since it only supports LTO. There are use cases that objects can be divided into groups in which device functions are self-contained but host functions are not. It is desirable to link/optimize/codegen the device code and generate a fatbin for each group, whereas partially link the host code with `ld -r` or generate a static library by using the `--emit-static-lib` option of clang. This avoids linking all device code together, therefore decreases the linking time for `-fgpu-rdc`. Previously, clang emits an external symbol `__hip_fatbin` for all objects for `-fgpu-rdc`. With this patch, clang emits an unique external symbol `__hip_fatbin_{cuid}` for the fat binary for each object. When a group of objects are linked together to generate a fatbin, the symbols are merged by alias and point to the same fat binary. Each group has its own fat binary. One executable or shared library can have multiple fat binaries. Device linking is done for undefined fab binary symbols only to avoid repeated linking. `__hip_gpubin_handle` is also uniquefied and merged to avoid repeated registering. Symbol `__hip_cuid_{cuid}` is introduced to facilitate debugging and tooling. Fixes: llvm/llvm-project#77018
1 parent 81b8f19 commit c17c2e7

11 files changed

+460
-56
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

+11-11
Original file line numberDiff line numberDiff line change
@@ -762,10 +762,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
762762
// to contain the fat binary but will be populated somewhere else,
763763
// e.g. by lld through link script.
764764
FatBinStr = new llvm::GlobalVariable(
765-
CGM.getModule(), CGM.Int8Ty,
766-
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
767-
"__hip_fatbin", nullptr,
768-
llvm::GlobalVariable::NotThreadLocal);
765+
CGM.getModule(), CGM.Int8Ty,
766+
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
767+
"__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
768+
llvm::GlobalVariable::NotThreadLocal);
769769
cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
770770
}
771771

@@ -818,8 +818,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
818818
// thread safety of the loaded program. Therefore we can assume sequential
819819
// execution of constructor functions here.
820820
if (IsHIP) {
821-
auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
822-
llvm::GlobalValue::LinkOnceAnyLinkage;
821+
auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
822+
: llvm::GlobalValue::ExternalLinkage;
823823
llvm::BasicBlock *IfBlock =
824824
llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
825825
llvm::BasicBlock *ExitBlock =
@@ -828,11 +828,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
828828
// of HIP ABI.
829829
GpuBinaryHandle = new llvm::GlobalVariable(
830830
TheModule, PtrTy, /*isConstant=*/false, Linkage,
831-
/*Initializer=*/llvm::ConstantPointerNull::get(PtrTy),
832-
"__hip_gpubin_handle");
833-
if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
834-
GpuBinaryHandle->setComdat(
835-
CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
831+
/*Initializer=*/
832+
CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
833+
CudaGpuBinary
834+
? "__hip_gpubin_handle"
835+
: "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
836836
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
837837
// Prevent the weak symbol in different shared libraries being merged.
838838
if (Linkage != llvm::GlobalValue::InternalLinkage)

clang/lib/CodeGen/CodeGenModule.cpp

+9-1
Original file line numberDiff line numberDiff line change
@@ -959,7 +959,15 @@ void CodeGenModule::Release() {
959959
llvm::ConstantArray::get(ATy, UsedArray), "__clang_gpu_used_external");
960960
addCompilerUsedGlobal(GV);
961961
}
962-
962+
if (LangOpts.HIP) {
963+
// Emit a unique ID so that host and device binaries from the same
964+
// compilation unit can be associated.
965+
auto *GV = new llvm::GlobalVariable(
966+
getModule(), Int8Ty, false, llvm::GlobalValue::ExternalLinkage,
967+
llvm::Constant::getNullValue(Int8Ty),
968+
"__hip_cuid_" + getContext().getCUIDHash());
969+
addCompilerUsedGlobal(GV);
970+
}
963971
emitLLVMUsed();
964972
if (SanStats)
965973
SanStats->finish();

clang/lib/Driver/OffloadBundler.cpp

+29-8
Original file line numberDiff line numberDiff line change
@@ -733,10 +733,6 @@ class ObjectFileHandler final : public FileHandler {
733733
StringRef Content = *ContentOrErr;
734734

735735
// Copy fat object contents to the output when extracting host bundle.
736-
<<<<<<< HEAD
737-
if (Content.size() == 1u && Content.front() == 0)
738-
Content = StringRef(Input.getBufferStart(), Input.getBufferSize());
739-
=======
740736
std::string ModifiedContent;
741737
if (Content.size() == 1u && Content.front() == 0) {
742738
auto HostBundleOrErr = getHostBundle(
@@ -747,7 +743,6 @@ class ObjectFileHandler final : public FileHandler {
747743
ModifiedContent = std::move(*HostBundleOrErr);
748744
Content = ModifiedContent;
749745
}
750-
>>>>>>> 61b13e0dfe1b476d9bf0fe477983be8471cfd26b
751746

752747
OS.write(Content.data(), Content.size());
753748
return Error::success();
@@ -869,8 +864,6 @@ class ObjectFileHandler final : public FileHandler {
869864
}
870865
return Error::success();
871866
}
872-
<<<<<<< HEAD
873-
=======
874867

875868
Expected<std::string> getHostBundle(StringRef Input) {
876869
TempFileHandlerRAII TempFiles;
@@ -917,7 +910,35 @@ class ObjectFileHandler final : public FileHandler {
917910

918911
return BufOrErr->get()->getBuffer().str();
919912
}
920-
>>>>>>> 61b13e0dfe1b476d9bf0fe477983be8471cfd26b
913+
914+
Expected<std::string> getHostBundle() {
915+
TempFileHandlerRAII TempFiles;
916+
917+
auto ModifiedObjPathOrErr = TempFiles.Create(std::nullopt);
918+
if (!ModifiedObjPathOrErr)
919+
return ModifiedObjPathOrErr.takeError();
920+
StringRef ModifiedObjPath = *ModifiedObjPathOrErr;
921+
922+
BumpPtrAllocator Alloc;
923+
StringSaver SS{Alloc};
924+
SmallVector<StringRef, 16> ObjcopyArgs{"llvm-objcopy"};
925+
926+
ObjcopyArgs.push_back("--regex");
927+
ObjcopyArgs.push_back("--remove-section=__CLANG_OFFLOAD_BUNDLE__.*");
928+
ObjcopyArgs.push_back("--");
929+
ObjcopyArgs.push_back(BundlerConfig.InputFileNames.front());
930+
ObjcopyArgs.push_back(ModifiedObjPath);
931+
932+
if (Error Err = executeObjcopy(BundlerConfig.ObjcopyPath, ObjcopyArgs))
933+
return std::move(Err);
934+
935+
auto BufOrErr = MemoryBuffer::getFile(ModifiedObjPath);
936+
if (!BufOrErr)
937+
return createStringError(BufOrErr.getError(),
938+
"Failed to read back the modified object file");
939+
940+
return BufOrErr->get()->getBuffer().str();
941+
}
921942
};
922943

923944
/// Handler for text files. The bundled file will have the following format.

0 commit comments

Comments
 (0)