Skip to content

Commit 33a6ce1

Browse files
authored
[HIP] Allow partial linking for -fgpu-rdc (#81700)
`-fgpu-rdc` mode allows device functions call device functions in different TU. However, currently all device objects have to be linked together since only one fat binary is supported. This is time consuming for AMDGPU backend since it only supports LTO. There are use cases that objects can be divided into groups in which device functions are self-contained but host functions are not. It is desirable to link/optimize/codegen the device code and generate a fatbin for each group, whereas partially link the host code with `ld -r` or generate a static library by using the `--emit-static-lib` option of clang. This avoids linking all device code together, therefore decreases the linking time for `-fgpu-rdc`. Previously, clang emits an external symbol `__hip_fatbin` for all objects for `-fgpu-rdc`. With this patch, clang emits an unique external symbol `__hip_fatbin_{cuid}` for the fat binary for each object. When a group of objects are linked together to generate a fatbin, the symbols are merged by alias and point to the same fat binary. Each group has its own fat binary. One executable or shared library can have multiple fat binaries. Device linking is done for undefined fab binary symbols only to avoid repeated linking. `__hip_gpubin_handle` is also uniquefied and merged to avoid repeated registering. Symbol `__hip_cuid_{cuid}` is introduced to facilitate debugging and tooling. Fixes: #77018
1 parent cc83927 commit 33a6ce1

11 files changed

+469
-50
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

+11-11
Original file line numberDiff line numberDiff line change
@@ -760,10 +760,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
760760
// to contain the fat binary but will be populated somewhere else,
761761
// e.g. by lld through link script.
762762
FatBinStr = new llvm::GlobalVariable(
763-
CGM.getModule(), CGM.Int8Ty,
764-
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
765-
"__hip_fatbin", nullptr,
766-
llvm::GlobalVariable::NotThreadLocal);
763+
CGM.getModule(), CGM.Int8Ty,
764+
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
765+
"__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
766+
llvm::GlobalVariable::NotThreadLocal);
767767
cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
768768
}
769769

@@ -816,8 +816,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
816816
// thread safety of the loaded program. Therefore we can assume sequential
817817
// execution of constructor functions here.
818818
if (IsHIP) {
819-
auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
820-
llvm::GlobalValue::LinkOnceAnyLinkage;
819+
auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
820+
: llvm::GlobalValue::ExternalLinkage;
821821
llvm::BasicBlock *IfBlock =
822822
llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
823823
llvm::BasicBlock *ExitBlock =
@@ -826,11 +826,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
826826
// of HIP ABI.
827827
GpuBinaryHandle = new llvm::GlobalVariable(
828828
TheModule, PtrTy, /*isConstant=*/false, Linkage,
829-
/*Initializer=*/llvm::ConstantPointerNull::get(PtrTy),
830-
"__hip_gpubin_handle");
831-
if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
832-
GpuBinaryHandle->setComdat(
833-
CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
829+
/*Initializer=*/
830+
CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
831+
CudaGpuBinary
832+
? "__hip_gpubin_handle"
833+
: "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
834834
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
835835
// Prevent the weak symbol in different shared libraries being merged.
836836
if (Linkage != llvm::GlobalValue::InternalLinkage)

clang/lib/CodeGen/CodeGenModule.cpp

+9-1
Original file line numberDiff line numberDiff line change
@@ -915,7 +915,15 @@ void CodeGenModule::Release() {
915915
llvm::ConstantArray::get(ATy, UsedArray), "__clang_gpu_used_external");
916916
addCompilerUsedGlobal(GV);
917917
}
918-
918+
if (LangOpts.HIP) {
919+
// Emit a unique ID so that host and device binaries from the same
920+
// compilation unit can be associated.
921+
auto *GV = new llvm::GlobalVariable(
922+
getModule(), Int8Ty, false, llvm::GlobalValue::ExternalLinkage,
923+
llvm::Constant::getNullValue(Int8Ty),
924+
"__hip_cuid_" + getContext().getCUIDHash());
925+
addCompilerUsedGlobal(GV);
926+
}
919927
emitLLVMUsed();
920928
if (SanStats)
921929
SanStats->finish();

clang/lib/Driver/OffloadBundler.cpp

+38-2
Original file line numberDiff line numberDiff line change
@@ -588,8 +588,15 @@ class ObjectFileHandler final : public FileHandler {
588588
StringRef Content = *ContentOrErr;
589589

590590
// Copy fat object contents to the output when extracting host bundle.
591-
if (Content.size() == 1u && Content.front() == 0)
592-
Content = StringRef(Input.getBufferStart(), Input.getBufferSize());
591+
std::string ModifiedContent;
592+
if (Content.size() == 1u && Content.front() == 0) {
593+
auto HostBundleOrErr = getHostBundle();
594+
if (!HostBundleOrErr)
595+
return HostBundleOrErr.takeError();
596+
597+
ModifiedContent = std::move(*HostBundleOrErr);
598+
Content = ModifiedContent;
599+
}
593600

594601
OS.write(Content.data(), Content.size());
595602
return Error::success();
@@ -692,6 +699,35 @@ class ObjectFileHandler final : public FileHandler {
692699
}
693700
return Error::success();
694701
}
702+
703+
Expected<std::string> getHostBundle() {
704+
TempFileHandlerRAII TempFiles;
705+
706+
auto ModifiedObjPathOrErr = TempFiles.Create(std::nullopt);
707+
if (!ModifiedObjPathOrErr)
708+
return ModifiedObjPathOrErr.takeError();
709+
StringRef ModifiedObjPath = *ModifiedObjPathOrErr;
710+
711+
BumpPtrAllocator Alloc;
712+
StringSaver SS{Alloc};
713+
SmallVector<StringRef, 16> ObjcopyArgs{"llvm-objcopy"};
714+
715+
ObjcopyArgs.push_back("--regex");
716+
ObjcopyArgs.push_back("--remove-section=__CLANG_OFFLOAD_BUNDLE__.*");
717+
ObjcopyArgs.push_back("--");
718+
ObjcopyArgs.push_back(BundlerConfig.InputFileNames.front());
719+
ObjcopyArgs.push_back(ModifiedObjPath);
720+
721+
if (Error Err = executeObjcopy(BundlerConfig.ObjcopyPath, ObjcopyArgs))
722+
return std::move(Err);
723+
724+
auto BufOrErr = MemoryBuffer::getFile(ModifiedObjPath);
725+
if (!BufOrErr)
726+
return createStringError(BufOrErr.getError(),
727+
"Failed to read back the modified object file");
728+
729+
return BufOrErr->get()->getBuffer().str();
730+
}
695731
};
696732

697733
/// Handler for text files. The bundled file will have the following format.

0 commit comments

Comments
 (0)