Skip to content

Commit a1a2fa5

Browse files
yxsamliusearlmc1
authored andcommitted
[HIP] Allow partial linking for -fgpu-rdc (llvm#81700)
`-fgpu-rdc` mode allows device functions call device functions in different TU. However, currently all device objects have to be linked together since only one fat binary is supported. This is time consuming for AMDGPU backend since it only supports LTO. There are use cases that objects can be divided into groups in which device functions are self-contained but host functions are not. It is desirable to link/optimize/codegen the device code and generate a fatbin for each group, whereas partially link the host code with `ld -r` or generate a static library by using the `--emit-static-lib` option of clang. This avoids linking all device code together, therefore decreases the linking time for `-fgpu-rdc`. Previously, clang emits an external symbol `__hip_fatbin` for all objects for `-fgpu-rdc`. With this patch, clang emits an unique external symbol `__hip_fatbin_{cuid}` for the fat binary for each object. When a group of objects are linked together to generate a fatbin, the symbols are merged by alias and point to the same fat binary. Each group has its own fat binary. One executable or shared library can have multiple fat binaries. Device linking is done for undefined fab binary symbols only to avoid repeated linking. `__hip_gpubin_handle` is also uniquefied and merged to avoid repeated registering. Symbol `__hip_cuid_{cuid}` is introduced to facilitate debugging and tooling. Fixes: llvm#77018 Change-Id: I0ebf263b742b554939e5b758e5ec761e00763738
1 parent 9203121 commit a1a2fa5

11 files changed

+469
-50
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

+11-11
Original file line numberDiff line numberDiff line change
@@ -760,10 +760,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
760760
// to contain the fat binary but will be populated somewhere else,
761761
// e.g. by lld through link script.
762762
FatBinStr = new llvm::GlobalVariable(
763-
CGM.getModule(), CGM.Int8Ty,
764-
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
765-
"__hip_fatbin", nullptr,
766-
llvm::GlobalVariable::NotThreadLocal);
763+
CGM.getModule(), CGM.Int8Ty,
764+
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
765+
"__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
766+
llvm::GlobalVariable::NotThreadLocal);
767767
cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
768768
}
769769

@@ -816,8 +816,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
816816
// thread safety of the loaded program. Therefore we can assume sequential
817817
// execution of constructor functions here.
818818
if (IsHIP) {
819-
auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
820-
llvm::GlobalValue::LinkOnceAnyLinkage;
819+
auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
820+
: llvm::GlobalValue::ExternalLinkage;
821821
llvm::BasicBlock *IfBlock =
822822
llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
823823
llvm::BasicBlock *ExitBlock =
@@ -826,11 +826,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
826826
// of HIP ABI.
827827
GpuBinaryHandle = new llvm::GlobalVariable(
828828
TheModule, PtrTy, /*isConstant=*/false, Linkage,
829-
/*Initializer=*/llvm::ConstantPointerNull::get(PtrTy),
830-
"__hip_gpubin_handle");
831-
if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
832-
GpuBinaryHandle->setComdat(
833-
CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
829+
/*Initializer=*/
830+
CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
831+
CudaGpuBinary
832+
? "__hip_gpubin_handle"
833+
: "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
834834
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
835835
// Prevent the weak symbol in different shared libraries being merged.
836836
if (Linkage != llvm::GlobalValue::InternalLinkage)

clang/lib/CodeGen/CodeGenModule.cpp

+9-1
Original file line numberDiff line numberDiff line change
@@ -894,7 +894,15 @@ void CodeGenModule::Release() {
894894
llvm::ConstantArray::get(ATy, UsedArray), "__clang_gpu_used_external");
895895
addCompilerUsedGlobal(GV);
896896
}
897-
897+
if (LangOpts.HIP) {
898+
// Emit a unique ID so that host and device binaries from the same
899+
// compilation unit can be associated.
900+
auto *GV = new llvm::GlobalVariable(
901+
getModule(), Int8Ty, false, llvm::GlobalValue::ExternalLinkage,
902+
llvm::Constant::getNullValue(Int8Ty),
903+
"__hip_cuid_" + getContext().getCUIDHash());
904+
addCompilerUsedGlobal(GV);
905+
}
898906
emitLLVMUsed();
899907
if (SanStats)
900908
SanStats->finish();

clang/lib/Driver/OffloadBundler.cpp

+38-2
Original file line numberDiff line numberDiff line change
@@ -588,8 +588,15 @@ class ObjectFileHandler final : public FileHandler {
588588
StringRef Content = *ContentOrErr;
589589

590590
// Copy fat object contents to the output when extracting host bundle.
591-
if (Content.size() == 1u && Content.front() == 0)
592-
Content = StringRef(Input.getBufferStart(), Input.getBufferSize());
591+
std::string ModifiedContent;
592+
if (Content.size() == 1u && Content.front() == 0) {
593+
auto HostBundleOrErr = getHostBundle();
594+
if (!HostBundleOrErr)
595+
return HostBundleOrErr.takeError();
596+
597+
ModifiedContent = std::move(*HostBundleOrErr);
598+
Content = ModifiedContent;
599+
}
593600

594601
OS.write(Content.data(), Content.size());
595602
return Error::success();
@@ -692,6 +699,35 @@ class ObjectFileHandler final : public FileHandler {
692699
}
693700
return Error::success();
694701
}
702+
703+
Expected<std::string> getHostBundle() {
704+
TempFileHandlerRAII TempFiles;
705+
706+
auto ModifiedObjPathOrErr = TempFiles.Create(std::nullopt);
707+
if (!ModifiedObjPathOrErr)
708+
return ModifiedObjPathOrErr.takeError();
709+
StringRef ModifiedObjPath = *ModifiedObjPathOrErr;
710+
711+
BumpPtrAllocator Alloc;
712+
StringSaver SS{Alloc};
713+
SmallVector<StringRef, 16> ObjcopyArgs{"llvm-objcopy"};
714+
715+
ObjcopyArgs.push_back("--regex");
716+
ObjcopyArgs.push_back("--remove-section=__CLANG_OFFLOAD_BUNDLE__.*");
717+
ObjcopyArgs.push_back("--");
718+
ObjcopyArgs.push_back(BundlerConfig.InputFileNames.front());
719+
ObjcopyArgs.push_back(ModifiedObjPath);
720+
721+
if (Error Err = executeObjcopy(BundlerConfig.ObjcopyPath, ObjcopyArgs))
722+
return std::move(Err);
723+
724+
auto BufOrErr = MemoryBuffer::getFile(ModifiedObjPath);
725+
if (!BufOrErr)
726+
return createStringError(BufOrErr.getError(),
727+
"Failed to read back the modified object file");
728+
729+
return BufOrErr->get()->getBuffer().str();
730+
}
695731
};
696732

697733
/// Handler for text files. The bundled file will have the following format.

0 commit comments

Comments
 (0)