diff --git a/clang/docs/ClangLinkerWrapper.rst b/clang/docs/ClangLinkerWrapper.rst index fbabb4f8613b3..1e851b0aa0619 100644 --- a/clang/docs/ClangLinkerWrapper.rst +++ b/clang/docs/ClangLinkerWrapper.rst @@ -54,12 +54,30 @@ only for the linker wrapper will be forwarded to the wrapped linker job. --pass-remarks= Pass remarks for LTO --print-wrapped-module Print the wrapped module's IR for testing --ptxas-arg= Argument to pass to the 'ptxas' invocation + --relocatable Link device code to create a relocatable offloading application --save-temps Save intermediate results --sysroot Set the system root --verbose Verbose output from tools --v Display the version number and exit -- The separator for the wrapped linker arguments +Relocatable Linking +=================== + +The ``clang-linker-wrapper`` handles linking embedded device code and then +registering it with the appropriate runtime. Normally, this is only done when +the executable is created so other files containing device code can be linked +together. This can be somewhat problematic for users who wish to ship static +libraries that contain offloading code to users without a compatible offloading +toolchain. + +When using a relocatable link with ``-r``, the ``clang-linker-wrapper`` will +perform the device linking and registration eagerly. This will remove the +embedded device code and register it correctly with the runtime. Semantically, +this is similar to creating a shared library object. If standard relocatable +linking is desired, simply do not run the binaries through the +``clang-linker-wrapper``. This will simply append the embedded device code so +that it can be linked later. Example ======= diff --git a/clang/docs/OffloadingDesign.rst b/clang/docs/OffloadingDesign.rst index 209b89e0c5a16..04319cd869b19 100644 --- a/clang/docs/OffloadingDesign.rst +++ b/clang/docs/OffloadingDesign.rst @@ -470,3 +470,37 @@ We can see the steps created by clang to generate the offloading code using the # "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["/tmp/zaxpy-07f434.s"], output: "/tmp/zaxpy-0af7b7.o" # "x86_64-unknown-linux-gnu" - "clang", inputs: ["/tmp/zaxpy-e6a41b.bc", "/tmp/zaxpy-0af7b7.o"], output: "/tmp/zaxpy-416cad.o" # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["/tmp/zaxpy-416cad.o"], output: "a.out" + +Relocatable Linking +------------------- + +The offloading compilation pipeline normally will defer the final device linking +and runtime registration until the ``clang-linker-wrapper`` is run to create the +executable. This is the standard behaviour when compiling for OpenMP offloading +or CUDA and HIP in ``-fgpu-rdc`` mode. However, there are some cases where the +user may wish to perform this device handling prematurely. This is described in +the :doc:`linker wrapper documentation`. + +Effectively, this allows the user to handle offloading specific linking ahead of +time when shipping objects or static libraries. This can be thought of as +performing a standard ``-fno-gpu-rdc`` compilation on a subset of object files. +This can be useful to reduce link time, prevent users from interacting with the +library's device code, or for shipping libraries to incompatible compilers. + +Normally, if a relocatable link is done using ``clang -r`` it will simply merge +the ``.llvm.offloading`` sections which will then be linked later when the +executable is created. However, if the ``-r`` flag is used with the offloading +toolchain, it will perform the device linking and registration phases and then +merge the registration code into the final relocatable object file. + +The following example shows how using the relocatable link with the offloading +pipeline can create a static library with offloading code that can be +redistributed without requiring any additional handling. + +.. code-block:: console + + $ clang++ -fopenmp -fopenmp-targets=nvptx64 foo.cpp -c + $ clang++ -lomptarget.devicertl --offload-link -r foo.o -o merged.o + $ llvm-ar rcs libfoo.a merged.o + # g++ app.cpp -L. -lfoo + diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c index b5d8ae217a972..08f860f6cab0d 100644 --- a/clang/test/Driver/linker-wrapper-image.c +++ b/clang/test/Driver/linker-wrapper-image.c @@ -9,6 +9,8 @@ // RUN: -fembed-offload-object=%t.out // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu \ // RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=OPENMP,OPENMP-ELF +// RUN: clang-linker-wrapper --print-wrapped-module --dry-run -r --host-triple=x86_64-unknown-linux-gnu \ +// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=OPENMP-ELF,OPENMP-REL // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \ // RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=OPENMP,OPENMP-COFF @@ -19,6 +21,8 @@ // OPENMP-COFF: @__start_omp_offloading_entries = weak_odr hidden constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "omp_offloading_entries$OA" // OPENMP-COFF-NEXT: @__stop_omp_offloading_entries = weak_odr hidden constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "omp_offloading_entries$OZ" +// OPENMP-REL: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading.relocatable", align 8 + // OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 8 // OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }] // OPENMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries } @@ -42,6 +46,8 @@ // RUN: -fembed-offload-object=%t.out // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu \ // RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=CUDA,CUDA-ELF +// RUN: clang-linker-wrapper --print-wrapped-module --dry-run -r --host-triple=x86_64-unknown-linux-gnu \ +// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=CUDA,CUDA-ELF // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \ // RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=CUDA,CUDA-COFF @@ -140,6 +146,8 @@ // RUN: -fembed-offload-object=%t.out // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu \ // RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=HIP,HIP-ELF +// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu -r \ +// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=HIP,HIP-ELF // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \ // RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=HIP,HIP-COFF diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c index a8667c99977c5..010001b83d7c2 100644 --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -176,10 +176,36 @@ __attribute__((visibility("protected"), used)) int x; // RUN: --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out -// RUN: llvm-ar rcs %t.a %t.o // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ -// RUN: --linker-path=/usr/bin/ld.lld -- -r --whole-archive %t.a --no-whole-archive \ +// RUN: --linker-path=/usr/bin/ld.lld -- -r %t.o \ // RUN: %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=RELOCATABLE-LINK -// RELOCATABLE-LINK-NOT: clang{{.*}} -o {{.*}}.img --target=x86_64-unknown-linux-gnu +// RELOCATABLE-LINK: clang{{.*}} -o {{.*}}.img --target=x86_64-unknown-linux-gnu // RELOCATABLE-LINK: /usr/bin/ld.lld{{.*}}-r +// RELOCATABLE-LINK: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading + +// RUN: clang-offload-packager -o %t.out \ +// RUN: --image=file=%t.elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx90a \ +// RUN: --image=file=%t.elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx90a +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ +// RUN: --linker-path=/usr/bin/ld.lld -- -r %t.o \ +// RUN: %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=RELOCATABLE-LINK-HIP + +// RELOCATABLE-LINK-HIP: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa +// RELOCATABLE-LINK-HIP: clang-offload-bundler{{.*}} -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx90a -input=/dev/null -input={{.*}} -output={{.*}} +// RELOCATABLE-LINK-HIP: /usr/bin/ld.lld{{.*}}-r +// RELOCATABLE-LINK-HIP: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading + +// RUN: clang-offload-packager -o %t.out \ +// RUN: --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_89 \ +// RUN: --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_89 +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ +// RUN: --linker-path=/usr/bin/ld.lld -- -r %t.o \ +// RUN: %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=RELOCATABLE-LINK-CUDA + +// RELOCATABLE-LINK-CUDA: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda +// RELOCATABLE-LINK-CUDA: fatbinary{{.*}} -64 --create {{.*}}.fatbin --image=profile=sm_89,file={{.*}}.img +// RELOCATABLE-LINK-CUDA: /usr/bin/ld.lld{{.*}}-r +// RELOCATABLE-LINK-CUDA: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index b682cc293d54b..095cf5ed38169 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -241,6 +241,70 @@ Expected findProgram(StringRef Name, ArrayRef Paths) { return *Path; } +/// Returns the hashed value for a constant string. +std::string getHash(StringRef Str) { + llvm::MD5 Hasher; + llvm::MD5::MD5Result Hash; + Hasher.update(Str); + Hasher.final(Hash); + return llvm::utohexstr(Hash.low(), /*LowerCase=*/true); +} + +/// Renames offloading entry sections in a relocatable link so they do not +/// conflict with a later link job. +Error relocateOffloadSection(const ArgList &Args, StringRef Output) { + llvm::Triple Triple( + Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple())); + if (Triple.isOSWindows()) + return createStringError( + inconvertibleErrorCode(), + "Relocatable linking is not supported on COFF targets"); + + Expected ObjcopyPath = + findProgram("llvm-objcopy", {getMainExecutable("llvm-objcopy")}); + if (!ObjcopyPath) + return ObjcopyPath.takeError(); + + // Use the linker output file to get a unique hash. This creates a unique + // identifier to rename the sections to that is deterministic to the contents. + auto BufferOrErr = DryRun ? MemoryBuffer::getMemBuffer("") + : MemoryBuffer::getFileOrSTDIN(Output); + if (!BufferOrErr) + return createStringError(inconvertibleErrorCode(), "Failed to open %s", + Output.str().c_str()); + std::string Suffix = "_" + getHash((*BufferOrErr)->getBuffer()); + + SmallVector ObjcopyArgs = { + *ObjcopyPath, + Output, + }; + + // Remove the old .llvm.offloading section to prevent further linking. + ObjcopyArgs.emplace_back("--remove-section"); + ObjcopyArgs.emplace_back(".llvm.offloading"); + for (StringRef Prefix : {"omp", "cuda", "hip"}) { + auto Section = (Prefix + "_offloading_entries").str(); + // Rename the offloading entires to make them private to this link unit. + ObjcopyArgs.emplace_back("--rename-section"); + ObjcopyArgs.emplace_back( + Args.MakeArgString(Section + "=" + Section + Suffix)); + + // Rename the __start_ / __stop_ symbols appropriately to iterate over the + // newly renamed section containing the offloading entries. + ObjcopyArgs.emplace_back("--redefine-sym"); + ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" + + "__start_" + Section + Suffix)); + ObjcopyArgs.emplace_back("--redefine-sym"); + ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" + + "__stop_" + Section + Suffix)); + } + + if (Error Err = executeCommands(*ObjcopyPath, ObjcopyArgs)) + return Err; + + return Error::success(); +} + /// Runs the wrapped linker job with the newly created input. Error runLinker(ArrayRef Files, const ArgList &Args) { llvm::TimeTraceScope TimeScope("Execute host linker"); @@ -265,6 +329,10 @@ Error runLinker(ArrayRef Files, const ArgList &Args) { LinkerArgs.push_back(Arg); if (Error Err = executeCommands(LinkerPath, LinkerArgs)) return Err; + + if (Args.hasArg(OPT_relocatable)) + return relocateOffloadSection(Args, ExecutableName); + return Error::success(); } @@ -910,7 +978,8 @@ wrapDeviceImages(ArrayRef> Buffers, case OFK_OpenMP: if (Error Err = offloading::wrapOpenMPBinaries( M, BuffersToWrap, - offloading::getOffloadEntryArray(M, "omp_offloading_entries"))) + offloading::getOffloadEntryArray(M, "omp_offloading_entries"), + /*Suffix=*/"", /*Relocatable=*/Args.hasArg(OPT_relocatable))) return std::move(Err); break; case OFK_Cuda: @@ -1356,12 +1425,6 @@ Expected>> getDeviceInput(const ArgList &Args) { llvm::TimeTraceScope TimeScope("ExtractDeviceCode"); - // If the user is requesting a reloctable link we ignore the device code. The - // actual linker will merge the embedded device code sections so they can be - // linked when the executable is finally created. - if (Args.hasArg(OPT_relocatable)) - return SmallVector>{}; - StringRef Root = Args.getLastArgValue(OPT_sysroot_EQ); SmallVector LibraryPaths; for (const opt::Arg *Arg : Args.filtered(OPT_library_path, OPT_libpath)) diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td index c59cb0fb3e7cb..763426570c2a6 100644 --- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -127,8 +127,9 @@ def version : Flag<["--", "-"], "version">, Flags<[HelpHidden]>, Alias; def whole_archive : Flag<["--", "-"], "whole-archive">, Flags<[HelpHidden]>; def no_whole_archive : Flag<["--", "-"], "no-whole-archive">, Flags<[HelpHidden]>; -def relocatable : Flag<["--", "-"], "relocatable">, Flags<[HelpHidden]>; -def r : Flag<["-"], "r">, Alias, Flags<[HelpHidden]>; +def relocatable : Flag<["--", "-"], "relocatable">, + HelpText<"Link device code to create a relocatable offloading application">; +def r : Flag<["-"], "r">, Alias; // link.exe-style linker options. def out : Joined<["/", "-", "/?", "-?"], "out:">, Flags<[HelpHidden]>; diff --git a/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h b/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h index e3ded00b573f7..79309251c3b6b 100644 --- a/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h +++ b/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h @@ -20,10 +20,13 @@ using EntryArrayTy = std::pair; /// \param EntryArray Optional pair pointing to the `__start` and `__stop` /// symbols holding the `__tgt_offload_entry` array. /// \param Suffix An optional suffix appended to the emitted symbols. +/// \param Relocatable Indicate if we need to change the offloading section to +/// create a relocatable object. llvm::Error wrapOpenMPBinaries(llvm::Module &M, llvm::ArrayRef> Images, EntryArrayTy EntryArray, - llvm::StringRef Suffix = ""); + llvm::StringRef Suffix = "", + bool Relocatable = false); /// Wraps the input fatbinary image into the module \p M as global symbols and /// registers the images with the CUDA runtime. diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp index 76a8eebdb3622..fec1bdbe9d8c7 100644 --- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp +++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp @@ -112,7 +112,8 @@ PointerType *getBinDescPtrTy(Module &M) { /// /// Global variable that represents BinDesc is returned. GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, - EntryArrayTy EntryArray, StringRef Suffix) { + EntryArrayTy EntryArray, StringRef Suffix, + bool Relocatable) { LLVMContext &C = M.getContext(); auto [EntriesB, EntriesE] = EntryArray; @@ -129,7 +130,8 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, GlobalVariable::InternalLinkage, Data, ".omp_offloading.device_image" + Suffix); Image->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - Image->setSection(".llvm.offloading"); + Image->setSection(Relocatable ? ".llvm.offloading.relocatable" + : ".llvm.offloading"); Image->setAlignment(Align(object::OffloadBinary::getAlignment())); StringRef Binary(Buf.data(), Buf.size()); @@ -582,8 +584,9 @@ void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc, Error offloading::wrapOpenMPBinaries(Module &M, ArrayRef> Images, EntryArrayTy EntryArray, - llvm::StringRef Suffix) { - GlobalVariable *Desc = createBinDesc(M, Images, EntryArray, Suffix); + llvm::StringRef Suffix, bool Relocatable) { + GlobalVariable *Desc = + createBinDesc(M, Images, EntryArray, Suffix, Relocatable); if (!Desc) return createStringError(inconvertibleErrorCode(), "No binary descriptors created."); diff --git a/llvm/lib/Object/OffloadBinary.cpp b/llvm/lib/Object/OffloadBinary.cpp index bfc35e41fe658..22d604b125c58 100644 --- a/llvm/lib/Object/OffloadBinary.cpp +++ b/llvm/lib/Object/OffloadBinary.cpp @@ -83,7 +83,7 @@ Error extractFromObject(const ObjectFile &Obj, if (!NameOrErr) return NameOrErr.takeError(); - if (!NameOrErr->equals(".llvm.offloading")) + if (!NameOrErr->starts_with(".llvm.offloading")) continue; }