llvm · jhuber6 · Feb 7, 2024 · Jan 30, 2024 · Jan 31, 2024 · Jan 31, 2024
diff --git a/clang/docs/ClangLinkerWrapper.rst b/clang/docs/ClangLinkerWrapper.rst
@@ -54,12 +54,30 @@ only for the linker wrapper will be forwarded to the wrapped linker job.
     --pass-remarks=<value> Pass remarks for LTO
     --print-wrapped-module Print the wrapped module's IR for testing
     --ptxas-arg=<value>    Argument to pass to the 'ptxas' invocation
+    --relocatable           Link device code to create a relocatable offloading application
     --save-temps           Save intermediate results
     --sysroot<value>       Set the system root
     --verbose              Verbose output from tools
     --v                    Display the version number and exit
     --                     The separator for the wrapped linker arguments
 
+Relocatable Linking
+===================
+
+The ``clang-linker-wrapper`` handles linking embedded device code and then 
+registering it with the appropriate runtime. Normally, this is only done when 
+the executable is created so other files containing device code can be linked 
+together. This can be somewhat problematic for users who wish to ship static 
+libraries that contain offloading code to users without a compatible offloading 
+toolchain.
+
+When using a relocatable link with ``-r``, the ``clang-linker-wrapper`` will 
+perform the device linking and registration eagerly. This will remove the 
+embedded device code and register it correctly with the runtime. Semantically, 
+this is similar to creating a shared library object. If standard relocatable 
+linking is desired, simply do not run the binaries through the 
+``clang-linker-wrapper``. This will simply append the embedded device code so 
+that it can be linked later.
 
 Example
 =======

diff --git a/clang/docs/OffloadingDesign.rst b/clang/docs/OffloadingDesign.rst
@@ -470,3 +470,37 @@ We can see the steps created by clang to generate the offloading code using the
     # "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["/tmp/zaxpy-07f434.s"], output: "/tmp/zaxpy-0af7b7.o"
     # "x86_64-unknown-linux-gnu" - "clang", inputs: ["/tmp/zaxpy-e6a41b.bc", "/tmp/zaxpy-0af7b7.o"], output: "/tmp/zaxpy-416cad.o"
     # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["/tmp/zaxpy-416cad.o"], output: "a.out"
+
+Relocatable Linking
+-------------------
+
+The offloading compilation pipeline normally will defer the final device linking 
+and runtime registration until the ``clang-linker-wrapper`` is run to create the 
+executable. This is the standard behaviour when compiling for OpenMP offloading 
+or CUDA and HIP in ``-fgpu-rdc`` mode. However, there are some cases where the 
+user may wish to perform this device handling prematurely. This is described in 
+the :doc:`linker wrapper documentation<ClangLinkerWrapper>`.
+
+Effectively, this allows the user to handle offloading specific linking ahead of 
+time when shipping objects or static libraries. This can be thought of as 
+performing a standard ``-fno-gpu-rdc`` compilation on a subset of object files. 
+This can be useful to reduce link time, prevent users from interacting with the 
+library's device code, or for shipping libraries to incompatible compilers.
+
+Normally, if a relocatable link is done using ``clang -r`` it will simply merge 
+the ``.llvm.offloading`` sections which will then be linked later when the 
+executable is created. However, if the ``-r`` flag is used with the offloading 
+toolchain, it will perform the device linking and registration phases and then 
+merge the registration code into the final relocatable object file.
+
+The following example shows how using the relocatable link with the offloading 
+pipeline can create a static library with offloading code that can be 
+redistributed without requiring any additional handling.
+
+.. code-block:: console
+
+    $ clang++ -fopenmp -fopenmp-targets=nvptx64 foo.cpp -c
+    $ clang++ -lomptarget.devicertl --offload-link -r foo.o -o merged.o
+    $ llvm-ar rcs libfoo.a merged.o
+    # g++ app.cpp -L. -lfoo
+
diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c
@@ -9,6 +9,8 @@
 // RUN:   -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu \
 // RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=OPENMP,OPENMP-ELF
+// RUN: clang-linker-wrapper --print-wrapped-module --dry-run -r --host-triple=x86_64-unknown-linux-gnu \
+// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=OPENMP-ELF,OPENMP-REL
 // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \
 // RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=OPENMP,OPENMP-COFF
 
@@ -19,6 +21,8 @@
 //      OPENMP-COFF: @__start_omp_offloading_entries = weak_odr hidden constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "omp_offloading_entries$OA"
 // OPENMP-COFF-NEXT: @__stop_omp_offloading_entries = weak_odr hidden constant [0 x %struct.__tgt_offload_entry] zeroinitializer, section "omp_offloading_entries$OZ"
 
+// OPENMP-REL: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading.relocatable", align 8
+
 //      OPENMP: @.omp_offloading.device_image = internal unnamed_addr constant [[[SIZE:[0-9]+]] x i8] c"\10\FF\10\AD{{.*}}", section ".llvm.offloading", align 8
 // OPENMP-NEXT: @.omp_offloading.device_images = internal unnamed_addr constant [1 x %__tgt_device_image] [%__tgt_device_image { ptr getelementptr inbounds ([[[BEGIN:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr getelementptr inbounds ([[[END:[0-9]+]] x i8], ptr @.omp_offloading.device_image, i64 1, i64 0), ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }]
 // OPENMP-NEXT: @.omp_offloading.descriptor = internal constant %__tgt_bin_desc { i32 1, ptr @.omp_offloading.device_images, ptr @__start_omp_offloading_entries, ptr @__stop_omp_offloading_entries }
@@ -42,6 +46,8 @@
 // RUN:   -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu \
 // RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=CUDA,CUDA-ELF
+// RUN: clang-linker-wrapper --print-wrapped-module --dry-run -r --host-triple=x86_64-unknown-linux-gnu \
+// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=CUDA,CUDA-ELF
 // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \
 // RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=CUDA,CUDA-COFF
 
@@ -140,6 +146,8 @@
 // RUN:   -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu \
 // RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=HIP,HIP-ELF
+// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-linux-gnu -r \
+// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=HIP,HIP-ELF
 // RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple=x86_64-unknown-windows-gnu \
 // RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefixes=HIP,HIP-COFF
 

diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
@@ -176,10 +176,36 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: llvm-ar rcs %t.a %t.o
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld.lld -- -r --whole-archive %t.a --no-whole-archive \
+// RUN:   --linker-path=/usr/bin/ld.lld -- -r %t.o \
 // RUN:   %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=RELOCATABLE-LINK
 
-// RELOCATABLE-LINK-NOT: clang{{.*}} -o {{.*}}.img --target=x86_64-unknown-linux-gnu
+// RELOCATABLE-LINK: clang{{.*}} -o {{.*}}.img --target=x86_64-unknown-linux-gnu
 // RELOCATABLE-LINK: /usr/bin/ld.lld{{.*}}-r
+// RELOCATABLE-LINK: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading
+
+// RUN: clang-offload-packager -o %t.out \
+// RUN:   --image=file=%t.elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx90a \
+// RUN:   --image=file=%t.elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx90a
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
+// RUN:   --linker-path=/usr/bin/ld.lld -- -r %t.o \
+// RUN:   %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=RELOCATABLE-LINK-HIP
+
+// RELOCATABLE-LINK-HIP: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa
+// RELOCATABLE-LINK-HIP: clang-offload-bundler{{.*}} -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx90a -input=/dev/null -input={{.*}} -output={{.*}}
+// RELOCATABLE-LINK-HIP: /usr/bin/ld.lld{{.*}}-r
+// RELOCATABLE-LINK-HIP: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading
+
+// RUN: clang-offload-packager -o %t.out \
+// RUN:   --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_89 \
+// RUN:   --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_89
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
+// RUN:   --linker-path=/usr/bin/ld.lld -- -r %t.o \
+// RUN:   %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=RELOCATABLE-LINK-CUDA
+
+// RELOCATABLE-LINK-CUDA: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda
+// RELOCATABLE-LINK-CUDA: fatbinary{{.*}} -64 --create {{.*}}.fatbin --image=profile=sm_89,file={{.*}}.img
+// RELOCATABLE-LINK-CUDA: /usr/bin/ld.lld{{.*}}-r
+// RELOCATABLE-LINK-CUDA: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -241,6 +241,70 @@ Expected<std::string> findProgram(StringRef Name, ArrayRef<StringRef> Paths) {
   return *Path;
 }
 
+/// Returns the hashed value for a constant string.
+std::string getHash(StringRef Str) {
+  llvm::MD5 Hasher;
+  llvm::MD5::MD5Result Hash;
+  Hasher.update(Str);
+  Hasher.final(Hash);
+  return llvm::utohexstr(Hash.low(), /*LowerCase=*/true);
+}
+
+/// Renames offloading entry sections in a relocatable link so they do not
+/// conflict with a later link job.
+Error relocateOffloadSection(const ArgList &Args, StringRef Output) {
+  llvm::Triple Triple(
+      Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple()));
+  if (Triple.isOSWindows())
+    return createStringError(
+        inconvertibleErrorCode(),
+        "Relocatable linking is not supported on COFF targets");
+
+  Expected<std::string> ObjcopyPath =
+      findProgram("llvm-objcopy", {getMainExecutable("llvm-objcopy")});
+  if (!ObjcopyPath)
+    return ObjcopyPath.takeError();
+
+  // Use the linker output file to get a unique hash. This creates a unique
+  // identifier to rename the sections to that is deterministic to the contents.
+  auto BufferOrErr = DryRun ? MemoryBuffer::getMemBuffer("")
+                            : MemoryBuffer::getFileOrSTDIN(Output);
+  if (!BufferOrErr)
+    return createStringError(inconvertibleErrorCode(), "Failed to open %s",
+                             Output.str().c_str());
+  std::string Suffix = "_" + getHash((*BufferOrErr)->getBuffer());
+
+  SmallVector<StringRef> ObjcopyArgs = {
+      *ObjcopyPath,
+      Output,
+  };
+
+  // Remove the old .llvm.offloading section to prevent further linking.
+  ObjcopyArgs.emplace_back("--remove-section");
+  ObjcopyArgs.emplace_back(".llvm.offloading");
+  for (StringRef Prefix : {"omp", "cuda", "hip"}) {
+    auto Section = (Prefix + "_offloading_entries").str();
+    // Rename the offloading entires to make them private to this link unit.
+    ObjcopyArgs.emplace_back("--rename-section");
+    ObjcopyArgs.emplace_back(
+        Args.MakeArgString(Section + "=" + Section + Suffix));
+
+    // Rename the __start_ / __stop_ symbols appropriately to iterate over the
+    // newly renamed section containing the offloading entries.
+    ObjcopyArgs.emplace_back("--redefine-sym");
+    ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" +
+                                                "__start_" + Section + Suffix));
+    ObjcopyArgs.emplace_back("--redefine-sym");
+    ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" +
+                                                "__stop_" + Section + Suffix));
+  }
+
+  if (Error Err = executeCommands(*ObjcopyPath, ObjcopyArgs))
+    return Err;
+
+  return Error::success();
+}
+
 /// Runs the wrapped linker job with the newly created input.
 Error runLinker(ArrayRef<StringRef> Files, const ArgList &Args) {
   llvm::TimeTraceScope TimeScope("Execute host linker");
@@ -265,6 +329,10 @@ Error runLinker(ArrayRef<StringRef> Files, const ArgList &Args) {
     LinkerArgs.push_back(Arg);
   if (Error Err = executeCommands(LinkerPath, LinkerArgs))
     return Err;
+
+  if (Args.hasArg(OPT_relocatable))
+    return relocateOffloadSection(Args, ExecutableName);
+
   return Error::success();
 }
 
@@ -910,7 +978,8 @@ wrapDeviceImages(ArrayRef<std::unique_ptr<MemoryBuffer>> Buffers,
   case OFK_OpenMP:
     if (Error Err = offloading::wrapOpenMPBinaries(
             M, BuffersToWrap,
-            offloading::getOffloadEntryArray(M, "omp_offloading_entries")))
+            offloading::getOffloadEntryArray(M, "omp_offloading_entries"),
+            /*Suffix=*/"", /*Relocatable=*/Args.hasArg(OPT_relocatable)))
       return std::move(Err);
     break;
   case OFK_Cuda:
@@ -1356,12 +1425,6 @@ Expected<SmallVector<SmallVector<OffloadFile>>>
 getDeviceInput(const ArgList &Args) {
   llvm::TimeTraceScope TimeScope("ExtractDeviceCode");
 
-  // If the user is requesting a reloctable link we ignore the device code. The
-  // actual linker will merge the embedded device code sections so they can be
-  // linked when the executable is finally created.
-  if (Args.hasArg(OPT_relocatable))
-    return SmallVector<SmallVector<OffloadFile>>{};
-
   StringRef Root = Args.getLastArgValue(OPT_sysroot_EQ);
   SmallVector<StringRef> LibraryPaths;
   for (const opt::Arg *Arg : Args.filtered(OPT_library_path, OPT_libpath))

diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
@@ -127,8 +127,9 @@ def version : Flag<["--", "-"], "version">, Flags<[HelpHidden]>, Alias<v>;
 def whole_archive : Flag<["--", "-"], "whole-archive">, Flags<[HelpHidden]>;
 def no_whole_archive : Flag<["--", "-"], "no-whole-archive">, Flags<[HelpHidden]>;
 
-def relocatable : Flag<["--", "-"], "relocatable">, Flags<[HelpHidden]>;
-def r : Flag<["-"], "r">, Alias<relocatable>, Flags<[HelpHidden]>;
+def relocatable : Flag<["--", "-"], "relocatable">, 
+    HelpText<"Link device code to create a relocatable offloading application">;
+def r : Flag<["-"], "r">, Alias<relocatable>;
 
 // link.exe-style linker options.
 def out : Joined<["/", "-", "/?", "-?"], "out:">, Flags<[HelpHidden]>;

diff --git a/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h b/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h
@@ -20,10 +20,13 @@ using EntryArrayTy = std::pair<GlobalVariable *, GlobalVariable *>;
 /// \param EntryArray Optional pair pointing to the `__start` and `__stop`
 /// symbols holding the `__tgt_offload_entry` array.
 /// \param Suffix An optional suffix appended to the emitted symbols.
+/// \param Relocatable Indicate if we need to change the offloading section to
+/// create a relocatable object.
 llvm::Error wrapOpenMPBinaries(llvm::Module &M,
                                llvm::ArrayRef<llvm::ArrayRef<char>> Images,
                                EntryArrayTy EntryArray,
-                               llvm::StringRef Suffix = "");
+                               llvm::StringRef Suffix = "",
+                               bool Relocatable = false);
 
 /// Wraps the input fatbinary image into the module \p M as global symbols and
 /// registers the images with the CUDA runtime.

diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
@@ -112,7 +112,8 @@ PointerType *getBinDescPtrTy(Module &M) {
 ///
 /// Global variable that represents BinDesc is returned.
 GlobalVariable *createBinDesc(Module &M, ArrayRef<ArrayRef<char>> Bufs,
-                              EntryArrayTy EntryArray, StringRef Suffix) {
+                              EntryArrayTy EntryArray, StringRef Suffix,
+                              bool Relocatable) {
   LLVMContext &C = M.getContext();
   auto [EntriesB, EntriesE] = EntryArray;
 
@@ -129,7 +130,8 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef<ArrayRef<char>> Bufs,
                                      GlobalVariable::InternalLinkage, Data,
                                      ".omp_offloading.device_image" + Suffix);
     Image->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-    Image->setSection(".llvm.offloading");
+    Image->setSection(Relocatable ? ".llvm.offloading.relocatable"
+                                  : ".llvm.offloading");
     Image->setAlignment(Align(object::OffloadBinary::getAlignment()));
 
     StringRef Binary(Buf.data(), Buf.size());
@@ -582,8 +584,9 @@ void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc,
 
 Error offloading::wrapOpenMPBinaries(Module &M, ArrayRef<ArrayRef<char>> Images,
                                      EntryArrayTy EntryArray,
-                                     llvm::StringRef Suffix) {
-  GlobalVariable *Desc = createBinDesc(M, Images, EntryArray, Suffix);
+                                     llvm::StringRef Suffix, bool Relocatable) {
+  GlobalVariable *Desc =
+      createBinDesc(M, Images, EntryArray, Suffix, Relocatable);
   if (!Desc)
     return createStringError(inconvertibleErrorCode(),
                              "No binary descriptors created.");

diff --git a/llvm/lib/Object/OffloadBinary.cpp b/llvm/lib/Object/OffloadBinary.cpp
@@ -83,7 +83,7 @@ Error extractFromObject(const ObjectFile &Obj,
       if (!NameOrErr)
         return NameOrErr.takeError();
 
-      if (!NameOrErr->equals(".llvm.offloading"))
+      if (!NameOrErr->starts_with(".llvm.offloading"))
         continue;
     }