Skip to content

Very slow linking with -fno-sycl-rdc #8353

Open
@al42and

Description

@al42and

Describe the bug

#7595 introduced the flag to disable relocatable device code. This is supposed to improve compilation times. However, I'm observing the opposite.

Time to compile GROMACS for sm_86, -j12 goes from 3 minutes (walltime) to over 20 minutes.

This seems to be caused by hundreds of CPU-only files.
During linking, every file is passed to the device compiler, multiplied into almost 200 near-empty bitcode files.

$ ps xua | grep llvm-
aland    3565265 64.0  0.0   6592  3264 pts/10   R    14:12   0:00 /home/aland/intel-sycl/llvm/build/install/bin/llvm-foreach --out-ext=s --in-file-list=/tmp/p2p_protocol-69886f/p2p_protocol-sm_86.bc --in-replace=/tmp/p2p_protocol-69886f/p2p_protocol-sm_86.bc --out-file-list=/tmp/p2p_protocol-65b068/p2p_protocol-sm_86.s --out-replace=/tmp/p2p_protocol-65b068/p2p_protocol-sm_86.s -- /home/aland/intel-sycl/llvm/build/install/bin/clang-16 -cc1 -triple nvptx64-nvidia-cuda -aux-triple x86_64-unknown-linux-gnu -fsycl-is-device -fdeclare-spirv-builtins -fenable-sycl-dae -Wno-sycl-strict -sycl-std=2020 -D__SYCL_TARGET_NVIDIA_GPU_SM_86__ -S -disable-free -clear-ast-before-backend -main-file-name p2p_protocol.cpp.o -fsycl-use-main-file-name -full-main-file-name src/gromacs/CMakeFiles/thread_mpi.dir/__/external/thread_mpi/src/p2p_protocol.cpp.o -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -fno-verbose-asm -no-integrated-as -aux-target-cpu x86-64 -internal-isystem /home/aland/intel-sycl/llvm/build/install/bin/../include/sycl -internal-isystem /home/aland/intel-sycl/llvm/build/install/bin/../include -mlink-builtin-bitcode /home/aland/intel-sycl/llvm/build/install/lib/clang/16/../../clc/remangled-l64-signed_char.libspirv-nvptx64-nvidia-cuda.bc -mlink-builtin-bitcode /usr/local/cuda-11.8/nvvm/libdevice/libdevice.10.bc -target-sdk-version=11.8 -target-cpu sm_86 -target-feature +ptx78 -mllvm -treat-scalable-fixed-error-as-warning -debugger-tuning=gdb -fno-dwarf-directory-asm -resource-dir /home/aland/intel-sycl/llvm/build/install/lib/clang/16 -O3 -Wno-linker-warnings -Wno-override-module -Wno-sycl-target -fdebug-compilation-dir=/home/aland/gromacs/build/sycl-edge-cuda -ferror-limit 19 -fno-gpu-rdc -fgnuc-version=4.2.1 -no-opaque-pointers -vectorize-loops -vectorize-slp -o /tmp/p2p_protocol-65b068/p2p_protocol-sm_86.s -x ir /tmp/p2p_protocol-69886f/p2p_protocol-sm_86.bc

$ wc -l /tmp/p2p_protocol-69886f/p2p_protocol-sm_86.bc
181 /tmp/p2p_protocol-69886f/p2p_protocol-sm_86.bc

$ llvm-dis < /tmp/p2p_protocol-e6ce18/p2p_protocol-sm_86_0.bc 
; ModuleID = '<stdin>'
source_filename = "llvm-link"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

@__spirv_BuiltInGlobalInvocationId = dso_local local_unnamed_addr addrspace(1) constant <3 x i64> zeroinitializer, align 32
@__spirv_BuiltInLocalInvocationId = dso_local local_unnamed_addr addrspace(1) constant <3 x i64> zeroinitializer, align 32

; Function Attrs: convergent
declare extern_weak dso_local void @__assertfail(i8* noundef, i8* noundef, i32 noundef, i8* noundef, i64 noundef) local_unnamed_addr #0

; Function Attrs: convergent mustprogress norecurse
define weak dso_local void @_wassert(i8* noundef %_Message, i8* noundef %_File, i32 noundef %_Line) local_unnamed_addr #1 !srcloc !8 {
entry:
  tail call void @__assertfail(i8* noundef %_Message, i8* noundef %_File, i32 noundef %_Line, i8* noundef null, i64 noundef 1) #2
  ret void
}

attributes #0 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_50" "target-features"="+ptx78,+sm_50" }
attributes #1 = { convergent mustprogress norecurse "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="/home/aland/intel-sycl/llvm/libdevice/fallback-cassert.cpp" "target-cpu"="sm_50" "target-features"="+ptx78,+sm_50" }
attributes #2 = { convergent }

!opencl.spir.version = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
!spirv.Source = !{!1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1}
!llvm.ident = !{!2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !3}
!llvm.module.flags = !{!4, !5, !6, !7}
!sycl.specialization-constants = !{}
!sycl.specialization-constants-default-values = !{}

!0 = !{i32 1, i32 2}
!1 = !{i32 4, i32 100000}
!2 = !{!"clang version 16.0.0 (https://github.com/intel/llvm 9a4efa4803e1feee680cf867ad53546482b24726)"}
!3 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!4 = !{i32 1, !"wchar_size", i32 4}
!5 = !{i32 7, !"nvvm-reflect-ftz", i32 0}
!6 = !{i32 7, !"nvvm-reflect-prec-sqrt", i32 0}
!7 = !{i32 7, !"frame-pointer", i32 2}
!8 = !{i32 3875}

The p2p_protocol.cpp file has not device code in it, is not compiled with -fsycl.

To Reproduce

$ wget https://ftp.gromacs.org/gromacs/gromacs-2023.tar.gz
$ tar xf gromacs-2023.tar.gz && cd gromacs-2023 && mkdir build && cd build
$ cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGMX_GPU=SYCL -DGMX_GPU_NB_CLUSTER_SIZE=8 -DSYCL_CXX_FLAGS_EXTRA='-fsycl-targets=nvptx64-nvidia-cuda;-Xsycl-target-backend=nvptx64-nvidia-cuda;--offload-arch=sm_86;-fno-sycl-rdc' -GNinja -DGMX_GPU_FFT_LIBRARY=vkfft
$ time cmake --build . --target gmx -- -j12

Shorter, but less dramatic way to demonstrate

Another way to see the problem is creating a file main.cpp with some SYCL code, helper.cpp with some CPU-only code, then observing the compilation output

GPU_FLAGS=-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=sm_86
RDC_FLAGS=-fno-sycl-rdc

main.o: main.cpp
        clang++ $(GPU_FLAGS) $(RDC_FLAGS) -v main.cpp -c -o main.o

helper.o: helper.cpp
        clang++ $(RDC_FLAGS) -v helper.cpp -c -o helper.o

main: main.o helper.o
        clang++ $(GPU_FLAGS) $(RDC_FLAGS) -v main.o helper.o -o main

clean:
        rm -f main main.o helper.o

all: main


.PHONY: clean all

Somehow, the file with no GPU code still requires 352 bytes of global memory when linked alongside the SYCL-enabled file:

$ cat helper.cpp 
#include <cmath>
int run_b(float x)
{
    return 1000 * sinf(x);
}
$ make all
...
clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=sm_86 -fno-sycl-rdc -v main.o helper.o -o main
...
 "/home/aland/intel-sycl/llvm/build/install/bin/llvm-foreach" --out-ext=o --in-file-list=/tmp/helper-3436c3/helper-sm_86.s --in-replace=/tmp/helper-3436c3/helper-sm_86.s --out-file-list=/tmp/helper-3cef92/helper-sm_86.cubin --out-replace=/tmp/helper-3cef92/helper-sm_86.cubin -- /usr/local/cuda-11.8/bin/ptxas -m64 -O3 -v --gpu-name sm_86 --output-file /tmp/helper-3cef92/helper-sm_86.cubin /tmp/helper-3436c3/helper-sm_86.s
ptxas info    : 352 bytes gmem

What about AMD?

Linking fails:

clang++ -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx1031 -fno-sycl-rdc -v main.o helper.o -o main
clang version 16.0.0 (https://github.com/intel/llvm 4f757188a5d24cebb65e593ded74413f384b749e)
.....
 "/home/aland/intel-sycl/llvm/build/install/bin/llvm-foreach" --out-ext=o --in-file-list=/tmp/main-85e66c/main-gfx1031.o --in-replace=/tmp/main-85e66c/main-gfx1031.o --out-file-list=/tmp/main-18995e/main-gfx1031.o --out-replace=/tmp/main-18995e/main-gfx1031.o -- /home/aland/intel-sycl/llvm/build/install/bin/clang-16 -cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fsycl-is-device -fdeclare-spirv-builtins -Wno-sycl-strict -O2 -sycl-std=2020 -D__SYCL_TARGET_AMD_GPU_GFX1031__ -emit-llvm-bc -disable-free -clear-ast-before-backend -main-file-name main.o -fsycl-use-main-file-name -full-main-file-name main.o -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=all -ffp-contract=on -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility=hidden -fapply-global-visibility-to-externs -internal-isystem /home/aland/intel-sycl/llvm/build/install//bin/../include/sycl -internal-isystem /home/aland/intel-sycl/llvm/build/install//bin/../include -mlink-builtin-bitcode /home/aland/intel-sycl/llvm/build/install/lib/clang/16/../../clc/remangled-l64-signed_char.libspirv-amdgcn-amd-amdhsa.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_wavefrontsize64_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_isa_version_1031.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc -target-cpu gfx1031 -mllvm -treat-scalable-fixed-error-as-warning -debugger-tuning=gdb -v -resource-dir /home/aland/intel-sycl/llvm/build/install/lib/clang/16 -fdebug-compilation-dir=/home/aland/sycl_tests/clang-compile -ferror-limit 19 -fno-gpu-rdc -fgnuc-version=4.2.1 -no-opaque-pointers -fcolor-diagnostics -faddrsig -o /tmp/main-18995e/main-gfx1031.o -x object /tmp/main-85e66c/main-gfx1031.o
error: invalid value 'object' in '-x object'
llvm-foreach: 
make: *** [Makefile:12: main] Error 1

Environment (please complete the following information):

  • OS: Ubuntu 22.04
  • Target device and vendor: NVIDIA GPU
  • DPC++ version: 4f75718
  • Dependencies version: CUDA 11.8, ROCm 5.4.2

Metadata

Metadata

Assignees

No one assigned

    Labels

    confirmedcudaCUDA back-endenhancementNew feature or requesthipIssues related to execution on HIP backend.performancePerformance related issues

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions