Description
Describe the bug
#7595 introduced the flag to disable relocatable device code. This is supposed to improve compilation times. However, I'm observing the opposite.
Time to compile GROMACS for sm_86, -j12
goes from 3 minutes (walltime) to over 20 minutes.
This seems to be caused by hundreds of CPU-only files.
During linking, every file is passed to the device compiler, multiplied into almost 200 near-empty bitcode files.
$ ps xua | grep llvm-
aland 3565265 64.0 0.0 6592 3264 pts/10 R 14:12 0:00 /home/aland/intel-sycl/llvm/build/install/bin/llvm-foreach --out-ext=s --in-file-list=/tmp/p2p_protocol-69886f/p2p_protocol-sm_86.bc --in-replace=/tmp/p2p_protocol-69886f/p2p_protocol-sm_86.bc --out-file-list=/tmp/p2p_protocol-65b068/p2p_protocol-sm_86.s --out-replace=/tmp/p2p_protocol-65b068/p2p_protocol-sm_86.s -- /home/aland/intel-sycl/llvm/build/install/bin/clang-16 -cc1 -triple nvptx64-nvidia-cuda -aux-triple x86_64-unknown-linux-gnu -fsycl-is-device -fdeclare-spirv-builtins -fenable-sycl-dae -Wno-sycl-strict -sycl-std=2020 -D__SYCL_TARGET_NVIDIA_GPU_SM_86__ -S -disable-free -clear-ast-before-backend -main-file-name p2p_protocol.cpp.o -fsycl-use-main-file-name -full-main-file-name src/gromacs/CMakeFiles/thread_mpi.dir/__/external/thread_mpi/src/p2p_protocol.cpp.o -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -fno-verbose-asm -no-integrated-as -aux-target-cpu x86-64 -internal-isystem /home/aland/intel-sycl/llvm/build/install/bin/../include/sycl -internal-isystem /home/aland/intel-sycl/llvm/build/install/bin/../include -mlink-builtin-bitcode /home/aland/intel-sycl/llvm/build/install/lib/clang/16/../../clc/remangled-l64-signed_char.libspirv-nvptx64-nvidia-cuda.bc -mlink-builtin-bitcode /usr/local/cuda-11.8/nvvm/libdevice/libdevice.10.bc -target-sdk-version=11.8 -target-cpu sm_86 -target-feature +ptx78 -mllvm -treat-scalable-fixed-error-as-warning -debugger-tuning=gdb -fno-dwarf-directory-asm -resource-dir /home/aland/intel-sycl/llvm/build/install/lib/clang/16 -O3 -Wno-linker-warnings -Wno-override-module -Wno-sycl-target -fdebug-compilation-dir=/home/aland/gromacs/build/sycl-edge-cuda -ferror-limit 19 -fno-gpu-rdc -fgnuc-version=4.2.1 -no-opaque-pointers -vectorize-loops -vectorize-slp -o /tmp/p2p_protocol-65b068/p2p_protocol-sm_86.s -x ir /tmp/p2p_protocol-69886f/p2p_protocol-sm_86.bc
$ wc -l /tmp/p2p_protocol-69886f/p2p_protocol-sm_86.bc
181 /tmp/p2p_protocol-69886f/p2p_protocol-sm_86.bc
$ llvm-dis < /tmp/p2p_protocol-e6ce18/p2p_protocol-sm_86_0.bc
; ModuleID = '<stdin>'
source_filename = "llvm-link"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
@__spirv_BuiltInGlobalInvocationId = dso_local local_unnamed_addr addrspace(1) constant <3 x i64> zeroinitializer, align 32
@__spirv_BuiltInLocalInvocationId = dso_local local_unnamed_addr addrspace(1) constant <3 x i64> zeroinitializer, align 32
; Function Attrs: convergent
declare extern_weak dso_local void @__assertfail(i8* noundef, i8* noundef, i32 noundef, i8* noundef, i64 noundef) local_unnamed_addr #0
; Function Attrs: convergent mustprogress norecurse
define weak dso_local void @_wassert(i8* noundef %_Message, i8* noundef %_File, i32 noundef %_Line) local_unnamed_addr #1 !srcloc !8 {
entry:
tail call void @__assertfail(i8* noundef %_Message, i8* noundef %_File, i32 noundef %_Line, i8* noundef null, i64 noundef 1) #2
ret void
}
attributes #0 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_50" "target-features"="+ptx78,+sm_50" }
attributes #1 = { convergent mustprogress norecurse "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="/home/aland/intel-sycl/llvm/libdevice/fallback-cassert.cpp" "target-cpu"="sm_50" "target-features"="+ptx78,+sm_50" }
attributes #2 = { convergent }
!opencl.spir.version = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
!spirv.Source = !{!1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1, !1}
!llvm.ident = !{!2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !3}
!llvm.module.flags = !{!4, !5, !6, !7}
!sycl.specialization-constants = !{}
!sycl.specialization-constants-default-values = !{}
!0 = !{i32 1, i32 2}
!1 = !{i32 4, i32 100000}
!2 = !{!"clang version 16.0.0 (https://github.com/intel/llvm 9a4efa4803e1feee680cf867ad53546482b24726)"}
!3 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!4 = !{i32 1, !"wchar_size", i32 4}
!5 = !{i32 7, !"nvvm-reflect-ftz", i32 0}
!6 = !{i32 7, !"nvvm-reflect-prec-sqrt", i32 0}
!7 = !{i32 7, !"frame-pointer", i32 2}
!8 = !{i32 3875}
The p2p_protocol.cpp
file has not device code in it, is not compiled with -fsycl
.
To Reproduce
$ wget https://ftp.gromacs.org/gromacs/gromacs-2023.tar.gz
$ tar xf gromacs-2023.tar.gz && cd gromacs-2023 && mkdir build && cd build
$ cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGMX_GPU=SYCL -DGMX_GPU_NB_CLUSTER_SIZE=8 -DSYCL_CXX_FLAGS_EXTRA='-fsycl-targets=nvptx64-nvidia-cuda;-Xsycl-target-backend=nvptx64-nvidia-cuda;--offload-arch=sm_86;-fno-sycl-rdc' -GNinja -DGMX_GPU_FFT_LIBRARY=vkfft
$ time cmake --build . --target gmx -- -j12
Shorter, but less dramatic way to demonstrate
Another way to see the problem is creating a file main.cpp
with some SYCL code, helper.cpp
with some CPU-only code, then observing the compilation output
GPU_FLAGS=-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=sm_86
RDC_FLAGS=-fno-sycl-rdc
main.o: main.cpp
clang++ $(GPU_FLAGS) $(RDC_FLAGS) -v main.cpp -c -o main.o
helper.o: helper.cpp
clang++ $(RDC_FLAGS) -v helper.cpp -c -o helper.o
main: main.o helper.o
clang++ $(GPU_FLAGS) $(RDC_FLAGS) -v main.o helper.o -o main
clean:
rm -f main main.o helper.o
all: main
.PHONY: clean all
Somehow, the file with no GPU code still requires 352 bytes of global memory when linked alongside the SYCL-enabled file:
$ cat helper.cpp
#include <cmath>
int run_b(float x)
{
return 1000 * sinf(x);
}
$ make all
...
clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=sm_86 -fno-sycl-rdc -v main.o helper.o -o main
...
"/home/aland/intel-sycl/llvm/build/install/bin/llvm-foreach" --out-ext=o --in-file-list=/tmp/helper-3436c3/helper-sm_86.s --in-replace=/tmp/helper-3436c3/helper-sm_86.s --out-file-list=/tmp/helper-3cef92/helper-sm_86.cubin --out-replace=/tmp/helper-3cef92/helper-sm_86.cubin -- /usr/local/cuda-11.8/bin/ptxas -m64 -O3 -v --gpu-name sm_86 --output-file /tmp/helper-3cef92/helper-sm_86.cubin /tmp/helper-3436c3/helper-sm_86.s
ptxas info : 352 bytes gmem
What about AMD?
Linking fails:
clang++ -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx1031 -fno-sycl-rdc -v main.o helper.o -o main
clang version 16.0.0 (https://github.com/intel/llvm 4f757188a5d24cebb65e593ded74413f384b749e)
.....
"/home/aland/intel-sycl/llvm/build/install/bin/llvm-foreach" --out-ext=o --in-file-list=/tmp/main-85e66c/main-gfx1031.o --in-replace=/tmp/main-85e66c/main-gfx1031.o --out-file-list=/tmp/main-18995e/main-gfx1031.o --out-replace=/tmp/main-18995e/main-gfx1031.o -- /home/aland/intel-sycl/llvm/build/install/bin/clang-16 -cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fsycl-is-device -fdeclare-spirv-builtins -Wno-sycl-strict -O2 -sycl-std=2020 -D__SYCL_TARGET_AMD_GPU_GFX1031__ -emit-llvm-bc -disable-free -clear-ast-before-backend -main-file-name main.o -fsycl-use-main-file-name -full-main-file-name main.o -mrelocation-model pic -pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=all -ffp-contract=on -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility=hidden -fapply-global-visibility-to-externs -internal-isystem /home/aland/intel-sycl/llvm/build/install//bin/../include/sycl -internal-isystem /home/aland/intel-sycl/llvm/build/install//bin/../include -mlink-builtin-bitcode /home/aland/intel-sycl/llvm/build/install/lib/clang/16/../../clc/remangled-l64-signed_char.libspirv-amdgcn-amd-amdhsa.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_wavefrontsize64_off.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_isa_version_1031.bc -mlink-builtin-bitcode /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc -target-cpu gfx1031 -mllvm -treat-scalable-fixed-error-as-warning -debugger-tuning=gdb -v -resource-dir /home/aland/intel-sycl/llvm/build/install/lib/clang/16 -fdebug-compilation-dir=/home/aland/sycl_tests/clang-compile -ferror-limit 19 -fno-gpu-rdc -fgnuc-version=4.2.1 -no-opaque-pointers -fcolor-diagnostics -faddrsig -o /tmp/main-18995e/main-gfx1031.o -x object /tmp/main-85e66c/main-gfx1031.o
error: invalid value 'object' in '-x object'
llvm-foreach:
make: *** [Makefile:12: main] Error 1
Environment (please complete the following information):
- OS: Ubuntu 22.04
- Target device and vendor: NVIDIA GPU
- DPC++ version: 4f75718
- Dependencies version: CUDA 11.8, ROCm 5.4.2