From 4f30a79ac85e7c2e3cb5a8169c1003821f4c4aea Mon Sep 17 00:00:00 2001 From: pgorlani Date: Thu, 6 Jan 2022 12:00:44 +0000 Subject: [PATCH 01/11] [SYCL][CUDA][libclc] Add approx. tanhf built-in This patch adds the support for an approximate hyperbolic tangent single-precision built-in function introduced in PTX 7.0 for devices having compute capabilities >= 7.5. If this built-in is available, it is possible use it by setting the `-fcuda-approx-tanhf` flag. --- clang/include/clang/Basic/BuiltinsNVPTX.def | 4 ++++ clang/include/clang/Basic/TargetOptions.h | 3 +++ clang/include/clang/Driver/Options.td | 4 ++++ clang/lib/CodeGen/CodeGenModule.cpp | 3 +++ clang/test/CodeGenCUDA/flush-denormals.cu | 4 ++-- clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu | 11 +++++++++++ libclc/ptx-nvidiacl/libspirv/math/tanh.cl | 13 ++++++++++++- libclc/ptx-nvidiacl/libspirv/reflect.ll | 7 +++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 7 +++++++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 7 +++++++ llvm/lib/Target/NVPTX/NVVMReflect.cpp | 6 ++++++ 11 files changed, 66 insertions(+), 3 deletions(-) create mode 100644 clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index bb67c1abf2a6..dbde3232f9a9 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -218,6 +218,10 @@ BUILTIN(__nvvm_sin_approx_f, "ff", "") BUILTIN(__nvvm_cos_approx_ftz_f, "ff", "") BUILTIN(__nvvm_cos_approx_f, "ff", "") +// Tanh + +TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_75,PTX70)) + // Fma BUILTIN(__nvvm_fma_rn_ftz_f, "ffff", "") diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h index 07542feb9d81..dafccbf00bb1 100644 --- a/clang/include/clang/Basic/TargetOptions.h +++ b/clang/include/clang/Basic/TargetOptions.h @@ -78,6 +78,9 @@ class TargetOptions { /// \brief If enabled, use precise square root bool NVVMCudaPrecSqrt = false; + /// \brief If enabled, use approximate tanh + bool NVVMCudaApproxTanhf = false; + /// \brief If enabled, allow AMDGPU unsafe floating point atomics. bool AllowAMDGPUUnsafeFPAtomics = false; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index ad85abdf1bb9..35703d6985b9 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -985,6 +985,10 @@ defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt", PosFlag, NegFlag, BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>; +defm nvvm_cuda_approx_tanhf : BoolFOption<"cuda-approx-tanhf", + TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse, + PosFlag= 7.5">, + NegFlag>; def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group, HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">; def hip_path_EQ : Joined<["--"], "hip-path=">, Group, diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index e04906935483..f5e33a824863 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -780,6 +780,9 @@ void CodeGenModule::Release() { llvm::DenormalMode::IEEE); getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-prec-sqrt", getTarget().getTargetOpts().NVVMCudaPrecSqrt); + getModule().addModuleFlag(llvm::Module::Override, + "nvvm-reflect-approx-tanhf", + getTarget().getTargetOpts().NVVMCudaApproxTanhf); } if (LangOpts.EHAsynch) diff --git a/clang/test/CodeGenCUDA/flush-denormals.cu b/clang/test/CodeGenCUDA/flush-denormals.cu index f19132b0b464..ca0ac97859ef 100644 --- a/clang/test/CodeGenCUDA/flush-denormals.cu +++ b/clang/test/CodeGenCUDA/flush-denormals.cu @@ -44,8 +44,8 @@ extern "C" __device__ void foo() {} // FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" // NOFTZ-NOT: "denormal-fp-math-f32" -// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}} +// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}} // PTXFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1} -// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}} +// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}} // PTXNOFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 0} diff --git a/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu b/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu new file mode 100644 index 000000000000..a63cbfff514b --- /dev/null +++ b/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm -fcuda-approx-tanhf %s -o -| FileCheck --check-prefix=CHECK-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s + +#include "Inputs/cuda.h" + +// Check that the -fcuda-approx-tanhf flag correctly sets the nvvm-reflect module flags. + +extern "C" __device__ void foo() {} + +// CHECK-ON: !{i32 4, !"nvvm-reflect-approx-tanhf", i32 1} +// CHECK-OFF: !{i32 4, !"nvvm-reflect-approx-tanhf", i32 0} diff --git a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl index f443c36c0411..682428762164 100644 --- a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl +++ b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl @@ -11,7 +11,18 @@ #include "../../include/libdevice.h" #include +int __clc_nvvm_reflect_arch(); +int __clc_nvvm_reflect_approx_tanh(); + +float __my_tanhf (float x){ + if(__clc_nvvm_reflect_approx_tanh()) { + return __nvvm_tanh_approx_f(x); + } else { + return __nv_tanhf(x); + } +} + #define __CLC_FUNCTION __spirv_ocl_tanh #define __CLC_BUILTIN __nv_tanh -#define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f) +#define __CLC_BUILTIN_F __my_tanhf #include diff --git a/libclc/ptx-nvidiacl/libspirv/reflect.ll b/libclc/ptx-nvidiacl/libspirv/reflect.ll index 91ae4135644d..978489adaf19 100755 --- a/libclc/ptx-nvidiacl/libspirv/reflect.ll +++ b/libclc/ptx-nvidiacl/libspirv/reflect.ll @@ -6,3 +6,10 @@ define i32 @__clc_nvvm_reflect_arch() alwaysinline { %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([12 x i8], [12 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*)) ret i32 %reflect } + +@str_approx_tanh = private addrspace(1) constant [20 x i8] c"__CUDA_APPROX_TANHF\00" + +define i32 @__clc_nvvm_reflect_approx_tanh() alwaysinline { + %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([20 x i8], [20 x i8] addrspace(1)* @str_approx_tanh, i32 0, i32 0) to i8*)) + ret i32 %reflect +} diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index fb7598d6ac75..bee180f80bee 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -808,6 +808,13 @@ let TargetPrefix = "nvvm" in { def int_nvvm_cos_approx_f : GCCBuiltin<"__nvvm_cos_approx_f">, DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; +// +// Tanh +// + + def int_nvvm_tanh_approx_f : GCCBuiltin<"__nvvm_tanh_approx_f">, + DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + // // Fma // diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index d7c8acd14608..507e667fc284 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -784,6 +784,13 @@ def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;", def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;", Float32Regs, Float32Regs, int_nvvm_cos_approx_f>; +// +// Tanh +// + +def INT_NVVM_TANH_APPROX_F : F_MATH_1<"tanh.approx.f32 \t$dst, $src0;", + Float32Regs, Float32Regs, int_nvvm_tanh_approx_f>; + // // Fma // diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 6ffc49a59a55..396fa1880520 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -176,6 +176,12 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { if (auto *Flag = mdconst::extract_or_null( F.getParent()->getModuleFlag("nvvm-reflect-prec-sqrt"))) ReflectVal = Flag->getSExtValue(); + } else if (ReflectArg == "__CUDA_APPROX_TANHF") { + // Try to pull __CUDA_APPROX_TANHF from the nvvm-reflect-approx-tanhf + // module flag. + if (auto *Flag = mdconst::extract_or_null( + F.getParent()->getModuleFlag("nvvm-reflect-approx-tanhf"))) + ReflectVal = Flag->getSExtValue(); } Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal)); ToRemove.push_back(Call); From a151a75ef8a718855aec23d476fa2c1c410796a5 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Wed, 26 Jan 2022 09:37:11 +0000 Subject: [PATCH 02/11] Add architecture check in libclc --- libclc/ptx-nvidiacl/libspirv/math/tanh.cl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl index 682428762164..1f9ae67437e6 100644 --- a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl +++ b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl @@ -14,8 +14,8 @@ int __clc_nvvm_reflect_arch(); int __clc_nvvm_reflect_approx_tanh(); -float __my_tanhf (float x){ - if(__clc_nvvm_reflect_approx_tanh()) { +float __select_tanhf (float x){ + if(__clc_nvvm_reflect_approx_tanh() && __clc_nvvm_reflect_arch() >= 800) { return __nvvm_tanh_approx_f(x); } else { return __nv_tanhf(x); @@ -24,5 +24,5 @@ float __my_tanhf (float x){ #define __CLC_FUNCTION __spirv_ocl_tanh #define __CLC_BUILTIN __nv_tanh -#define __CLC_BUILTIN_F __my_tanhf +#define __CLC_BUILTIN_F __select_tanhf #include From 3db7642d63c3a9b02a0404b84c4593c0cf53d64e Mon Sep 17 00:00:00 2001 From: pgorlani <92453485+pgorlani@users.noreply.github.com> Date: Mon, 31 Jan 2022 09:18:10 +0000 Subject: [PATCH 03/11] Update flag description --- clang/include/clang/Driver/Options.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 35703d6985b9..92130fca0d03 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -987,7 +987,7 @@ defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt", BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>; defm nvvm_cuda_approx_tanhf : BoolFOption<"cuda-approx-tanhf", TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse, - PosFlag= 7.5">, + PosFlag= 8.0">, NegFlag>; def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group, HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">; From 7b89c001a8250153a81747e7f0249ec80db9a148 Mon Sep 17 00:00:00 2001 From: pgorlani <92453485+pgorlani@users.noreply.github.com> Date: Mon, 31 Jan 2022 09:52:15 +0000 Subject: [PATCH 04/11] Update libclc/ptx-nvidiacl/libspirv/math/tanh.cl Co-authored-by: Alexey Bader --- libclc/ptx-nvidiacl/libspirv/math/tanh.cl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl index 1f9ae67437e6..6f79aed1b572 100644 --- a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl +++ b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl @@ -14,12 +14,11 @@ int __clc_nvvm_reflect_arch(); int __clc_nvvm_reflect_approx_tanh(); -float __select_tanhf (float x){ - if(__clc_nvvm_reflect_approx_tanh() && __clc_nvvm_reflect_arch() >= 800) { +float __select_tanhf (float x) { + if (__clc_nvvm_reflect_approx_tanh() && __clc_nvvm_reflect_arch() >= 800) { return __nvvm_tanh_approx_f(x); - } else { - return __nv_tanhf(x); } + return __nv_tanhf(x); } #define __CLC_FUNCTION __spirv_ocl_tanh From 44cd96cfd792939868e5c035041f4341ec5d2714 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Mon, 31 Jan 2022 10:13:58 +0000 Subject: [PATCH 05/11] Increase intrinsic architecture --- clang/include/clang/Basic/BuiltinsNVPTX.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index dbde3232f9a9..3153d8687fdf 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -220,7 +220,7 @@ BUILTIN(__nvvm_cos_approx_f, "ff", "") // Tanh -TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_75,PTX70)) +TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_80,PTX70)) // Fma From f6cba9c3909b2fd710e285e2223c4d3e594cee9f Mon Sep 17 00:00:00 2001 From: pgorlani <92453485+pgorlani@users.noreply.github.com> Date: Mon, 7 Feb 2022 09:12:16 +0000 Subject: [PATCH 06/11] Update flag message --- clang/include/clang/Driver/Options.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 92130fca0d03..c8514947c3a5 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -987,7 +987,7 @@ defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt", BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>; defm nvvm_cuda_approx_tanhf : BoolFOption<"cuda-approx-tanhf", TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse, - PosFlag= 8.0">, + PosFlag= 8.0">, NegFlag>; def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group, HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">; From 8efe3c12abd8e3ef04b8788b743bbfc1378a52e9 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Wed, 16 Feb 2022 17:29:35 +0000 Subject: [PATCH 07/11] rename flag, apply module flag only for sycl --- clang/include/clang/Driver/Options.td | 9 ++++----- clang/lib/CodeGen/CodeGenModule.cpp | 3 +++ clang/test/CodeGenCUDA/flush-denormals.cu | 4 ++-- clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu | 6 +++--- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index c8514947c3a5..d6a26c8a2404 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -985,10 +985,6 @@ defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt", PosFlag, NegFlag, BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>; -defm nvvm_cuda_approx_tanhf : BoolFOption<"cuda-approx-tanhf", - TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse, - PosFlag= 8.0">, - NegFlag>; def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group, HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">; def hip_path_EQ : Joined<["--"], "hip-path=">, Group, @@ -4730,7 +4726,10 @@ def fno_sycl_device_lib_EQ : CommaJoined<["-"], "fno-sycl-device-lib=">, Group, HelpText<"Control exclusion of " "device libraries from device binary linkage. Valid arguments " "are libc, libm-fp32, libm-fp64, all">; - +defm nvvm_cuda_approx_tanh : BoolFOption<"sycl-cuda-approx-tanh", + TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse, + PosFlag= 8.0">, + NegFlag>; //===----------------------------------------------------------------------===// // FLangOption + CoreOption + NoXarchOption //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index f5e33a824863..5e576a98afa1 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -780,6 +780,9 @@ void CodeGenModule::Release() { llvm::DenormalMode::IEEE); getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-prec-sqrt", getTarget().getTargetOpts().NVVMCudaPrecSqrt); + } + + if ( LangOpts.isSYCL() && getTriple().isNVPTX()) { getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-approx-tanhf", getTarget().getTargetOpts().NVVMCudaApproxTanhf); diff --git a/clang/test/CodeGenCUDA/flush-denormals.cu b/clang/test/CodeGenCUDA/flush-denormals.cu index ca0ac97859ef..f19132b0b464 100644 --- a/clang/test/CodeGenCUDA/flush-denormals.cu +++ b/clang/test/CodeGenCUDA/flush-denormals.cu @@ -44,8 +44,8 @@ extern "C" __device__ void foo() {} // FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" // NOFTZ-NOT: "denormal-fp-math-f32" -// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}} +// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}} // PTXFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1} -// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}} +// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}} // PTXNOFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 0} diff --git a/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu b/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu index a63cbfff514b..ec5c335d047d 100644 --- a/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu +++ b/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu @@ -1,9 +1,9 @@ -// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm -fcuda-approx-tanhf %s -o -| FileCheck --check-prefix=CHECK-ON %s -// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s +// RUN: %clang_cc1 -fsycl-is-device -triple nvptx64-nvidia-cuda -emit-llvm -fsycl-cuda-approx-tanh %s -o -| FileCheck --check-prefix=CHECK-ON %s +// RUN: %clang_cc1 -fsycl-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s #include "Inputs/cuda.h" -// Check that the -fcuda-approx-tanhf flag correctly sets the nvvm-reflect module flags. +// Check that the -fsycl-cuda-approx-tanh flag correctly sets the nvvm-reflect module flags. extern "C" __device__ void foo() {} From 464e7611c1b38ee8090455cbc1905d82b0ffc113 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Wed, 23 Feb 2022 16:10:52 +0000 Subject: [PATCH 08/11] add f16 and f16x2 intrinsics --- clang/include/clang/Basic/BuiltinsNVPTX.def | 2 ++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 4 ++++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 3153d8687fdf..72d9815dc2f8 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -221,6 +221,8 @@ BUILTIN(__nvvm_cos_approx_f, "ff", "") // Tanh TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_tanh_approx_f16, "hh", "", AND(SM_75, PTX70)) +TARGET_BUILTIN(__nvvm_tanh_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70)) // Fma diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index bee180f80bee..8e5ad53fc298 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -814,6 +814,10 @@ let TargetPrefix = "nvvm" in { def int_nvvm_tanh_approx_f : GCCBuiltin<"__nvvm_tanh_approx_f">, DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_nvvm_tanh_approx_f16 : GCCBuiltin<"__nvvm_tanh_approx_f16">, + DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>; + def int_nvvm_tanh_approx_f16x2 : GCCBuiltin<"__nvvm_tanh_approx_f16x2">, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>; // // Fma diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 507e667fc284..f0bacf9dadce 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -790,6 +790,10 @@ def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;", def INT_NVVM_TANH_APPROX_F : F_MATH_1<"tanh.approx.f32 \t$dst, $src0;", Float32Regs, Float32Regs, int_nvvm_tanh_approx_f>; +def INT_NVVM_TANH_APPROX_F16 : F_MATH_1<"tanh.approx.f16 \t$dst, $src0;", + Float16Regs, Float16Regs, int_nvvm_tanh_approx_f16>; +def INT_NVVM_TANH_APPROX_F16X2 : F_MATH_1<"tanh.approx.f16x2 \t$dst, $src0;", + Float16x2Regs, Float16x2Regs, int_nvvm_tanh_approx_f16x2>; // // Fma From 5db0f72a2eccbed13b0f94ab6d5ea3b05858f647 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Thu, 24 Feb 2022 09:59:44 +0000 Subject: [PATCH 09/11] add f16 and f16x2 to liclc with half vects made of f16x2 --- clang/lib/CodeGen/CodeGenModule.cpp | 2 +- libclc/generic/include/clcmacro.h | 18 ++++---- libclc/ptx-nvidiacl/libspirv/math/tanh.cl | 52 +++++++++++++++++------ 3 files changed, 49 insertions(+), 23 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 5e576a98afa1..741d0992a2ee 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -782,7 +782,7 @@ void CodeGenModule::Release() { getTarget().getTargetOpts().NVVMCudaPrecSqrt); } - if ( LangOpts.isSYCL() && getTriple().isNVPTX()) { + if (LangOpts.isSYCL() && getTriple().isNVPTX()) { getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-approx-tanhf", getTarget().getTargetOpts().NVVMCudaApproxTanhf); diff --git a/libclc/generic/include/clcmacro.h b/libclc/generic/include/clcmacro.h index d4167a8e4529..3bb309324422 100644 --- a/libclc/generic/include/clcmacro.h +++ b/libclc/generic/include/clcmacro.h @@ -9,11 +9,7 @@ #ifndef __CLC_MACRO_H #define __CLC_MACRO_H -#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ - DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \ - return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \ - } \ - \ +#define _CLC_UNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) { \ return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z)); \ } \ @@ -30,12 +26,14 @@ return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi)); \ } -#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ - ARG2_TYPE) \ - DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \ - return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \ +#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \ + return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \ } \ - \ + _CLC_UNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) + +#define _CLC_BINARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE) \ DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) { \ return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y), \ FUNCTION(x.z, y.z)); \ diff --git a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl index 6f79aed1b572..9a36fc24036f 100644 --- a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl +++ b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl @@ -11,17 +11,45 @@ #include "../../include/libdevice.h" #include -int __clc_nvvm_reflect_arch(); -int __clc_nvvm_reflect_approx_tanh(); - -float __select_tanhf (float x) { - if (__clc_nvvm_reflect_approx_tanh() && __clc_nvvm_reflect_arch() >= 800) { - return __nvvm_tanh_approx_f(x); - } - return __nv_tanhf(x); +extern int __clc_nvvm_reflect_arch(); +extern int __clc_nvvm_reflect_approx_tanh(); + +#define __USE_TANH_APPROX \ + (__clc_nvvm_reflect_approx_tanh() && (__clc_nvvm_reflect_arch() >= 750)) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DEF _CLC_OVERLOAD double __spirv_ocl_tanh(double x) { + return __nv_tanh(x); } -#define __CLC_FUNCTION __spirv_ocl_tanh -#define __CLC_BUILTIN __nv_tanh -#define __CLC_BUILTIN_F __select_tanhf -#include +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_tanh, double) + +#endif + +_CLC_DEF _CLC_OVERLOAD float __spirv_ocl_tanh(float x) { + return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f(x) : __nv_tanhf(x); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __spirv_ocl_tanh, float) + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEF _CLC_OVERLOAD half __spirv_ocl_tanh(half x) { + return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f16(x) : __nv_tanhf(x); +} + +_CLC_DEF _CLC_OVERLOAD half2 __spirv_ocl_tanh(half2 x) { + return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f16x2(x) + : (half2)(__nv_tanhf(x.x), __nv_tanhf(x.y)); +} + +_CLC_UNARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_tanh, half) + +#endif + +#undef __USE_TANH_APPROX From 75e1ee8411dd0e0919339d35200d48c8f87800cc Mon Sep 17 00:00:00 2001 From: pgorlani Date: Thu, 24 Feb 2022 10:14:14 +0000 Subject: [PATCH 10/11] bring back to sm_75 considering https://github.com/intel/llvm/pull/5642 --- clang/include/clang/Basic/BuiltinsNVPTX.def | 2 +- clang/include/clang/Driver/Options.td | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 72d9815dc2f8..35c7e731b7d1 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -220,7 +220,7 @@ BUILTIN(__nvvm_cos_approx_f, "ff", "") // Tanh -TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_75,PTX70)) TARGET_BUILTIN(__nvvm_tanh_approx_f16, "hh", "", AND(SM_75, PTX70)) TARGET_BUILTIN(__nvvm_tanh_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70)) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index d6a26c8a2404..e650b342f157 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4728,7 +4728,7 @@ def fno_sycl_device_lib_EQ : CommaJoined<["-"], "fno-sycl-device-lib=">, Group; defm nvvm_cuda_approx_tanh : BoolFOption<"sycl-cuda-approx-tanh", TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse, - PosFlag= 8.0">, + PosFlag=7.5">, NegFlag>; //===----------------------------------------------------------------------===// // FLangOption + CoreOption + NoXarchOption From 5f780f6d717a0c29398c857278c41bc3ec14904e Mon Sep 17 00:00:00 2001 From: pgorlani Date: Thu, 24 Feb 2022 13:11:46 +0000 Subject: [PATCH 11/11] add have2 vectorization --- libclc/generic/include/clcmacro.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/libclc/generic/include/clcmacro.h b/libclc/generic/include/clcmacro.h index 3bb309324422..b53043aa1966 100644 --- a/libclc/generic/include/clcmacro.h +++ b/libclc/generic/include/clcmacro.h @@ -51,6 +51,14 @@ return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \ } +#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \ + return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \ + } \ + _CLC_BINARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE) + #define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \ @@ -105,6 +113,15 @@ FUNCTION(x.hi, y.hi, z.hi)); \ } +#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE, ARG3_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, \ + ARG3_TYPE##2 z) { \ + return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \ + } \ + _CLC_TERNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ + ARG2_TYPE, ARG3_TYPE) + #define _CLC_V_S_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ ARG2_TYPE, ARG3_TYPE) \ DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##2 z) { \