From 4f30a79ac85e7c2e3cb5a8169c1003821f4c4aea Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Thu, 6 Jan 2022 12:00:44 +0000
Subject: [PATCH 01/11] [SYCL][CUDA][libclc] Add approx. tanhf built-in

This patch adds the support for an approximate
hyperbolic tangent single-precision built-in
function introduced in PTX 7.0 for devices having
compute capabilities >= 7.5.

If this built-in is available, it is possible use
it by setting the `-fcuda-approx-tanhf` flag.
---
 clang/include/clang/Basic/BuiltinsNVPTX.def        |  4 ++++
 clang/include/clang/Basic/TargetOptions.h          |  3 +++
 clang/include/clang/Driver/Options.td              |  4 ++++
 clang/lib/CodeGen/CodeGenModule.cpp                |  3 +++
 clang/test/CodeGenCUDA/flush-denormals.cu          |  4 ++--
 clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu | 11 +++++++++++
 libclc/ptx-nvidiacl/libspirv/math/tanh.cl          | 13 ++++++++++++-
 libclc/ptx-nvidiacl/libspirv/reflect.ll            |  7 +++++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td             |  7 +++++++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td           |  7 +++++++
 llvm/lib/Target/NVPTX/NVVMReflect.cpp              |  6 ++++++
 11 files changed, 66 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index bb67c1abf2a6..dbde3232f9a9 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -218,6 +218,10 @@ BUILTIN(__nvvm_sin_approx_f, "ff", "")
 BUILTIN(__nvvm_cos_approx_ftz_f, "ff", "")
 BUILTIN(__nvvm_cos_approx_f, "ff", "")
 
+// Tanh
+
+TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_75,PTX70))
+
 // Fma
 
 BUILTIN(__nvvm_fma_rn_ftz_f, "ffff", "")
diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h
index 07542feb9d81..dafccbf00bb1 100644
--- a/clang/include/clang/Basic/TargetOptions.h
+++ b/clang/include/clang/Basic/TargetOptions.h
@@ -78,6 +78,9 @@ class TargetOptions {
   /// \brief If enabled, use precise square root
   bool NVVMCudaPrecSqrt = false;
 
+  /// \brief If enabled, use approximate tanh
+  bool NVVMCudaApproxTanhf = false;
+
   /// \brief If enabled, allow AMDGPU unsafe floating point atomics.
   bool AllowAMDGPUUnsafeFPAtomics = false;
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index ad85abdf1bb9..35703d6985b9 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -985,6 +985,10 @@ defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt",
   PosFlag<SetTrue, [CC1Option], "Specify">,
   NegFlag<SetFalse, [], "Don't specify">,
   BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>;
+defm nvvm_cuda_approx_tanhf : BoolFOption<"cuda-approx-tanhf",
+  TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse,
+  PosFlag<SetTrue, [CC1Option], "Use the built-in fast approximation of tanhf function. Device needs to have a compute capability >= 7.5">,
+  NegFlag<SetFalse>>;
 def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<i_Group>,
   HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;
 def hip_path_EQ : Joined<["--"], "hip-path=">, Group<i_Group>,
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index e04906935483..f5e33a824863 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -780,6 +780,9 @@ void CodeGenModule::Release() {
                                   llvm::DenormalMode::IEEE);
     getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-prec-sqrt",
                               getTarget().getTargetOpts().NVVMCudaPrecSqrt);
+    getModule().addModuleFlag(llvm::Module::Override,
+                              "nvvm-reflect-approx-tanhf",
+                              getTarget().getTargetOpts().NVVMCudaApproxTanhf);
   }
 
   if (LangOpts.EHAsynch)
diff --git a/clang/test/CodeGenCUDA/flush-denormals.cu b/clang/test/CodeGenCUDA/flush-denormals.cu
index f19132b0b464..ca0ac97859ef 100644
--- a/clang/test/CodeGenCUDA/flush-denormals.cu
+++ b/clang/test/CodeGenCUDA/flush-denormals.cu
@@ -44,8 +44,8 @@ extern "C" __device__ void foo() {}
 // FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
 // NOFTZ-NOT: "denormal-fp-math-f32"
 
-// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
+// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
 // PTXFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
 
-// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
+// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
 // PTXNOFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 0}
diff --git a/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu b/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu
new file mode 100644
index 000000000000..a63cbfff514b
--- /dev/null
+++ b/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm  -fcuda-approx-tanhf %s -o -| FileCheck --check-prefix=CHECK-ON %s
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s
+
+#include "Inputs/cuda.h"
+
+// Check that the -fcuda-approx-tanhf flag correctly sets the nvvm-reflect module flags.
+
+extern "C" __device__ void foo() {}
+
+// CHECK-ON: !{i32 4, !"nvvm-reflect-approx-tanhf", i32 1}
+// CHECK-OFF: !{i32 4, !"nvvm-reflect-approx-tanhf", i32 0}
diff --git a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl
index f443c36c0411..682428762164 100644
--- a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl
+++ b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl
@@ -11,7 +11,18 @@
 #include "../../include/libdevice.h"
 #include <clcmacro.h>
 
+int __clc_nvvm_reflect_arch();
+int __clc_nvvm_reflect_approx_tanh();
+
+float __my_tanhf (float x){
+  if(__clc_nvvm_reflect_approx_tanh()) {
+    return __nvvm_tanh_approx_f(x);
+  } else {
+    return __nv_tanhf(x);
+  }
+}
+
 #define __CLC_FUNCTION __spirv_ocl_tanh
 #define __CLC_BUILTIN __nv_tanh
-#define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f)
+#define __CLC_BUILTIN_F __my_tanhf
 #include <math/unary_builtin.inc>
diff --git a/libclc/ptx-nvidiacl/libspirv/reflect.ll b/libclc/ptx-nvidiacl/libspirv/reflect.ll
index 91ae4135644d..978489adaf19 100755
--- a/libclc/ptx-nvidiacl/libspirv/reflect.ll
+++ b/libclc/ptx-nvidiacl/libspirv/reflect.ll
@@ -6,3 +6,10 @@ define i32 @__clc_nvvm_reflect_arch() alwaysinline {
   %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([12 x i8], [12 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*))
   ret i32 %reflect
 }
+
+@str_approx_tanh = private addrspace(1) constant [20 x i8] c"__CUDA_APPROX_TANHF\00"
+
+define i32 @__clc_nvvm_reflect_approx_tanh() alwaysinline {
+  %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([20 x i8], [20 x i8] addrspace(1)* @str_approx_tanh, i32 0, i32 0) to i8*))
+  ret i32 %reflect
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index fb7598d6ac75..bee180f80bee 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -808,6 +808,13 @@ let TargetPrefix = "nvvm" in {
   def int_nvvm_cos_approx_f : GCCBuiltin<"__nvvm_cos_approx_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
 
+//
+// Tanh
+//
+
+  def int_nvvm_tanh_approx_f : GCCBuiltin<"__nvvm_tanh_approx_f">,
+       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
 //
 // Fma
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index d7c8acd14608..507e667fc284 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -784,6 +784,13 @@ def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;",
 def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
   Float32Regs, Float32Regs, int_nvvm_cos_approx_f>;
 
+//
+// Tanh
+//
+
+def INT_NVVM_TANH_APPROX_F : F_MATH_1<"tanh.approx.f32 \t$dst, $src0;",
+   Float32Regs, Float32Regs, int_nvvm_tanh_approx_f>;
+
 //
 // Fma
 //
diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index 6ffc49a59a55..396fa1880520 100644
--- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -176,6 +176,12 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
       if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
               F.getParent()->getModuleFlag("nvvm-reflect-prec-sqrt")))
         ReflectVal = Flag->getSExtValue();
+    } else if (ReflectArg == "__CUDA_APPROX_TANHF") {
+      // Try to pull __CUDA_APPROX_TANHF from the nvvm-reflect-approx-tanhf
+      // module flag.
+      if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
+              F.getParent()->getModuleFlag("nvvm-reflect-approx-tanhf")))
+        ReflectVal = Flag->getSExtValue();
     }
     Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
     ToRemove.push_back(Call);

From a151a75ef8a718855aec23d476fa2c1c410796a5 Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Wed, 26 Jan 2022 09:37:11 +0000
Subject: [PATCH 02/11] Add architecture check in libclc

---
 libclc/ptx-nvidiacl/libspirv/math/tanh.cl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl
index 682428762164..1f9ae67437e6 100644
--- a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl
+++ b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl
@@ -14,8 +14,8 @@
 int __clc_nvvm_reflect_arch();
 int __clc_nvvm_reflect_approx_tanh();
 
-float __my_tanhf (float x){
-  if(__clc_nvvm_reflect_approx_tanh()) {
+float __select_tanhf (float x){
+  if(__clc_nvvm_reflect_approx_tanh() && __clc_nvvm_reflect_arch() >= 800) {
     return __nvvm_tanh_approx_f(x);
   } else {
     return __nv_tanhf(x);
@@ -24,5 +24,5 @@ float __my_tanhf (float x){
 
 #define __CLC_FUNCTION __spirv_ocl_tanh
 #define __CLC_BUILTIN __nv_tanh
-#define __CLC_BUILTIN_F __my_tanhf
+#define __CLC_BUILTIN_F __select_tanhf
 #include <math/unary_builtin.inc>

From 3db7642d63c3a9b02a0404b84c4593c0cf53d64e Mon Sep 17 00:00:00 2001
From: pgorlani <92453485+pgorlani@users.noreply.github.com>
Date: Mon, 31 Jan 2022 09:18:10 +0000
Subject: [PATCH 03/11] Update flag description

---
 clang/include/clang/Driver/Options.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 35703d6985b9..92130fca0d03 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -987,7 +987,7 @@ defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt",
   BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>;
 defm nvvm_cuda_approx_tanhf : BoolFOption<"cuda-approx-tanhf",
   TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse,
-  PosFlag<SetTrue, [CC1Option], "Use the built-in fast approximation of tanhf function. Device needs to have a compute capability >= 7.5">,
+  PosFlag<SetTrue, [CC1Option], "Use the built-in fast approximation of tanhf function. Device needs to have a compute capability >= 8.0">,
   NegFlag<SetFalse>>;
 def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<i_Group>,
   HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;

From 7b89c001a8250153a81747e7f0249ec80db9a148 Mon Sep 17 00:00:00 2001
From: pgorlani <92453485+pgorlani@users.noreply.github.com>
Date: Mon, 31 Jan 2022 09:52:15 +0000
Subject: [PATCH 04/11] Update libclc/ptx-nvidiacl/libspirv/math/tanh.cl

Co-authored-by: Alexey Bader <alexey.bader@intel.com>
---
 libclc/ptx-nvidiacl/libspirv/math/tanh.cl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl
index 1f9ae67437e6..6f79aed1b572 100644
--- a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl
+++ b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl
@@ -14,12 +14,11 @@
 int __clc_nvvm_reflect_arch();
 int __clc_nvvm_reflect_approx_tanh();
 
-float __select_tanhf (float x){
-  if(__clc_nvvm_reflect_approx_tanh() && __clc_nvvm_reflect_arch() >= 800) {
+float __select_tanhf (float x) {
+  if (__clc_nvvm_reflect_approx_tanh() && __clc_nvvm_reflect_arch() >= 800) {
     return __nvvm_tanh_approx_f(x);
-  } else {
-    return __nv_tanhf(x);
   }
+  return __nv_tanhf(x);
 }
 
 #define __CLC_FUNCTION __spirv_ocl_tanh

From 44cd96cfd792939868e5c035041f4341ec5d2714 Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Mon, 31 Jan 2022 10:13:58 +0000
Subject: [PATCH 05/11] Increase intrinsic architecture

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index dbde3232f9a9..3153d8687fdf 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -220,7 +220,7 @@ BUILTIN(__nvvm_cos_approx_f, "ff", "")
 
 // Tanh
 
-TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_75,PTX70))
+TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_80,PTX70))
 
 // Fma
 

From f6cba9c3909b2fd710e285e2223c4d3e594cee9f Mon Sep 17 00:00:00 2001
From: pgorlani <92453485+pgorlani@users.noreply.github.com>
Date: Mon, 7 Feb 2022 09:12:16 +0000
Subject: [PATCH 06/11] Update flag message

---
 clang/include/clang/Driver/Options.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 92130fca0d03..c8514947c3a5 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -987,7 +987,7 @@ defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt",
   BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>;
 defm nvvm_cuda_approx_tanhf : BoolFOption<"cuda-approx-tanhf",
   TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse,
-  PosFlag<SetTrue, [CC1Option], "Use the built-in fast approximation of tanhf function. Device needs to have a compute capability >= 8.0">,
+  PosFlag<SetTrue, [CC1Option], "Only SYCL. Use the built-in fast approximation of tanhf function. Device needs to have a compute capability >= 8.0">,
   NegFlag<SetFalse>>;
 def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<i_Group>,
   HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;

From 8efe3c12abd8e3ef04b8788b743bbfc1378a52e9 Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Wed, 16 Feb 2022 17:29:35 +0000
Subject: [PATCH 07/11] rename flag, apply module flag only for sycl

---
 clang/include/clang/Driver/Options.td              | 9 ++++-----
 clang/lib/CodeGen/CodeGenModule.cpp                | 3 +++
 clang/test/CodeGenCUDA/flush-denormals.cu          | 4 ++--
 clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu | 6 +++---
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c8514947c3a5..d6a26c8a2404 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -985,10 +985,6 @@ defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt",
   PosFlag<SetTrue, [CC1Option], "Specify">,
   NegFlag<SetFalse, [], "Don't specify">,
   BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>;
-defm nvvm_cuda_approx_tanhf : BoolFOption<"cuda-approx-tanhf",
-  TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse,
-  PosFlag<SetTrue, [CC1Option], "Only SYCL. Use the built-in fast approximation of tanhf function. Device needs to have a compute capability >= 8.0">,
-  NegFlag<SetFalse>>;
 def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<i_Group>,
   HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;
 def hip_path_EQ : Joined<["--"], "hip-path=">, Group<i_Group>,
@@ -4730,7 +4726,10 @@ def fno_sycl_device_lib_EQ : CommaJoined<["-"], "fno-sycl-device-lib=">, Group<s
   Values<"libc, libm-fp32, libm-fp64, all">, HelpText<"Control exclusion of "
   "device libraries from device binary linkage. Valid arguments "
   "are libc, libm-fp32, libm-fp64, all">;
-
+defm nvvm_cuda_approx_tanh : BoolFOption<"sycl-cuda-approx-tanh",
+  TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse,
+  PosFlag<SetTrue, [CC1Option], "Use the built-in fast approximation of tanh function. Device needs to have a compute capability >= 8.0">,
+  NegFlag<SetFalse>>;
 //===----------------------------------------------------------------------===//
 // FLangOption + CoreOption + NoXarchOption
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index f5e33a824863..5e576a98afa1 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -780,6 +780,9 @@ void CodeGenModule::Release() {
                                   llvm::DenormalMode::IEEE);
     getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-prec-sqrt",
                               getTarget().getTargetOpts().NVVMCudaPrecSqrt);
+  }
+
+  if ( LangOpts.isSYCL() && getTriple().isNVPTX()) {
     getModule().addModuleFlag(llvm::Module::Override,
                               "nvvm-reflect-approx-tanhf",
                               getTarget().getTargetOpts().NVVMCudaApproxTanhf);
diff --git a/clang/test/CodeGenCUDA/flush-denormals.cu b/clang/test/CodeGenCUDA/flush-denormals.cu
index ca0ac97859ef..f19132b0b464 100644
--- a/clang/test/CodeGenCUDA/flush-denormals.cu
+++ b/clang/test/CodeGenCUDA/flush-denormals.cu
@@ -44,8 +44,8 @@ extern "C" __device__ void foo() {}
 // FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
 // NOFTZ-NOT: "denormal-fp-math-f32"
 
-// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
+// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
 // PTXFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
 
-// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
+// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
 // PTXNOFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 0}
diff --git a/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu b/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu
index a63cbfff514b..ec5c335d047d 100644
--- a/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu
+++ b/clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm  -fcuda-approx-tanhf %s -o -| FileCheck --check-prefix=CHECK-ON %s
-// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s
+// RUN: %clang_cc1 -fsycl-is-device -triple nvptx64-nvidia-cuda -emit-llvm  -fsycl-cuda-approx-tanh %s -o -| FileCheck --check-prefix=CHECK-ON %s
+// RUN: %clang_cc1 -fsycl-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s
 
 #include "Inputs/cuda.h"
 
-// Check that the -fcuda-approx-tanhf flag correctly sets the nvvm-reflect module flags.
+// Check that the -fsycl-cuda-approx-tanh flag correctly sets the nvvm-reflect module flags.
 
 extern "C" __device__ void foo() {}
 

From 464e7611c1b38ee8090455cbc1905d82b0ffc113 Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Wed, 23 Feb 2022 16:10:52 +0000
Subject: [PATCH 08/11] add f16 and f16x2 intrinsics

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 2 ++
 llvm/include/llvm/IR/IntrinsicsNVVM.td      | 4 ++++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td    | 4 ++++
 3 files changed, 10 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 3153d8687fdf..72d9815dc2f8 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -221,6 +221,8 @@ BUILTIN(__nvvm_cos_approx_f, "ff", "")
 // Tanh
 
 TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_tanh_approx_f16, "hh", "", AND(SM_75, PTX70))
+TARGET_BUILTIN(__nvvm_tanh_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70))
 
 // Fma
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index bee180f80bee..8e5ad53fc298 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -814,6 +814,10 @@ let TargetPrefix = "nvvm" in {
 
   def int_nvvm_tanh_approx_f : GCCBuiltin<"__nvvm_tanh_approx_f">,
        DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_tanh_approx_f16 : GCCBuiltin<"__nvvm_tanh_approx_f16">,
+      DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>;
+  def int_nvvm_tanh_approx_f16x2 : GCCBuiltin<"__nvvm_tanh_approx_f16x2">,
+      DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>;
 
 //
 // Fma
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 507e667fc284..f0bacf9dadce 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -790,6 +790,10 @@ def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
 
 def INT_NVVM_TANH_APPROX_F : F_MATH_1<"tanh.approx.f32 \t$dst, $src0;",
    Float32Regs, Float32Regs, int_nvvm_tanh_approx_f>;
+def INT_NVVM_TANH_APPROX_F16 : F_MATH_1<"tanh.approx.f16 \t$dst, $src0;",
+  Float16Regs, Float16Regs, int_nvvm_tanh_approx_f16>;
+def INT_NVVM_TANH_APPROX_F16X2 : F_MATH_1<"tanh.approx.f16x2 \t$dst, $src0;",
+  Float16x2Regs, Float16x2Regs, int_nvvm_tanh_approx_f16x2>;
 
 //
 // Fma

From 5db0f72a2eccbed13b0f94ab6d5ea3b05858f647 Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Thu, 24 Feb 2022 09:59:44 +0000
Subject: [PATCH 09/11] add f16 and f16x2 to liclc with half vects made of
 f16x2

---
 clang/lib/CodeGen/CodeGenModule.cpp       |  2 +-
 libclc/generic/include/clcmacro.h         | 18 ++++----
 libclc/ptx-nvidiacl/libspirv/math/tanh.cl | 52 +++++++++++++++++------
 3 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 5e576a98afa1..741d0992a2ee 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -782,7 +782,7 @@ void CodeGenModule::Release() {
                               getTarget().getTargetOpts().NVVMCudaPrecSqrt);
   }
 
-  if ( LangOpts.isSYCL() && getTriple().isNVPTX()) {
+  if (LangOpts.isSYCL() && getTriple().isNVPTX()) {
     getModule().addModuleFlag(llvm::Module::Override,
                               "nvvm-reflect-approx-tanhf",
                               getTarget().getTargetOpts().NVVMCudaApproxTanhf);
diff --git a/libclc/generic/include/clcmacro.h b/libclc/generic/include/clcmacro.h
index d4167a8e4529..3bb309324422 100644
--- a/libclc/generic/include/clcmacro.h
+++ b/libclc/generic/include/clcmacro.h
@@ -9,11 +9,7 @@
 #ifndef __CLC_MACRO_H
 #define __CLC_MACRO_H
 
-#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE)          \
-  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) {                              \
-    return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y));                        \
-  }                                                                            \
-                                                                               \
+#define _CLC_UNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE)    \
   DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) {                              \
     return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z));         \
   }                                                                            \
@@ -30,12 +26,14 @@
     return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi));                     \
   }
 
-#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,         \
-                              ARG2_TYPE)                                       \
-  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) {              \
-    return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y));              \
+#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE)          \
+  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) {                              \
+    return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y));                        \
   }                                                                            \
-                                                                               \
+  _CLC_UNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE)
+
+#define _CLC_BINARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,   \
+                                    ARG2_TYPE)                                 \
   DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) {              \
     return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y),               \
                          FUNCTION(x.z, y.z));                                  \
diff --git a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl
index 6f79aed1b572..9a36fc24036f 100644
--- a/libclc/ptx-nvidiacl/libspirv/math/tanh.cl
+++ b/libclc/ptx-nvidiacl/libspirv/math/tanh.cl
@@ -11,17 +11,45 @@
 #include "../../include/libdevice.h"
 #include <clcmacro.h>
 
-int __clc_nvvm_reflect_arch();
-int __clc_nvvm_reflect_approx_tanh();
-
-float __select_tanhf (float x) {
-  if (__clc_nvvm_reflect_approx_tanh() && __clc_nvvm_reflect_arch() >= 800) {
-    return __nvvm_tanh_approx_f(x);
-  }
-  return __nv_tanhf(x);
+extern int __clc_nvvm_reflect_arch();
+extern int __clc_nvvm_reflect_approx_tanh();
+
+#define __USE_TANH_APPROX                                                      \
+  (__clc_nvvm_reflect_approx_tanh() && (__clc_nvvm_reflect_arch() >= 750))
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEF _CLC_OVERLOAD double __spirv_ocl_tanh(double x) {
+  return __nv_tanh(x);
 }
 
-#define __CLC_FUNCTION __spirv_ocl_tanh
-#define __CLC_BUILTIN __nv_tanh
-#define __CLC_BUILTIN_F __select_tanhf
-#include <math/unary_builtin.inc>
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_tanh, double)
+
+#endif
+
+_CLC_DEF _CLC_OVERLOAD float __spirv_ocl_tanh(float x) {
+  return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f(x) : __nv_tanhf(x);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __spirv_ocl_tanh, float)
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEF _CLC_OVERLOAD half __spirv_ocl_tanh(half x) {
+  return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f16(x) : __nv_tanhf(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD half2 __spirv_ocl_tanh(half2 x) {
+  return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f16x2(x)
+                             : (half2)(__nv_tanhf(x.x), __nv_tanhf(x.y));
+}
+
+_CLC_UNARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_tanh, half)
+
+#endif
+
+#undef __USE_TANH_APPROX

From 75e1ee8411dd0e0919339d35200d48c8f87800cc Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Thu, 24 Feb 2022 10:14:14 +0000
Subject: [PATCH 10/11] bring back to sm_75 considering
 https://github.com/intel/llvm/pull/5642

---
 clang/include/clang/Basic/BuiltinsNVPTX.def | 2 +-
 clang/include/clang/Driver/Options.td       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 72d9815dc2f8..35c7e731b7d1 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -220,7 +220,7 @@ BUILTIN(__nvvm_cos_approx_f, "ff", "")
 
 // Tanh
 
-TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_75,PTX70))
 TARGET_BUILTIN(__nvvm_tanh_approx_f16, "hh", "", AND(SM_75, PTX70))
 TARGET_BUILTIN(__nvvm_tanh_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70))
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d6a26c8a2404..e650b342f157 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4728,7 +4728,7 @@ def fno_sycl_device_lib_EQ : CommaJoined<["-"], "fno-sycl-device-lib=">, Group<s
   "are libc, libm-fp32, libm-fp64, all">;
 defm nvvm_cuda_approx_tanh : BoolFOption<"sycl-cuda-approx-tanh",
   TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse,
-  PosFlag<SetTrue, [CC1Option], "Use the built-in fast approximation of tanh function. Device needs to have a compute capability >= 8.0">,
+  PosFlag<SetTrue, [CC1Option], "Use the built-in fast approximation of tanh function for devices having c.c.>=7.5">,
   NegFlag<SetFalse>>;
 //===----------------------------------------------------------------------===//
 // FLangOption + CoreOption + NoXarchOption

From 5f780f6d717a0c29398c857278c41bc3ec14904e Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Thu, 24 Feb 2022 13:11:46 +0000
Subject: [PATCH 11/11] add have2 vectorization

---
 libclc/generic/include/clcmacro.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/libclc/generic/include/clcmacro.h b/libclc/generic/include/clcmacro.h
index 3bb309324422..b53043aa1966 100644
--- a/libclc/generic/include/clcmacro.h
+++ b/libclc/generic/include/clcmacro.h
@@ -51,6 +51,14 @@
     return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi));         \
   }
 
+#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,         \
+                              ARG2_TYPE)                                       \
+  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) {              \
+    return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y));              \
+  }                                                                            \
+  _CLC_BINARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,         \
+                              ARG2_TYPE)
+
 #define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,          \
                              ARG2_TYPE)                                        \
   DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) {                 \
@@ -105,6 +113,15 @@
                           FUNCTION(x.hi, y.hi, z.hi));                         \
   }
 
+#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,        \
+                               ARG2_TYPE, ARG3_TYPE)                           \
+  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y,                \
+                                ARG3_TYPE##2 z) {                              \
+    return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y));    \
+  }                                                                            \
+  _CLC_TERNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,        \
+                               ARG2_TYPE, ARG3_TYPE)
+
 #define _CLC_V_S_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,        \
                                ARG2_TYPE, ARG3_TYPE)                           \
   DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##2 z) {    \