intel · aelovikov-intel · Jan 30, 2025 · Jan 20, 2025 · Jan 21, 2025 · Jan 22, 2025
@@ -18,6 +18,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/FormatVariadic.h"
 
@@ -106,6 +107,48 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) {
   return true;
 }
 
+static bool replaceWithNVPTXCalls(FPBuiltinIntrinsic &BuiltinCall) {
+  IRBuilder<> IRBuilder(&BuiltinCall);
+  SmallVector<Value *> Args(BuiltinCall.args());
+  Value *Replacement = nullptr;
+  // To chose between ftz and non-ftz intrinsic.
+  FastMathFlags FMF = BuiltinCall.getFastMathFlags();
+  auto *Type = BuiltinCall.getType();
+  // For now only add lowering for fdiv and sqrt. Yet nvvm intrinsics have
+  // approximate variants for sin, cos, exp2 and log2.
+  // For vector fpbuiltins for NVPTX target we don't have nvvm intrinsics, use
+  // standart for LLVM math operations. Also nvvm fdiv and sqrt intrisics
+  // support only float type.
+  switch (BuiltinCall.getIntrinsicID()) {
+  case Intrinsic::fpbuiltin_fdiv:
+    if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
+      return replaceWithLLVMIR(BuiltinCall);
+    Replacement =
+        IRBuilder.CreateIntrinsic(Type,
+                                  FMF.isFast()
+                                  ? Intrinsic::nvvm_div_approx_ftz_f
+                                  : Intrinsic::nvvm_div_approx_f, Args);
+    break;
+  case Intrinsic::fpbuiltin_sqrt:
+    if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy())
+      return replaceWithLLVMIR(BuiltinCall);
+    Replacement =
+        IRBuilder.CreateIntrinsic(BuiltinCall.getType(),
+                                  FMF.isFast()
+                                  ? Intrinsic::nvvm_sqrt_approx_ftz_f
+                                  : Intrinsic::nvvm_sqrt_approx_f, Args);
+    break;
+  default:
+    return false;
+  }
+  BuiltinCall.replaceAllUsesWith(Replacement);
+  cast<Instruction>(Replacement)->copyFastMathFlags(&BuiltinCall);
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
+                    << BuiltinCall.getCalledFunction()->getName()
+                    << "` with equivalent IR. \n `");
+  return true;
+}
+
 static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
                                       const TargetTransformInfo &TTI,
                                       FPBuiltinIntrinsic &BuiltinCall) {
@@ -154,6 +197,14 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI,
     }
   }
 
+  // We don't have implementation for CUDA approximate precision builtins.
+  // Lets map them on NVPTX intrinsics. If no appropriate intrinsics are known
+  // - skip to replaceWithAltMathFunction.
+  if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() == 3.0) {
+    if (replaceWithNVPTXCalls(BuiltinCall))
+      return true;
+  }
+
   /// Call TLI to select a function implementation to call
   StringRef ImplName = TLI.selectFPBuiltinImplementation(&BuiltinCall);
   if (ImplName.empty()) {

diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-3.0.ll
@@ -0,0 +1,81 @@
+; RUN: opt -fpbuiltin-fn-selection -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: @test_fdiv
+; CHECK: %{{.*}} = call float @llvm.nvvm.div.approx.f(float %{{.*}}, float %{{.*}})
+; CHECK: %{{.*}} = fdiv <2 x float> %{{.*}}, %{{.*}}
+define void @test_fdiv(float %d1, <2 x float> %v2d1,
+                       float %d2, <2 x float> %v2d2) {
+entry:
+  %t0 = call float @llvm.fpbuiltin.fdiv.f32(float %d1, float %d2) #0
+  %t1 = call <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float> %v2d1, <2 x float> %v2d2) #0
+  ret void
+}
+
+; CHECK-LABEL: @test_fdiv_fast
+; CHECK: %{{.*}} = call fast float @llvm.nvvm.div.approx.ftz.f(float %{{.*}}, float %{{.*}})
+; CHECK: %{{.*}} = fdiv fast <2 x float> %{{.*}}, %{{.*}}
+define void @test_fdiv_fast(float %d1, <2 x float> %v2d1,
+                            float %d2, <2 x float> %v2d2) {
+entry:
+  %t0 = call fast float @llvm.fpbuiltin.fdiv.f32(float %d1, float %d2) #0
+  %t1 = call fast <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float> %v2d1, <2 x float> %v2d2) #0
+  ret void
+}
+
+declare float @llvm.fpbuiltin.fdiv.f32(float, float)
+declare <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float>, <2 x float>)
+
+; CHECK-LABEL: @test_fdiv_double
+; CHECK: %{{.*}} = fdiv double %{{.*}}, %{{.*}}
+; CHECK: %{{.*}} = fdiv <2 x double> %{{.*}}, %{{.*}}
+define void @test_fdiv_double(double %d1, <2 x double> %v2d1,
+                              double %d2, <2 x double> %v2d2) {
+entry:
+  %t0 = call double @llvm.fpbuiltin.fdiv.f64(double %d1, double %d2) #0
+  %t1 = call <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0
+  ret void
+}
+
+declare double @llvm.fpbuiltin.fdiv.f64(double, double)
+declare <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double>, <2 x double>)
+
+; CHECK-LABEL: @test_sqrt
+; CHECK: %{{.*}} = call float @llvm.nvvm.sqrt.approx.f(float %{{.*}})
+; CHECK: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}})
+define void @test_sqrt(float %d, <2 x float> %v2d, <4 x float> %v4d) {
+entry:
+  %t0 = call float @llvm.fpbuiltin.sqrt.f32(float %d) #0
+  %t1 = call <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #0
+  ret void
+}
+
+; CHECK-LABEL: @test_sqrt_fast
+; CHECK: %{{.*}} = call fast float @llvm.nvvm.sqrt.approx.ftz.f(float %{{.*}})
+; CHECK: %{{.*}} = call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}})
+define void @test_sqrt_fast(float %d, <2 x float> %v2d, <4 x float> %v4d) {
+entry:
+  %t0 = call fast float @llvm.fpbuiltin.sqrt.f32(float %d) #0
+  %t1 = call fast <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #0
+  ret void
+}
+
+declare float @llvm.fpbuiltin.sqrt.f32(float)
+declare <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float>)
+
+; CHECK-LABEL: @test_sqrt_double
+; CHECK: %{{.*}} = call double @llvm.sqrt.f64(double %{{.*}})
+; CHECK: %{{.*}} = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{.*}})
+define void @test_sqrt_double(double %d, <2 x double> %v2d) {
+entry:
+  %t0 = call double @llvm.fpbuiltin.sqrt.f64(double %d) #0
+  %t1 = call <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double> %v2d) #0
+  ret void
+}
+
+declare double @llvm.fpbuiltin.sqrt.f64(double)
+declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>)
+
+attributes #0 = { "fpbuiltin-max-error"="3.0" }