Wrap and test some more Float16 intrinsics #2644

kshyatt · 2025-02-06T22:14:23Z

Since I don't have access to an A100 this is going to be a bit "debugging via CI".

kshyatt · 2025-02-07T01:35:35Z

Too much to hope for that they had secretly included these in libdevice. I'll work on doing the @asmcalls tomorrow.

maleadt · 2025-02-10T08:30:18Z

The libdevice file is simply LLVM bitcode, so you can disassemble it to look at the functions in there. Or check out the documentation: https://docs.nvidia.com/cuda/libdevice-users-guide/index.html (there are some h suffixed functions, at least).

kshyatt · 2025-02-10T17:29:16Z

OK! We now have some working Float16 methods for log, exp, log2, log10, exp2 and exp10!

github-actions · 2025-02-10T17:30:00Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/src/device/intrinsics/math.jl b/src/device/intrinsics/math.jl
index 742058006..43518d0d1 100644
--- a/src/device/intrinsics/math.jl
+++ b/src/device/intrinsics/math.jl
@@ -217,7 +217,7 @@ end
 @device_override function Base.exp10(h::Float16)
     # perform computation in Float32 domain
     f = Float32(h)
-    f = fma(f, log2(10.f0), -0.0f0)
+    f = fma(f, log2(10.0f0), -0.0f0)
     f = @fastmath exp2(f)
     r = Float16(f)
 
diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl
index 6fa5768df..2355d0dd0 100644
--- a/test/core/device/intrinsics/math.jl
+++ b/test/core/device/intrinsics/math.jl
@@ -3,7 +3,7 @@ using SpecialFunctions
 @testset "math" begin
     @testset "log10" begin
         for T in (Float32, Float64)
-            @test testf(a->log10.(a), T[100])
+            @test testf(a -> log10.(a), T[100])
         end
     end
 
@@ -14,22 +14,22 @@ using SpecialFunctions
             @test testf((x,y)->x.^y, rand(Float32, 1), -rand(range, 1))
         end
     end
-    
+
     @testset "min/max" begin
         for T in (Float32, Float64)
-            @test testf((x,y)->max.(x, y), rand(Float32, 1), rand(T, 1))
-            @test testf((x,y)->min.(x, y), rand(Float32, 1), rand(T, 1))
+            @test testf((x, y) -> max.(x, y), rand(Float32, 1), rand(T, 1))
+            @test testf((x, y) -> min.(x, y), rand(Float32, 1), rand(T, 1))
         end
     end
 
     @testset "isinf" begin
-      for x in (Inf32, Inf, NaN16, NaN32, NaN)
+        for x in (Inf32, Inf, NaN16, NaN32, NaN)
         @test testf(x->isinf.(x), [x])
       end
     end
 
     @testset "isnan" begin
-      for x in (Inf32, Inf, NaN16, NaN32, NaN)
+        for x in (Inf32, Inf, NaN16, NaN32, NaN)
         @test testf(x->isnan.(x), [x])
       end
     end
@@ -104,20 +104,20 @@ using SpecialFunctions
         # JuliaGPU/CUDA.jl#1085: exp uses Base.sincos performing a global CPU load
         @test testf(x->exp.(x), [1e7im])
     end
-    
+
     @testset "Real - $op" for op in (abs, abs2, exp, exp10, log, log10)
         @testset "$T" for T in (Float16, Float32, Float64)
-            @test testf(x->op.(x), rand(T, 1))
+            @test testf(x -> op.(x), rand(T, 1))
         end
     end
-    
-    @testset "Float16 - $op" for op in (exp,exp2,exp10,log,log2,log10)
-        all_float_16 = collect(reinterpret(Float16, pattern) for pattern in  UInt16(0):UInt16(1):typemax(UInt16))
+
+    @testset "Float16 - $op" for op in (exp, exp2, exp10, log, log2, log10)
+        all_float_16 = collect(reinterpret(Float16, pattern) for pattern in UInt16(0):UInt16(1):typemax(UInt16))
         all_float_16 = filter(!isnan, all_float_16)
         if op in (log, log2, log10)
             all_float_16 = filter(>=(0), all_float_16)
         end
-        @test testf(x->map(op, x), all_float_16)
+        @test testf(x -> map(op, x), all_float_16)
     end
 
     @testset "fastmath" begin

maleadt · 2025-02-11T10:14:06Z

What about more native implementations?

using CUDA, LLVM, LLVM.Interop

function asmlog(x::Float16)
    log_x = @asmcall("""{
                  .reg.b32        f, C;
                  .reg.b16        r,h;
                  mov.b16         h,\$1;
                  cvt.f32.f16     f,h;
                  lg2.approx.ftz.f32  f,f;
                  mov.b32         C, 0x3f317218U;
                  mul.f32         f,f,C;
                  cvt.rn.f16.f32  r,f;
                  .reg.b16 spc, ulp, p;
                  mov.b16 spc, 0X160DU;
                  mov.b16 ulp, 0x9C00U;
                  set.eq.f16.f16 p, h, spc;
                  fma.rn.f16 r,p,ulp,r;
                  mov.b16 spc, 0X3BFEU;
                  mov.b16 ulp, 0x8010U;
                  set.eq.f16.f16 p, h, spc;
                  fma.rn.f16 r,p,ulp,r;
                  mov.b16 spc, 0X3C0BU;
                  mov.b16 ulp, 0x8080U;
                  set.eq.f16.f16 p, h, spc;
                  fma.rn.f16 r,p,ulp,r;
                  mov.b16 spc, 0X6051U;
                  mov.b16 ulp, 0x1C00U;
                  set.eq.f16.f16 p, h, spc;
                  fma.rn.f16 r,p,ulp,r;
                  mov.b16         \$0,r;
          }""", "=h,h", Float16, Tuple{Float16}, x)
    return log_x
end

function nativelog(h::Float16)
    # perform computation in Float32 domain
    f = Float32(h)
    f = @fastmath log(f)
    r = Float16(f)

    # handle degenrate cases
    r = fma(Float16(h == reinterpret(Float16, 0x160D)), reinterpret(Float16, 0x9C00), r)
    r = fma(Float16(h == reinterpret(Float16, 0x3BFE)), reinterpret(Float16, 0x8010), r)
    r = fma(Float16(h == reinterpret(Float16, 0x3C0B)), reinterpret(Float16, 0x8080), r)
    r = fma(Float16(h == reinterpret(Float16, 0x6051)), reinterpret(Float16, 0x1C00), r)

    return r
end

function main()
    CUDA.code_ptx(asmlog, Tuple{Float16})
    CUDA.code_ptx(nativelog, Tuple{Float16})
    return
end

.visible .func  (.param .b32 func_retval0) julia_asmlog_18627(
	.param .b32 julia_asmlog_18627_param_0
)
{
	.reg .b16 	%h<2>;
	.reg .b16 	%rs<3>;

// %bb.0:                               // %top
	ld.param.u16 	%rs2, [julia_asmlog_18627_param_0];
	// begin inline asm
	{
        .reg.b32        f, C;
        .reg.b16        r,h;
        mov.b16         h,%rs2;
        cvt.f32.f16     f,h;
        lg2.approx.ftz.f32  f,f;
        mov.b32         C, 0x3f317218U;
        mul.f32         f,f,C;
        cvt.rn.f16.f32  r,f;
        .reg.b16 spc, ulp, p;
        mov.b16 spc, 0X160DU;
        mov.b16 ulp, 0x9C00U;
        set.eq.f16.f16 p, h, spc;
        fma.rn.f16 r,p,ulp,r;
        mov.b16 spc, 0X3BFEU;
        mov.b16 ulp, 0x8010U;
        set.eq.f16.f16 p, h, spc;
        fma.rn.f16 r,p,ulp,r;
        mov.b16 spc, 0X3C0BU;
        mov.b16 ulp, 0x8080U;
        set.eq.f16.f16 p, h, spc;
        fma.rn.f16 r,p,ulp,r;
        mov.b16 spc, 0X6051U;
        mov.b16 ulp, 0x1C00U;
        set.eq.f16.f16 p, h, spc;
        fma.rn.f16 r,p,ulp,r;
        mov.b16         %rs1,r;
}
	// end inline asm
	mov.b16 	%h1, %rs1;
	st.param.b16 	[func_retval0+0], %h1;
	ret;
                                        // -- End function
}


.visible .func  (.param .b32 func_retval0) julia_nativelog_18635(
	.param .b32 julia_nativelog_18635_param_0
)
{
	.reg .pred 	%p<5>;
	.reg .b16 	%h<19>;
	.reg .f32 	%f<4>;

// %bb.0:                               // %top
	ld.param.b16 	%h1, [julia_nativelog_18635_param_0];
	cvt.f32.f16 	%f1, %h1;
	lg2.approx.f32 	%f2, %f1;
	mul.f32 	%f3, %f2, 0f3F317218;
	cvt.rn.f16.f32 	%h2, %f3;
	mov.b16 	%h3, 0x160D;
	setp.eq.f16 	%p1, %h1, %h3;
	selp.b16 	%h4, 0x3C00, 0x0000, %p1;
	mov.b16 	%h5, 0x9C00;
	fma.rn.f16 	%h6, %h4, %h5, %h2;
	mov.b16 	%h7, 0x3BFE;
	setp.eq.f16 	%p2, %h1, %h7;
	selp.b16 	%h8, 0x3C00, 0x0000, %p2;
	mov.b16 	%h9, 0x8010;
	fma.rn.f16 	%h10, %h8, %h9, %h6;
	mov.b16 	%h11, 0x3C0B;
	setp.eq.f16 	%p3, %h1, %h11;
	selp.b16 	%h12, 0x3C00, 0x0000, %p3;
	mov.b16 	%h13, 0x8080;
	fma.rn.f16 	%h14, %h12, %h13, %h10;
	mov.b16 	%h15, 0x6051;
	setp.eq.f16 	%p4, %h1, %h15;
	selp.b16 	%h16, 0x3C00, 0x0000, %p4;
	mov.b16 	%h17, 0x1C00;
	fma.rn.f16 	%h18, %h16, %h17, %h14;
	st.param.b16 	[func_retval0+0], %h18;
	ret;
                                        // -- End function
}

function nativelog10(h::Float16)
    # perform computation in Float32 domain
    f = Float32(h)
    f = @fastmath log10(f)
    r = Float16(f)

    # handle degenerate cases
    r = fma(Float16(h == reinterpret(Float16, 0x338F)), reinterpret(Float16, 0x1000), r)
    r = fma(Float16(h == reinterpret(Float16, 0x33F8)), reinterpret(Float16, 0x9000), r)
    r = fma(Float16(h == reinterpret(Float16, 0x57E1)), reinterpret(Float16, 0x9800), r)
    r = fma(Float16(h == reinterpret(Float16, 0x719D)), reinterpret(Float16, 0x9C00), r)

    return r
end

.visible .func  (.param .b32 func_retval0) julia_nativelog10_18750(
	.param .b32 julia_nativelog10_18750_param_0
)
{
	.reg .pred 	%p<5>;
	.reg .b16 	%h<19>;
	.reg .f32 	%f<4>;

// %bb.0:                               // %top
	ld.param.b16 	%h1, [julia_nativelog10_18750_param_0];
	cvt.f32.f16 	%f1, %h1;
	lg2.approx.f32 	%f2, %f1;
	mul.f32 	%f3, %f2, 0f3E9A209B;
	cvt.rn.f16.f32 	%h2, %f3;
	mov.b16 	%h3, 0x338F;
	setp.eq.f16 	%p1, %h1, %h3;
	selp.b16 	%h4, 0x3C00, 0x0000, %p1;
	mov.b16 	%h5, 0x1000;
	fma.rn.f16 	%h6, %h4, %h5, %h2;
	mov.b16 	%h7, 0x33F8;
	setp.eq.f16 	%p2, %h1, %h7;
	selp.b16 	%h8, 0x3C00, 0x0000, %p2;
	mov.b16 	%h9, 0x9000;
	fma.rn.f16 	%h10, %h8, %h9, %h6;
	mov.b16 	%h11, 0x57E1;
	setp.eq.f16 	%p3, %h1, %h11;
	selp.b16 	%h12, 0x3C00, 0x0000, %p3;
	mov.b16 	%h13, 0x9800;
	fma.rn.f16 	%h14, %h12, %h13, %h10;
	mov.b16 	%h15, 0x719D;
	setp.eq.f16 	%p4, %h1, %h15;
	selp.b16 	%h16, 0x3C00, 0x0000, %p4;
	mov.b16 	%h17, 0x9C00;
	fma.rn.f16 	%h18, %h16, %h17, %h14;
	st.param.b16 	[func_retval0+0], %h18;
	ret;
                                        // -- End function
}

function nativelog2(h::Float16)
    # perform computation in Float32 domain
    f = Float32(h)
    f = @fastmath log2(f)
    r = Float16(f)

    # handle degenerate cases
    r = fma(Float16(r == reinterpret(Float16, 0xA2E2)), reinterpret(Float16, 0x8080), r)
    r = fma(Float16(r == reinterpret(Float16, 0xBF46)), reinterpret(Float16, 0x9400), r)

    return r
end

.visible .func  (.param .b32 func_retval0) julia_nativelog2_18847(
	.param .b32 julia_nativelog2_18847_param_0
)
{
	.reg .pred 	%p<3>;
	.reg .b16 	%h<11>;
	.reg .f32 	%f<3>;

// %bb.0:                               // %top
	ld.param.b16 	%h1, [julia_nativelog2_18847_param_0];
	cvt.f32.f16 	%f1, %h1;
	lg2.approx.f32 	%f2, %f1;
	cvt.rn.f16.f32 	%h2, %f2;
	mov.b16 	%h3, 0xA2E2;
	setp.eq.f16 	%p1, %h2, %h3;
	selp.b16 	%h4, 0x3C00, 0x0000, %p1;
	mov.b16 	%h5, 0x8080;
	fma.rn.f16 	%h6, %h4, %h5, %h2;
	mov.b16 	%h7, 0xBF46;
	setp.eq.f16 	%p2, %h6, %h7;
	selp.b16 	%h8, 0x3C00, 0x0000, %p2;
	mov.b16 	%h9, 0x9400;
	fma.rn.f16 	%h10, %h8, %h9, %h6;
	st.param.b16 	[func_retval0+0], %h10;
	ret;
                                        // -- End function
}

It's weird that the special cases are checked against r here, and not the input h.

function nativeexp(h::Float16)
    # perform computation in Float32 domain
    f = Float32(h)
    f = fma(f, reinterpret(Float32, 0x3fb8aa3b), reinterpret(Float32, Base.sign_mask(Float32)))
    f = @fastmath exp2(f)
    r = Float16(f)

    # handle degenerate cases
    r = fma(Float16(h == reinterpret(Float16, 0x1F79)), reinterpret(Float16, 0x9400), r)
    r = fma(Float16(h == reinterpret(Float16, 0x25CF)), reinterpret(Float16, 0x9400), r)
    r = fma(Float16(h == reinterpret(Float16, 0xC13B)), reinterpret(Float16, 0x0400), r)
    r = fma(Float16(h == reinterpret(Float16, 0xC1EF)), reinterpret(Float16, 0x0200), r)

    return r
end

.visible .func  (.param .b32 func_retval0) julia_nativeexp_19019(
	.param .b32 julia_nativeexp_19019_param_0
)
{
	.reg .pred 	%p<5>;
	.reg .b16 	%h<18>;
	.reg .f32 	%f<4>;

// %bb.0:                               // %top
	ld.param.b16 	%h1, [julia_nativeexp_19019_param_0];
	cvt.f32.f16 	%f1, %h1;
	mul.f32 	%f2, %f1, 0f3FB8AA3B;
	ex2.approx.f32 	%f3, %f2;
	cvt.rn.f16.f32 	%h2, %f3;
	mov.b16 	%h3, 0x1F79;
	setp.eq.f16 	%p1, %h1, %h3;
	selp.b16 	%h4, 0x3C00, 0x0000, %p1;
	mov.b16 	%h5, 0x9400;
	fma.rn.f16 	%h6, %h4, %h5, %h2;
	mov.b16 	%h7, 0x25CF;
	setp.eq.f16 	%p2, %h1, %h7;
	selp.b16 	%h8, 0x3C00, 0x0000, %p2;
	fma.rn.f16 	%h9, %h8, %h5, %h6;
	mov.b16 	%h10, 0xC13B;
	setp.eq.f16 	%p3, %h1, %h10;
	selp.b16 	%h11, 0x3C00, 0x0000, %p3;
	mov.b16 	%h12, 0x0400;
	fma.rn.f16 	%h13, %h11, %h12, %h9;
	mov.b16 	%h14, 0xC1EF;
	setp.eq.f16 	%p4, %h1, %h14;
	selp.b16 	%h15, 0x3C00, 0x0000, %p4;
	mov.b16 	%h16, 0x0200;
	fma.rn.f16 	%h17, %h15, %h16, %h13;
	st.param.b16 	[func_retval0+0], %h17;
	ret;
                                        // -- End function
}

function nativeexp2(h::Float16)
    # perform computation in Float32 domain
    f = Float32(h)
    f = @fastmath exp2(f)

    # one ULP adjustement
    f = muladd(f, reinterpret(Float32, 0x33800000), f)
    r = Float16(f)

    return r
end

.visible .func  (.param .b32 func_retval0) julia_nativeexp2_19066(
	.param .b32 julia_nativeexp2_19066_param_0
)
{
	.reg .b16 	%h<3>;
	.reg .f32 	%f<4>;

// %bb.0:                               // %top
	ld.param.b16 	%h1, [julia_nativeexp2_19066_param_0];
	cvt.f32.f16 	%f1, %h1;
	ex2.approx.f32 	%f2, %f1;
	fma.rn.f32 	%f3, %f2, 0f33800000, %f2;
	cvt.rn.f16.f32 	%h2, %f3;
	st.param.b16 	[func_retval0+0], %h2;
	ret;
                                        // -- End function
}

function nativeexp10(h::Float16)
    # perform computation in Float32 domain
    f = Float32(h)
    f = fma(f, reinterpret(Float32, 0x40549A78), reinterpret(Float32, Base.sign_mask(Float32)))
    f = @fastmath exp2(f)
    r = Float16(f)

    # handle degenerate cases
    r = fma(Float16(h == reinterpret(Float16, 0x34DE)), reinterpret(Float16, 0x9800), r)
    r = fma(Float16(h == reinterpret(Float16, 0x9766)), reinterpret(Float16, 0x9000), r)
    r = fma(Float16(h == reinterpret(Float16, 0x9972)), reinterpret(Float16, 0x1000), r)
    r = fma(Float16(h == reinterpret(Float16, 0xA5C4)), reinterpret(Float16, 0x1000), r)
    r = fma(Float16(h == reinterpret(Float16, 0xBF0A)), reinterpret(Float16, 0x8100), r)

    return r
end

.visible .func  (.param .b32 func_retval0) julia_nativeexp10_19202(
	.param .b32 julia_nativeexp10_19202_param_0
)
{
	.reg .pred 	%p<6>;
	.reg .b16 	%h<22>;
	.reg .f32 	%f<4>;

// %bb.0:                               // %top
	ld.param.b16 	%h1, [julia_nativeexp10_19202_param_0];
	cvt.f32.f16 	%f1, %h1;
	mul.f32 	%f2, %f1, 0f40549A78;
	ex2.approx.f32 	%f3, %f2;
	cvt.rn.f16.f32 	%h2, %f3;
	mov.b16 	%h3, 0x34DE;
	setp.eq.f16 	%p1, %h1, %h3;
	selp.b16 	%h4, 0x3C00, 0x0000, %p1;
	mov.b16 	%h5, 0x9800;
	fma.rn.f16 	%h6, %h4, %h5, %h2;
	mov.b16 	%h7, 0x9766;
	setp.eq.f16 	%p2, %h1, %h7;
	selp.b16 	%h8, 0x3C00, 0x0000, %p2;
	mov.b16 	%h9, 0x9000;
	fma.rn.f16 	%h10, %h8, %h9, %h6;
	mov.b16 	%h11, 0x9972;
	setp.eq.f16 	%p3, %h1, %h11;
	selp.b16 	%h12, 0x3C00, 0x0000, %p3;
	mov.b16 	%h13, 0x1000;
	fma.rn.f16 	%h14, %h12, %h13, %h10;
	mov.b16 	%h15, 0xA5C4;
	setp.eq.f16 	%p4, %h1, %h15;
	selp.b16 	%h16, 0x3C00, 0x0000, %p4;
	fma.rn.f16 	%h17, %h16, %h13, %h14;
	mov.b16 	%h18, 0xBF0A;
	setp.eq.f16 	%p5, %h1, %h18;
	selp.b16 	%h19, 0x3C00, 0x0000, %p5;
	mov.b16 	%h20, 0x8100;
	fma.rn.f16 	%h21, %h19, %h20, %h17;
	st.param.b16 	[func_retval0+0], %h21;
	ret;
                                        // -- End function
}

I only did these ports by looking at the assembly, and they would still need to be tested properly.

maleadt · 2025-02-11T10:44:10Z

I wonder if some of the degenerate cases in the ASM-to-Julia ports above could be written differently (I couldn't find any existing predicates returning those bit values); maybe some floating-point wizards know (cc @oscardssmith)?

src/device/intrinsics/math.jl

kshyatt · 2025-02-11T16:37:09Z

What about more native implementations?

I'm in favour! My assembly skills are weak enough I didn't want to venture too far out on my own, but others should feel free to push more to this branch or point me at some references :)

github-actions

CUDA.jl Benchmarks

Benchmark suite	Current: `9158c15`	Previous: `c75b56f`	Ratio
`latency/precompile`	`46266542969.5` ns	`46181620127.5` ns	`1.00`
`latency/ttfp`	`7033038663` ns	`7068629094` ns	`0.99`
`latency/import`	`3709401260` ns	`3724762175` ns	`1.00`
`integration/volumerhs`	`9626731.5` ns	`9624216.5` ns	`1.00`
`integration/byval/slices=1`	`146781` ns	`146894` ns	`1.00`
`integration/byval/slices=3`	`425052` ns	`425137.5` ns	`1.00`
`integration/byval/reference`	`144993` ns	`144952` ns	`1.00`
`integration/byval/slices=2`	`285847` ns	`285936` ns	`1.00`
`integration/cudadevrt`	`103391` ns	`103412` ns	`1.00`
`kernel/indexing`	`14174.5` ns	`14099` ns	`1.01`
`kernel/indexing_checked`	`14867` ns	`14674` ns	`1.01`
`kernel/occupancy`	`670.2025316455696` ns	`701.1379310344828` ns	`0.96`
`kernel/launch`	`2106` ns	`2179.6666666666665` ns	`0.97`
`kernel/rand`	`14654` ns	`14749` ns	`0.99`
`array/reverse/1d`	`19381` ns	`19776` ns	`0.98`
`array/reverse/2d`	`25298` ns	`24908` ns	`1.02`
`array/reverse/1d_inplace`	`11198` ns	`10219` ns	`1.10`
`array/reverse/2d_inplace`	`13125` ns	`11910` ns	`1.10`
`array/copy`	`21390` ns	`21311` ns	`1.00`
`array/iteration/findall/int`	`159113.5` ns	`158209` ns	`1.01`
`array/iteration/findall/bool`	`139775` ns	`139123` ns	`1.00`
`array/iteration/findfirst/int`	`168927` ns	`153168` ns	`1.10`
`array/iteration/findfirst/bool`	`172718.5` ns	`154631` ns	`1.12`
`array/iteration/scalar`	`71399` ns	`71886` ns	`0.99`
`array/iteration/logical`	`215988` ns	`213254` ns	`1.01`
`array/iteration/findmin/1d`	`41936` ns	`40786` ns	`1.03`
`array/iteration/findmin/2d`	`94349` ns	`93428` ns	`1.01`
`array/reductions/reduce/1d`	`41978.5` ns	`35669` ns	`1.18`
`array/reductions/reduce/2d`	`51783` ns	`40477` ns	`1.28`
`array/reductions/mapreduce/1d`	`39167` ns	`33443` ns	`1.17`
`array/reductions/mapreduce/2d`	`52087` ns	`40694.5` ns	`1.28`
`array/broadcast`	`20737` ns	`20825` ns	`1.00`
`array/copyto!/gpu_to_gpu`	`13633` ns	`13806` ns	`0.99`
`array/copyto!/cpu_to_gpu`	`209815` ns	`208873` ns	`1.00`
`array/copyto!/gpu_to_cpu`	`244927` ns	`242948` ns	`1.01`
`array/accumulate/1d`	`108947` ns	`108924` ns	`1.00`
`array/accumulate/2d`	`79917` ns	`80034` ns	`1.00`
`array/construct`	`1293.2` ns	`1297.3` ns	`1.00`
`array/random/randn/Float32`	`44527.5` ns	`43906.5` ns	`1.01`
`array/random/randn!/Float32`	`26948` ns	`26669` ns	`1.01`
`array/random/rand!/Int64`	`27243` ns	`27027` ns	`1.01`
`array/random/rand!/Float32`	`8884.166666666668` ns	`8863` ns	`1.00`
`array/random/rand/Int64`	`30096` ns	`30048.5` ns	`1.00`
`array/random/rand/Float32`	`13155` ns	`13342` ns	`0.99`
`array/permutedims/4d`	`60890` ns	`60675.5` ns	`1.00`
`array/permutedims/2d`	`55611` ns	`55115.5` ns	`1.01`
`array/permutedims/3d`	`57252.5` ns	`55700` ns	`1.03`
`array/sorting/1d`	`2767601` ns	`2777689` ns	`1.00`
`array/sorting/by`	`3355313` ns	`3368739` ns	`1.00`
`array/sorting/2d`	`1082151.5` ns	`1084912` ns	`1.00`
`cuda/synchronization/stream/auto`	`1053.5` ns	`1013.5384615384615` ns	`1.04`
`cuda/synchronization/stream/nonblocking`	`6524.2` ns	`6485.2` ns	`1.01`
`cuda/synchronization/stream/blocking`	`848.8194444444445` ns	`826` ns	`1.03`
`cuda/synchronization/context/auto`	`1164.9` ns	`1197.3` ns	`0.97`
`cuda/synchronization/context/nonblocking`	`6690.4` ns	`6768.6` ns	`0.99`
`cuda/synchronization/context/blocking`	`935.1521739130435` ns	`946.4193548387096` ns	`0.99`

This comment was automatically generated by workflow using github-action-benchmark.

src/device/intrinsics/math.jl

vchuravy · 2025-02-12T11:25:18Z

test/core/device/intrinsics/math.jl

+    @testset "Float16 - $op" for op in (log,exp,exp2,exp10,log2,log10)
+        @testset "$T" for T in (Float16, )
+            @test testf(x->op.(x), rand(T, 1))
+        end
+    end


We could test all values here:

julia> all_float_16 = collect(reinterpret(Float16, pattern) for pattern in UInt16(0):UInt16(1):typemax(UInt16)) 65536-element Vector{Float16}:

(there might be a better way that avoids some of the duplicated patterns, but it is only 65k in the end)

Otherwise for some of the degenerate cases we might randomly fail if we disagree with Julia.

all_float_16 = collect(reinterpret(Float16, pattern) for pattern in UInt16(0):UInt16(1):typemax(UInt16)) all_float_16 = filter(!isnan, all_float_16) julia> findall(==(0), exp.(all_float_16) .== Array(exp.(CuArray(all_float_16)))) 2-element Vector{Int64}: 8058 9680 julia> all_float_16[8058] Float16(0.007298) julia> all_float_16[9680] Float16(0.02269) julia> reinterpret(UInt16, all_float_16[8058]) 0x1f79 julia> reinterpret(UInt16, all_float_16[9680]) 0x25cf

src/device/intrinsics/math.jl

vchuravy · 2025-02-12T11:36:57Z

src/device/intrinsics/math.jl

+    r = fma(Float16(h == reinterpret(Float16, 0x1F79)), reinterpret(Float16, 0x9400), r)
+    r = fma(Float16(h == reinterpret(Float16, 0x25CF)), reinterpret(Float16, 0x9400), r)


These two cause us to disagree with Julia.

julia> findall(==(0), exp.(all_float_16) .== Array(exp.(CuArray(all_float_16)))) 2-element Vector{Int64}: 8058 9680 julia> all_float_16[8058] Float16(0.007298) julia> all_float_16[9680] Float16(0.02269) julia> reinterpret(UInt16, all_float_16[8058] ) 0x1f79 julia> reinterpret(UInt16, all_float_16[9680]) 0x25cf

julia> Float16(exp(Float32(all_float_16[8058]))) Float16(1.008) julia> exp_cu[8058] Float16(1.007) julia> exp_cu[8058] - Float16(exp(Float32(all_float_16[8058]))) Float16(-0.000977)

kshyatt · 2025-02-14T18:37:01Z

OK I got a bit turned around here - what's the ask? Can we come up with a list of what needs to be done to get this merged?

maleadt · 2025-02-18T11:38:26Z

I think we should replace the magic constants with the expressions Valentin figured out. Beyond that, we should think about whether we want to mimic CUDA or Julia here, i.e., whether to keep the adjustments or not. And in any case, make sure there's tests covering those added snippets.

test/core/device/intrinsics/math.jl

src/device/intrinsics/math.jl

vchuravy · 2025-02-27T19:41:16Z

src/device/intrinsics/math.jl

+    f = @fastmath exp2(f)
+
+    # one ULP adjustement
+    f = muladd(f, reinterpret(Float32, 0x33800000), f)


Should this be an fma?

Probably, I can try to figure out what that magic number is too

I think that magic number is just a relative eps

julia> reinterpret(UInt32, eps(0.5f0)) 0x33800000

Don't know why they choose 0.5f0 though.

vchuravy · 2025-03-01T06:55:46Z

src/device/intrinsics/math.jl

+    r = Float16(f)
+
+    # handle degenerate cases
+    r = fma(Float16(h == reinterpret(Float16, 0x338F)), reinterpret(Float16, 0x1000), r)


We should add a comment that this is just:

r = h == reinterpret(Float16, 0x338F) ? reinterpret(Float16, 0x1000) : r

But FMA is the fastest way to do that xD, and keep your flops number up.

test/core/device/intrinsics/math.jl

src/device/intrinsics/math.jl

test/core/device/intrinsics/math.jl

codecov · 2025-03-01T09:25:52Z

Codecov Report

All modified and coverable lines are covered by tests ✅

Project coverage is 79.91%. Comparing base (8b93480) to head (20bc139).
Report is 2 commits behind head on master.

Additional details and impacted files

@@            Coverage Diff             @@
##           master    #2644      +/-   ##
==========================================
- Coverage   80.03%   79.91%   -0.13%     
==========================================
  Files         154      154              
  Lines       13685    13685              
==========================================
- Hits        10953    10936      -17     
- Misses       2732     2749      +17

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

Co-authored-by: Valentin Churavy <[email protected]>

vchuravy

From my side, I would say let's merge this

Maybe the last decision is if we should follow Julia or CUDA for denormal numbers.

maleadt · 2025-03-11T18:34:46Z

Maybe the last decision is if we should follow Julia or CUDA for denormal numbers.

Yeah, that's the only real thing we need to decide. I'm not sure if we've picked a side in the past, and if we've been consistent at that. I used to be of the opinion that it should be easy to port CUDA C code to Julia using CUDA.jl, but I don't think that ended up happening, and we're way more likely to see other Julia people unfamiliar with GPUs trying to run generic code using CUDA.jl. Which means it's probably a better choice to be consistent with Julia than with CUDA C.

christiangnrd mentioned this pull request Feb 7, 2025

Add nextafter intrinsic JuliaGPU/Metal.jl#529

Merged

kshyatt force-pushed the ksh/f16 branch from 5c1db19 to 1c15c77 Compare February 10, 2025 17:28

kshyatt marked this pull request as ready for review February 10, 2025 17:29

maleadt reviewed Feb 11, 2025

View reviewed changes

src/device/intrinsics/math.jl Outdated Show resolved Hide resolved

vchuravy reviewed Feb 11, 2025

View reviewed changes

src/device/intrinsics/math.jl Outdated Show resolved Hide resolved

github-actions bot reviewed Feb 11, 2025

View reviewed changes

vchuravy reviewed Feb 12, 2025

View reviewed changes

src/device/intrinsics/math.jl Show resolved Hide resolved

vchuravy reviewed Feb 12, 2025

View reviewed changes

src/device/intrinsics/math.jl Outdated Show resolved Hide resolved

vchuravy reviewed Feb 12, 2025

View reviewed changes

src/device/intrinsics/math.jl Outdated Show resolved Hide resolved

vchuravy reviewed Feb 12, 2025

View reviewed changes

src/device/intrinsics/math.jl Show resolved Hide resolved

vchuravy reviewed Feb 12, 2025

View reviewed changes

kshyatt force-pushed the ksh/f16 branch from 3081234 to 14f97a5 Compare February 26, 2025 20:22

vchuravy reviewed Feb 27, 2025

View reviewed changes

test/core/device/intrinsics/math.jl Show resolved Hide resolved

vchuravy reviewed Feb 27, 2025

View reviewed changes

test/core/device/intrinsics/math.jl Outdated Show resolved Hide resolved

vchuravy reviewed Feb 27, 2025

View reviewed changes

src/device/intrinsics/math.jl Outdated Show resolved Hide resolved

vchuravy reviewed Feb 27, 2025

View reviewed changes

src/device/intrinsics/math.jl Outdated Show resolved Hide resolved

vchuravy reviewed Feb 27, 2025

View reviewed changes

vchuravy reviewed Mar 1, 2025

View reviewed changes

test/core/device/intrinsics/math.jl Show resolved Hide resolved

vchuravy reviewed Mar 1, 2025

View reviewed changes

src/device/intrinsics/math.jl Outdated Show resolved Hide resolved

vchuravy reviewed Mar 1, 2025

View reviewed changes

test/core/device/intrinsics/math.jl Outdated Show resolved Hide resolved

vchuravy reviewed Mar 1, 2025

View reviewed changes

test/core/device/intrinsics/math.jl Outdated Show resolved Hide resolved

kshyatt and others added 14 commits March 11, 2025 17:42

Wrap and test some more Float16 intrinsics

b905e0d

Fix bad sqrt override

7dfff4c

Actually fix

736d836

Replace inline assembly with native code.

edbbe73

Try enabling some more intrinsics.

6feddcf

Replace magic constants and test all floats

94f3790

Update test/core/device/intrinsics/math.jl

74961f9

Co-authored-by: Valentin Churavy <[email protected]>

Update src/device/intrinsics/math.jl

b609b21

Co-authored-by: Valentin Churavy <[email protected]>

Use map to avoid spawing lots of kernels

c3c1530

Update src/device/intrinsics/math.jl

a8786b7

Co-authored-by: Valentin Churavy <[email protected]>

Update test/core/device/intrinsics/math.jl

9acefb7

Update test/core/device/intrinsics/math.jl

ebe2558

Co-authored-by: Valentin Churavy <[email protected]>

Update test/core/device/intrinsics/math.jl

ed8ee84

Co-authored-by: Valentin Churavy <[email protected]>

Update src/device/intrinsics/math.jl

9158c15

Co-authored-by: Valentin Churavy <[email protected]>

vchuravy force-pushed the ksh/f16 branch from 75d3fa9 to 9158c15 Compare March 11, 2025 16:42

vchuravy approved these changes Mar 11, 2025

View reviewed changes

maleadt mentioned this pull request Mar 11, 2025

Pi and e to Float32 and Float16 JuliaGPU/Metal.jl#559

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Wrap and test some more Float16 intrinsics #2644

Wrap and test some more Float16 intrinsics #2644

kshyatt commented Feb 6, 2025

kshyatt commented Feb 7, 2025

maleadt commented Feb 10, 2025

kshyatt commented Feb 10, 2025

github-actions bot commented Feb 10, 2025 •

edited

Loading

maleadt commented Feb 11, 2025 •

edited

Loading

maleadt commented Feb 11, 2025 •

edited

Loading

kshyatt commented Feb 11, 2025

github-actions bot left a comment •

edited

Loading

vchuravy Feb 12, 2025

vchuravy Feb 12, 2025

vchuravy Feb 12, 2025

vchuravy Feb 12, 2025

kshyatt commented Feb 14, 2025

maleadt commented Feb 18, 2025

vchuravy Feb 27, 2025

kshyatt Feb 27, 2025

vchuravy Mar 1, 2025 •

edited

Loading

vchuravy Mar 1, 2025

codecov bot commented Mar 1, 2025

vchuravy left a comment

maleadt commented Mar 11, 2025

		r = fma(Float16(h == reinterpret(Float16, 0x1F79)), reinterpret(Float16, 0x9400), r)
		r = fma(Float16(h == reinterpret(Float16, 0x25CF)), reinterpret(Float16, 0x9400), r)

Wrap and test some more Float16 intrinsics #2644

Are you sure you want to change the base?

Wrap and test some more Float16 intrinsics #2644

Conversation

kshyatt commented Feb 6, 2025

kshyatt commented Feb 7, 2025

maleadt commented Feb 10, 2025

kshyatt commented Feb 10, 2025

github-actions bot commented Feb 10, 2025 • edited Loading

maleadt commented Feb 11, 2025 • edited Loading

maleadt commented Feb 11, 2025 • edited Loading

kshyatt commented Feb 11, 2025

github-actions bot left a comment • edited Loading

Choose a reason for hiding this comment

CUDA.jl Benchmarks

vchuravy Feb 12, 2025

Choose a reason for hiding this comment

vchuravy Feb 12, 2025

Choose a reason for hiding this comment

vchuravy Feb 12, 2025

Choose a reason for hiding this comment

vchuravy Feb 12, 2025

Choose a reason for hiding this comment

kshyatt commented Feb 14, 2025

maleadt commented Feb 18, 2025

vchuravy Feb 27, 2025

Choose a reason for hiding this comment

kshyatt Feb 27, 2025

Choose a reason for hiding this comment

vchuravy Mar 1, 2025 • edited Loading

Choose a reason for hiding this comment

vchuravy Mar 1, 2025

Choose a reason for hiding this comment

codecov bot commented Mar 1, 2025

Codecov Report

vchuravy left a comment

Choose a reason for hiding this comment

maleadt commented Mar 11, 2025

github-actions bot commented Feb 10, 2025 •

edited

Loading

maleadt commented Feb 11, 2025 •

edited

Loading

maleadt commented Feb 11, 2025 •

edited

Loading

github-actions bot left a comment •

edited

Loading

vchuravy Mar 1, 2025 •

edited

Loading