-
Notifications
You must be signed in to change notification settings - Fork 238
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Wrap and test some more Float16 intrinsics #2644
base: master
Are you sure you want to change the base?
Conversation
Too much to hope for that they had secretly included these in |
The |
OK! We now have some working |
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/device/intrinsics/math.jl b/src/device/intrinsics/math.jl
index 742058006..43518d0d1 100644
--- a/src/device/intrinsics/math.jl
+++ b/src/device/intrinsics/math.jl
@@ -217,7 +217,7 @@ end
@device_override function Base.exp10(h::Float16)
# perform computation in Float32 domain
f = Float32(h)
- f = fma(f, log2(10.f0), -0.0f0)
+ f = fma(f, log2(10.0f0), -0.0f0)
f = @fastmath exp2(f)
r = Float16(f)
diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl
index 6fa5768df..2355d0dd0 100644
--- a/test/core/device/intrinsics/math.jl
+++ b/test/core/device/intrinsics/math.jl
@@ -3,7 +3,7 @@ using SpecialFunctions
@testset "math" begin
@testset "log10" begin
for T in (Float32, Float64)
- @test testf(a->log10.(a), T[100])
+ @test testf(a -> log10.(a), T[100])
end
end
@@ -14,22 +14,22 @@ using SpecialFunctions
@test testf((x,y)->x.^y, rand(Float32, 1), -rand(range, 1))
end
end
-
+
@testset "min/max" begin
for T in (Float32, Float64)
- @test testf((x,y)->max.(x, y), rand(Float32, 1), rand(T, 1))
- @test testf((x,y)->min.(x, y), rand(Float32, 1), rand(T, 1))
+ @test testf((x, y) -> max.(x, y), rand(Float32, 1), rand(T, 1))
+ @test testf((x, y) -> min.(x, y), rand(Float32, 1), rand(T, 1))
end
end
@testset "isinf" begin
- for x in (Inf32, Inf, NaN16, NaN32, NaN)
+ for x in (Inf32, Inf, NaN16, NaN32, NaN)
@test testf(x->isinf.(x), [x])
end
end
@testset "isnan" begin
- for x in (Inf32, Inf, NaN16, NaN32, NaN)
+ for x in (Inf32, Inf, NaN16, NaN32, NaN)
@test testf(x->isnan.(x), [x])
end
end
@@ -104,20 +104,20 @@ using SpecialFunctions
# JuliaGPU/CUDA.jl#1085: exp uses Base.sincos performing a global CPU load
@test testf(x->exp.(x), [1e7im])
end
-
+
@testset "Real - $op" for op in (abs, abs2, exp, exp10, log, log10)
@testset "$T" for T in (Float16, Float32, Float64)
- @test testf(x->op.(x), rand(T, 1))
+ @test testf(x -> op.(x), rand(T, 1))
end
end
-
- @testset "Float16 - $op" for op in (exp,exp2,exp10,log,log2,log10)
- all_float_16 = collect(reinterpret(Float16, pattern) for pattern in UInt16(0):UInt16(1):typemax(UInt16))
+
+ @testset "Float16 - $op" for op in (exp, exp2, exp10, log, log2, log10)
+ all_float_16 = collect(reinterpret(Float16, pattern) for pattern in UInt16(0):UInt16(1):typemax(UInt16))
all_float_16 = filter(!isnan, all_float_16)
if op in (log, log2, log10)
all_float_16 = filter(>=(0), all_float_16)
end
- @test testf(x->map(op, x), all_float_16)
+ @test testf(x -> map(op, x), all_float_16)
end
@testset "fastmath" begin |
What about more native implementations? using CUDA, LLVM, LLVM.Interop
function asmlog(x::Float16)
log_x = @asmcall("""{
.reg.b32 f, C;
.reg.b16 r,h;
mov.b16 h,\$1;
cvt.f32.f16 f,h;
lg2.approx.ftz.f32 f,f;
mov.b32 C, 0x3f317218U;
mul.f32 f,f,C;
cvt.rn.f16.f32 r,f;
.reg.b16 spc, ulp, p;
mov.b16 spc, 0X160DU;
mov.b16 ulp, 0x9C00U;
set.eq.f16.f16 p, h, spc;
fma.rn.f16 r,p,ulp,r;
mov.b16 spc, 0X3BFEU;
mov.b16 ulp, 0x8010U;
set.eq.f16.f16 p, h, spc;
fma.rn.f16 r,p,ulp,r;
mov.b16 spc, 0X3C0BU;
mov.b16 ulp, 0x8080U;
set.eq.f16.f16 p, h, spc;
fma.rn.f16 r,p,ulp,r;
mov.b16 spc, 0X6051U;
mov.b16 ulp, 0x1C00U;
set.eq.f16.f16 p, h, spc;
fma.rn.f16 r,p,ulp,r;
mov.b16 \$0,r;
}""", "=h,h", Float16, Tuple{Float16}, x)
return log_x
end
function nativelog(h::Float16)
# perform computation in Float32 domain
f = Float32(h)
f = @fastmath log(f)
r = Float16(f)
# handle degenrate cases
r = fma(Float16(h == reinterpret(Float16, 0x160D)), reinterpret(Float16, 0x9C00), r)
r = fma(Float16(h == reinterpret(Float16, 0x3BFE)), reinterpret(Float16, 0x8010), r)
r = fma(Float16(h == reinterpret(Float16, 0x3C0B)), reinterpret(Float16, 0x8080), r)
r = fma(Float16(h == reinterpret(Float16, 0x6051)), reinterpret(Float16, 0x1C00), r)
return r
end
function main()
CUDA.code_ptx(asmlog, Tuple{Float16})
CUDA.code_ptx(nativelog, Tuple{Float16})
return
end
function nativelog10(h::Float16)
# perform computation in Float32 domain
f = Float32(h)
f = @fastmath log10(f)
r = Float16(f)
# handle degenerate cases
r = fma(Float16(h == reinterpret(Float16, 0x338F)), reinterpret(Float16, 0x1000), r)
r = fma(Float16(h == reinterpret(Float16, 0x33F8)), reinterpret(Float16, 0x9000), r)
r = fma(Float16(h == reinterpret(Float16, 0x57E1)), reinterpret(Float16, 0x9800), r)
r = fma(Float16(h == reinterpret(Float16, 0x719D)), reinterpret(Float16, 0x9C00), r)
return r
end
function nativelog2(h::Float16)
# perform computation in Float32 domain
f = Float32(h)
f = @fastmath log2(f)
r = Float16(f)
# handle degenerate cases
r = fma(Float16(r == reinterpret(Float16, 0xA2E2)), reinterpret(Float16, 0x8080), r)
r = fma(Float16(r == reinterpret(Float16, 0xBF46)), reinterpret(Float16, 0x9400), r)
return r
end
It's weird that the special cases are checked against function nativeexp(h::Float16)
# perform computation in Float32 domain
f = Float32(h)
f = fma(f, reinterpret(Float32, 0x3fb8aa3b), reinterpret(Float32, Base.sign_mask(Float32)))
f = @fastmath exp2(f)
r = Float16(f)
# handle degenerate cases
r = fma(Float16(h == reinterpret(Float16, 0x1F79)), reinterpret(Float16, 0x9400), r)
r = fma(Float16(h == reinterpret(Float16, 0x25CF)), reinterpret(Float16, 0x9400), r)
r = fma(Float16(h == reinterpret(Float16, 0xC13B)), reinterpret(Float16, 0x0400), r)
r = fma(Float16(h == reinterpret(Float16, 0xC1EF)), reinterpret(Float16, 0x0200), r)
return r
end
function nativeexp2(h::Float16)
# perform computation in Float32 domain
f = Float32(h)
f = @fastmath exp2(f)
# one ULP adjustement
f = muladd(f, reinterpret(Float32, 0x33800000), f)
r = Float16(f)
return r
end
function nativeexp10(h::Float16)
# perform computation in Float32 domain
f = Float32(h)
f = fma(f, reinterpret(Float32, 0x40549A78), reinterpret(Float32, Base.sign_mask(Float32)))
f = @fastmath exp2(f)
r = Float16(f)
# handle degenerate cases
r = fma(Float16(h == reinterpret(Float16, 0x34DE)), reinterpret(Float16, 0x9800), r)
r = fma(Float16(h == reinterpret(Float16, 0x9766)), reinterpret(Float16, 0x9000), r)
r = fma(Float16(h == reinterpret(Float16, 0x9972)), reinterpret(Float16, 0x1000), r)
r = fma(Float16(h == reinterpret(Float16, 0xA5C4)), reinterpret(Float16, 0x1000), r)
r = fma(Float16(h == reinterpret(Float16, 0xBF0A)), reinterpret(Float16, 0x8100), r)
return r
end
I only did these ports by looking at the assembly, and they would still need to be tested properly. |
I wonder if some of the degenerate cases in the ASM-to-Julia ports above could be written differently (I couldn't find any existing predicates returning those bit values); maybe some floating-point wizards know (cc @oscardssmith)? |
I'm in favour! My assembly skills are weak enough I didn't want to venture too far out on my own, but others should feel free to push more to this branch or point me at some references :) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA.jl Benchmarks
Benchmark suite | Current: 9158c15 | Previous: c75b56f | Ratio |
---|---|---|---|
latency/precompile |
46266542969.5 ns |
46181620127.5 ns |
1.00 |
latency/ttfp |
7033038663 ns |
7068629094 ns |
0.99 |
latency/import |
3709401260 ns |
3724762175 ns |
1.00 |
integration/volumerhs |
9626731.5 ns |
9624216.5 ns |
1.00 |
integration/byval/slices=1 |
146781 ns |
146894 ns |
1.00 |
integration/byval/slices=3 |
425052 ns |
425137.5 ns |
1.00 |
integration/byval/reference |
144993 ns |
144952 ns |
1.00 |
integration/byval/slices=2 |
285847 ns |
285936 ns |
1.00 |
integration/cudadevrt |
103391 ns |
103412 ns |
1.00 |
kernel/indexing |
14174.5 ns |
14099 ns |
1.01 |
kernel/indexing_checked |
14867 ns |
14674 ns |
1.01 |
kernel/occupancy |
670.2025316455696 ns |
701.1379310344828 ns |
0.96 |
kernel/launch |
2106 ns |
2179.6666666666665 ns |
0.97 |
kernel/rand |
14654 ns |
14749 ns |
0.99 |
array/reverse/1d |
19381 ns |
19776 ns |
0.98 |
array/reverse/2d |
25298 ns |
24908 ns |
1.02 |
array/reverse/1d_inplace |
11198 ns |
10219 ns |
1.10 |
array/reverse/2d_inplace |
13125 ns |
11910 ns |
1.10 |
array/copy |
21390 ns |
21311 ns |
1.00 |
array/iteration/findall/int |
159113.5 ns |
158209 ns |
1.01 |
array/iteration/findall/bool |
139775 ns |
139123 ns |
1.00 |
array/iteration/findfirst/int |
168927 ns |
153168 ns |
1.10 |
array/iteration/findfirst/bool |
172718.5 ns |
154631 ns |
1.12 |
array/iteration/scalar |
71399 ns |
71886 ns |
0.99 |
array/iteration/logical |
215988 ns |
213254 ns |
1.01 |
array/iteration/findmin/1d |
41936 ns |
40786 ns |
1.03 |
array/iteration/findmin/2d |
94349 ns |
93428 ns |
1.01 |
array/reductions/reduce/1d |
41978.5 ns |
35669 ns |
1.18 |
array/reductions/reduce/2d |
51783 ns |
40477 ns |
1.28 |
array/reductions/mapreduce/1d |
39167 ns |
33443 ns |
1.17 |
array/reductions/mapreduce/2d |
52087 ns |
40694.5 ns |
1.28 |
array/broadcast |
20737 ns |
20825 ns |
1.00 |
array/copyto!/gpu_to_gpu |
13633 ns |
13806 ns |
0.99 |
array/copyto!/cpu_to_gpu |
209815 ns |
208873 ns |
1.00 |
array/copyto!/gpu_to_cpu |
244927 ns |
242948 ns |
1.01 |
array/accumulate/1d |
108947 ns |
108924 ns |
1.00 |
array/accumulate/2d |
79917 ns |
80034 ns |
1.00 |
array/construct |
1293.2 ns |
1297.3 ns |
1.00 |
array/random/randn/Float32 |
44527.5 ns |
43906.5 ns |
1.01 |
array/random/randn!/Float32 |
26948 ns |
26669 ns |
1.01 |
array/random/rand!/Int64 |
27243 ns |
27027 ns |
1.01 |
array/random/rand!/Float32 |
8884.166666666668 ns |
8863 ns |
1.00 |
array/random/rand/Int64 |
30096 ns |
30048.5 ns |
1.00 |
array/random/rand/Float32 |
13155 ns |
13342 ns |
0.99 |
array/permutedims/4d |
60890 ns |
60675.5 ns |
1.00 |
array/permutedims/2d |
55611 ns |
55115.5 ns |
1.01 |
array/permutedims/3d |
57252.5 ns |
55700 ns |
1.03 |
array/sorting/1d |
2767601 ns |
2777689 ns |
1.00 |
array/sorting/by |
3355313 ns |
3368739 ns |
1.00 |
array/sorting/2d |
1082151.5 ns |
1084912 ns |
1.00 |
cuda/synchronization/stream/auto |
1053.5 ns |
1013.5384615384615 ns |
1.04 |
cuda/synchronization/stream/nonblocking |
6524.2 ns |
6485.2 ns |
1.01 |
cuda/synchronization/stream/blocking |
848.8194444444445 ns |
826 ns |
1.03 |
cuda/synchronization/context/auto |
1164.9 ns |
1197.3 ns |
0.97 |
cuda/synchronization/context/nonblocking |
6690.4 ns |
6768.6 ns |
0.99 |
cuda/synchronization/context/blocking |
935.1521739130435 ns |
946.4193548387096 ns |
0.99 |
This comment was automatically generated by workflow using github-action-benchmark.
test/core/device/intrinsics/math.jl
Outdated
@testset "Float16 - $op" for op in (log,exp,exp2,exp10,log2,log10) | ||
@testset "$T" for T in (Float16, ) | ||
@test testf(x->op.(x), rand(T, 1)) | ||
end | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could test all values here:
julia> all_float_16 = collect(reinterpret(Float16, pattern) for pattern in UInt16(0):UInt16(1):typemax(UInt16))
65536-element Vector{Float16}:
(there might be a better way that avoids some of the duplicated patterns, but it is only 65k in the end)
Otherwise for some of the degenerate cases we might randomly fail if we disagree with Julia.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
all_float_16 = collect(reinterpret(Float16, pattern) for pattern in UInt16(0):UInt16(1):typemax(UInt16))
all_float_16 = filter(!isnan, all_float_16)
julia> findall(==(0), exp.(all_float_16) .== Array(exp.(CuArray(all_float_16))))
2-element Vector{Int64}:
8058
9680
julia> all_float_16[8058]
Float16(0.007298)
julia> all_float_16[9680]
Float16(0.02269)
julia> reinterpret(UInt16, all_float_16[8058])
0x1f79
julia> reinterpret(UInt16, all_float_16[9680])
0x25cf
r = fma(Float16(h == reinterpret(Float16, 0x1F79)), reinterpret(Float16, 0x9400), r) | ||
r = fma(Float16(h == reinterpret(Float16, 0x25CF)), reinterpret(Float16, 0x9400), r) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These two cause us to disagree with Julia.
julia> findall(==(0), exp.(all_float_16) .== Array(exp.(CuArray(all_float_16))))
2-element Vector{Int64}:
8058
9680
julia> all_float_16[8058]
Float16(0.007298)
julia> all_float_16[9680]
Float16(0.02269)
julia> reinterpret(UInt16, all_float_16[8058]
)
0x1f79
julia> reinterpret(UInt16, all_float_16[9680])
0x25cf
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
julia> Float16(exp(Float32(all_float_16[8058])))
Float16(1.008)
julia> exp_cu[8058]
Float16(1.007)
julia> exp_cu[8058] - Float16(exp(Float32(all_float_16[8058])))
Float16(-0.000977)
OK I got a bit turned around here - what's the ask? Can we come up with a list of what needs to be done to get this merged? |
I think we should replace the magic constants with the expressions Valentin figured out. Beyond that, we should think about whether we want to mimic CUDA or Julia here, i.e., whether to keep the adjustments or not. And in any case, make sure there's tests covering those added snippets. |
f = @fastmath exp2(f) | ||
|
||
# one ULP adjustement | ||
f = muladd(f, reinterpret(Float32, 0x33800000), f) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be an fma
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably, I can try to figure out what that magic number is too
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that magic number is just a relative eps
julia> reinterpret(UInt32, eps(0.5f0))
0x33800000
Don't know why they choose 0.5f0
though.
r = Float16(f) | ||
|
||
# handle degenerate cases | ||
r = fma(Float16(h == reinterpret(Float16, 0x338F)), reinterpret(Float16, 0x1000), r) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should add a comment that this is just:
r = h == reinterpret(Float16, 0x338F) ? reinterpret(Float16, 0x1000) : r
But FMA is the fastest way to do that xD, and keep your flops number up.
Codecov ReportAll modified and coverable lines are covered by tests ✅
Additional details and impacted files@@ Coverage Diff @@
## master #2644 +/- ##
==========================================
- Coverage 80.03% 79.91% -0.13%
==========================================
Files 154 154
Lines 13685 13685
==========================================
- Hits 10953 10936 -17
- Misses 2732 2749 +17 ☔ View full report in Codecov by Sentry. |
Co-authored-by: Valentin Churavy <[email protected]>
Co-authored-by: Valentin Churavy <[email protected]>
Co-authored-by: Valentin Churavy <[email protected]>
Co-authored-by: Valentin Churavy <[email protected]>
Co-authored-by: Valentin Churavy <[email protected]>
Co-authored-by: Valentin Churavy <[email protected]>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From my side, I would say let's merge this
Maybe the last decision is if we should follow Julia or CUDA for denormal numbers.
Yeah, that's the only real thing we need to decide. I'm not sure if we've picked a side in the past, and if we've been consistent at that. I used to be of the opinion that it should be easy to port CUDA C code to Julia using CUDA.jl, but I don't think that ended up happening, and we're way more likely to see other Julia people unfamiliar with GPUs trying to run generic code using CUDA.jl. Which means it's probably a better choice to be consistent with Julia than with CUDA C. |
Since I don't have access to an A100 this is going to be a bit "debugging via CI".