diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index b569ca24e7..d1dac97a2c 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -306,6 +306,7 @@ The following keyword arguments are supported: supported on LLVM 4.0+) - `name`: override the name that the kernel will have in the generated code - `always_inline`: inline all function calls in the kernel +- `fastmath`: use less precise square roots and flush denormals The output of this function is automatically cached, i.e. you can simply call `cufunction` in a hot path without degrading performance. New code will be generated automatically, when diff --git a/test/core/codegen.jl b/test/core/codegen.jl index e948972407..d4b044cdff 100644 --- a/test/core/codegen.jl +++ b/test/core/codegen.jl @@ -157,6 +157,29 @@ end @test !occursin(".local", asm) end +@testset "fastmath" begin + function sqrt_kernel(x) + i = threadIdx().x + @inbounds x[i] = sqrt(x[i]) + return + end + + function div_kernel(x) + i = threadIdx().x + @fastmath @inbounds x[i] = 1 / x[i] + return + end + + asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}})) + @test occursin("sqrt.r", asm) + + asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) + @test occursin("sqrt.approx.ftz", asm) + + asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true)) + @test occursin("div.approx.ftz", asm) +end + end ############################################################################################