diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index b569ca24e7..d1dac97a2c 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -306,6 +306,7 @@ The following keyword arguments are supported:
   supported on LLVM 4.0+)
 - `name`: override the name that the kernel will have in the generated code
 - `always_inline`: inline all function calls in the kernel
+- `fastmath`: use less precise square roots and flush denormals
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
 in a hot path without degrading performance. New code will be generated automatically, when
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index e948972407..d4b044cdff 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -157,6 +157,29 @@ end
     @test !occursin(".local", asm)
 end
 
+@testset "fastmath" begin
+    function sqrt_kernel(x)
+        i = threadIdx().x
+        @inbounds x[i] = sqrt(x[i])
+        return
+    end
+
+    function div_kernel(x)
+        i = threadIdx().x
+        @fastmath @inbounds x[i] = 1 / x[i]
+        return
+    end
+
+    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}))
+    @test occursin("sqrt.r", asm)
+
+    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
+    @test occursin("sqrt.approx.ftz", asm)
+
+    asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
+    @test occursin("div.approx.ftz", asm)
+end
+
 end
 
 ############################################################################################