JuliaGPU · AntonOresten · Jan 2, 2026 · Jan 2, 2026
diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
@@ -180,18 +180,19 @@ end
 
 ## PTX
 
-# half-precision atomics using PTX instruction
+# half-precision and bfloat16 atomics using PTX instructions
 
-for A in (AS.Generic, AS.Global, AS.Shared), T in (:Float16,)
+for A in (AS.Generic, AS.Global, AS.Shared), T in (:Float16, :BFloat16)
     if A == AS.Global
         scope = ".global"
     elseif A == AS.Shared
         scope = ".shared"
     else
         scope = ""
     end
+    name = T == :Float16 ? "f16" : "bf16"
 
-    intr = "atom$scope.add.noftz.f16 \$0, [\$1], \$2;"
+    intr = "atom$scope.add.noftz.$name \$0, [\$1], \$2;"
     @eval @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) =
         @asmcall($intr, "=h,l,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T}, ptr, val)
 end
@@ -441,7 +442,7 @@ end
     atomic_arrayset(A, Base._to_linear_index(A, Is...), op, convert(T, val))
 
 # native atomics
-for (op,impl,typ) in [(:(+), :(atomic_add!), [:UInt32,:Int32,:UInt64,:Int64,:Float32]),
+for (op,impl,typ) in [(:(+), :(atomic_add!), [:UInt32,:Int32,:UInt64,:Int64,:Float32,:Float16,:BFloat16]),
                       (:(-), :(atomic_sub!), [:UInt32,:Int32,:UInt64,:Int64,:Float32]),
                       (:(&), :(atomic_and!), [:UInt32,:Int32,:UInt64,:Int64]),
                       (:(|), :(atomic_or!),  [:UInt32,:Int32,:UInt64,:Int64]),

diff --git a/test/core/device/intrinsics/atomics.jl b/test/core/device/intrinsics/atomics.jl
@@ -10,6 +10,7 @@ using BFloat16s: BFloat16
     types = [Int32, Int64, UInt32, UInt64, Float32]
     capability(device()) >= v"6.0" && push!(types, Float64)
     capability(device()) >= v"7.0" && push!(types, Float16)
+    capability(device()) >= v"9.0" && push!(types, BFloat16)
 
     @testset for T in types
         a = CuArray(T[0])
@@ -19,8 +20,9 @@ using BFloat16s: BFloat16
             return
         end
 
-        @cuda threads=1024 kernel(a, one(T))
-        @test Array(a)[1] == 1024
+        nthreads = T == BFloat16 ? 128 : 1024 # BFloat16(256) + 1 == 256
+        @cuda threads=nthreads kernel(a, one(T))
+        @test Array(a)[1] == nthreads
     end
 end
 
@@ -212,6 +214,7 @@ end
 @testset "add" begin
     types = [Int32, Int64, UInt32, UInt64, Float32, Float64]
     capability(device()) >= v"7.0" && append!(types, [Int16, UInt16, Float16])
+    capability(device()) >= v"9.0" && push!(types, BFloat16)
 
     @testset for T in types
         a = CuArray([zero(T)])
@@ -222,8 +225,9 @@ end
             return
         end
 
-        @cuda threads=1024 kernel(T, a)
-        @test Array(a)[1] == 2048
+        nthreads = T == BFloat16 ? 64 : 1024
+        @cuda threads=nthreads kernel(T, a)
+        @test Array(a)[1] == 2 * nthreads
     end
 end