Simplify using %laneid.

maleadt · maleadt · commit 9650734d7f22 · 2021-04-01T13:38:20.000+02:00
diff --git a/src/device/intrinsics/indexing.jl b/src/device/intrinsics/indexing.jl
@@ -1,7 +1,7 @@
 # Indexing and dimensions (B.4)
 
 export
-    threadIdx, blockDim, blockIdx, gridDim,
+    threadIdx, blockDim, blockIdx, gridDim, laneid,
     warpsize
 
 @generated function _index(::Val{name}, ::Val{range}) where {name, range}
@@ -96,3 +96,11 @@ Returns the thread index within the block.
 Returns the warp size (in threads).
 """
 @inline warpsize() = Int(ccall("llvm.nvvm.read.ptx.sreg.warpsize", llvmcall, UInt32, ()))
+
+"""
+    laneid()::UInt32
+
+Returns the thread's lane within the warp.
+"""
+@inline laneid() = Int(ccall("llvm.nvvm.read.ptx.sreg.laneid", llvmcall, UInt32, ()))+UInt32(1)
+
diff --git a/src/device/random.jl b/src/device/random.jl
@@ -100,19 +100,13 @@ that). It should always be called by at least 32 threads to ensure the random st
 initialized, even if you will be using the generator from fewer threads!
 """
 @inline Base.@propagate_inbounds function Random.seed!(rng::SharedTauswortheGenerator, seed)
-    # 0-indexed so that we can bitwise and instead of mod1
-    tid0 = threadIdx().x - 1 + (threadIdx().y - 1) * blockDim().x +
-                               (threadIdx().z - 1) * blockDim().x * blockDim().y
     state = initial_state(seed)
-    @inbounds rng.state[tid0&31 + 1] = state
+    @inbounds rng.state[laneid()] = state
     return
 end
 
 @inline Base.@propagate_inbounds function initial_state(seeds)
-    # 0-indexed so that we can bitwise and instead of mod1
-    tid0 = threadIdx().x - 1 + (threadIdx().y - 1) * blockDim().x +
-                               (threadIdx().z - 1) * blockDim().x * blockDim().y
-    z = seeds[tid0&31 + 1]
+    z = seeds[laneid()]
 
     # add the block id to ensure unique values across blocks
     # XXX: is this OK? shouldn't we use a generator that allows skipping ahead?
@@ -166,28 +160,30 @@ Generate a byte of random data using the on-device Tausworthe generator.
     kernel may deadlock.
 """
 function Random.rand(rng::SharedTauswortheGenerator, ::Type{UInt32})
-    # 0-indexed so that we can bitwise and instead of mod1
-    tid0 = threadIdx().x - 1 + (threadIdx().y - 1) * blockDim().x +
-                               (threadIdx().z - 1) * blockDim().x * blockDim().y
-    i = tid0&31 + 1
-    j = tid0&3  + 1
+    @inline pow2_mod1(x, y) = (x-1)&(y-1) + 1
+
+    i = laneid()
+    j = pow2_mod1(i, 4)
 
     @inbounds begin
-        # get
+        # get state
         z = rng.state[i]
         if z == 0
             z = initial_state(rng.seed)
         end
 
-        sync_threads()  # XXX: this implies that rand() cannot be called from a branch
+        sync_threads()
 
-        # advance
+        # advance & update state
         S1, S2, S3, M = TausShift1()[j], TausShift2()[j], TausShift3()[j], TausOffset()[j]
         rng.state[i] = TausStep(z, S1, S2, S3, M)
 
         sync_threads()
 
-        # update
-        rng.state[tid0&31+1] ⊻ rng.state[(tid0+1)&31+1] ⊻ rng.state[(tid0+1)&31+1] ⊻ rng.state[(tid0+1)&31+1]
+        # generate
+        rng.state[i] ⊻
+          rng.state[pow2_mod1(i+1, 32)] ⊻
+          rng.state[pow2_mod1(i+2, 32)] ⊻
+          rng.state[pow2_mod1(i+3, 32)]
     end
 end
diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl
@@ -14,6 +14,9 @@
     @on_device blockIdx().z
     @on_device gridDim().z
 
+    @on_device warpsize()
+    @on_device laneid()
+
     @testset "range metadata" begin
         foobar() = threadIdx().x
         ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{}; raw=true))