Skip to content

Commit 9650734

Browse files
committed
Simplify using %laneid.
1 parent a96f52f commit 9650734

File tree

3 files changed

+26
-19
lines changed

3 files changed

+26
-19
lines changed

src/device/intrinsics/indexing.jl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Indexing and dimensions (B.4)
22

33
export
4-
threadIdx, blockDim, blockIdx, gridDim,
4+
threadIdx, blockDim, blockIdx, gridDim, laneid,
55
warpsize
66

77
@generated function _index(::Val{name}, ::Val{range}) where {name, range}
@@ -96,3 +96,11 @@ Returns the thread index within the block.
9696
Returns the warp size (in threads).
9797
"""
9898
@inline warpsize() = Int(ccall("llvm.nvvm.read.ptx.sreg.warpsize", llvmcall, UInt32, ()))
99+
100+
"""
101+
laneid()::UInt32
102+
103+
Returns the thread's lane within the warp.
104+
"""
105+
@inline laneid() = Int(ccall("llvm.nvvm.read.ptx.sreg.laneid", llvmcall, UInt32, ()))+UInt32(1)
106+

src/device/random.jl

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -100,19 +100,13 @@ that). It should always be called by at least 32 threads to ensure the random st
100100
initialized, even if you will be using the generator from fewer threads!
101101
"""
102102
@inline Base.@propagate_inbounds function Random.seed!(rng::SharedTauswortheGenerator, seed)
103-
# 0-indexed so that we can bitwise and instead of mod1
104-
tid0 = threadIdx().x - 1 + (threadIdx().y - 1) * blockDim().x +
105-
(threadIdx().z - 1) * blockDim().x * blockDim().y
106103
state = initial_state(seed)
107-
@inbounds rng.state[tid0&31 + 1] = state
104+
@inbounds rng.state[laneid()] = state
108105
return
109106
end
110107

111108
@inline Base.@propagate_inbounds function initial_state(seeds)
112-
# 0-indexed so that we can bitwise and instead of mod1
113-
tid0 = threadIdx().x - 1 + (threadIdx().y - 1) * blockDim().x +
114-
(threadIdx().z - 1) * blockDim().x * blockDim().y
115-
z = seeds[tid0&31 + 1]
109+
z = seeds[laneid()]
116110

117111
# add the block id to ensure unique values across blocks
118112
# XXX: is this OK? shouldn't we use a generator that allows skipping ahead?
@@ -166,28 +160,30 @@ Generate a byte of random data using the on-device Tausworthe generator.
166160
kernel may deadlock.
167161
"""
168162
function Random.rand(rng::SharedTauswortheGenerator, ::Type{UInt32})
169-
# 0-indexed so that we can bitwise and instead of mod1
170-
tid0 = threadIdx().x - 1 + (threadIdx().y - 1) * blockDim().x +
171-
(threadIdx().z - 1) * blockDim().x * blockDim().y
172-
i = tid0&31 + 1
173-
j = tid0&3 + 1
163+
@inline pow2_mod1(x, y) = (x-1)&(y-1) + 1
164+
165+
i = laneid()
166+
j = pow2_mod1(i, 4)
174167

175168
@inbounds begin
176-
# get
169+
# get state
177170
z = rng.state[i]
178171
if z == 0
179172
z = initial_state(rng.seed)
180173
end
181174

182-
sync_threads() # XXX: this implies that rand() cannot be called from a branch
175+
sync_threads()
183176

184-
# advance
177+
# advance & update state
185178
S1, S2, S3, M = TausShift1()[j], TausShift2()[j], TausShift3()[j], TausOffset()[j]
186179
rng.state[i] = TausStep(z, S1, S2, S3, M)
187180

188181
sync_threads()
189182

190-
# update
191-
rng.state[tid0&31+1] rng.state[(tid0+1)&31+1] rng.state[(tid0+1)&31+1] rng.state[(tid0+1)&31+1]
183+
# generate
184+
rng.state[i]
185+
rng.state[pow2_mod1(i+1, 32)]
186+
rng.state[pow2_mod1(i+2, 32)]
187+
rng.state[pow2_mod1(i+3, 32)]
192188
end
193189
end

test/device/intrinsics.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
@on_device blockIdx().z
1515
@on_device gridDim().z
1616

17+
@on_device warpsize()
18+
@on_device laneid()
19+
1720
@testset "range metadata" begin
1821
foobar() = threadIdx().x
1922
ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{}; raw=true))

0 commit comments

Comments
 (0)