Skip to content

Commit 95894f0

Browse files
authored
mapreduce: avoid deadlock by forcing the accumulator type. (#2596)
Otherwise we may union-split across a shfl invocation, resulting in a deadlock.
1 parent 03ebed7 commit 95894f0

File tree

2 files changed

+10
-2
lines changed

2 files changed

+10
-2
lines changed

src/mapreduce.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ Base.@propagate_inbounds _map_getindex(args::Tuple{}, I) = ()
8888
# Reduce an array across the grid. All elements to be processed can be addressed by the
8989
# product of the two iterators `Rreduce` and `Rother`, where the latter iterator will have
9090
# singleton entries for the dimensions that should be reduced (and vice versa).
91-
function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R, As...)
91+
function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::AbstractArray{T}, As...) where T
9292
assume(length(Rother) > 0)
9393

9494
# decompose the 1D hardware indices into separate ones for reduction (across threads
@@ -112,7 +112,7 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R, As.
112112
neutral
113113
end
114114

115-
val = op(neutral, neutral)
115+
val::T = op(neutral, neutral)
116116

117117
# reduce serially across chunks of input vector that don't fit in a block
118118
ireduce = threadIdx_reduce + (blockIdx_reduce - 1) * blockDim_reduce

test/base/array.jl

+8
Original file line numberDiff line numberDiff line change
@@ -916,3 +916,11 @@ end
916916
@test c == a′ + b
917917
@test c === a
918918
end
919+
920+
@testset "issue 2595" begin
921+
# mixed-type reductions resulted in a deadlock because of union splitting over shfl
922+
a = CUDA.zeros(Float32, 1)
923+
b = CUDA.ones(Float64, 2)
924+
sum!(a, b)
925+
@test Array(a) == [2f0]
926+
end

0 commit comments

Comments
 (0)