Skip to content

Commit 1e6ec76

Browse files
committed
Workaround #28126, support SIMDing broadcast in more cases
This is an ugly performance hack around issue #28126 in some limited (but common) cases. The problem in short: when given many arrays of the same size, LLVM has difficulty hoisting the decision of whether a given dimension should be "extruded" out of the loop. This extra indirection in the index computation seems to foil the array bounds aliasing checks, which stymies SIMDification. The solution: check to see if _Julia_ can statically decide whether or not to extrude any dimensions in a given broadcast expression -- and if so, use a special array wrapper that flags that none of the dimensions in that array need to be extruded out in order to perform the broadcast.
1 parent 9562bdf commit 1e6ec76

File tree

1 file changed

+42
-9
lines changed

1 file changed

+42
-9
lines changed

base/broadcast.jl

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -864,13 +864,14 @@ broadcast_unalias(::Nothing, src) = src
864864

865865
# Preprocessing a `Broadcasted` does two things:
866866
# * unaliases any arguments from `dest`
867-
# * "extrudes" the arguments where it is advantageous to pre-compute the broadcasted indices
868-
@inline preprocess(dest, bc::Broadcasted{Style}) where {Style} = Broadcasted{Style}(bc.f, preprocess_args(dest, bc.args), bc.axes)
869-
preprocess(dest, x) = extrude(broadcast_unalias(dest, x))
867+
# * calls `f` on the arguments (typically `extrude`, which pre-computes the broadcasted indices where advantageous)
868+
@inline preprocess(dest, bc) = preprocess(extrude, dest, bc)
869+
@inline preprocess(f, dest, bc::Broadcasted{Style}) where {Style} = Broadcasted{Style}(bc.f, preprocess_args(f, dest, bc.args), bc.axes)
870+
preprocess(f, dest, x) = f(broadcast_unalias(dest, x))
870871

871-
@inline preprocess_args(dest, args::Tuple) = (preprocess(dest, args[1]), preprocess_args(dest, tail(args))...)
872-
preprocess_args(dest, args::Tuple{Any}) = (preprocess(dest, args[1]),)
873-
preprocess_args(dest, args::Tuple{}) = ()
872+
@inline preprocess_args(f, dest, args::Tuple) = (preprocess(f, dest, args[1]), preprocess_args(f, dest, tail(args))...)
873+
preprocess_args(f, dest, args::Tuple{Any}) = (preprocess(f, dest, args[1]),)
874+
preprocess_args(f, dest, args::Tuple{}) = ()
874875

875876
# Specialize this method if all you want to do is specialize on typeof(dest)
876877
@inline function copyto!(dest::AbstractArray, bc::Broadcasted{Nothing})
@@ -882,13 +883,45 @@ preprocess_args(dest, args::Tuple{}) = ()
882883
return copyto!(dest, A)
883884
end
884885
end
885-
bc′ = preprocess(dest, bc)
886-
@simd for I in eachindex(bc′)
887-
@inbounds dest[I] = bc′[I]
886+
# Ugly performance hack around issue #28126: determine if all arguments to the
887+
# broadcast are sized such that the broadcasting core can statically determine
888+
# whether a given dimension is "extruded" or not. If so, we don't need to check
889+
# any array sizes within the inner loop. Ideally this really should be something
890+
# that Julia and/or LLVM could figure out and eliminate... and indeed they can
891+
# for limited numbers of arguments.
892+
if _is_static_broadcast_28126(dest, bc)
893+
bcs′ = preprocess(nonextrude_28126, dest, bc)
894+
@simd for I in eachindex(bcs′)
895+
@inbounds dest[I] = bcs′[I]
896+
end
897+
else
898+
bc′ = preprocess(extrude, dest, bc)
899+
@simd for I in eachindex(bc′)
900+
@inbounds dest[I] = bc′[I]
901+
end
888902
end
889903
return dest
890904
end
891905

906+
@inline _is_static_broadcast_28126(dest, bc::Broadcasted{Style}) where {Style} = _is_static_broadcast_28126_args(dest, bc.args)
907+
_is_static_broadcast_28126(dest, x) = false
908+
_is_static_broadcast_28126(dest, x::Union{Ref, Tuple, Type, Number, AbstractArray{<:Any,0}}) = true
909+
_is_static_broadcast_28126(dest::AbstractArray, x::AbstractArray{<:Any,0}) = true
910+
_is_static_broadcast_28126(dest::AbstractArray, x::AbstractArray{<:Any,1}) = axes(dest, 1) == axes(x, 1)
911+
_is_static_broadcast_28126(dest::AbstractArray, x::AbstractArray) = axes(dest) == axes(x) # This can be better with other missing dimensions
912+
913+
@inline _is_static_broadcast_28126_args(dest, args::Tuple) = _is_static_broadcast_28126(dest, args[1]) && _is_static_broadcast_28126_args(dest, tail(args))
914+
_is_static_broadcast_28126_args(dest, args::Tuple{Any}) = _is_static_broadcast_28126(dest, args[1])
915+
_is_static_broadcast_28126_args(dest, args::Tuple{}) = true
916+
917+
struct _NonExtruded28126{T}
918+
x::T
919+
end
920+
@inline axes(b::_NonExtruded28126) = axes(b.x)
921+
Base.@propagate_inbounds _broadcast_getindex(b::_NonExtruded28126, i) = b.x[i]
922+
_nonextrude_28126(x::AbstractArray) = _NonExtruded28126(x)
923+
_nonextrude_28126(x) = x
924+
892925
# Performance optimization: for BitArray outputs, we cache the result
893926
# in a "small" Vector{Bool}, and then copy in chunks into the output
894927
@inline function copyto!(dest::BitArray, bc::Broadcasted{Nothing})

0 commit comments

Comments
 (0)