From 5ea272f334c97b21694c9930bbbf78993b26f379 Mon Sep 17 00:00:00 2001 From: Charles Kawczynski Date: Fri, 4 Oct 2024 10:23:28 -0400 Subject: [PATCH] Use verbose names for device types --- ext/MultiBroadcastFusionCUDAExt.jl | 4 ++-- src/execution/fused_kernels.jl | 10 +++++----- test/execution/bm_fused_reads_vs_hard_coded.jl | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ext/MultiBroadcastFusionCUDAExt.jl b/ext/MultiBroadcastFusionCUDAExt.jl index 3c6af49..61e3895 100644 --- a/ext/MultiBroadcastFusionCUDAExt.jl +++ b/ext/MultiBroadcastFusionCUDAExt.jl @@ -4,9 +4,9 @@ import CUDA, Adapt import MultiBroadcastFusion as MBF import MultiBroadcastFusion: fused_copyto! -MBF.device(x::CUDA.CuArray) = MBF.GPU() +MBF.device(x::CUDA.CuArray) = MBF.MBF_CUDA() -function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.GPU) +function fused_copyto!(fmb::MBF.FusedMultiBroadcast, ::MBF.MBF_CUDA) (; pairs) = fmb dest = first(pairs).first nitems = length(parent(dest)) diff --git a/src/execution/fused_kernels.jl b/src/execution/fused_kernels.jl index f3bd891..c0d2fcc 100644 --- a/src/execution/fused_kernels.jl +++ b/src/execution/fused_kernels.jl @@ -2,9 +2,9 @@ @make_fused fused_direct FusedMultiBroadcast fused_direct @make_fused fused_assemble FusedMultiBroadcast fused_assemble -struct CPU end -struct GPU end -device(x::AbstractArray) = CPU() +struct MBF_CPU end +struct MBF_CUDA end +device(x::AbstractArray) = MBF_CPU() function Base.copyto!(fmb::FusedMultiBroadcast) pairs = fmb.pairs # (Pair(dest1, bc1),Pair(dest2, bc2),...) @@ -26,7 +26,7 @@ Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, i...) = @inline rcopyto_at!(pairs::Tuple{}, i...) = nothing # This is better than the baseline. -function fused_copyto!(fmb::FusedMultiBroadcast, ::CPU) +function fused_copyto!(fmb::FusedMultiBroadcast, ::MBF_CPU) (; pairs) = fmb destinations = map(x -> x.first, pairs) ei = if eltype(destinations) <: Vector @@ -44,7 +44,7 @@ end # This should, in theory be better, but it seems like inlining is # failing somewhere. -# function fused_copyto!(fmb::FusedMultiBroadcast, ::CPU) +# function fused_copyto!(fmb::FusedMultiBroadcast, ::MBF_CPU) # (; pairs) = fmb # destinations = map(x -> x.first, pairs) # ei = if eltype(destinations) <: Vector diff --git a/test/execution/bm_fused_reads_vs_hard_coded.jl b/test/execution/bm_fused_reads_vs_hard_coded.jl index 4141567..18695e3 100644 --- a/test/execution/bm_fused_reads_vs_hard_coded.jl +++ b/test/execution/bm_fused_reads_vs_hard_coded.jl @@ -10,7 +10,7 @@ import MultiBroadcastFusion as MBF # =========================================== hard-coded implementations perf_kernel_hard_coded!(X, Y) = perf_kernel_hard_coded!(X, Y, MBF.device(X.x1)) -function perf_kernel_hard_coded!(X, Y, ::MBF.CPU) +function perf_kernel_hard_coded!(X, Y, ::MBF.MBF_CPU) (; x1, x2, x3, x4) = X (; y1, y2, y3, y4) = Y @inbounds for i in eachindex(x1) @@ -24,7 +24,7 @@ end @static get(ENV, "USE_CUDA", nothing) == "true" && using CUDA use_cuda = @isdefined(CUDA) && CUDA.has_cuda() # will be true if you first run `using CUDA` @static if use_cuda - function perf_kernel_hard_coded!(X, Y, ::MBF.GPU) + function perf_kernel_hard_coded!(X, Y, ::MBF.MBF_CUDA) x1 = X.x1 nitems = length(parent(x1)) max_threads = 256 # can be higher if conditions permit