From 03b7eec8b1eca101348e2cbd25eb3315a79a2a6b Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 11 Jul 2024 15:15:08 -0500
Subject: [PATCH 01/24] Optimize run_spin_precession for CPU

---
 KomaMRICore/ext/KomaAMDGPUExt.jl              |  2 +
 KomaMRICore/ext/KomaCUDAExt.jl                |  2 +
 KomaMRICore/ext/KomaMetalExt.jl               | 10 +-
 KomaMRICore/ext/KomaoneAPIExt.jl              |  9 +-
 .../Bloch/BlochDictSimulationMethod.jl        | 94 ++++++++++++++-----
 .../simulation/Bloch/BlochSimulationMethod.jl | 87 +++++++++++++----
 .../src/simulation/Bloch/KernelFunctions.jl   | 50 ----------
 KomaMRICore/src/simulation/SimulatorCore.jl   | 10 +-
 8 files changed, 157 insertions(+), 107 deletions(-)
 delete mode 100644 KomaMRICore/src/simulation/Bloch/KernelFunctions.jl

diff --git a/KomaMRICore/ext/KomaAMDGPUExt.jl b/KomaMRICore/ext/KomaAMDGPUExt.jl
index d0b5f1350..5c14edb04 100644
--- a/KomaMRICore/ext/KomaAMDGPUExt.jl
+++ b/KomaMRICore/ext/KomaAMDGPUExt.jl
@@ -1,6 +1,7 @@
 module KomaAMDGPUExt
 
 using AMDGPU
+using Suppressor
 import KomaMRICore
 import Adapt
 
@@ -20,6 +21,7 @@ end
 
 function __init__()
     push!(KomaMRICore.LOADED_BACKENDS[], ROCBackend())
+    @suppress AMDGPU.allowscalar(true)
 end
 
 end
\ No newline at end of file
diff --git a/KomaMRICore/ext/KomaCUDAExt.jl b/KomaMRICore/ext/KomaCUDAExt.jl
index cc97a0a96..0177dc1b8 100644
--- a/KomaMRICore/ext/KomaCUDAExt.jl
+++ b/KomaMRICore/ext/KomaCUDAExt.jl
@@ -1,6 +1,7 @@
 module KomaCUDAExt
 
 using CUDA
+using Suppressor
 import KomaMRICore
 import Adapt
 
@@ -19,6 +20,7 @@ end
 
 function __init__()
     push!(KomaMRICore.LOADED_BACKENDS[], CUDABackend())
+    @suppress CUDA.allowscalar(true)
 end
 
 end
\ No newline at end of file
diff --git a/KomaMRICore/ext/KomaMetalExt.jl b/KomaMRICore/ext/KomaMetalExt.jl
index d50af7901..31172d0ec 100644
--- a/KomaMRICore/ext/KomaMetalExt.jl
+++ b/KomaMRICore/ext/KomaMetalExt.jl
@@ -3,6 +3,7 @@
 module KomaMetalExt
 
 using Metal
+using Suppressor
 import KomaMRICore
 import Adapt
 
@@ -16,16 +17,9 @@ function KomaMRICore._print_devices(::MetalBackend)
     @info "Metal device type: $(KomaMRICore.device_name(MetalBackend()))"
 end
 
-#Temporary workaround for https://github.com/JuliaGPU/Metal.jl/issues/348
-#Once run_spin_excitation! and run_spin_precession! are kernel-based, this code
-#can be removed
-Base.cumsum(x::MtlVector{T}) where T = convert(MtlVector{T}, cumsum(KomaMRICore.cpu(x)))
-Base.cumsum(x::MtlArray{T}; dims) where T = convert(MtlArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
-Base.findall(x::MtlVector{Bool}) = convert(MtlVector, findall(KomaMRICore.cpu(x)))
-
 function __init__()
     push!(KomaMRICore.LOADED_BACKENDS[], MetalBackend())
-    @warn "Metal does not support all array operations used by KomaMRI (https://github.com/JuliaGPU/Metal.jl/issues/348). GPU performance may be slower than expected"
+    @suppress Metal.allowscalar(true)
 end
 
 end
diff --git a/KomaMRICore/ext/KomaoneAPIExt.jl b/KomaMRICore/ext/KomaoneAPIExt.jl
index 0bec755fc..46dfcfeb3 100644
--- a/KomaMRICore/ext/KomaoneAPIExt.jl
+++ b/KomaMRICore/ext/KomaoneAPIExt.jl
@@ -1,6 +1,7 @@
 module KomaoneAPIExt
 
 using oneAPI
+using Suppressor
 import KomaMRICore
 import Adapt
 
@@ -17,15 +18,9 @@ function KomaMRICore._print_devices(::oneAPIBackend)
     @info "$(length(oneAPI.devices())) oneAPI capable device(s)." devices...
 end
 
-#Temporary workaround since oneAPI.jl (similar to Metal) does not support some array operations
-#Once run_spin_excitation! and run_spin_precession! are kernel-based, this code can be removed
-Base.cumsum(x::oneVector{T}) where T = convert(oneVector{T}, cumsum(KomaMRICore.cpu(x)))
-Base.cumsum(x::oneArray{T}; dims) where T = convert(oneArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
-Base.findall(x::oneVector{Bool}) = convert(oneVector, findall(KomaMRICore.cpu(x)))
-
 function __init__()
     push!(KomaMRICore.LOADED_BACKENDS[], oneAPIBackend())
-    @warn "oneAPI does not support all array operations used by KomaMRI. GPU performance may be slower than expected"
+    @suppress oneAPI.allowscalar(true)
 end
 
 end
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl b/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl
index 3cf7abb6c..1dfe1c108 100644
--- a/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl
+++ b/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl
@@ -14,6 +14,30 @@ function sim_output_dim(
     return (sum(seq.ADC.N), length(obj), out_state_dim)
 end
 
+"""Preallocated arrays for use in run_spin_precession."""
+struct BlochDictPrealloc{T} <: PreallocResult{T}
+    Bz_1::AbstractVector{T}
+    Bz_2::AbstractVector{T}
+    ϕ::AbstractVector{T}
+end
+
+Base.view(p::BlochDictPrealloc, i::UnitRange) = begin
+    @views BlochDictPrealloc(
+        p.Bz_1[i],
+        p.Bz_2[i],
+        p.ϕ[i]
+    )
+end
+
+"""BlochDict preallocation function. Returns arrays for use in run_spin_precession."""
+function prealloc(sim_method::BlochDict, obj::Phantom{T}, M::Mag{T}) where {T<:Real}
+    BlochDictPrealloc(
+        similar(obj.x),
+        similar(obj.x),
+        similar(obj.x),
+    )
+end
+
 """
     run_spin_precession(obj, seq, Xt, sig)
 
@@ -35,33 +59,59 @@ function run_spin_precession!(
     sig::AbstractArray{Complex{T}},
     M::Mag{T},
     sim_method::BlochDict,
-    backend::KA.Backend
+    backend::KA.Backend,
+    prealloc::BlochDictPrealloc
 ) where {T<:Real}
     #Simulation
     #Motion
     x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
-    #Effective field
-    Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw / T(2π * γ)
-    #Rotation
-    if is_ADC_on(seq)
-        ϕ = T(-2π * γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
-    else
-        ϕ = T(-2π * γ) .* trapz(seq.Δt', Bz)
+    
+    #Initialize arrays
+    Bz_1 = prealloc.Bz_1
+    Bz_2 = prealloc.Bz_2
+    ϕ = prealloc.ϕ
+    fill!(ϕ, zero(T))
+    Bz_1 .= x[:,1] .* seq.Gx[1] .+ y[:,1] .* seq.Gy[1] .+ z[:,1] .* seq.Gz[1] .+ p.Δw / T(2π * γ)
+
+    # Fill sig[1] if needed
+    ADC_idx = 1
+    if (seq.ADC[1])
+        sig[1,:,1] .= M.xy
+        if sim_method.save_Mz
+            sig[1,:,2] .= M.z
+        end
+        ADC_idx += 1
     end
-    #Mxy precession and relaxation, and Mz relaxation
-    tp = cumsum(seq.Δt) # t' = t - t0
-    dur = sum(seq.Δt)   # Total length, used for signal relaxation
-    Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))] #This assumes Δw and T2 are constant in time
-    M.xy .= Mxy[:, end]
-    #Acquired signal
-    sig[:, :, 1] .= transpose(Mxy[:, findall(seq.ADC)])
-
-    if sim_method.save_Mz
-        Mz = [M.z M.z .* exp.(-tp' ./ p.T1) .+ p.ρ .* (1 .- exp.(-tp' ./ p.T1))] #Calculate intermediate points
-        sig[:, :, 2] .= transpose(Mz[:, findall(seq.ADC)]) #Save state to signal
-        M.z .= Mz[:, end]
-    else
-        M.z .= M.z .* exp.(-dur ./ p.T1) .+ p.ρ .* (1 .- exp.(-dur ./ p.T1)) #Jump to the last point
+
+    t_seq = zero(T) # Time
+    for seq_idx=2:length(seq.t)
+        t_seq += seq.Δt[seq_idx-1]
+
+        #Effective Field
+        if size(x,2) > 1 #Motion
+            Bz_2 .= x[:,seq_idx] .* seq.Gx[seq_idx] .+ y[:,seq_idx] .* seq.Gy[seq_idx] .+ z[:,seq_idx] .* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
+        else             #No motion
+            Bz_2 .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z.* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
+        end
+        
+        #Rotation
+        ϕ .= ϕ .+ (Bz_1 .+ Bz_2) .* (T(-2π * γ) * seq.Δt[seq_idx-1] / 2)
+
+        #Acquired Signal
+        if seq_idx <= length(seq.ADC) && any(seq.ADC[seq_idx,:])
+            sig[ADC_idx,:,1] .= M.xy .* exp.(-t_seq ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))
+            if sim_method.save_Mz
+                sig[ADC_idx,:,2] .= M.z .* exp.(-t_seq ./ p.T1) .+ p.ρ .* (1 .- exp.(-t_seq ./ p.T1))
+            end
+            ADC_idx += 1
+        end
+
+        Bz_1, Bz_2 = Bz_2, Bz_1
     end
+
+    #Final Spin-State
+    M.xy .= M.xy .* exp.(-t_seq ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))
+    M.z .= M.z .* exp.(-t_seq ./ p.T1) .+ p.ρ .* (1 .- exp.(-t_seq ./ p.T1))
+
     return nothing
 end
diff --git a/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl b/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
index a6dabbd0a..3a69f72dd 100644
--- a/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
+++ b/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
@@ -22,6 +22,33 @@ function initialize_spins_state(
     return Xt, obj
 end
 
+"""Preallocated arrays for use in run_spin_precession."""
+struct BlochPrealloc{T} <: PreallocResult{T}
+    Bz_1::AbstractVector{T}
+    Bz_2::AbstractVector{T}
+    ϕ::AbstractVector{T}
+    Mxy::AbstractVector{Complex{T}}
+end
+
+Base.view(p::BlochPrealloc, i::UnitRange) = begin
+    @views BlochPrealloc(
+        p.Bz_1[i],
+        p.Bz_2[i],
+        p.ϕ[i],
+        p.Mxy[i]
+    )
+end
+
+"""Default preallocation function. Returns arrays for use in run_spin_precession."""
+function prealloc(sim_method::SimulationMethod, obj::Phantom{T}, M::Mag{T}) where {T<:Real}
+    BlochPrealloc(
+        similar(obj.x),
+        similar(obj.x),
+        similar(obj.x),
+        similar(M.xy)
+    )
+end
+
 """
     run_spin_precession(obj, seq, Xt, sig)
 
@@ -43,27 +70,55 @@ function run_spin_precession!(
     sig::AbstractArray{Complex{T}},
     M::Mag{T},
     sim_method::SimulationMethod,
-    backend::KA.Backend
+    backend::KA.Backend,
+    prealloc::BlochPrealloc
 ) where {T<:Real}
     #Simulation
     #Motion
     x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
-    #Effective field
-    Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw / T(2π * γ)
-    #Rotation
-    if is_ADC_on(seq)
-        ϕ = T(-2π * γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
-    else
-        ϕ = T(-2π * γ) .* trapz(seq.Δt', Bz)
+    
+    #Initialize arrays
+    Bz_1 = prealloc.Bz_1
+    Bz_2 = prealloc.Bz_2
+    ϕ = prealloc.ϕ
+    Mxy = prealloc.Mxy
+    fill!(ϕ, zero(T))
+    Bz_1 .= x[:,1] .* seq.Gx[1] .+ y[:,1] .* seq.Gy[1] .+ z[:,1] .* seq.Gz[1] .+ p.Δw / T(2π * γ)
+
+    # Fill sig[1] if needed
+    ADC_idx = 1
+    if (seq.ADC[1])
+        sig[1] = sum(M.xy)
+        ADC_idx += 1
     end
-    #Mxy precession and relaxation, and Mz relaxation
-    tp   = cumsum(seq.Δt) # t' = t - t0
-    dur  = sum(seq.Δt)   # Total length, used for signal relaxation
-    Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))] #This assumes Δw and T2 are constant in time
-    M.xy .= Mxy[:, end]
-    M.z  .= M.z .* exp.(-dur ./ p.T1) .+ p.ρ .* (1 .- exp.(-dur ./ p.T1))
-    #Acquired signal
-    sig .= transpose(sum(Mxy[:, findall(seq.ADC)]; dims=1)) #<--- TODO: add coil sensitivities
+
+    t_seq = zero(T) # Time
+    for seq_idx=2:length(seq.t)
+        t_seq += seq.Δt[seq_idx-1]
+
+        #Effective Field
+        if size(x,2) > 1 #Motion
+            Bz_2 .= x[:,seq_idx] .* seq.Gx[seq_idx] .+ y[:,seq_idx] .* seq.Gy[seq_idx] .+ z[:,seq_idx] .* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
+        else             #No motion
+            Bz_2 .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z.* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
+        end
+        
+        #Rotation
+        ϕ .= ϕ .+ (Bz_1 .+ Bz_2) .* (T(-2π * γ) * seq.Δt[seq_idx-1] / 2)
+
+        #Acquired Signal
+        if seq_idx <= length(seq.ADC) && any(seq.ADC[seq_idx,:])
+            Mxy .= M.xy .* exp.(-t_seq ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))
+            sig[ADC_idx] = sum(Mxy)
+            ADC_idx += 1
+        end
+
+        Bz_1, Bz_2 = Bz_2, Bz_1
+    end
+
+    #Final Spin-State
+    M.xy .= M.xy .* exp.(-t_seq ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))
+    M.z .= M.z .* exp.(-t_seq ./ p.T1) .+ p.ρ .* (1 .- exp.(-t_seq ./ p.T1))
 
     return nothing
 end
diff --git a/KomaMRICore/src/simulation/Bloch/KernelFunctions.jl b/KomaMRICore/src/simulation/Bloch/KernelFunctions.jl
deleted file mode 100644
index 15728bcb6..000000000
--- a/KomaMRICore/src/simulation/Bloch/KernelFunctions.jl
+++ /dev/null
@@ -1,50 +0,0 @@
-using KernelAbstractions: @index, @kernel
-
-## COV_EXCL_START
-#! format: off
-
-"""
-    cumsum2_kernel
-
-Simple kernel function, computes the cumulative sum of each row of a matrix. Operates
-in-place on the input matrix without allocating additional memory.
-
-# Arguments
-- 'A': matrix to compute cumsum on
-"""
-@kernel function cumsum_matrix_rows_kernel!(A)
-    i = @index(Global)
-
-    for k ∈ 2:size(A, 2)
-        @inbounds A[i, k] += A[i, k-1]
-    end
-end
-
-#! format: on
-## COV_EXCL_STOP
-
-"""
-    cumtrapz
-
-A more efficient GPU implementation of the cumtrapz method defined in TrapezoidalIntegration.jl.
-Uses a kernel to compute cumsum along the second dimension.
-
-# Arguments
-- `Δt`: (`1 x NΔt ::Matrix{Float64}`, `[s]`) delta time 1-row array
-- `x`: (`Ns x (NΔt+1) ::Matrix{Float64}`, `[T]`) magnitude of the field Gx * x + Gy * y +
-    Gz * z
-
-# Returns
-- `y`: (`Ns x NΔt ::Matrix{Float64}`, `[T*s]`) matrix where every column is the
-    cumulative integration over time of (Gx * x + Gy * y + Gz * z) * Δt for every spin of a
-    phantom
-"""
-function KomaMRIBase.cumtrapz(Δt::AbstractArray{T}, x::AbstractArray{T}, backend::KA.GPU) where {T<:Real}
-    y = (x[:, 2:end] .+ x[:, 1:end-1]) .* (Δt / 2)
-    cumsum_matrix_rows_kernel!(backend)(y, ndrange=size(y,1))
-    KA.synchronize(backend)
-    return y
-end
-
-#If on CPU, forward call to cumtrapz in KomaMRIBase
-KomaMRIBase.cumtrapz(Δt::AbstractArray{T}, x::AbstractArray{T}, backend::KA.CPU) where {T<:Real} = KomaMRIBase.cumtrapz(Δt, x)
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimulatorCore.jl b/KomaMRICore/src/simulation/SimulatorCore.jl
index fabb828d1..2a6e0ed57 100644
--- a/KomaMRICore/src/simulation/SimulatorCore.jl
+++ b/KomaMRICore/src/simulation/SimulatorCore.jl
@@ -1,8 +1,8 @@
 abstract type SimulationMethod end #get all available types by using subtypes(KomaMRI.SimulationMethod)
 abstract type SpinStateRepresentation{T<:Real} end #get all available types by using subtypes(KomaMRI.SpinStateRepresentation)
+abstract type PreallocResult{T<:Real} end #get all available types by using subtypes(KomaMRI.PreallocResult)
 
 #Defined methods:
-include("Bloch/KernelFunctions.jl")
 include("Bloch/BlochSimulationMethod.jl")       #Defines Bloch simulation method
 include("Bloch/BlochDictSimulationMethod.jl")   #Defines BlochDict simulation method
 
@@ -84,7 +84,8 @@ function run_spin_precession_parallel!(
     sig::AbstractArray{Complex{T}},
     Xt::SpinStateRepresentation{T},
     sim_method::SimulationMethod,
-    backend::KA.Backend;
+    backend::KA.Backend,
+    prealloc::PreallocResult;
     Nthreads=Threads.nthreads(),
 ) where {T<:Real}
     parts = kfoldperm(length(obj), Nthreads)
@@ -92,7 +93,7 @@ function run_spin_precession_parallel!(
 
     ThreadsX.foreach(enumerate(parts)) do (i, p)
         run_spin_precession!(
-            @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method, backend
+            @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method, backend, @view(prealloc[p])
         )
     end
 
@@ -180,6 +181,7 @@ function run_sim_time_iter!(
     rfs = 0
     samples = 1
     progress_bar = Progress(Nblocks; desc="Running simulation...")
+    prealloc_result = prealloc(sim_method, obj, Xt) 
 
     for (block, p) in enumerate(parts)
         seq_block = @view seq[p]
@@ -196,7 +198,7 @@ function run_sim_time_iter!(
             rfs += 1
         else
             run_spin_precession_parallel!(
-                obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method, backend; Nthreads
+                obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method, backend, prealloc_result; Nthreads
             )
         end
         samples += Nadc

From 637b490159be5ea8a7e7ff26856def800d9b635f Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 11 Jul 2024 15:19:48 -0500
Subject: [PATCH 02/24] Don't run Benchmark action for external PRs

---
 .github/workflows/Benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
index ef9074d5d..0cf328d1e 100644
--- a/.github/workflows/Benchmark.yml
+++ b/.github/workflows/Benchmark.yml
@@ -32,7 +32,7 @@ on:
 
 jobs:
   benchmark:
-    if: "!contains(github.event.head_commit.message, '[skip benchmarks]')"
+    if: ${{ !contains(github.event.head_commit.message, '[skip benchmarks]') && github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }}
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4

From 1837de778f8a892f8567ad7ef34bcbd274c94ec0 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 11 Jul 2024 15:26:54 -0500
Subject: [PATCH 03/24] Commit change for adding Suppressor as dependency

---
 KomaMRICore/Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/KomaMRICore/Project.toml b/KomaMRICore/Project.toml
index 5992b99fa..1d028c369 100644
--- a/KomaMRICore/Project.toml
+++ b/KomaMRICore/Project.toml
@@ -11,6 +11,7 @@ KomaMRIBase = "d0bc0b20-b151-4d03-b2a4-6ca51751cb9c"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"
 ThreadsX = "ac1d9e8a-700a-412c-b207-f0111f4b6c0d"
 
 [weakdeps]

From 405f8d1a63a56281c8608a96bc688cca89a07b1d Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 11 Jul 2024 15:44:57 -0500
Subject: [PATCH 04/24] Still run with Threads.nthreads() if falling back to
 CPU

---
 KomaMRICore/src/simulation/SimulatorCore.jl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/KomaMRICore/src/simulation/SimulatorCore.jl b/KomaMRICore/src/simulation/SimulatorCore.jl
index 2a6e0ed57..10c5f2367 100644
--- a/KomaMRICore/src/simulation/SimulatorCore.jl
+++ b/KomaMRICore/src/simulation/SimulatorCore.jl
@@ -48,7 +48,7 @@ function default_sim_params(sim_params=Dict{String,Any}())
     sampling_params = KomaMRIBase.default_sampling_params()
     get!(sim_params, "gpu", true)
     get!(sim_params, "gpu_device", nothing)
-    get!(sim_params, "Nthreads", sim_params["gpu"] ? 1 : Threads.nthreads())
+    get!(sim_params, "Nthreads", Threads.nthreads())
     get!(sim_params, "Nblocks", 20)
     get!(sim_params, "Δt", sampling_params["Δt"])
     get!(sim_params, "Δt_rf", sampling_params["Δt_rf"])
@@ -336,9 +336,12 @@ function simulate(
     Xt, obj = initialize_spins_state(obj, sim_params["sim_method"])
     # Signal init
     Ndims = sim_output_dim(obj, seq, sys, sim_params["sim_method"])
-    sig = zeros(ComplexF64, Ndims..., sim_params["Nthreads"])
     backend = get_backend(sim_params["gpu"])
     sim_params["gpu"] &= backend isa KA.GPU
+    if sim_params["gpu"]
+        sim_params["Nthreads"] = 1
+    end
+    sig = zeros(ComplexF64, Ndims..., sim_params["Nthreads"])
     if !KA.supports_float64(backend) && sim_params["precision"] == "f64"
         sim_params["precision"] = "f32"
         @info """ Backend: '$(name(backend))' does not support 64-bit precision 

From 3bce91514cddb5d17bc18e1b775cf59c67cb7c2d Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 11 Jul 2024 16:01:43 -0500
Subject: [PATCH 05/24] Warn if user tries to run on CPU without enabling
 multi-threading

---
 KomaMRICore/src/simulation/SimulatorCore.jl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/KomaMRICore/src/simulation/SimulatorCore.jl b/KomaMRICore/src/simulation/SimulatorCore.jl
index 10c5f2367..7cb209bfe 100644
--- a/KomaMRICore/src/simulation/SimulatorCore.jl
+++ b/KomaMRICore/src/simulation/SimulatorCore.jl
@@ -327,6 +327,13 @@ function simulate(
 )
     #Simulation parameter unpacking, and setting defaults if key is not defined
     sim_params = default_sim_params(sim_params)
+    #Warn if user is trying to run on CPU without enabling multi-threading
+    if (!sim_params["gpu"] && Threads.nthreads() == 1)
+        @info """
+            Simulation will be run on the CPU with only 1 thread. To
+            enable multi-threading, start julia with --threads=auto.
+        """ maxlog=1
+    end
     # Simulation init
     seqd = discretize(seq; sampling_params=sim_params) # Sampling of Sequence waveforms
     parts, excitation_bool = get_sim_ranges(seqd; Nblocks=sim_params["Nblocks"]) # Generating simulation blocks

From 7df32aaa9e899fe4ad13620440cd1b00b9a6a9af Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 11 Jul 2024 16:54:54 -0500
Subject: [PATCH 06/24] Add space to confirm if error happens again

---
 KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl b/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
index 3a69f72dd..7bf531f6e 100644
--- a/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
+++ b/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
@@ -109,7 +109,7 @@ function run_spin_precession!(
         #Acquired Signal
         if seq_idx <= length(seq.ADC) && any(seq.ADC[seq_idx,:])
             Mxy .= M.xy .* exp.(-t_seq ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))
-            sig[ADC_idx] = sum(Mxy)
+            sig[ADC_idx] = sum(Mxy) 
             ADC_idx += 1
         end
 

From c19c75e523b3ff86f1f3b1b66a9352e73a2edb47 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 11 Jul 2024 17:27:57 -0500
Subject: [PATCH 07/24] Try putting -t_seq inside array

---
 KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl b/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
index 7bf531f6e..c6a7f5f2a 100644
--- a/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
+++ b/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
@@ -108,7 +108,7 @@ function run_spin_precession!(
 
         #Acquired Signal
         if seq_idx <= length(seq.ADC) && any(seq.ADC[seq_idx,:])
-            Mxy .= M.xy .* exp.(-t_seq ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))
+            Mxy .= M.xy .* exp.([-t_seq] ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))
             sig[ADC_idx] = sum(Mxy) 
             ADC_idx += 1
         end

From 037561f0bc6c2daa71705e9446eb991a3406c4c4 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 11 Jul 2024 17:39:04 -0500
Subject: [PATCH 08/24] Try changing order of operations

---
 KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl b/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
index c6a7f5f2a..400df1e7b 100644
--- a/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
+++ b/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
@@ -108,7 +108,7 @@ function run_spin_precession!(
 
         #Acquired Signal
         if seq_idx <= length(seq.ADC) && any(seq.ADC[seq_idx,:])
-            Mxy .= M.xy .* exp.([-t_seq] ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))
+            Mxy .= exp.(-t_seq ./ p.T2) .* (M.xy .* (cos.(ϕ) .+ im * sin.(ϕ)))
             sig[ADC_idx] = sum(Mxy) 
             ADC_idx += 1
         end

From b7fa2695594935ca3671b55ac12863878f188952 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Sat, 13 Jul 2024 15:17:17 -0500
Subject: [PATCH 09/24] Rename arrays

---
 .../Bloch/BlochDictSimulationMethod.jl        | 22 ++++++++---------
 .../simulation/Bloch/BlochSimulationMethod.jl | 24 +++++++++----------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl b/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl
index 1dfe1c108..11c1efcc0 100644
--- a/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl
+++ b/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl
@@ -16,15 +16,15 @@ end
 
 """Preallocated arrays for use in run_spin_precession."""
 struct BlochDictPrealloc{T} <: PreallocResult{T}
-    Bz_1::AbstractVector{T}
-    Bz_2::AbstractVector{T}
+    Bz_old::AbstractVector{T}
+    Bz_new::AbstractVector{T}
     ϕ::AbstractVector{T}
 end
 
 Base.view(p::BlochDictPrealloc, i::UnitRange) = begin
     @views BlochDictPrealloc(
-        p.Bz_1[i],
-        p.Bz_2[i],
+        p.Bz_old[i],
+        p.Bz_new[i],
         p.ϕ[i]
     )
 end
@@ -67,11 +67,11 @@ function run_spin_precession!(
     x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
     
     #Initialize arrays
-    Bz_1 = prealloc.Bz_1
-    Bz_2 = prealloc.Bz_2
+    Bz_old = prealloc.Bz_old
+    Bz_new = prealloc.Bz_new
     ϕ = prealloc.ϕ
     fill!(ϕ, zero(T))
-    Bz_1 .= x[:,1] .* seq.Gx[1] .+ y[:,1] .* seq.Gy[1] .+ z[:,1] .* seq.Gz[1] .+ p.Δw / T(2π * γ)
+    Bz_old .= x[:,1] .* seq.Gx[1] .+ y[:,1] .* seq.Gy[1] .+ z[:,1] .* seq.Gz[1] .+ p.Δw / T(2π * γ)
 
     # Fill sig[1] if needed
     ADC_idx = 1
@@ -89,13 +89,13 @@ function run_spin_precession!(
 
         #Effective Field
         if size(x,2) > 1 #Motion
-            Bz_2 .= x[:,seq_idx] .* seq.Gx[seq_idx] .+ y[:,seq_idx] .* seq.Gy[seq_idx] .+ z[:,seq_idx] .* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
+            Bz_new .= x[:,seq_idx] .* seq.Gx[seq_idx] .+ y[:,seq_idx] .* seq.Gy[seq_idx] .+ z[:,seq_idx] .* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
         else             #No motion
-            Bz_2 .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z.* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
+            Bz_new .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z.* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
         end
         
         #Rotation
-        ϕ .= ϕ .+ (Bz_1 .+ Bz_2) .* (T(-2π * γ) * seq.Δt[seq_idx-1] / 2)
+        ϕ .= ϕ .+ (Bz_old .+ Bz_new) .* (T(-2π * γ) * seq.Δt[seq_idx-1] / 2)
 
         #Acquired Signal
         if seq_idx <= length(seq.ADC) && any(seq.ADC[seq_idx,:])
@@ -106,7 +106,7 @@ function run_spin_precession!(
             ADC_idx += 1
         end
 
-        Bz_1, Bz_2 = Bz_2, Bz_1
+        Bz_old, Bz_new = Bz_new, Bz_old
     end
 
     #Final Spin-State
diff --git a/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl b/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
index 400df1e7b..dcf816073 100644
--- a/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
+++ b/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
@@ -24,16 +24,16 @@ end
 
 """Preallocated arrays for use in run_spin_precession."""
 struct BlochPrealloc{T} <: PreallocResult{T}
-    Bz_1::AbstractVector{T}
-    Bz_2::AbstractVector{T}
+    Bz_old::AbstractVector{T}
+    Bz_new::AbstractVector{T}
     ϕ::AbstractVector{T}
     Mxy::AbstractVector{Complex{T}}
 end
 
 Base.view(p::BlochPrealloc, i::UnitRange) = begin
     @views BlochPrealloc(
-        p.Bz_1[i],
-        p.Bz_2[i],
+        p.Bz_old[i],
+        p.Bz_new[i],
         p.ϕ[i],
         p.Mxy[i]
     )
@@ -78,12 +78,12 @@ function run_spin_precession!(
     x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
     
     #Initialize arrays
-    Bz_1 = prealloc.Bz_1
-    Bz_2 = prealloc.Bz_2
+    Bz_old = prealloc.Bz_old
+    Bz_new = prealloc.Bz_new
     ϕ = prealloc.ϕ
     Mxy = prealloc.Mxy
     fill!(ϕ, zero(T))
-    Bz_1 .= x[:,1] .* seq.Gx[1] .+ y[:,1] .* seq.Gy[1] .+ z[:,1] .* seq.Gz[1] .+ p.Δw / T(2π * γ)
+    Bz_old .= x[:,1] .* seq.Gx[1] .+ y[:,1] .* seq.Gy[1] .+ z[:,1] .* seq.Gz[1] .+ p.Δw / T(2π * γ)
 
     # Fill sig[1] if needed
     ADC_idx = 1
@@ -98,22 +98,22 @@ function run_spin_precession!(
 
         #Effective Field
         if size(x,2) > 1 #Motion
-            Bz_2 .= x[:,seq_idx] .* seq.Gx[seq_idx] .+ y[:,seq_idx] .* seq.Gy[seq_idx] .+ z[:,seq_idx] .* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
+            Bz_new .= x[:,seq_idx] .* seq.Gx[seq_idx] .+ y[:,seq_idx] .* seq.Gy[seq_idx] .+ z[:,seq_idx] .* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
         else             #No motion
-            Bz_2 .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z.* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
+            Bz_new .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z.* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
         end
         
         #Rotation
-        ϕ .= ϕ .+ (Bz_1 .+ Bz_2) .* (T(-2π * γ) * seq.Δt[seq_idx-1] / 2)
+        ϕ .= ϕ .+ (Bz_old .+ Bz_new) .* (T(-2π * γ) * seq.Δt[seq_idx-1] / 2)
 
         #Acquired Signal
-        if seq_idx <= length(seq.ADC) && any(seq.ADC[seq_idx,:])
+        if seq_idx <= length(seq.ADC) && seq.ADC[seq_idx]
             Mxy .= exp.(-t_seq ./ p.T2) .* (M.xy .* (cos.(ϕ) .+ im * sin.(ϕ)))
             sig[ADC_idx] = sum(Mxy) 
             ADC_idx += 1
         end
 
-        Bz_1, Bz_2 = Bz_2, Bz_1
+        Bz_old, Bz_new = Bz_new, Bz_old
     end
 
     #Final Spin-State

From bdaa16d4f1febc570b5d4ec829ac1b41bb7b95e6 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Tue, 16 Jul 2024 14:40:34 -0500
Subject: [PATCH 10/24] Leave BlochDict alone for now

---
 .../Bloch/BlochDictSimulationMethod.jl        | 95 +++++--------------
 .../src/simulation/Bloch/KernelFunctions.jl   | 50 ++++++++++
 KomaMRICore/src/simulation/SimulatorCore.jl   |  1 +
 3 files changed, 74 insertions(+), 72 deletions(-)
 create mode 100644 KomaMRICore/src/simulation/Bloch/KernelFunctions.jl

diff --git a/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl b/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl
index 11c1efcc0..8e177153a 100644
--- a/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl
+++ b/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl
@@ -14,30 +14,6 @@ function sim_output_dim(
     return (sum(seq.ADC.N), length(obj), out_state_dim)
 end
 
-"""Preallocated arrays for use in run_spin_precession."""
-struct BlochDictPrealloc{T} <: PreallocResult{T}
-    Bz_old::AbstractVector{T}
-    Bz_new::AbstractVector{T}
-    ϕ::AbstractVector{T}
-end
-
-Base.view(p::BlochDictPrealloc, i::UnitRange) = begin
-    @views BlochDictPrealloc(
-        p.Bz_old[i],
-        p.Bz_new[i],
-        p.ϕ[i]
-    )
-end
-
-"""BlochDict preallocation function. Returns arrays for use in run_spin_precession."""
-function prealloc(sim_method::BlochDict, obj::Phantom{T}, M::Mag{T}) where {T<:Real}
-    BlochDictPrealloc(
-        similar(obj.x),
-        similar(obj.x),
-        similar(obj.x),
-    )
-end
-
 """
     run_spin_precession(obj, seq, Xt, sig)
 
@@ -60,58 +36,33 @@ function run_spin_precession!(
     M::Mag{T},
     sim_method::BlochDict,
     backend::KA.Backend,
-    prealloc::BlochDictPrealloc
+    prealloc::BlochPrealloc
 ) where {T<:Real}
     #Simulation
     #Motion
     x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
-    
-    #Initialize arrays
-    Bz_old = prealloc.Bz_old
-    Bz_new = prealloc.Bz_new
-    ϕ = prealloc.ϕ
-    fill!(ϕ, zero(T))
-    Bz_old .= x[:,1] .* seq.Gx[1] .+ y[:,1] .* seq.Gy[1] .+ z[:,1] .* seq.Gz[1] .+ p.Δw / T(2π * γ)
-
-    # Fill sig[1] if needed
-    ADC_idx = 1
-    if (seq.ADC[1])
-        sig[1,:,1] .= M.xy
-        if sim_method.save_Mz
-            sig[1,:,2] .= M.z
-        end
-        ADC_idx += 1
+    #Effective field
+    Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw / T(2π * γ)
+    #Rotation
+    if is_ADC_on(seq)
+        ϕ = T(-2π * γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
+    else
+        ϕ = T(-2π * γ) .* trapz(seq.Δt', Bz)
     end
-
-    t_seq = zero(T) # Time
-    for seq_idx=2:length(seq.t)
-        t_seq += seq.Δt[seq_idx-1]
-
-        #Effective Field
-        if size(x,2) > 1 #Motion
-            Bz_new .= x[:,seq_idx] .* seq.Gx[seq_idx] .+ y[:,seq_idx] .* seq.Gy[seq_idx] .+ z[:,seq_idx] .* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
-        else             #No motion
-            Bz_new .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z.* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
-        end
-        
-        #Rotation
-        ϕ .= ϕ .+ (Bz_old .+ Bz_new) .* (T(-2π * γ) * seq.Δt[seq_idx-1] / 2)
-
-        #Acquired Signal
-        if seq_idx <= length(seq.ADC) && any(seq.ADC[seq_idx,:])
-            sig[ADC_idx,:,1] .= M.xy .* exp.(-t_seq ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))
-            if sim_method.save_Mz
-                sig[ADC_idx,:,2] .= M.z .* exp.(-t_seq ./ p.T1) .+ p.ρ .* (1 .- exp.(-t_seq ./ p.T1))
-            end
-            ADC_idx += 1
-        end
-
-        Bz_old, Bz_new = Bz_new, Bz_old
+    #Mxy precession and relaxation, and Mz relaxation
+    tp = cumsum(seq.Δt) # t' = t - t0
+    dur = sum(seq.Δt)   # Total length, used for signal relaxation
+    Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))] #This assumes Δw and T2 are constant in time
+    M.xy .= Mxy[:, end]
+    #Acquired signal
+    sig[:, :, 1] .= transpose(Mxy[:, findall(seq.ADC)])
+
+    if sim_method.save_Mz
+        Mz = [M.z M.z .* exp.(-tp' ./ p.T1) .+ p.ρ .* (1 .- exp.(-tp' ./ p.T1))] #Calculate intermediate points
+        sig[:, :, 2] .= transpose(Mz[:, findall(seq.ADC)]) #Save state to signal
+        M.z .= Mz[:, end]
+    else
+        M.z .= M.z .* exp.(-dur ./ p.T1) .+ p.ρ .* (1 .- exp.(-dur ./ p.T1)) #Jump to the last point
     end
-
-    #Final Spin-State
-    M.xy .= M.xy .* exp.(-t_seq ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))
-    M.z .= M.z .* exp.(-t_seq ./ p.T1) .+ p.ρ .* (1 .- exp.(-t_seq ./ p.T1))
-
     return nothing
-end
+end
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/Bloch/KernelFunctions.jl b/KomaMRICore/src/simulation/Bloch/KernelFunctions.jl
new file mode 100644
index 000000000..15728bcb6
--- /dev/null
+++ b/KomaMRICore/src/simulation/Bloch/KernelFunctions.jl
@@ -0,0 +1,50 @@
+using KernelAbstractions: @index, @kernel
+
+## COV_EXCL_START
+#! format: off
+
+"""
+    cumsum2_kernel
+
+Simple kernel function, computes the cumulative sum of each row of a matrix. Operates
+in-place on the input matrix without allocating additional memory.
+
+# Arguments
+- 'A': matrix to compute cumsum on
+"""
+@kernel function cumsum_matrix_rows_kernel!(A)
+    i = @index(Global)
+
+    for k ∈ 2:size(A, 2)
+        @inbounds A[i, k] += A[i, k-1]
+    end
+end
+
+#! format: on
+## COV_EXCL_STOP
+
+"""
+    cumtrapz
+
+A more efficient GPU implementation of the cumtrapz method defined in TrapezoidalIntegration.jl.
+Uses a kernel to compute cumsum along the second dimension.
+
+# Arguments
+- `Δt`: (`1 x NΔt ::Matrix{Float64}`, `[s]`) delta time 1-row array
+- `x`: (`Ns x (NΔt+1) ::Matrix{Float64}`, `[T]`) magnitude of the field Gx * x + Gy * y +
+    Gz * z
+
+# Returns
+- `y`: (`Ns x NΔt ::Matrix{Float64}`, `[T*s]`) matrix where every column is the
+    cumulative integration over time of (Gx * x + Gy * y + Gz * z) * Δt for every spin of a
+    phantom
+"""
+function KomaMRIBase.cumtrapz(Δt::AbstractArray{T}, x::AbstractArray{T}, backend::KA.GPU) where {T<:Real}
+    y = (x[:, 2:end] .+ x[:, 1:end-1]) .* (Δt / 2)
+    cumsum_matrix_rows_kernel!(backend)(y, ndrange=size(y,1))
+    KA.synchronize(backend)
+    return y
+end
+
+#If on CPU, forward call to cumtrapz in KomaMRIBase
+KomaMRIBase.cumtrapz(Δt::AbstractArray{T}, x::AbstractArray{T}, backend::KA.CPU) where {T<:Real} = KomaMRIBase.cumtrapz(Δt, x)
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimulatorCore.jl b/KomaMRICore/src/simulation/SimulatorCore.jl
index 7cb209bfe..f9fcbf565 100644
--- a/KomaMRICore/src/simulation/SimulatorCore.jl
+++ b/KomaMRICore/src/simulation/SimulatorCore.jl
@@ -5,6 +5,7 @@ abstract type PreallocResult{T<:Real} end #get all available types by using subt
 #Defined methods:
 include("Bloch/BlochSimulationMethod.jl")       #Defines Bloch simulation method
 include("Bloch/BlochDictSimulationMethod.jl")   #Defines BlochDict simulation method
+include("Bloch/KernelFunctions.jl")
 
 """
     sim_params = default_sim_params(sim_params=Dict{String,Any}())

From aeefa4ae839da8abd2cee6e121a91fc40e561ac0 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Tue, 16 Jul 2024 18:30:24 -0500
Subject: [PATCH 11/24] Reorganize files, separate CPU specialization from
 default

---
 KomaMRICore/Project.toml                      |  5 +-
 KomaMRICore/ext/KomaAMDGPUExt.jl              |  2 -
 KomaMRICore/ext/KomaCUDAExt.jl                |  2 -
 KomaMRICore/ext/KomaMetalExt.jl               | 10 +-
 KomaMRICore/ext/KomaoneAPIExt.jl              |  9 +-
 .../simulation/SimMethods/Bloch/BlochCPU.jl   | 94 ++++++++++++++++++
 .../BlochDict}/BlochDictSimulationMethod.jl   |  2 +-
 .../{Bloch => SimMethods}/KernelFunctions.jl  |  0
 .../{Bloch => SimMethods}/Magnetization.jl    |  0
 .../SimulationMethod.jl}                      | 97 ++++++-------------
 KomaMRICore/src/simulation/SimulatorCore.jl   |  9 +-
 11 files changed, 144 insertions(+), 86 deletions(-)
 create mode 100644 KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
 rename KomaMRICore/src/simulation/{Bloch => SimMethods/BlochDict}/BlochDictSimulationMethod.jl (98%)
 rename KomaMRICore/src/simulation/{Bloch => SimMethods}/KernelFunctions.jl (100%)
 rename KomaMRICore/src/simulation/{Bloch => SimMethods}/Magnetization.jl (100%)
 rename KomaMRICore/src/simulation/{Bloch/BlochSimulationMethod.jl => SimMethods/SimulationMethod.jl} (59%)

diff --git a/KomaMRICore/Project.toml b/KomaMRICore/Project.toml
index 1d028c369..b14f5548a 100644
--- a/KomaMRICore/Project.toml
+++ b/KomaMRICore/Project.toml
@@ -11,7 +11,6 @@ KomaMRIBase = "d0bc0b20-b151-4d03-b2a4-6ca51751cb9c"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
-Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"
 ThreadsX = "ac1d9e8a-700a-412c-b207-f0111f4b6c0d"
 
 [weakdeps]
@@ -27,19 +26,19 @@ KomaMetalExt = "Metal"
 KomaoneAPIExt = "oneAPI"
 
 [compat]
-Adapt = "3, 4"
 AMDGPU = "0.9"
+Adapt = "3, 4"
 CUDA = "3, 4, 5"
 Functors = "0.4"
 KernelAbstractions = "0.9"
 KomaMRIBase = "0.9"
 Metal = "1"
-oneAPI = "1"
 Pkg = "1.4"
 ProgressMeter = "1"
 Reexport = "1"
 ThreadsX = "0.1"
 julia = "1.9"
+oneAPI = "1"
 
 [workspace]
 projects = ["test"]
diff --git a/KomaMRICore/ext/KomaAMDGPUExt.jl b/KomaMRICore/ext/KomaAMDGPUExt.jl
index 5c14edb04..d0b5f1350 100644
--- a/KomaMRICore/ext/KomaAMDGPUExt.jl
+++ b/KomaMRICore/ext/KomaAMDGPUExt.jl
@@ -1,7 +1,6 @@
 module KomaAMDGPUExt
 
 using AMDGPU
-using Suppressor
 import KomaMRICore
 import Adapt
 
@@ -21,7 +20,6 @@ end
 
 function __init__()
     push!(KomaMRICore.LOADED_BACKENDS[], ROCBackend())
-    @suppress AMDGPU.allowscalar(true)
 end
 
 end
\ No newline at end of file
diff --git a/KomaMRICore/ext/KomaCUDAExt.jl b/KomaMRICore/ext/KomaCUDAExt.jl
index 0177dc1b8..cc97a0a96 100644
--- a/KomaMRICore/ext/KomaCUDAExt.jl
+++ b/KomaMRICore/ext/KomaCUDAExt.jl
@@ -1,7 +1,6 @@
 module KomaCUDAExt
 
 using CUDA
-using Suppressor
 import KomaMRICore
 import Adapt
 
@@ -20,7 +19,6 @@ end
 
 function __init__()
     push!(KomaMRICore.LOADED_BACKENDS[], CUDABackend())
-    @suppress CUDA.allowscalar(true)
 end
 
 end
\ No newline at end of file
diff --git a/KomaMRICore/ext/KomaMetalExt.jl b/KomaMRICore/ext/KomaMetalExt.jl
index 31172d0ec..d50af7901 100644
--- a/KomaMRICore/ext/KomaMetalExt.jl
+++ b/KomaMRICore/ext/KomaMetalExt.jl
@@ -3,7 +3,6 @@
 module KomaMetalExt
 
 using Metal
-using Suppressor
 import KomaMRICore
 import Adapt
 
@@ -17,9 +16,16 @@ function KomaMRICore._print_devices(::MetalBackend)
     @info "Metal device type: $(KomaMRICore.device_name(MetalBackend()))"
 end
 
+#Temporary workaround for https://github.com/JuliaGPU/Metal.jl/issues/348
+#Once run_spin_excitation! and run_spin_precession! are kernel-based, this code
+#can be removed
+Base.cumsum(x::MtlVector{T}) where T = convert(MtlVector{T}, cumsum(KomaMRICore.cpu(x)))
+Base.cumsum(x::MtlArray{T}; dims) where T = convert(MtlArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
+Base.findall(x::MtlVector{Bool}) = convert(MtlVector, findall(KomaMRICore.cpu(x)))
+
 function __init__()
     push!(KomaMRICore.LOADED_BACKENDS[], MetalBackend())
-    @suppress Metal.allowscalar(true)
+    @warn "Metal does not support all array operations used by KomaMRI (https://github.com/JuliaGPU/Metal.jl/issues/348). GPU performance may be slower than expected"
 end
 
 end
diff --git a/KomaMRICore/ext/KomaoneAPIExt.jl b/KomaMRICore/ext/KomaoneAPIExt.jl
index 46dfcfeb3..0bec755fc 100644
--- a/KomaMRICore/ext/KomaoneAPIExt.jl
+++ b/KomaMRICore/ext/KomaoneAPIExt.jl
@@ -1,7 +1,6 @@
 module KomaoneAPIExt
 
 using oneAPI
-using Suppressor
 import KomaMRICore
 import Adapt
 
@@ -18,9 +17,15 @@ function KomaMRICore._print_devices(::oneAPIBackend)
     @info "$(length(oneAPI.devices())) oneAPI capable device(s)." devices...
 end
 
+#Temporary workaround since oneAPI.jl (similar to Metal) does not support some array operations
+#Once run_spin_excitation! and run_spin_precession! are kernel-based, this code can be removed
+Base.cumsum(x::oneVector{T}) where T = convert(oneVector{T}, cumsum(KomaMRICore.cpu(x)))
+Base.cumsum(x::oneArray{T}; dims) where T = convert(oneArray{T}, cumsum(KomaMRICore.cpu(x), dims=dims))
+Base.findall(x::oneVector{Bool}) = convert(oneVector, findall(KomaMRICore.cpu(x)))
+
 function __init__()
     push!(KomaMRICore.LOADED_BACKENDS[], oneAPIBackend())
-    @suppress oneAPI.allowscalar(true)
+    @warn "oneAPI does not support all array operations used by KomaMRI. GPU performance may be slower than expected"
 end
 
 end
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
new file mode 100644
index 000000000..88abc20c5
--- /dev/null
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
@@ -0,0 +1,94 @@
+"""Stores arrays for use in Bloch CPU run_spin_precession function."""
+struct BlochCPUPrealloc{T} <: PreallocResult{T}
+    Bz_old::AbstractVector{T}
+    Bz_new::AbstractVector{T}
+    ϕ::AbstractVector{T}
+    Mxy::AbstractVector{Complex{T}}
+end
+
+Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
+    @views BlochCPUPrealloc(
+        p.Bz_old[i],
+        p.Bz_new[i],
+        p.ϕ[i],
+        p.Mxy[i]
+    )
+end
+
+"""Preallocates arrays for use in run_spin_precession."""
+function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}) where {T<:Real}
+    return BlochCPUPrealloc(
+        similar(obj.x),
+        similar(obj.x),
+        similar(obj.x),
+        similar(M.xy)
+    )
+end
+
+"""
+    run_spin_precession(obj, seq, Xt, sig)
+
+Specialized run_spin_precession function for simulating a sequence on the CPU with Bloch as the
+simulation method. Compared with the default run_spin_precession function in SimMethod.jl, this
+function uses a loop to step through time and does not allocate a matrix of size NSpins x seq.t.
+The four arrays of size Nspins x 1 are preallocated in SimulatorCore::run_sim_time_iter! so that
+they can be re-used from block-to-block.
+"""
+function run_spin_precession!(
+    p::Phantom{T},
+    seq::DiscreteSequence{T},
+    sig::AbstractArray{Complex{T}},
+    M::Mag{T},
+    sim_method::Bloch,
+    backend::KA.CPU,
+    prealloc::BlochCPUPrealloc
+) where {T<:Real}
+    #Simulation
+    #Motion
+    x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
+    
+    #Initialize arrays
+    Bz_old = prealloc.Bz_old
+    Bz_new = prealloc.Bz_new
+    ϕ = prealloc.ϕ
+    Mxy = prealloc.Mxy
+    fill!(ϕ, zero(T))
+    Bz_old .= x[:,1] .* seq.Gx[1] .+ y[:,1] .* seq.Gy[1] .+ z[:,1] .* seq.Gz[1] .+ p.Δw / T(2π * γ)
+
+    # Fill sig[1] if needed
+    ADC_idx = 1
+    if (seq.ADC[1])
+        sig[1] = sum(M.xy)
+        ADC_idx += 1
+    end
+
+    t_seq = zero(T) # Time
+    for seq_idx=2:length(seq.t)
+        t_seq += seq.Δt[seq_idx-1]
+
+        #Effective Field
+        if size(x,2) > 1 #Motion
+            Bz_new .= x[:,seq_idx] .* seq.Gx[seq_idx] .+ y[:,seq_idx] .* seq.Gy[seq_idx] .+ z[:,seq_idx] .* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
+        else             #No motion
+            Bz_new .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z.* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
+        end
+        
+        #Rotation
+        ϕ .= ϕ .+ (Bz_old .+ Bz_new) .* (T(-2π * γ) * seq.Δt[seq_idx-1] / 2)
+
+        #Acquired Signal
+        if seq_idx <= length(seq.ADC) && seq.ADC[seq_idx]
+            Mxy .= exp.(-t_seq ./ p.T2) .* (M.xy .* (cos.(ϕ) .+ im * sin.(ϕ)))
+            sig[ADC_idx] = sum(Mxy) 
+            ADC_idx += 1
+        end
+
+        Bz_old, Bz_new = Bz_new, Bz_old
+    end
+
+    #Final Spin-State
+    M.xy .= M.xy .* exp.(-t_seq ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))
+    M.z .= M.z .* exp.(-t_seq ./ p.T1) .+ p.ρ .* (1 .- exp.(-t_seq ./ p.T1))
+
+    return nothing
+end
diff --git a/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl b/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDictSimulationMethod.jl
similarity index 98%
rename from KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl
rename to KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDictSimulationMethod.jl
index 8e177153a..1507bfa9d 100644
--- a/KomaMRICore/src/simulation/Bloch/BlochDictSimulationMethod.jl
+++ b/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDictSimulationMethod.jl
@@ -36,7 +36,7 @@ function run_spin_precession!(
     M::Mag{T},
     sim_method::BlochDict,
     backend::KA.Backend,
-    prealloc::BlochPrealloc
+    prealloc::PreallocResult
 ) where {T<:Real}
     #Simulation
     #Motion
diff --git a/KomaMRICore/src/simulation/Bloch/KernelFunctions.jl b/KomaMRICore/src/simulation/SimMethods/KernelFunctions.jl
similarity index 100%
rename from KomaMRICore/src/simulation/Bloch/KernelFunctions.jl
rename to KomaMRICore/src/simulation/SimMethods/KernelFunctions.jl
diff --git a/KomaMRICore/src/simulation/Bloch/Magnetization.jl b/KomaMRICore/src/simulation/SimMethods/Magnetization.jl
similarity index 100%
rename from KomaMRICore/src/simulation/Bloch/Magnetization.jl
rename to KomaMRICore/src/simulation/SimMethods/Magnetization.jl
diff --git a/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
similarity index 59%
rename from KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
rename to KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
index dcf816073..e63696acc 100644
--- a/KomaMRICore/src/simulation/Bloch/BlochSimulationMethod.jl
+++ b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
@@ -1,8 +1,11 @@
+#Bloch is the default simulation method
 struct Bloch <: SimulationMethod end
 
 export Bloch
 
-include("Magnetization.jl") #Defines Mag <: SpinStateRepresentation
+include("Magnetization.jl")
+include("KernelFunctions.jl")
+
 @functor Mag #Gives gpu acceleration capabilities, see GPUFunctions.jl
 
 function sim_output_dim(
@@ -22,32 +25,15 @@ function initialize_spins_state(
     return Xt, obj
 end
 
-"""Preallocated arrays for use in run_spin_precession."""
-struct BlochPrealloc{T} <: PreallocResult{T}
-    Bz_old::AbstractVector{T}
-    Bz_new::AbstractVector{T}
-    ϕ::AbstractVector{T}
-    Mxy::AbstractVector{Complex{T}}
-end
+abstract type PreallocResult{T<:Real} end
 
-Base.view(p::BlochPrealloc, i::UnitRange) = begin
-    @views BlochPrealloc(
-        p.Bz_old[i],
-        p.Bz_new[i],
-        p.ϕ[i],
-        p.Mxy[i]
-    )
-end
+"""Default preallocation struct, stores nothing."""
+struct DefaultPreAlloc{T} <: PreallocResult{T} end
 
-"""Default preallocation function. Returns arrays for use in run_spin_precession."""
-function prealloc(sim_method::SimulationMethod, obj::Phantom{T}, M::Mag{T}) where {T<:Real}
-    BlochPrealloc(
-        similar(obj.x),
-        similar(obj.x),
-        similar(obj.x),
-        similar(M.xy)
-    )
-end
+Base.view(p::DefaultPreAlloc, i::UnitRange) = p
+
+"""Default preallocation function."""
+prealloc(sim_method::SimulationMethod, backend::KA.Backend, obj::Phantom{T}, M::Mag{T}) where {T<:Real} = DefaultPreAlloc{T}()
 
 """
     run_spin_precession(obj, seq, Xt, sig)
@@ -71,54 +57,27 @@ function run_spin_precession!(
     M::Mag{T},
     sim_method::SimulationMethod,
     backend::KA.Backend,
-    prealloc::BlochPrealloc
+    prealloc::PreallocResult
 ) where {T<:Real}
     #Simulation
     #Motion
     x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
-    
-    #Initialize arrays
-    Bz_old = prealloc.Bz_old
-    Bz_new = prealloc.Bz_new
-    ϕ = prealloc.ϕ
-    Mxy = prealloc.Mxy
-    fill!(ϕ, zero(T))
-    Bz_old .= x[:,1] .* seq.Gx[1] .+ y[:,1] .* seq.Gy[1] .+ z[:,1] .* seq.Gz[1] .+ p.Δw / T(2π * γ)
-
-    # Fill sig[1] if needed
-    ADC_idx = 1
-    if (seq.ADC[1])
-        sig[1] = sum(M.xy)
-        ADC_idx += 1
+    #Effective field
+    Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw / T(2π * γ)
+    #Rotation
+    if is_ADC_on(seq)
+        ϕ = T(-2π * γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
+    else
+        ϕ = T(-2π * γ) .* trapz(seq.Δt', Bz)
     end
-
-    t_seq = zero(T) # Time
-    for seq_idx=2:length(seq.t)
-        t_seq += seq.Δt[seq_idx-1]
-
-        #Effective Field
-        if size(x,2) > 1 #Motion
-            Bz_new .= x[:,seq_idx] .* seq.Gx[seq_idx] .+ y[:,seq_idx] .* seq.Gy[seq_idx] .+ z[:,seq_idx] .* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
-        else             #No motion
-            Bz_new .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z.* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
-        end
-        
-        #Rotation
-        ϕ .= ϕ .+ (Bz_old .+ Bz_new) .* (T(-2π * γ) * seq.Δt[seq_idx-1] / 2)
-
-        #Acquired Signal
-        if seq_idx <= length(seq.ADC) && seq.ADC[seq_idx]
-            Mxy .= exp.(-t_seq ./ p.T2) .* (M.xy .* (cos.(ϕ) .+ im * sin.(ϕ)))
-            sig[ADC_idx] = sum(Mxy) 
-            ADC_idx += 1
-        end
-
-        Bz_old, Bz_new = Bz_new, Bz_old
-    end
-
-    #Final Spin-State
-    M.xy .= M.xy .* exp.(-t_seq ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))
-    M.z .= M.z .* exp.(-t_seq ./ p.T1) .+ p.ρ .* (1 .- exp.(-t_seq ./ p.T1))
+    #Mxy precession and relaxation, and Mz relaxation
+    tp   = cumsum(seq.Δt) # t' = t - t0
+    dur  = sum(seq.Δt)   # Total length, used for signal relaxation
+    Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))] #This assumes Δw and T2 are constant in time
+    M.xy .= Mxy[:, end]
+    M.z  .= M.z .* exp.(-dur ./ p.T1) .+ p.ρ .* (1 .- exp.(-dur ./ p.T1))
+    #Acquired signal
+    sig .= transpose(sum(Mxy[:, findall(seq.ADC)]; dims=1)) #<--- TODO: add coil sensitivities
 
     return nothing
 end
@@ -164,4 +123,4 @@ function run_spin_excitation!(
     #Acquired signal
     #sig .= -1.4im #<-- This was to test if an ADC point was inside an RF block
     return nothing
-end
+end
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimulatorCore.jl b/KomaMRICore/src/simulation/SimulatorCore.jl
index f9fcbf565..a38a5ce44 100644
--- a/KomaMRICore/src/simulation/SimulatorCore.jl
+++ b/KomaMRICore/src/simulation/SimulatorCore.jl
@@ -1,11 +1,10 @@
 abstract type SimulationMethod end #get all available types by using subtypes(KomaMRI.SimulationMethod)
 abstract type SpinStateRepresentation{T<:Real} end #get all available types by using subtypes(KomaMRI.SpinStateRepresentation)
-abstract type PreallocResult{T<:Real} end #get all available types by using subtypes(KomaMRI.PreallocResult)
 
 #Defined methods:
-include("Bloch/BlochSimulationMethod.jl")       #Defines Bloch simulation method
-include("Bloch/BlochDictSimulationMethod.jl")   #Defines BlochDict simulation method
-include("Bloch/KernelFunctions.jl")
+include("SimMethods/SimulationMethod.jl")  #Defines Bloch simulation method
+include("SimMethods/BlochDict/BlochDictSimulationMethod.jl")   #Defines BlochDict simulation method
+include("SimMethods/Bloch/BlochCPU.jl") #Defines specialized Bloch functions for CPU
 
 """
     sim_params = default_sim_params(sim_params=Dict{String,Any}())
@@ -182,7 +181,7 @@ function run_sim_time_iter!(
     rfs = 0
     samples = 1
     progress_bar = Progress(Nblocks; desc="Running simulation...")
-    prealloc_result = prealloc(sim_method, obj, Xt) 
+    prealloc_result = prealloc(sim_method, backend, obj, Xt) 
 
     for (block, p) in enumerate(parts)
         seq_block = @view seq[p]

From bf112484c89aa1331f9f0b0259f55b62173b1740 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Tue, 16 Jul 2024 19:09:34 -0500
Subject: [PATCH 12/24] Try reverting Benchmark.yml change to see if action
 runs

---
 .github/workflows/Benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
index 0cf328d1e..49a124195 100644
--- a/.github/workflows/Benchmark.yml
+++ b/.github/workflows/Benchmark.yml
@@ -32,7 +32,7 @@ on:
 
 jobs:
   benchmark:
-    if: ${{ !contains(github.event.head_commit.message, '[skip benchmarks]') && github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }}
+    if: ${{ !contains(github.event.head_commit.message, '[skip benchmarks]') }}
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4

From 55b855cfd038f0dd8feca3b28415cb5a6bef2e44 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Wed, 17 Jul 2024 18:02:17 -0500
Subject: [PATCH 13/24] Optimize run_spin_excitation! for CPU

---
 .../simulation/SimMethods/Bloch/BlochCPU.jl   | 105 ++++++++++++++----
 .../simulation/SimMethods/Magnetization.jl    |   6 +
 .../simulation/SimMethods/SimulationMethod.jl |   2 +
 KomaMRICore/src/simulation/SimulatorCore.jl   |   8 +-
 4 files changed, 94 insertions(+), 27 deletions(-)

diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
index 88abc20c5..4fc503094 100644
--- a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
@@ -1,17 +1,25 @@
 """Stores arrays for use in Bloch CPU run_spin_precession function."""
 struct BlochCPUPrealloc{T} <: PreallocResult{T}
-    Bz_old::AbstractVector{T}
-    Bz_new::AbstractVector{T}
-    ϕ::AbstractVector{T}
-    Mxy::AbstractVector{Complex{T}}
+    Bz_1::AbstractVector{T}               # Vector{T}(Nspins x 1)
+    Bz_2::AbstractVector{T}               # Vector{T}(Nspins x 1)
+    Bz_3::AbstractVector{T}               # Vector{T}(Nspins x 1)
+    Bz_4::AbstractVector{T}               # Vector{T}(Nspins x 1)
+    Bz_5::AbstractVector{T}               # Vector{T}(Nspins x 1)
+    Mxy_1::AbstractVector{Complex{T}}     # Vector{Complex{T}}(Nspins x 1)
+    Mxy_2::AbstractVector{Complex{T}}     # Vector{Complex{T}}(Nspins x 1)
+    Mxy_3::AbstractVector{Complex{T}}     # Vector{Complex{T}}(Nspins x 1)
 end
 
 Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
     @views BlochCPUPrealloc(
-        p.Bz_old[i],
-        p.Bz_new[i],
-        p.ϕ[i],
-        p.Mxy[i]
+        p.Bz_1[i],
+        p.Bz_2[i],
+        p.Bz_3[i],
+        p.Bz_4[i],
+        p.Bz_5[i],
+        p.Mxy_1[i],
+        p.Mxy_2[i],
+        p.Mxy_3[i]
     )
 end
 
@@ -21,18 +29,21 @@ function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}
         similar(obj.x),
         similar(obj.x),
         similar(obj.x),
+        similar(obj.x),
+        similar(obj.x),
+        similar(M.xy),
+        similar(M.xy),
         similar(M.xy)
     )
 end
 
 """
-    run_spin_precession(obj, seq, Xt, sig)
+    run_spin_precession(obj, seq, Xt, sig, M, sim_method, backend, prealloc)
 
-Specialized run_spin_precession function for simulating a sequence on the CPU with Bloch as the
-simulation method. Compared with the default run_spin_precession function in SimMethod.jl, this
-function uses a loop to step through time and does not allocate a matrix of size NSpins x seq.t.
-The four arrays of size Nspins x 1 are preallocated in SimulatorCore::run_sim_time_iter! so that
-they can be re-used from block-to-block.
+Alternate implementation of the run_spin_precession! function in SimulationMethod.jl optimized
+for the CPU. Uses a loop to step through time instead of allocating a matrix of size NSpins x seq.t.
+The Bz_old, Bz_new, ϕ, and Mxy arrays are pre-allocated in run_sim_time_iter! so that they can be
+re-used from block to block.
 """
 function run_spin_precession!(
     p::Phantom{T},
@@ -45,13 +56,13 @@ function run_spin_precession!(
 ) where {T<:Real}
     #Simulation
     #Motion
-    x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
+    x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t[1,:]')
     
     #Initialize arrays
-    Bz_old = prealloc.Bz_old
-    Bz_new = prealloc.Bz_new
-    ϕ = prealloc.ϕ
-    Mxy = prealloc.Mxy
+    Bz_old = prealloc.Bz_1
+    Bz_new = prealloc.Bz_2
+    ϕ = prealloc.Bz_3
+    Mxy = prealloc.Mxy_1
     fill!(ϕ, zero(T))
     Bz_old .= x[:,1] .* seq.Gx[1] .+ y[:,1] .* seq.Gy[1] .+ z[:,1] .* seq.Gz[1] .+ p.Δw / T(2π * γ)
 
@@ -64,14 +75,11 @@ function run_spin_precession!(
 
     t_seq = zero(T) # Time
     for seq_idx=2:length(seq.t)
+        x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t[seq_idx,:]')
         t_seq += seq.Δt[seq_idx-1]
 
         #Effective Field
-        if size(x,2) > 1 #Motion
-            Bz_new .= x[:,seq_idx] .* seq.Gx[seq_idx] .+ y[:,seq_idx] .* seq.Gy[seq_idx] .+ z[:,seq_idx] .* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
-        else             #No motion
-            Bz_new .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z.* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
-        end
+        Bz_new .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z.* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
         
         #Rotation
         ϕ .= ϕ .+ (Bz_old .+ Bz_new) .* (T(-2π * γ) * seq.Δt[seq_idx-1] / 2)
@@ -92,3 +100,52 @@ function run_spin_precession!(
 
     return nothing
 end
+
+"""
+    run_spin_excitation!(obj, seq, sig, M, sim_method, backend, prealloc)
+
+Alternate implementation of the run_spin_excitation! function in SimulationMethod.jl 
+optimized for the CPU. Uses preallocation for all arrays to reduce memory usage.
+"""
+function run_spin_excitation!(
+    p::Phantom{T},
+    seq::DiscreteSequence{T},
+    sig::AbstractArray{Complex{T}},
+    M::Mag{T},
+    sim_method::Bloch,
+    backend::KA.CPU,
+    prealloc::BlochCPUPrealloc
+) where {T<:Real}
+    ΔBz = prealloc.Bz_1
+    Bz = prealloc.Bz_2
+    B = prealloc.Bz_3
+    φ = prealloc.Bz_4
+    α = prealloc.Mxy_1
+    β = prealloc.Mxy_2
+    Maux_xy = prealloc.Mxy_3
+    Maux_z = prealloc.Bz_5
+
+    #Can be calculated outside of loop
+    ΔBz .= p.Δw ./ T(2π * γ)
+
+    #Simulation
+    for s in seq #This iterates over seq, "s = seq[i,:]"
+        #Motion
+        x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, s.t)
+        #Effective field
+        Bz .= (s.Gx .* x .+ s.Gy .* y .+ s.Gz .* z) .+ ΔBz .- s.Δf ./ T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
+        B .= sqrt.(abs.(s.B1) .^ 2 .+ abs.(Bz) .^ 2)
+        B[B .== 0] .= eps(T)
+        #Spinor Rotation
+        φ .= T(-2π * γ) * (B .* s.Δt) # TODO: Use trapezoidal integration here (?),  this is just Forward Euler 
+        α .= cos.(φ/2) .- 1im * (Bz ./ B) .* sin.(φ/2)
+        β .= -1im * (s.B1 ./ B) .* sin.(φ/2)
+        mul!(Spinor(α, β), M, Maux_xy, Maux_z)
+        #Relaxation
+        M.xy .= M.xy .* exp.(-s.Δt ./ p.T2)
+        M.z .= M.z .* exp.(-s.Δt ./ p.T1) .+ p.ρ .* (1 .- exp.(-s.Δt ./ p.T1))
+    end
+    #Acquired signal
+    #sig .= -1.4im #<-- This was to test if an ADC point was inside an RF block
+    return nothing
+end
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimMethods/Magnetization.jl b/KomaMRICore/src/simulation/SimMethods/Magnetization.jl
index 78552cf71..a2ebf6709 100644
--- a/KomaMRICore/src/simulation/SimMethods/Magnetization.jl
+++ b/KomaMRICore/src/simulation/SimMethods/Magnetization.jl
@@ -53,6 +53,12 @@ mul!(s::Spinor, M::Mag) = begin
     M.xy .= M_aux.xy
     M.z  .= M_aux.z
 end
+mul!(s::Spinor, M::Mag, Maux_xy, Maux_z) = begin
+    Maux_xy .= 2*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy)
+    Maux_z .= (abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-2*real.(s.α.*s.β.*conj.(M.xy))
+    M.xy .= Maux_xy
+    M.z .= Maux_z
+end
 *(s::Spinor, M::Mag) = begin
     Mag(
         2*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy),
diff --git a/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
index e63696acc..2b4bfa838 100644
--- a/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
+++ b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
@@ -103,6 +103,8 @@ function run_spin_excitation!(
     sig::AbstractArray{Complex{T}},
     M::Mag{T},
     sim_method::SimulationMethod,
+    backend::KA.Backend,
+    prealloc::PreallocResult
 ) where {T<:Real}
     #Simulation
     for s in seq #This iterates over seq, "s = seq[i,:]"
diff --git a/KomaMRICore/src/simulation/SimulatorCore.jl b/KomaMRICore/src/simulation/SimulatorCore.jl
index a38a5ce44..9245f28cd 100644
--- a/KomaMRICore/src/simulation/SimulatorCore.jl
+++ b/KomaMRICore/src/simulation/SimulatorCore.jl
@@ -124,7 +124,9 @@ function run_spin_excitation_parallel!(
     seq::DiscreteSequence{T},
     sig::AbstractArray{Complex{T}},
     Xt::SpinStateRepresentation{T},
-    sim_method::SimulationMethod;
+    sim_method::SimulationMethod,
+    backend::KA.Backend,
+    prealloc::PreallocResult;
     Nthreads=Threads.nthreads(),
 ) where {T<:Real}
     parts = kfoldperm(length(obj), Nthreads)
@@ -132,7 +134,7 @@ function run_spin_excitation_parallel!(
 
     ThreadsX.foreach(enumerate(parts)) do (i, p)
         run_spin_excitation!(
-            @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method
+            @view(obj[p]), seq, @view(sig[dims..., i]), @view(Xt[p]), sim_method, backend, @view(prealloc[p])
         )
     end
 
@@ -193,7 +195,7 @@ function run_sim_time_iter!(
         # Simulation wrappers
         if excitation_bool[block]
             run_spin_excitation_parallel!(
-                obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method; Nthreads
+                obj, seq_block, @view(sig[acq_samples, dims...]), Xt, sim_method, backend, prealloc_result; Nthreads
             )
             rfs += 1
         else

From 3d41cfdcd37e3aeb9ddea31f9a020f8beab35ea5 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 18 Jul 2024 10:50:54 -0500
Subject: [PATCH 14/24] Fix incomplete broadcasting

---
 .../simulation/SimMethods/Bloch/BlochCPU.jl    | 18 +++++++++---------
 .../src/simulation/SimMethods/Magnetization.jl | 12 ++++++------
 .../simulation/SimMethods/SimulationMethod.jl  | 12 ++++++------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
index 4fc503094..35f6e3159 100644
--- a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
@@ -64,7 +64,7 @@ function run_spin_precession!(
     ϕ = prealloc.Bz_3
     Mxy = prealloc.Mxy_1
     fill!(ϕ, zero(T))
-    Bz_old .= x[:,1] .* seq.Gx[1] .+ y[:,1] .* seq.Gy[1] .+ z[:,1] .* seq.Gz[1] .+ p.Δw / T(2π * γ)
+    Bz_old .= x[:,1] .* seq.Gx[1] .+ y[:,1] .* seq.Gy[1] .+ z[:,1] .* seq.Gz[1] .+ p.Δw ./ T(2π .* γ)
 
     # Fill sig[1] if needed
     ADC_idx = 1
@@ -79,14 +79,14 @@ function run_spin_precession!(
         t_seq += seq.Δt[seq_idx-1]
 
         #Effective Field
-        Bz_new .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z.* seq.Gz[seq_idx] .+ p.Δw / T(2π * γ)
+        Bz_new .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z .* seq.Gz[seq_idx] .+ p.Δw ./ T(2π .* γ)
         
         #Rotation
-        ϕ .= ϕ .+ (Bz_old .+ Bz_new) .* (T(-2π * γ) * seq.Δt[seq_idx-1] / 2)
+        ϕ .= ϕ .+ (Bz_old .+ Bz_new) .* (T(-2π .* γ) .* seq.Δt[seq_idx-1] ./ 2)
 
         #Acquired Signal
         if seq_idx <= length(seq.ADC) && seq.ADC[seq_idx]
-            Mxy .= exp.(-t_seq ./ p.T2) .* (M.xy .* (cos.(ϕ) .+ im * sin.(ϕ)))
+            Mxy .= exp.(-t_seq ./ p.T2) .* (M.xy .* (cos.(ϕ) .+ im .* sin.(ϕ)))
             sig[ADC_idx] = sum(Mxy) 
             ADC_idx += 1
         end
@@ -95,7 +95,7 @@ function run_spin_precession!(
     end
 
     #Final Spin-State
-    M.xy .= M.xy .* exp.(-t_seq ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))
+    M.xy .= M.xy .* exp.(-t_seq ./ p.T2) .* (cos.(ϕ) .+ im .* sin.(ϕ))
     M.z .= M.z .* exp.(-t_seq ./ p.T1) .+ p.ρ .* (1 .- exp.(-t_seq ./ p.T1))
 
     return nothing
@@ -126,7 +126,7 @@ function run_spin_excitation!(
     Maux_z = prealloc.Bz_5
 
     #Can be calculated outside of loop
-    ΔBz .= p.Δw ./ T(2π * γ)
+    ΔBz .= p.Δw ./ T(2π .* γ)
 
     #Simulation
     for s in seq #This iterates over seq, "s = seq[i,:]"
@@ -137,9 +137,9 @@ function run_spin_excitation!(
         B .= sqrt.(abs.(s.B1) .^ 2 .+ abs.(Bz) .^ 2)
         B[B .== 0] .= eps(T)
         #Spinor Rotation
-        φ .= T(-2π * γ) * (B .* s.Δt) # TODO: Use trapezoidal integration here (?),  this is just Forward Euler 
-        α .= cos.(φ/2) .- 1im * (Bz ./ B) .* sin.(φ/2)
-        β .= -1im * (s.B1 ./ B) .* sin.(φ/2)
+        φ .= T(-2π .* γ) .* (B .* s.Δt) # TODO: Use trapezoidal integration here (?),  this is just Forward Euler 
+        α .= cos.(φ ./ 2) .- 1im .* (Bz ./ B) .* sin.(φ ./ 2)
+        β .= -1im .* (s.B1 ./ B) .* sin.(φ ./ 2)
         mul!(Spinor(α, β), M, Maux_xy, Maux_z)
         #Relaxation
         M.xy .= M.xy .* exp.(-s.Δt ./ p.T2)
diff --git a/KomaMRICore/src/simulation/SimMethods/Magnetization.jl b/KomaMRICore/src/simulation/SimMethods/Magnetization.jl
index a2ebf6709..a5cd4947c 100644
--- a/KomaMRICore/src/simulation/SimMethods/Magnetization.jl
+++ b/KomaMRICore/src/simulation/SimMethods/Magnetization.jl
@@ -47,21 +47,21 @@ IEEE Transactions on Medical Imaging, 10(1), 53-65. doi:10.1109/42.75611
 """
 mul!(s::Spinor, M::Mag) = begin
     M_aux = Mag(
-        2*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy),
-        (abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-2*real.(s.α.*s.β.*conj.(M.xy))
+        2 .*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy),
+        (abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-2 .*real.(s.α.*s.β.*conj.(M.xy))
      )
     M.xy .= M_aux.xy
     M.z  .= M_aux.z
 end
 mul!(s::Spinor, M::Mag, Maux_xy, Maux_z) = begin
-    Maux_xy .= 2*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy)
-    Maux_z .= (abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-2*real.(s.α.*s.β.*conj.(M.xy))
+    Maux_xy .= 2 .*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy)
+    Maux_z .= (abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-2 .*real.(s.α.*s.β.*conj.(M.xy))
     M.xy .= Maux_xy
     M.z .= Maux_z
 end
 *(s::Spinor, M::Mag) = begin
     Mag(
-        2*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy),
-        (abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-2*real.(s.α.*s.β.*conj.(M.xy))
+        2 .*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy),
+        (abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-2 .*real.(s.α.*s.β.*conj.(M.xy))
      )
 end
diff --git a/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
index 2b4bfa838..0e1c719d2 100644
--- a/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
+++ b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
@@ -63,17 +63,17 @@ function run_spin_precession!(
     #Motion
     x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
     #Effective field
-    Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw / T(2π * γ)
+    Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw ./ T(2π .* γ)
     #Rotation
     if is_ADC_on(seq)
-        ϕ = T(-2π * γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
+        ϕ = T(-2π .* γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
     else
-        ϕ = T(-2π * γ) .* trapz(seq.Δt', Bz)
+        ϕ = T(-2π .* γ) .* trapz(seq.Δt', Bz)
     end
     #Mxy precession and relaxation, and Mz relaxation
     tp   = cumsum(seq.Δt) # t' = t - t0
     dur  = sum(seq.Δt)   # Total length, used for signal relaxation
-    Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))] #This assumes Δw and T2 are constant in time
+    Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im .* sin.(ϕ))] #This assumes Δw and T2 are constant in time
     M.xy .= Mxy[:, end]
     M.z  .= M.z .* exp.(-dur ./ p.T1) .+ p.ρ .* (1 .- exp.(-dur ./ p.T1))
     #Acquired signal
@@ -111,12 +111,12 @@ function run_spin_excitation!(
         #Motion
         x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, s.t)
         #Effective field
-        ΔBz = p.Δw ./ T(2π * γ) .- s.Δf ./ T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
+        ΔBz = p.Δw ./ T(2π .* γ) .- s.Δf ./ T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
         Bz = (s.Gx .* x .+ s.Gy .* y .+ s.Gz .* z) .+ ΔBz
         B = sqrt.(abs.(s.B1) .^ 2 .+ abs.(Bz) .^ 2)
         B[B .== 0] .= eps(T)
         #Spinor Rotation
-        φ = T(-2π * γ) * (B .* s.Δt) # TODO: Use trapezoidal integration here (?),  this is just Forward Euler
+        φ = T(-2π .* γ) .* (B .* s.Δt) # TODO: Use trapezoidal integration here (?),  this is just Forward Euler
         mul!(Q(φ, s.B1 ./ B, Bz ./ B), M)
         #Relaxation
         M.xy .= M.xy .* exp.(-s.Δt ./ p.T2)

From cc92880ca0651f928cac4360998b0fa8f870bcce Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 18 Jul 2024 12:00:03 -0500
Subject: [PATCH 15/24] Also fix incomplete broadcasting for BlochDict

---
 .../SimMethods/BlochDict/BlochDictSimulationMethod.jl     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDictSimulationMethod.jl b/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDictSimulationMethod.jl
index 1507bfa9d..24bb7b9dc 100644
--- a/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDictSimulationMethod.jl
+++ b/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDictSimulationMethod.jl
@@ -42,17 +42,17 @@ function run_spin_precession!(
     #Motion
     x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
     #Effective field
-    Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw / T(2π * γ)
+    Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw ./ T(2π .* γ)
     #Rotation
     if is_ADC_on(seq)
-        ϕ = T(-2π * γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
+        ϕ = T(-2π .* γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
     else
-        ϕ = T(-2π * γ) .* trapz(seq.Δt', Bz)
+        ϕ = T(-2π .* γ) .* trapz(seq.Δt', Bz)
     end
     #Mxy precession and relaxation, and Mz relaxation
     tp = cumsum(seq.Δt) # t' = t - t0
     dur = sum(seq.Δt)   # Total length, used for signal relaxation
-    Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im * sin.(ϕ))] #This assumes Δw and T2 are constant in time
+    Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im .* sin.(ϕ))] #This assumes Δw and T2 are constant in time
     M.xy .= Mxy[:, end]
     #Acquired signal
     sig[:, :, 1] .= transpose(Mxy[:, findall(seq.ADC)])

From dae09d0c4718fa528a43a750ce721af83cfcd570 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 18 Jul 2024 13:35:52 -0500
Subject: [PATCH 16/24] Use cis for complex exponentiation

---
 KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
index 35f6e3159..62c518378 100644
--- a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
@@ -86,7 +86,7 @@ function run_spin_precession!(
 
         #Acquired Signal
         if seq_idx <= length(seq.ADC) && seq.ADC[seq_idx]
-            Mxy .= exp.(-t_seq ./ p.T2) .* (M.xy .* (cos.(ϕ) .+ im .* sin.(ϕ)))
+            Mxy .= exp.(-t_seq ./ p.T2) .* M.xy .* cis.(ϕ)
             sig[ADC_idx] = sum(Mxy) 
             ADC_idx += 1
         end
@@ -95,7 +95,7 @@ function run_spin_precession!(
     end
 
     #Final Spin-State
-    M.xy .= M.xy .* exp.(-t_seq ./ p.T2) .* (cos.(ϕ) .+ im .* sin.(ϕ))
+    M.xy .= M.xy .* exp.(-t_seq ./ p.T2) .* cis.(ϕ)
     M.z .= M.z .* exp.(-t_seq ./ p.T1) .+ p.ρ .* (1 .- exp.(-t_seq ./ p.T1))
 
     return nothing

From b2e3e79a52cc24af817a5995d260753767601766 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 18 Jul 2024 14:33:26 -0500
Subject: [PATCH 17/24] Use @. macro

---
 .../simulation/SimMethods/Bloch/BlochCPU.jl   | 30 +++++++++----------
 .../simulation/SimMethods/Magnetization.jl    |  8 ++---
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
index 62c518378..ee9e5b197 100644
--- a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
@@ -64,7 +64,7 @@ function run_spin_precession!(
     ϕ = prealloc.Bz_3
     Mxy = prealloc.Mxy_1
     fill!(ϕ, zero(T))
-    Bz_old .= x[:,1] .* seq.Gx[1] .+ y[:,1] .* seq.Gy[1] .+ z[:,1] .* seq.Gz[1] .+ p.Δw ./ T(2π .* γ)
+    @. Bz_old = x[:,1] * seq.Gx[1] + y[:,1] * seq.Gy[1] + z[:,1] * seq.Gz[1] + p.Δw / T(2π * γ)
 
     # Fill sig[1] if needed
     ADC_idx = 1
@@ -79,14 +79,14 @@ function run_spin_precession!(
         t_seq += seq.Δt[seq_idx-1]
 
         #Effective Field
-        Bz_new .= x .* seq.Gx[seq_idx] .+ y .* seq.Gy[seq_idx] .+ z .* seq.Gz[seq_idx] .+ p.Δw ./ T(2π .* γ)
+        @. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + p.Δw / T(2π * γ)
         
         #Rotation
-        ϕ .= ϕ .+ (Bz_old .+ Bz_new) .* (T(-2π .* γ) .* seq.Δt[seq_idx-1] ./ 2)
+        @. ϕ += (Bz_old + Bz_new) * T(-2π * γ) * seq.Δt[seq_idx-1] / 2
 
         #Acquired Signal
         if seq_idx <= length(seq.ADC) && seq.ADC[seq_idx]
-            Mxy .= exp.(-t_seq ./ p.T2) .* M.xy .* cis.(ϕ)
+            @. Mxy = exp(-t_seq / p.T2) * M.xy * cis(ϕ)
             sig[ADC_idx] = sum(Mxy) 
             ADC_idx += 1
         end
@@ -95,8 +95,8 @@ function run_spin_precession!(
     end
 
     #Final Spin-State
-    M.xy .= M.xy .* exp.(-t_seq ./ p.T2) .* cis.(ϕ)
-    M.z .= M.z .* exp.(-t_seq ./ p.T1) .+ p.ρ .* (1 .- exp.(-t_seq ./ p.T1))
+    @. M.xy = M.xy * exp(-t_seq / p.T2) * cis(ϕ)
+    @. M.z = M.z * exp(-t_seq / p.T1) + p.ρ * (1 - exp(-t_seq / p.T1))
 
     return nothing
 end
@@ -126,24 +126,24 @@ function run_spin_excitation!(
     Maux_z = prealloc.Bz_5
 
     #Can be calculated outside of loop
-    ΔBz .= p.Δw ./ T(2π .* γ)
+    @. ΔBz = p.Δw / T(2π * γ)
 
     #Simulation
     for s in seq #This iterates over seq, "s = seq[i,:]"
         #Motion
         x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, s.t)
         #Effective field
-        Bz .= (s.Gx .* x .+ s.Gy .* y .+ s.Gz .* z) .+ ΔBz .- s.Δf ./ T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
-        B .= sqrt.(abs.(s.B1) .^ 2 .+ abs.(Bz) .^ 2)
-        B[B .== 0] .= eps(T)
+        @. Bz = (s.Gx * x + s.Gy * y + s.Gz * z) + ΔBz - s.Δf / T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
+        @. B = sqrt(abs(s.B1)^2 + abs(Bz)^2)
+        @. B[B == 0] = eps(T)
         #Spinor Rotation
-        φ .= T(-2π .* γ) .* (B .* s.Δt) # TODO: Use trapezoidal integration here (?),  this is just Forward Euler 
-        α .= cos.(φ ./ 2) .- 1im .* (Bz ./ B) .* sin.(φ ./ 2)
-        β .= -1im .* (s.B1 ./ B) .* sin.(φ ./ 2)
+        @. φ = T(-2π * γ) * (B * s.Δt) # TODO: Use trapezoidal integration here (?),  this is just Forward Euler 
+        @. α = cos(φ / 2) - 1im * (Bz / B) * sin(φ / 2)
+        @. β = -1im * (s.B1 / B) * sin(φ / 2)
         mul!(Spinor(α, β), M, Maux_xy, Maux_z)
         #Relaxation
-        M.xy .= M.xy .* exp.(-s.Δt ./ p.T2)
-        M.z .= M.z .* exp.(-s.Δt ./ p.T1) .+ p.ρ .* (1 .- exp.(-s.Δt ./ p.T1))
+        @. M.xy = M.xy * exp(-s.Δt / p.T2)
+        @. M.z = M.z * exp(-s.Δt / p.T1) + p.ρ * (1 - exp(-s.Δt / p.T1))
     end
     #Acquired signal
     #sig .= -1.4im #<-- This was to test if an ADC point was inside an RF block
diff --git a/KomaMRICore/src/simulation/SimMethods/Magnetization.jl b/KomaMRICore/src/simulation/SimMethods/Magnetization.jl
index a5cd4947c..3d09abaab 100644
--- a/KomaMRICore/src/simulation/SimMethods/Magnetization.jl
+++ b/KomaMRICore/src/simulation/SimMethods/Magnetization.jl
@@ -54,10 +54,10 @@ mul!(s::Spinor, M::Mag) = begin
     M.z  .= M_aux.z
 end
 mul!(s::Spinor, M::Mag, Maux_xy, Maux_z) = begin
-    Maux_xy .= 2 .*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy)
-    Maux_z .= (abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-2 .*real.(s.α.*s.β.*conj.(M.xy))
-    M.xy .= Maux_xy
-    M.z .= Maux_z
+    @. Maux_xy = 2*conj(s.α)*s.β*M.z+conj(s.α)^2*M.xy-s.β^2*conj(M.xy)
+    @. Maux_z = (abs(s.α)^2 -abs(s.β)^2)*M.z-2 *real(s.α*s.β*conj(M.xy))
+    @. M.xy = Maux_xy
+    @. M.z = Maux_z
 end
 *(s::Spinor, M::Mag) = begin
     Mag(

From 3fe77acbbe702b8e9ed92fdd19f8e681ebd85f7d Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 18 Jul 2024 14:49:31 -0500
Subject: [PATCH 18/24] Wrap all scalars with T (slightly helps performance)

---
 .../src/simulation/SimMethods/Bloch/BlochCPU.jl        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
index ee9e5b197..7cb822c12 100644
--- a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
@@ -82,7 +82,7 @@ function run_spin_precession!(
         @. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + p.Δw / T(2π * γ)
         
         #Rotation
-        @. ϕ += (Bz_old + Bz_new) * T(-2π * γ) * seq.Δt[seq_idx-1] / 2
+        @. ϕ += (Bz_old + Bz_new) * T(-2π * γ) * seq.Δt[seq_idx-1] / T(2)
 
         #Acquired Signal
         if seq_idx <= length(seq.ADC) && seq.ADC[seq_idx]
@@ -96,7 +96,7 @@ function run_spin_precession!(
 
     #Final Spin-State
     @. M.xy = M.xy * exp(-t_seq / p.T2) * cis(ϕ)
-    @. M.z = M.z * exp(-t_seq / p.T1) + p.ρ * (1 - exp(-t_seq / p.T1))
+    @. M.z = M.z * exp(-t_seq / p.T1) + p.ρ * (T(1) - exp(-t_seq / p.T1))
 
     return nothing
 end
@@ -138,12 +138,12 @@ function run_spin_excitation!(
         @. B[B == 0] = eps(T)
         #Spinor Rotation
         @. φ = T(-2π * γ) * (B * s.Δt) # TODO: Use trapezoidal integration here (?),  this is just Forward Euler 
-        @. α = cos(φ / 2) - 1im * (Bz / B) * sin(φ / 2)
-        @. β = -1im * (s.B1 / B) * sin(φ / 2)
+        @. α = cos(φ / T(2)) - Complex{T}(im) * (Bz / B) * sin(φ / T(2))
+        @. β = -Complex{T}(im) * (s.B1 / B) * sin(φ / T(2))
         mul!(Spinor(α, β), M, Maux_xy, Maux_z)
         #Relaxation
         @. M.xy = M.xy * exp(-s.Δt / p.T2)
-        @. M.z = M.z * exp(-s.Δt / p.T1) + p.ρ * (1 - exp(-s.Δt / p.T1))
+        @. M.z = M.z * exp(-s.Δt / p.T1) + p.ρ * (T(1) - exp(-s.Δt / p.T1))
     end
     #Acquired signal
     #sig .= -1.4im #<-- This was to test if an ADC point was inside an RF block

From 7b055b572a90b77185c2e5006a6eda6998f67866 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 18 Jul 2024 16:52:26 -0500
Subject: [PATCH 19/24] Simplify some calculations

---
 KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
index 7cb822c12..934ae1d5e 100644
--- a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
@@ -82,7 +82,7 @@ function run_spin_precession!(
         @. Bz_new = x * seq.Gx[seq_idx] + y * seq.Gy[seq_idx] + z * seq.Gz[seq_idx] + p.Δw / T(2π * γ)
         
         #Rotation
-        @. ϕ += (Bz_old + Bz_new) * T(-2π * γ) * seq.Δt[seq_idx-1] / T(2)
+        @. ϕ += (Bz_old + Bz_new) * T(-π * γ) * seq.Δt[seq_idx-1]
 
         #Acquired Signal
         if seq_idx <= length(seq.ADC) && seq.ADC[seq_idx]
@@ -137,9 +137,9 @@ function run_spin_excitation!(
         @. B = sqrt(abs(s.B1)^2 + abs(Bz)^2)
         @. B[B == 0] = eps(T)
         #Spinor Rotation
-        @. φ = T(-2π * γ) * (B * s.Δt) # TODO: Use trapezoidal integration here (?),  this is just Forward Euler 
-        @. α = cos(φ / T(2)) - Complex{T}(im) * (Bz / B) * sin(φ / T(2))
-        @. β = -Complex{T}(im) * (s.B1 / B) * sin(φ / T(2))
+        @. φ = T(-π * γ) * (B * s.Δt) # TODO: Use trapezoidal integration here (?),  this is just Forward Euler 
+        @. α = cos(φ) - Complex{T}(im) * (Bz / B) * sin(φ)
+        @. β = -Complex{T}(im) * (s.B1 / B) * sin(φ)
         mul!(Spinor(α, β), M, Maux_xy, Maux_z)
         #Relaxation
         @. M.xy = M.xy * exp(-s.Δt / p.T2)

From d839b5510fa161371a8b29c349afe76d9f7e8c26 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 18 Jul 2024 17:21:30 -0500
Subject: [PATCH 20/24] Also wrap scalars with T in Magnetization.jl

---
 .../src/simulation/SimMethods/Magnetization.jl | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/KomaMRICore/src/simulation/SimMethods/Magnetization.jl b/KomaMRICore/src/simulation/SimMethods/Magnetization.jl
index 3d09abaab..1ed77bdcf 100644
--- a/KomaMRICore/src/simulation/SimMethods/Magnetization.jl
+++ b/KomaMRICore/src/simulation/SimMethods/Magnetization.jl
@@ -45,23 +45,23 @@ Parameter relations for the Shinnar-Le Roux selective excitation pulse design al
 (NMR imaging).
 IEEE Transactions on Medical Imaging, 10(1), 53-65. doi:10.1109/42.75611
 """
-mul!(s::Spinor, M::Mag) = begin
+mul!(s::Spinor{T}, M::Mag) where {T<:Real} = begin
     M_aux = Mag(
-        2 .*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy),
-        (abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-2 .*real.(s.α.*s.β.*conj.(M.xy))
+        T(2) .*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy),
+        (abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-T(2) .*real.(s.α.*s.β.*conj.(M.xy))
      )
     M.xy .= M_aux.xy
     M.z  .= M_aux.z
 end
-mul!(s::Spinor, M::Mag, Maux_xy, Maux_z) = begin
-    @. Maux_xy = 2*conj(s.α)*s.β*M.z+conj(s.α)^2*M.xy-s.β^2*conj(M.xy)
-    @. Maux_z = (abs(s.α)^2 -abs(s.β)^2)*M.z-2 *real(s.α*s.β*conj(M.xy))
+mul!(s::Spinor{T}, M::Mag, Maux_xy, Maux_z) where {T<:Real} = begin
+    @. Maux_xy = T(2)*conj(s.α)*s.β*M.z+conj(s.α)^2*M.xy-s.β^2*conj(M.xy)
+    @. Maux_z = (abs(s.α)^2 -abs(s.β)^2)*M.z-T(2) *real(s.α*s.β*conj(M.xy))
     @. M.xy = Maux_xy
     @. M.z = Maux_z
 end
-*(s::Spinor, M::Mag) = begin
+*(s::Spinor{T}, M::Mag) where {T<:Real} = begin
     Mag(
-        2 .*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy),
-        (abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-2 .*real.(s.α.*s.β.*conj.(M.xy))
+        T(2) .*conj.(s.α).*s.β.*M.z.+conj.(s.α).^2 .* M.xy.-s.β.^2 .*conj.(M.xy),
+        (abs.(s.α).^2 .-abs.(s.β).^2).*M.z.-T(2) .*real.(s.α.*s.β.*conj.(M.xy))
      )
 end

From e334cfd3c89b527ec5a80804801cd018b7c6ecbc Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 18 Jul 2024 18:41:05 -0500
Subject: [PATCH 21/24] Move default definitions into
 BlochSimpleSimulationMethod.jl

---
 .../simulation/SimMethods/Bloch/BlochCPU.jl   |  10 +-
 .../BlochSimpleSimulationMethod.jl            | 101 ++++++++++++++++++
 .../simulation/SimMethods/SimulationMethod.jl |  96 +----------------
 KomaMRICore/src/simulation/SimulatorCore.jl   |   4 +-
 4 files changed, 111 insertions(+), 100 deletions(-)
 create mode 100644 KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimpleSimulationMethod.jl

diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
index 934ae1d5e..b8a8c30f3 100644
--- a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
@@ -40,10 +40,10 @@ end
 """
     run_spin_precession(obj, seq, Xt, sig, M, sim_method, backend, prealloc)
 
-Alternate implementation of the run_spin_precession! function in SimulationMethod.jl optimized
-for the CPU. Uses a loop to step through time instead of allocating a matrix of size NSpins x seq.t.
-The Bz_old, Bz_new, ϕ, and Mxy arrays are pre-allocated in run_sim_time_iter! so that they can be
-re-used from block to block.
+Alternate implementation of the run_spin_precession! function in BlochSimpleSimulationMethod.jl 
+optimized for the CPU. Uses a loop to step through time instead of allocating a matrix of size 
+NSpins x seq.t. The Bz_old, Bz_new, ϕ, and Mxy arrays are pre-allocated in run_sim_time_iter! so 
+that they can be re-used from block to block.
 """
 function run_spin_precession!(
     p::Phantom{T},
@@ -104,7 +104,7 @@ end
 """
     run_spin_excitation!(obj, seq, sig, M, sim_method, backend, prealloc)
 
-Alternate implementation of the run_spin_excitation! function in SimulationMethod.jl 
+Alternate implementation of the run_spin_excitation! function in BlochSimpleSimulationMethod.jl 
 optimized for the CPU. Uses preallocation for all arrays to reduce memory usage.
 """
 function run_spin_excitation!(
diff --git a/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimpleSimulationMethod.jl b/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimpleSimulationMethod.jl
new file mode 100644
index 000000000..bf6b23e4c
--- /dev/null
+++ b/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimpleSimulationMethod.jl
@@ -0,0 +1,101 @@
+"""
+Simplest sim method, works for GPU and CPU but not optimized for either. Although Bloch()
+is the simulation method chosen if none is passed, the run_spin_precession! and
+run_spin_excitation! functions in this file are dispatched to at the most abstract level,
+so new simulation methods will start by using these functions.
+"""
+struct BlochSimple <: SimulationMethod end
+
+export BlochSimple
+
+"""
+    run_spin_precession(obj, seq, Xt, sig)
+
+Simulates an MRI sequence `seq` on the Phantom `obj` for time points `t`. It calculates S(t)
+= ∑ᵢ ρ(xᵢ) exp(- t/T2(xᵢ) ) exp(- 𝒊 γ ∫ Bz(xᵢ,t)). It performs the simulation in free
+precession.
+
+# Arguments
+- `obj`: (`::Phantom`) Phantom struct (actually, it's a part of the complete phantom)
+- `seq`: (`::Sequence`) Sequence struct
+
+# Returns
+- `S`: (`Vector{ComplexF64}`) raw signal over time
+- `M0`: (`::Vector{Mag}`) final state of the Mag vector
+"""
+function run_spin_precession!(
+    p::Phantom{T},
+    seq::DiscreteSequence{T},
+    sig::AbstractArray{Complex{T}},
+    M::Mag{T},
+    sim_method::SimulationMethod,
+    backend::KA.Backend,
+    prealloc::PreallocResult
+) where {T<:Real}
+    #Simulation
+    #Motion
+    x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
+    #Effective field
+    Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw ./ T(2π .* γ)
+    #Rotation
+    if is_ADC_on(seq)
+        ϕ = T(-2π .* γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
+    else
+        ϕ = T(-2π .* γ) .* trapz(seq.Δt', Bz)
+    end
+    #Mxy precession and relaxation, and Mz relaxation
+    tp   = cumsum(seq.Δt) # t' = t - t0
+    dur  = sum(seq.Δt)   # Total length, used for signal relaxation
+    Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im .* sin.(ϕ))] #This assumes Δw and T2 are constant in time
+    M.xy .= Mxy[:, end]
+    M.z  .= M.z .* exp.(-dur ./ p.T1) .+ p.ρ .* (1 .- exp.(-dur ./ p.T1))
+    #Acquired signal
+    sig .= transpose(sum(Mxy[:, findall(seq.ADC)]; dims=1)) #<--- TODO: add coil sensitivities
+
+    return nothing
+end
+
+"""
+    M0 = run_spin_excitation(obj, seq, M0)
+
+It gives rise to a rotation of `M0` with an angle given by the efective magnetic field
+(including B1, gradients and off resonance) and with respect to a rotation axis.
+
+# Arguments
+- `obj`: (`::Phantom`) Phantom struct (actually, it's a part of the complete phantom)
+- `seq`: (`::Sequence`) Sequence struct
+
+# Returns
+- `M0`: (`::Vector{Mag}`) final state of the Mag vector after a rotation (actually, it's
+    a part of the complete Mag vector and it's a part of the initial state for the next
+    precession simulation step)
+"""
+function run_spin_excitation!(
+    p::Phantom{T},
+    seq::DiscreteSequence{T},
+    sig::AbstractArray{Complex{T}},
+    M::Mag{T},
+    sim_method::SimulationMethod,
+    backend::KA.Backend,
+    prealloc::PreallocResult
+) where {T<:Real}
+    #Simulation
+    for s in seq #This iterates over seq, "s = seq[i,:]"
+        #Motion
+        x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, s.t)
+        #Effective field
+        ΔBz = p.Δw ./ T(2π .* γ) .- s.Δf ./ T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
+        Bz = (s.Gx .* x .+ s.Gy .* y .+ s.Gz .* z) .+ ΔBz
+        B = sqrt.(abs.(s.B1) .^ 2 .+ abs.(Bz) .^ 2)
+        B[B .== 0] .= eps(T)
+        #Spinor Rotation
+        φ = T(-2π .* γ) .* (B .* s.Δt) # TODO: Use trapezoidal integration here (?),  this is just Forward Euler
+        mul!(Q(φ, s.B1 ./ B, Bz ./ B), M)
+        #Relaxation
+        M.xy .= M.xy .* exp.(-s.Δt ./ p.T2)
+        M.z .= M.z .* exp.(-s.Δt ./ p.T1) .+ p.ρ .* (1 .- exp.(-s.Δt ./ p.T1))
+    end
+    #Acquired signal
+    #sig .= -1.4im #<-- This was to test if an ADC point was inside an RF block
+    return nothing
+end
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
index 0e1c719d2..ec6586784 100644
--- a/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
+++ b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
@@ -4,7 +4,6 @@ struct Bloch <: SimulationMethod end
 export Bloch
 
 include("Magnetization.jl")
-include("KernelFunctions.jl")
 
 @functor Mag #Gives gpu acceleration capabilities, see GPUFunctions.jl
 
@@ -35,94 +34,7 @@ Base.view(p::DefaultPreAlloc, i::UnitRange) = p
 """Default preallocation function."""
 prealloc(sim_method::SimulationMethod, backend::KA.Backend, obj::Phantom{T}, M::Mag{T}) where {T<:Real} = DefaultPreAlloc{T}()
 
-"""
-    run_spin_precession(obj, seq, Xt, sig)
-
-Simulates an MRI sequence `seq` on the Phantom `obj` for time points `t`. It calculates S(t)
-= ∑ᵢ ρ(xᵢ) exp(- t/T2(xᵢ) ) exp(- 𝒊 γ ∫ Bz(xᵢ,t)). It performs the simulation in free
-precession.
-
-# Arguments
-- `obj`: (`::Phantom`) Phantom struct (actually, it's a part of the complete phantom)
-- `seq`: (`::Sequence`) Sequence struct
-
-# Returns
-- `S`: (`Vector{ComplexF64}`) raw signal over time
-- `M0`: (`::Vector{Mag}`) final state of the Mag vector
-"""
-function run_spin_precession!(
-    p::Phantom{T},
-    seq::DiscreteSequence{T},
-    sig::AbstractArray{Complex{T}},
-    M::Mag{T},
-    sim_method::SimulationMethod,
-    backend::KA.Backend,
-    prealloc::PreallocResult
-) where {T<:Real}
-    #Simulation
-    #Motion
-    x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')
-    #Effective field
-    Bz = x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz' .+ p.Δw ./ T(2π .* γ)
-    #Rotation
-    if is_ADC_on(seq)
-        ϕ = T(-2π .* γ) .* KomaMRIBase.cumtrapz(seq.Δt', Bz, backend)
-    else
-        ϕ = T(-2π .* γ) .* trapz(seq.Δt', Bz)
-    end
-    #Mxy precession and relaxation, and Mz relaxation
-    tp   = cumsum(seq.Δt) # t' = t - t0
-    dur  = sum(seq.Δt)   # Total length, used for signal relaxation
-    Mxy = [M.xy M.xy .* exp.(-tp' ./ p.T2) .* (cos.(ϕ) .+ im .* sin.(ϕ))] #This assumes Δw and T2 are constant in time
-    M.xy .= Mxy[:, end]
-    M.z  .= M.z .* exp.(-dur ./ p.T1) .+ p.ρ .* (1 .- exp.(-dur ./ p.T1))
-    #Acquired signal
-    sig .= transpose(sum(Mxy[:, findall(seq.ADC)]; dims=1)) #<--- TODO: add coil sensitivities
-
-    return nothing
-end
-
-"""
-    M0 = run_spin_excitation(obj, seq, M0)
-
-It gives rise to a rotation of `M0` with an angle given by the efective magnetic field
-(including B1, gradients and off resonance) and with respect to a rotation axis.
-
-# Arguments
-- `obj`: (`::Phantom`) Phantom struct (actually, it's a part of the complete phantom)
-- `seq`: (`::Sequence`) Sequence struct
-
-# Returns
-- `M0`: (`::Vector{Mag}`) final state of the Mag vector after a rotation (actually, it's
-    a part of the complete Mag vector and it's a part of the initial state for the next
-    precession simulation step)
-"""
-function run_spin_excitation!(
-    p::Phantom{T},
-    seq::DiscreteSequence{T},
-    sig::AbstractArray{Complex{T}},
-    M::Mag{T},
-    sim_method::SimulationMethod,
-    backend::KA.Backend,
-    prealloc::PreallocResult
-) where {T<:Real}
-    #Simulation
-    for s in seq #This iterates over seq, "s = seq[i,:]"
-        #Motion
-        x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, s.t)
-        #Effective field
-        ΔBz = p.Δw ./ T(2π .* γ) .- s.Δf ./ T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
-        Bz = (s.Gx .* x .+ s.Gy .* y .+ s.Gz .* z) .+ ΔBz
-        B = sqrt.(abs.(s.B1) .^ 2 .+ abs.(Bz) .^ 2)
-        B[B .== 0] .= eps(T)
-        #Spinor Rotation
-        φ = T(-2π .* γ) .* (B .* s.Δt) # TODO: Use trapezoidal integration here (?),  this is just Forward Euler
-        mul!(Q(φ, s.B1 ./ B, Bz ./ B), M)
-        #Relaxation
-        M.xy .= M.xy .* exp.(-s.Δt ./ p.T2)
-        M.z .= M.z .* exp.(-s.Δt ./ p.T1) .+ p.ρ .* (1 .- exp.(-s.Δt ./ p.T1))
-    end
-    #Acquired signal
-    #sig .= -1.4im #<-- This was to test if an ADC point was inside an RF block
-    return nothing
-end
\ No newline at end of file
+include("KernelFunctions.jl")
+include("BlochSimple/BlochSimpleSimulationMethod.jl")
+include("Bloch/BlochCPU.jl")
+include("BlochDict/BlochDictSimulationMethod.jl")
\ No newline at end of file
diff --git a/KomaMRICore/src/simulation/SimulatorCore.jl b/KomaMRICore/src/simulation/SimulatorCore.jl
index bb7bd71bc..8a6e9eca3 100644
--- a/KomaMRICore/src/simulation/SimulatorCore.jl
+++ b/KomaMRICore/src/simulation/SimulatorCore.jl
@@ -2,9 +2,7 @@ abstract type SimulationMethod end #get all available types by using subtypes(Ko
 abstract type SpinStateRepresentation{T<:Real} end #get all available types by using subtypes(KomaMRI.SpinStateRepresentation)
 
 #Defined methods:
-include("SimMethods/SimulationMethod.jl")  #Defines Bloch simulation method
-include("SimMethods/BlochDict/BlochDictSimulationMethod.jl")   #Defines BlochDict simulation method
-include("SimMethods/Bloch/BlochCPU.jl") #Defines specialized Bloch functions for CPU
+include("SimMethods/SimulationMethod.jl")  #Defines simulation methods
 
 """
     sim_params = default_sim_params(sim_params=Dict{String,Any}())

From 4801e48916e1b0a5769f196c492015152da115af Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Thu, 18 Jul 2024 19:22:07 -0500
Subject: [PATCH 22/24] Refactor allocation structs

---
 KomaMRICore/src/datatypes/Spinor.jl           |  1 +
 .../simulation/SimMethods/Bloch/BlochCPU.jl   | 66 +++++++++----------
 2 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/KomaMRICore/src/datatypes/Spinor.jl b/KomaMRICore/src/datatypes/Spinor.jl
index 803578081..8d8977bbb 100644
--- a/KomaMRICore/src/datatypes/Spinor.jl
+++ b/KomaMRICore/src/datatypes/Spinor.jl
@@ -31,6 +31,7 @@ Spinor(α::Complex{T}, β::Complex{T}) where {T<:Real} = Spinor([α], [β])
 Spinor(α::T, β::T) where {T<:Real} = Spinor([complex(α)], [complex(β)])
 one(T::Spinor) = Spinor(1.,0.)
 Base.getindex(s::Spinor, i) = Spinor(s.α[i], s.β[i])
+Base.view(s::Spinor, i::UnitRange) = @views Spinor(s.α[i], s.β[i])
 """
     str = show(io::IO, s::Spinor)
 
diff --git a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
index b8a8c30f3..de59c88ba 100644
--- a/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
+++ b/KomaMRICore/src/simulation/SimMethods/Bloch/BlochCPU.jl
@@ -1,39 +1,39 @@
-"""Stores arrays for use in Bloch CPU run_spin_precession function."""
+"""Stores preallocated structs for use in Bloch CPU run_spin_precession function."""
 struct BlochCPUPrealloc{T} <: PreallocResult{T}
-    Bz_1::AbstractVector{T}               # Vector{T}(Nspins x 1)
-    Bz_2::AbstractVector{T}               # Vector{T}(Nspins x 1)
-    Bz_3::AbstractVector{T}               # Vector{T}(Nspins x 1)
-    Bz_4::AbstractVector{T}               # Vector{T}(Nspins x 1)
-    Bz_5::AbstractVector{T}               # Vector{T}(Nspins x 1)
-    Mxy_1::AbstractVector{Complex{T}}     # Vector{Complex{T}}(Nspins x 1)
-    Mxy_2::AbstractVector{Complex{T}}     # Vector{Complex{T}}(Nspins x 1)
-    Mxy_3::AbstractVector{Complex{T}}     # Vector{Complex{T}}(Nspins x 1)
+    M::Mag{T}                               # Mag{T}
+    Bz_old::AbstractVector{T}               # Vector{T}(Nspins x 1)
+    Bz_new::AbstractVector{T}               # Vector{T}(Nspins x 1)
+    ϕ::AbstractVector{T}                    # Vector{T}(Nspins x 1)
+    φ::AbstractVector{T}                    # Vector{T}(Nspins x 1)
+    Rot::Spinor{T}                          # Spinor{T}
 end
 
 Base.view(p::BlochCPUPrealloc, i::UnitRange) = begin
     @views BlochCPUPrealloc(
-        p.Bz_1[i],
-        p.Bz_2[i],
-        p.Bz_3[i],
-        p.Bz_4[i],
-        p.Bz_5[i],
-        p.Mxy_1[i],
-        p.Mxy_2[i],
-        p.Mxy_3[i]
+        p.M[i],
+        p.Bz_old[i],
+        p.Bz_new[i],
+        p.ϕ[i],
+        p.φ[i],
+        p.Rot[i]
     )
 end
 
 """Preallocates arrays for use in run_spin_precession."""
 function prealloc(sim_method::Bloch, backend::KA.CPU, obj::Phantom{T}, M::Mag{T}) where {T<:Real}
     return BlochCPUPrealloc(
+        Mag(
+            similar(M.xy),
+            similar(M.z)
+        ),
         similar(obj.x),
         similar(obj.x),
         similar(obj.x),
         similar(obj.x),
-        similar(obj.x),
-        similar(M.xy),
-        similar(M.xy),
-        similar(M.xy)
+        Spinor(
+            similar(M.xy),
+            similar(M.xy)
+        )
     )
 end
 
@@ -59,10 +59,10 @@ function run_spin_precession!(
     x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t[1,:]')
     
     #Initialize arrays
-    Bz_old = prealloc.Bz_1
-    Bz_new = prealloc.Bz_2
-    ϕ = prealloc.Bz_3
-    Mxy = prealloc.Mxy_1
+    Bz_old = prealloc.Bz_old
+    Bz_new = prealloc.Bz_new
+    ϕ = prealloc.ϕ
+    Mxy = prealloc.M.xy
     fill!(ϕ, zero(T))
     @. Bz_old = x[:,1] * seq.Gx[1] + y[:,1] * seq.Gy[1] + z[:,1] * seq.Gz[1] + p.Δw / T(2π * γ)
 
@@ -116,14 +116,14 @@ function run_spin_excitation!(
     backend::KA.CPU,
     prealloc::BlochCPUPrealloc
 ) where {T<:Real}
-    ΔBz = prealloc.Bz_1
-    Bz = prealloc.Bz_2
-    B = prealloc.Bz_3
-    φ = prealloc.Bz_4
-    α = prealloc.Mxy_1
-    β = prealloc.Mxy_2
-    Maux_xy = prealloc.Mxy_3
-    Maux_z = prealloc.Bz_5
+    ΔBz = prealloc.Bz_old
+    Bz = prealloc.Bz_new
+    B = prealloc.ϕ
+    φ = prealloc.φ
+    α = prealloc.Rot.α
+    β = prealloc.Rot.β
+    Maux_xy = prealloc.M.xy
+    Maux_z = prealloc.M.z
 
     #Can be calculated outside of loop
     @. ΔBz = p.Δw / T(2π * γ)

From b86da0bef7718dc91f2fe6e49f04500824d6a2da Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Fri, 19 Jul 2024 13:58:10 -0500
Subject: [PATCH 23/24] Rename files

---
 .../BlochDict/{BlochDictSimulationMethod.jl => BlochDict.jl}  | 0
 .../{BlochSimpleSimulationMethod.jl => BlochSimple.jl}        | 0
 KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl     | 4 ++--
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename KomaMRICore/src/simulation/SimMethods/BlochDict/{BlochDictSimulationMethod.jl => BlochDict.jl} (100%)
 rename KomaMRICore/src/simulation/SimMethods/BlochSimple/{BlochSimpleSimulationMethod.jl => BlochSimple.jl} (100%)

diff --git a/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDictSimulationMethod.jl b/KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDict.jl
similarity index 100%
rename from KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDictSimulationMethod.jl
rename to KomaMRICore/src/simulation/SimMethods/BlochDict/BlochDict.jl
diff --git a/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimpleSimulationMethod.jl b/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl
similarity index 100%
rename from KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimpleSimulationMethod.jl
rename to KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl
diff --git a/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
index ec6586784..0fbc236bc 100644
--- a/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
+++ b/KomaMRICore/src/simulation/SimMethods/SimulationMethod.jl
@@ -35,6 +35,6 @@ Base.view(p::DefaultPreAlloc, i::UnitRange) = p
 prealloc(sim_method::SimulationMethod, backend::KA.Backend, obj::Phantom{T}, M::Mag{T}) where {T<:Real} = DefaultPreAlloc{T}()
 
 include("KernelFunctions.jl")
-include("BlochSimple/BlochSimpleSimulationMethod.jl")
+include("BlochSimple/BlochSimple.jl")
 include("Bloch/BlochCPU.jl")
-include("BlochDict/BlochDictSimulationMethod.jl")
\ No newline at end of file
+include("BlochDict/BlochDict.jl")
\ No newline at end of file

From 34a3a034b6a62d5a8491b014a11ab962f0c25945 Mon Sep 17 00:00:00 2001
From: Ryan Kierulf <ryanakierulf@gmail.com>
Date: Fri, 19 Jul 2024 14:28:21 -0500
Subject: [PATCH 24/24] Update so documentation doesn't fail

---
 .../simulation/SimMethods/BlochSimple/BlochSimple.jl   | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl b/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl
index bf6b23e4c..3b662cb74 100644
--- a/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl
+++ b/KomaMRICore/src/simulation/SimMethods/BlochSimple/BlochSimple.jl
@@ -1,9 +1,7 @@
-"""
-Simplest sim method, works for GPU and CPU but not optimized for either. Although Bloch()
-is the simulation method chosen if none is passed, the run_spin_precession! and
-run_spin_excitation! functions in this file are dispatched to at the most abstract level,
-so new simulation methods will start by using these functions.
-"""
+#Simplest sim method, works for GPU and CPU but not optimized for either. Although Bloch()
+#is the simulation method chosen if none is passed, the run_spin_precession! and
+#run_spin_excitation! functions in this file are dispatched to at the most abstract level,
+#so new simulation methods will start by using these functions.
 struct BlochSimple <: SimulationMethod end
 
 export BlochSimple