Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/workflows/gpu-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ jobs:
name: Run Benchmarks
needs: start-gpu-runner
runs-on: ${{ needs.start-gpu-runner.outputs.label }}
permissions:
contents: write
pull-requests: write
env:
JULIA_CUDA_MEMORY_POOL: none
timeout-minutes: 120
Expand All @@ -79,6 +82,24 @@ jobs:
with:
name: benchmark-results-${{ inputs.instance-type }}-${{ github.run_number }}
path: benchmark/results/
# Publish to /bench on gh-pages (time-series chart + regression alerts).
# save-data-file and auto-push are gated to main so PR / branch runs never
# pollute the published series — they still render a comparison.
- name: Publish benchmark to gh-pages
if: success() && hashFiles('benchmark/results/bench.json') != ''
uses: benchmark-action/github-action-benchmark@v1
with:
name: CuQuantum.jl GPU benchmarks (${{ inputs.instance-type }})
tool: customSmallerIsBetter
output-file-path: benchmark/results/bench.json
github-token: ${{ secrets.GITHUB_TOKEN }}
gh-pages-branch: gh-pages
benchmark-data-dir-path: bench
alert-threshold: '120%'
comment-on-alert: true
fail-on-alert: false
save-data-file: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
auto-push: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}

stop-gpu-runner:
name: Stop GPU Runner
Expand Down
1 change: 1 addition & 0 deletions benchmark/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CuQuantum = "fc340f4a-ac26-4dfb-b247-6d4fbb695c9a"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
QuantumToolbox = "6c2fb7c5-b903-41d2-bc5e-5a7c320b9fab"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

Expand Down
189 changes: 125 additions & 64 deletions benchmark/run_benchmarks.jl
Original file line number Diff line number Diff line change
@@ -1,24 +1,29 @@
# =============================================================================
# Benchmark: cuDensityMat (GPU) vs QuantumToolbox.jl (CPU)
# Benchmark: Liouvillian action L[ρ] — cuDensityMat vs QuantumToolbox.jl
# =============================================================================
#
# Compares the time to evaluate a single Liouvillian action L[ρ] for the
# dual-rail cavity system at varying numbers of cavities M.
# Times a single L[ρ] evaluation for the dual-rail cavity system across:
# - cuDensityMat (GPU, direct library binding)
# - QT.jl GPU cuSPARSE (sparse Liouvillian on-device via QuantumObject)
# - QT.jl CPU sparse
# - CPU dense (small sizes only)
#
# System: M coupled cavities, Fock truncation d=3
# H(t) = Σ_m χ n_m(n_m-1) + Σ_m δ_m(t) n_m + Σ_{n≠m} κ(t) a_n†a_m
# H = Σ_m χ n_m(n_m-1) + Σ_{n≠m} κ a_n†a_m
# L[ρ] = -i[H,ρ] + γ Σ_m (a_m ρ a_m† - ½{n_m, ρ})
#
# Usage:
# julia --project=. benchmark/run_benchmarks.jl
#
# Output: benchmark_results.csv
# Outputs (written to benchmark/results/):
# benchmark_results.csv — raw per-backend median/min/max
# bench.json — customSmallerIsBetter format for github-action-benchmark

using CUDA
using CuQuantum
using CuQuantum.CuDensityMat
using LinearAlgebra
using SparseArrays
using QuantumToolbox
using Statistics

# =============================================================================
Expand Down Expand Up @@ -341,92 +346,93 @@ function benchmark_cpu_dense(M::Int, d::Int; n_warmup = 3, n_trials = 20)
end

# =============================================================================
# 3. CPU sparse baseline: sparse superoperator
# 3. QuantumToolbox.jl path: sparse Liouvillian via QT.jl, CPU + GPU cuSPARSE
# =============================================================================
#
# Routed through QuantumToolbox.jl's `liouvillian` + `QuantumObject` so the
# tracked series includes QT.jl dispatch/allocation overhead — regressions in
# the user-facing QT.jl path show up here.

"""
Build a sparse Liouvillian superoperator directly using Kronecker products
of sparse matrices. Avoids forming the dense superoperator.
Build the Lindblad system as QuantumToolbox.jl `QuantumObject`s. Returns the
Liouvillian super-operator and the vectorized initial density matrix.
"""
function build_cpu_sparse_system(M::Int, d::Int)
T = ComplexF64
D = d^M
function build_qt_system(M::Int, d::Int)
a = destroy(d)
Id = qeye(d)

# Single-cavity operators (sparse)
a_single = spzeros(T, d, d)
for n = 1:(d-1)
a_single[n, n+1] = sqrt(n)
end
n_single = a_single' * a_single
kerr_single = n_single * (n_single - sparse(T(1)*I, d, d))

# Embed into full Hilbert space using sparse Kronecker products
function embed_sparse(op, mode, M, d)
mats = [i == mode ? op : sparse(T(1)*I, d, d) for i = 1:M]
result = mats[1]
for i = 2:M
result = kron(result, mats[i])
end
return result
end
a_ops = [tensor([i == m ? a : Id for i = 1:M]...) for m = 1:M]
n_ops = [adjoint(ao) * ao for ao in a_ops]

χ = 2π * 0.2
κ₀ = 2π * 0.1
γ = 0.01

eye_D = sparse(T(1)*I, D, D)

# Build Hamiltonian (sparse D×D)
H = spzeros(T, D, D)
a_ops = [embed_sparse(a_single, m, M, d) for m = 1:M]
n_ops = [embed_sparse(n_single, m, M, d) for m = 1:M]
for m = 1:M
H += χ * embed_sparse(kerr_single, m, M, d)
end
Id_full = tensor(fill(Id, M)...)
H = sum(χ * nm * (nm - Id_full) for nm in n_ops)
for n = 1:M, m = 1:M
n == m && continue
H += κ₀ * (a_ops[n]' * a_ops[m])
H += κ₀ * (adjoint(a_ops[n]) * a_ops[m])
end

# Build sparse Liouvillian superoperator (D²×D²)
# L = -i(I⊗H - Hᵀ⊗I) + γ Σ_m (conj(a_m)⊗a_m - ½ I⊗n_m - ½ n_mᵀ⊗I)
L_super = -im * (kron(eye_D, H) - kron(transpose(H), eye_D))
for m = 1:M
am = a_ops[m]
nm = n_ops[m]
L_super +=
γ * (kron(conj(am), am) - 0.5*kron(eye_D, nm) - 0.5*kron(transpose(nm), eye_D))
end
c_ops = [sqrt(γ) * ao for ao in a_ops]
L = liouvillian(H, c_ops)

# Initial state
rho_vec = zeros(T, D*D)
rho_vec[2+D*(2-1)] = 1.0
# Initial state: |1,0,...,0⟩⟨1,0,...,0|
ψ = tensor(fock(d, 1), [fock(d, 0) for _ = 2:M]...)
ρ_vec = mat2vec(ket2dm(ψ))

return L_super, rho_vec
return L, ρ_vec
end

function benchmark_cpu_sparse(M::Int, d::Int; n_warmup = 3, n_trials = 20)
D = d^M
# Sparse superoperator is D²×D² but with O(M * d² * D²) non-zeros.
# For D=729 (M=6): ~531K² matrix, ~50M non-zeros → ~1.2 GB storage. Feasible.
# For D=6561 (M=8): ~43M² matrix, ~5B non-zeros → ~120 GB storage. Too large.
# Sparse superoperator is D²×D² with O(M·d²·D²) non-zeros.
# D=729 (M=6): ~50M nnz ≈ 1.2 GB — feasible. D=6561 (M=8): ~5B nnz — too large.
if D > 1000
return NaN, NaN, NaN
end

L_sparse, rho_vec = build_cpu_sparse_system(M, d)
rho_dot = similar(rho_vec)
L, ρ_vec = build_qt_system(M, d)

# Warmup (QObject `*` returns a new QObject — captures allocation overhead)
for _ = 1:n_warmup
L * ρ_vec
end

times = Float64[]
for _ = 1:n_trials
t0 = time_ns()
L * ρ_vec
t1 = time_ns()
push!(times, (t1 - t0) / 1e6)
end

return median(times), minimum(times), maximum(times)
end

function benchmark_cusparse_gpu(M::Int, d::Int; n_warmup = 5, n_trials = 50)
D = d^M
if D > 1000
return NaN, NaN, NaN
end

L_cpu, ρ_vec_cpu = build_qt_system(M, d)
L_gpu = cu(L_cpu) # QObject wrapping CuSparseMatrixCSC
ρ_vec_gpu = cu(ρ_vec_cpu) # QObject wrapping CuVector

# Warmup
for _ = 1:n_warmup
mul!(rho_dot, L_sparse, rho_vec)
L_gpu * ρ_vec_gpu
end
CUDA.synchronize()

# Timed runs
times = Float64[]
for _ = 1:n_trials
CUDA.synchronize()
t0 = time_ns()
mul!(rho_dot, L_sparse, rho_vec)
L_gpu * ρ_vec_gpu
CUDA.synchronize()
t1 = time_ns()
push!(times, (t1 - t0) / 1e6)
end
Expand Down Expand Up @@ -526,12 +532,37 @@ function main()
println("SKIPPED (D=$D too large)")
end

# GPU cuSPARSE SpMV
print(" GPU cuSPARSE SpMV: ")
gpu_cusparse_med, gpu_cusparse_min, gpu_cusparse_max = try
benchmark_cusparse_gpu(M, d)
catch e
println("FAILED: $e")
(NaN, NaN, NaN)
end
if !isnan(gpu_cusparse_med)
println(
"$(round(gpu_cusparse_med, digits=3)) ms (min=$(round(gpu_cusparse_min, digits=3)), max=$(round(gpu_cusparse_max, digits=3)))",
)
elseif gpu_cusparse_med === NaN
println("SKIPPED (D=$D too large)")
end

# Speedup
if !isnan(gpu_med) && !isnan(cpu_sparse_med)
println(" Speedup (sparse/GPU): $(round(cpu_sparse_med / gpu_med, digits=1))x")
println(
" Speedup (sparse/GPU cuDensityMat): $(round(cpu_sparse_med / gpu_med, digits=1))x",
)
end
if !isnan(gpu_med) && !isnan(cpu_dense_med)
println(" Speedup (dense/GPU): $(round(cpu_dense_med / gpu_med, digits=1))x")
println(
" Speedup (dense/GPU cuDensityMat): $(round(cpu_dense_med / gpu_med, digits=1))x",
)
end
if !isnan(gpu_cusparse_med) && !isnan(gpu_med)
println(
" Speedup (cuSPARSE/cuDensityMat): $(round(gpu_cusparse_med / gpu_med, digits=1))x",
)
end

push!(
Expand All @@ -542,6 +573,7 @@ function main()
gpu_ms = gpu_med,
cpu_dense_ms = cpu_dense_med,
cpu_sparse_ms = cpu_sparse_med,
gpu_cusparse_ms = gpu_cusparse_med,
),
)
println()
Expand All @@ -552,15 +584,44 @@ function main()
mkpath(results_dir)
csv_file = joinpath(results_dir, "benchmark_results.csv")
open(csv_file, "w") do io
println(io, "M,D,rho_elements,gpu_ms,cpu_dense_ms,cpu_sparse_ms")
println(io, "M,D,rho_elements,gpu_ms,cpu_dense_ms,cpu_sparse_ms,gpu_cusparse_ms")
for r in results
println(
io,
"$(r.M),$(r.D),$(r.D^2),$(r.gpu_ms),$(r.cpu_dense_ms),$(r.cpu_sparse_ms)",
"$(r.M),$(r.D),$(r.D^2),$(r.gpu_ms),$(r.cpu_dense_ms),$(r.cpu_sparse_ms),$(r.gpu_cusparse_ms)",
)
end
end
println("Results saved to $csv_file")

# Emit customSmallerIsBetter JSON for github-action-benchmark
json_file = joinpath(results_dir, "bench.json")
open(json_file, "w") do io
print(io, "[")
first = true
for r in results
for (label, value) in (
("cuDensityMat GPU L[ρ] M=$(r.M) D=$(r.D)", r.gpu_ms),
("QT.jl GPU cuSPARSE L[ρ] M=$(r.M) D=$(r.D)", r.gpu_cusparse_ms),
("CPU dense SpMV L[ρ] M=$(r.M) D=$(r.D)", r.cpu_dense_ms),
("QT.jl CPU sparse L[ρ] M=$(r.M) D=$(r.D)", r.cpu_sparse_ms),
)
isnan(value) && continue
first || print(io, ",")
first = false
print(
io,
"{\"name\":\"",
label,
"\",\"unit\":\"ms\",\"value\":",
value,
"}",
)
end
end
print(io, "]")
end
println("JSON results saved to $json_file")
end

main()
2 changes: 2 additions & 0 deletions docs/src/benchmarks.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Benchmarks

**[Live time-series dashboard →](https://harmoniqs.github.io/CuQuantum.jl/bench/)** — per-commit GPU benchmark history, populated on every push to `main`.

All benchmarks use the same Lindblad master equation for ``M`` coupled cavities with Fock truncation ``d=3``:

```math
Expand Down
Loading