diff --git a/.github/workflows/gpu-benchmark.yml b/.github/workflows/gpu-benchmark.yml index b940756..ae903d2 100644 --- a/.github/workflows/gpu-benchmark.yml +++ b/.github/workflows/gpu-benchmark.yml @@ -59,6 +59,9 @@ jobs: name: Run Benchmarks needs: start-gpu-runner runs-on: ${{ needs.start-gpu-runner.outputs.label }} + permissions: + contents: write + pull-requests: write env: JULIA_CUDA_MEMORY_POOL: none timeout-minutes: 120 @@ -79,6 +82,24 @@ jobs: with: name: benchmark-results-${{ inputs.instance-type }}-${{ github.run_number }} path: benchmark/results/ + # Publish to /bench on gh-pages (time-series chart + regression alerts). + # save-data-file and auto-push are gated to main so PR / branch runs never + # pollute the published series — they still render a comparison. + - name: Publish benchmark to gh-pages + if: success() && hashFiles('benchmark/results/bench.json') != '' + uses: benchmark-action/github-action-benchmark@v1 + with: + name: CuQuantum.jl GPU benchmarks (${{ inputs.instance-type }}) + tool: customSmallerIsBetter + output-file-path: benchmark/results/bench.json + github-token: ${{ secrets.GITHUB_TOKEN }} + gh-pages-branch: gh-pages + benchmark-data-dir-path: bench + alert-threshold: '120%' + comment-on-alert: true + fail-on-alert: false + save-data-file: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + auto-push: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} stop-gpu-runner: name: Stop GPU Runner diff --git a/benchmark/Project.toml b/benchmark/Project.toml index 0be70c6..a07827b 100644 --- a/benchmark/Project.toml +++ b/benchmark/Project.toml @@ -2,6 +2,7 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CuQuantum = "fc340f4a-ac26-4dfb-b247-6d4fbb695c9a" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +QuantumToolbox = "6c2fb7c5-b903-41d2-bc5e-5a7c320b9fab" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/benchmark/run_benchmarks.jl b/benchmark/run_benchmarks.jl index ee77da4..ced7827 100644 --- a/benchmark/run_benchmarks.jl +++ b/benchmark/run_benchmarks.jl @@ -1,24 +1,29 @@ # ============================================================================= -# Benchmark: cuDensityMat (GPU) vs QuantumToolbox.jl (CPU) +# Benchmark: Liouvillian action L[ρ] — cuDensityMat vs QuantumToolbox.jl # ============================================================================= # -# Compares the time to evaluate a single Liouvillian action L[ρ] for the -# dual-rail cavity system at varying numbers of cavities M. +# Times a single L[ρ] evaluation for the dual-rail cavity system across: +# - cuDensityMat (GPU, direct library binding) +# - QT.jl GPU cuSPARSE (sparse Liouvillian on-device via QuantumObject) +# - QT.jl CPU sparse +# - CPU dense (small sizes only) # # System: M coupled cavities, Fock truncation d=3 -# H(t) = Σ_m χ n_m(n_m-1) + Σ_m δ_m(t) n_m + Σ_{n≠m} κ(t) a_n†a_m +# H = Σ_m χ n_m(n_m-1) + Σ_{n≠m} κ a_n†a_m # L[ρ] = -i[H,ρ] + γ Σ_m (a_m ρ a_m† - ½{n_m, ρ}) # # Usage: # julia --project=. benchmark/run_benchmarks.jl # -# Output: benchmark_results.csv +# Outputs (written to benchmark/results/): +# benchmark_results.csv — raw per-backend median/min/max +# bench.json — customSmallerIsBetter format for github-action-benchmark using CUDA using CuQuantum using CuQuantum.CuDensityMat using LinearAlgebra -using SparseArrays +using QuantumToolbox using Statistics # ============================================================================= @@ -341,92 +346,93 @@ function benchmark_cpu_dense(M::Int, d::Int; n_warmup = 3, n_trials = 20) end # ============================================================================= -# 3. CPU sparse baseline: sparse superoperator +# 3. QuantumToolbox.jl path: sparse Liouvillian via QT.jl, CPU + GPU cuSPARSE # ============================================================================= +# +# Routed through QuantumToolbox.jl's `liouvillian` + `QuantumObject` so the +# tracked series includes QT.jl dispatch/allocation overhead — regressions in +# the user-facing QT.jl path show up here. """ -Build a sparse Liouvillian superoperator directly using Kronecker products -of sparse matrices. Avoids forming the dense superoperator. +Build the Lindblad system as QuantumToolbox.jl `QuantumObject`s. Returns the +Liouvillian super-operator and the vectorized initial density matrix. """ -function build_cpu_sparse_system(M::Int, d::Int) - T = ComplexF64 - D = d^M +function build_qt_system(M::Int, d::Int) + a = destroy(d) + Id = qeye(d) - # Single-cavity operators (sparse) - a_single = spzeros(T, d, d) - for n = 1:(d-1) - a_single[n, n+1] = sqrt(n) - end - n_single = a_single' * a_single - kerr_single = n_single * (n_single - sparse(T(1)*I, d, d)) - - # Embed into full Hilbert space using sparse Kronecker products - function embed_sparse(op, mode, M, d) - mats = [i == mode ? op : sparse(T(1)*I, d, d) for i = 1:M] - result = mats[1] - for i = 2:M - result = kron(result, mats[i]) - end - return result - end + a_ops = [tensor([i == m ? a : Id for i = 1:M]...) for m = 1:M] + n_ops = [adjoint(ao) * ao for ao in a_ops] χ = 2π * 0.2 κ₀ = 2π * 0.1 γ = 0.01 - eye_D = sparse(T(1)*I, D, D) - - # Build Hamiltonian (sparse D×D) - H = spzeros(T, D, D) - a_ops = [embed_sparse(a_single, m, M, d) for m = 1:M] - n_ops = [embed_sparse(n_single, m, M, d) for m = 1:M] - for m = 1:M - H += χ * embed_sparse(kerr_single, m, M, d) - end + Id_full = tensor(fill(Id, M)...) + H = sum(χ * nm * (nm - Id_full) for nm in n_ops) for n = 1:M, m = 1:M n == m && continue - H += κ₀ * (a_ops[n]' * a_ops[m]) + H += κ₀ * (adjoint(a_ops[n]) * a_ops[m]) end - # Build sparse Liouvillian superoperator (D²×D²) - # L = -i(I⊗H - Hᵀ⊗I) + γ Σ_m (conj(a_m)⊗a_m - ½ I⊗n_m - ½ n_mᵀ⊗I) - L_super = -im * (kron(eye_D, H) - kron(transpose(H), eye_D)) - for m = 1:M - am = a_ops[m] - nm = n_ops[m] - L_super += - γ * (kron(conj(am), am) - 0.5*kron(eye_D, nm) - 0.5*kron(transpose(nm), eye_D)) - end + c_ops = [sqrt(γ) * ao for ao in a_ops] + L = liouvillian(H, c_ops) - # Initial state - rho_vec = zeros(T, D*D) - rho_vec[2+D*(2-1)] = 1.0 + # Initial state: |1,0,...,0⟩⟨1,0,...,0| + ψ = tensor(fock(d, 1), [fock(d, 0) for _ = 2:M]...) + ρ_vec = mat2vec(ket2dm(ψ)) - return L_super, rho_vec + return L, ρ_vec end function benchmark_cpu_sparse(M::Int, d::Int; n_warmup = 3, n_trials = 20) D = d^M - # Sparse superoperator is D²×D² but with O(M * d² * D²) non-zeros. - # For D=729 (M=6): ~531K² matrix, ~50M non-zeros → ~1.2 GB storage. Feasible. - # For D=6561 (M=8): ~43M² matrix, ~5B non-zeros → ~120 GB storage. Too large. + # Sparse superoperator is D²×D² with O(M·d²·D²) non-zeros. + # D=729 (M=6): ~50M nnz ≈ 1.2 GB — feasible. D=6561 (M=8): ~5B nnz — too large. if D > 1000 return NaN, NaN, NaN end - L_sparse, rho_vec = build_cpu_sparse_system(M, d) - rho_dot = similar(rho_vec) + L, ρ_vec = build_qt_system(M, d) + + # Warmup (QObject `*` returns a new QObject — captures allocation overhead) + for _ = 1:n_warmup + L * ρ_vec + end + + times = Float64[] + for _ = 1:n_trials + t0 = time_ns() + L * ρ_vec + t1 = time_ns() + push!(times, (t1 - t0) / 1e6) + end + + return median(times), minimum(times), maximum(times) +end + +function benchmark_cusparse_gpu(M::Int, d::Int; n_warmup = 5, n_trials = 50) + D = d^M + if D > 1000 + return NaN, NaN, NaN + end + + L_cpu, ρ_vec_cpu = build_qt_system(M, d) + L_gpu = cu(L_cpu) # QObject wrapping CuSparseMatrixCSC + ρ_vec_gpu = cu(ρ_vec_cpu) # QObject wrapping CuVector # Warmup for _ = 1:n_warmup - mul!(rho_dot, L_sparse, rho_vec) + L_gpu * ρ_vec_gpu end + CUDA.synchronize() - # Timed runs times = Float64[] for _ = 1:n_trials + CUDA.synchronize() t0 = time_ns() - mul!(rho_dot, L_sparse, rho_vec) + L_gpu * ρ_vec_gpu + CUDA.synchronize() t1 = time_ns() push!(times, (t1 - t0) / 1e6) end @@ -526,12 +532,37 @@ function main() println("SKIPPED (D=$D too large)") end + # GPU cuSPARSE SpMV + print(" GPU cuSPARSE SpMV: ") + gpu_cusparse_med, gpu_cusparse_min, gpu_cusparse_max = try + benchmark_cusparse_gpu(M, d) + catch e + println("FAILED: $e") + (NaN, NaN, NaN) + end + if !isnan(gpu_cusparse_med) + println( + "$(round(gpu_cusparse_med, digits=3)) ms (min=$(round(gpu_cusparse_min, digits=3)), max=$(round(gpu_cusparse_max, digits=3)))", + ) + elseif gpu_cusparse_med === NaN + println("SKIPPED (D=$D too large)") + end + # Speedup if !isnan(gpu_med) && !isnan(cpu_sparse_med) - println(" Speedup (sparse/GPU): $(round(cpu_sparse_med / gpu_med, digits=1))x") + println( + " Speedup (sparse/GPU cuDensityMat): $(round(cpu_sparse_med / gpu_med, digits=1))x", + ) end if !isnan(gpu_med) && !isnan(cpu_dense_med) - println(" Speedup (dense/GPU): $(round(cpu_dense_med / gpu_med, digits=1))x") + println( + " Speedup (dense/GPU cuDensityMat): $(round(cpu_dense_med / gpu_med, digits=1))x", + ) + end + if !isnan(gpu_cusparse_med) && !isnan(gpu_med) + println( + " Speedup (cuSPARSE/cuDensityMat): $(round(gpu_cusparse_med / gpu_med, digits=1))x", + ) end push!( @@ -542,6 +573,7 @@ function main() gpu_ms = gpu_med, cpu_dense_ms = cpu_dense_med, cpu_sparse_ms = cpu_sparse_med, + gpu_cusparse_ms = gpu_cusparse_med, ), ) println() @@ -552,15 +584,44 @@ function main() mkpath(results_dir) csv_file = joinpath(results_dir, "benchmark_results.csv") open(csv_file, "w") do io - println(io, "M,D,rho_elements,gpu_ms,cpu_dense_ms,cpu_sparse_ms") + println(io, "M,D,rho_elements,gpu_ms,cpu_dense_ms,cpu_sparse_ms,gpu_cusparse_ms") for r in results println( io, - "$(r.M),$(r.D),$(r.D^2),$(r.gpu_ms),$(r.cpu_dense_ms),$(r.cpu_sparse_ms)", + "$(r.M),$(r.D),$(r.D^2),$(r.gpu_ms),$(r.cpu_dense_ms),$(r.cpu_sparse_ms),$(r.gpu_cusparse_ms)", ) end end println("Results saved to $csv_file") + + # Emit customSmallerIsBetter JSON for github-action-benchmark + json_file = joinpath(results_dir, "bench.json") + open(json_file, "w") do io + print(io, "[") + first = true + for r in results + for (label, value) in ( + ("cuDensityMat GPU L[ρ] M=$(r.M) D=$(r.D)", r.gpu_ms), + ("QT.jl GPU cuSPARSE L[ρ] M=$(r.M) D=$(r.D)", r.gpu_cusparse_ms), + ("CPU dense SpMV L[ρ] M=$(r.M) D=$(r.D)", r.cpu_dense_ms), + ("QT.jl CPU sparse L[ρ] M=$(r.M) D=$(r.D)", r.cpu_sparse_ms), + ) + isnan(value) && continue + first || print(io, ",") + first = false + print( + io, + "{\"name\":\"", + label, + "\",\"unit\":\"ms\",\"value\":", + value, + "}", + ) + end + end + print(io, "]") + end + println("JSON results saved to $json_file") end main() diff --git a/docs/src/benchmarks.md b/docs/src/benchmarks.md index 94c163c..337043c 100644 --- a/docs/src/benchmarks.md +++ b/docs/src/benchmarks.md @@ -1,5 +1,7 @@ # Benchmarks +**[Live time-series dashboard →](https://harmoniqs.github.io/CuQuantum.jl/bench/)** — per-commit GPU benchmark history, populated on every push to `main`. + All benchmarks use the same Lindblad master equation for ``M`` coupled cavities with Fock truncation ``d=3``: ```math