diff --git a/.github/workflows/gpu-benchmark.yml b/.github/workflows/gpu-benchmark.yml
index b940756..ae903d2 100644
--- a/.github/workflows/gpu-benchmark.yml
+++ b/.github/workflows/gpu-benchmark.yml
@@ -59,6 +59,9 @@ jobs:
     name: Run Benchmarks
     needs: start-gpu-runner
     runs-on: ${{ needs.start-gpu-runner.outputs.label }}
+    permissions:
+      contents: write
+      pull-requests: write
     env:
       JULIA_CUDA_MEMORY_POOL: none
     timeout-minutes: 120
@@ -79,6 +82,24 @@ jobs:
         with:
           name: benchmark-results-${{ inputs.instance-type }}-${{ github.run_number }}
           path: benchmark/results/
+      # Publish to /bench on gh-pages (time-series chart + regression alerts).
+      # save-data-file and auto-push are gated to main so PR / branch runs never
+      # pollute the published series — they still render a comparison.
+      - name: Publish benchmark to gh-pages
+        if: success() && hashFiles('benchmark/results/bench.json') != ''
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: CuQuantum.jl GPU benchmarks (${{ inputs.instance-type }})
+          tool: customSmallerIsBetter
+          output-file-path: benchmark/results/bench.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          gh-pages-branch: gh-pages
+          benchmark-data-dir-path: bench
+          alert-threshold: '120%'
+          comment-on-alert: true
+          fail-on-alert: false
+          save-data-file: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+          auto-push: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
 
   stop-gpu-runner:
     name: Stop GPU Runner
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index 0be70c6..a07827b 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -2,6 +2,7 @@
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CuQuantum = "fc340f4a-ac26-4dfb-b247-6d4fbb695c9a"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+QuantumToolbox = "6c2fb7c5-b903-41d2-bc5e-5a7c320b9fab"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
diff --git a/benchmark/run_benchmarks.jl b/benchmark/run_benchmarks.jl
index ee77da4..ced7827 100644
--- a/benchmark/run_benchmarks.jl
+++ b/benchmark/run_benchmarks.jl
@@ -1,24 +1,29 @@
 # =============================================================================
-# Benchmark: cuDensityMat (GPU) vs QuantumToolbox.jl (CPU)
+# Benchmark: Liouvillian action L[ρ] — cuDensityMat vs QuantumToolbox.jl
 # =============================================================================
 #
-# Compares the time to evaluate a single Liouvillian action L[ρ] for the
-# dual-rail cavity system at varying numbers of cavities M.
+# Times a single L[ρ] evaluation for the dual-rail cavity system across:
+#   - cuDensityMat (GPU, direct library binding)
+#   - QT.jl GPU cuSPARSE (sparse Liouvillian on-device via QuantumObject)
+#   - QT.jl CPU sparse
+#   - CPU dense (small sizes only)
 #
 # System: M coupled cavities, Fock truncation d=3
-#   H(t) = Σ_m χ n_m(n_m-1) + Σ_m δ_m(t) n_m + Σ_{n≠m} κ(t) a_n†a_m
+#   H = Σ_m χ n_m(n_m-1) + Σ_{n≠m} κ a_n†a_m
 #   L[ρ] = -i[H,ρ] + γ Σ_m (a_m ρ a_m† - ½{n_m, ρ})
 #
 # Usage:
 #   julia --project=. benchmark/run_benchmarks.jl
 #
-# Output: benchmark_results.csv
+# Outputs (written to benchmark/results/):
+#   benchmark_results.csv — raw per-backend median/min/max
+#   bench.json            — customSmallerIsBetter format for github-action-benchmark
 
 using CUDA
 using CuQuantum
 using CuQuantum.CuDensityMat
 using LinearAlgebra
-using SparseArrays
+using QuantumToolbox
 using Statistics
 
 # =============================================================================
@@ -341,92 +346,93 @@ function benchmark_cpu_dense(M::Int, d::Int; n_warmup = 3, n_trials = 20)
 end
 
 # =============================================================================
-# 3. CPU sparse baseline: sparse superoperator
+# 3. QuantumToolbox.jl path: sparse Liouvillian via QT.jl, CPU + GPU cuSPARSE
 # =============================================================================
+#
+# Routed through QuantumToolbox.jl's `liouvillian` + `QuantumObject` so the
+# tracked series includes QT.jl dispatch/allocation overhead — regressions in
+# the user-facing QT.jl path show up here.
 
 """
-Build a sparse Liouvillian superoperator directly using Kronecker products
-of sparse matrices. Avoids forming the dense superoperator.
+Build the Lindblad system as QuantumToolbox.jl `QuantumObject`s. Returns the
+Liouvillian super-operator and the vectorized initial density matrix.
 """
-function build_cpu_sparse_system(M::Int, d::Int)
-    T = ComplexF64
-    D = d^M
+function build_qt_system(M::Int, d::Int)
+    a = destroy(d)
+    Id = qeye(d)
 
-    # Single-cavity operators (sparse)
-    a_single = spzeros(T, d, d)
-    for n = 1:(d-1)
-        a_single[n, n+1] = sqrt(n)
-    end
-    n_single = a_single' * a_single
-    kerr_single = n_single * (n_single - sparse(T(1)*I, d, d))
-
-    # Embed into full Hilbert space using sparse Kronecker products
-    function embed_sparse(op, mode, M, d)
-        mats = [i == mode ? op : sparse(T(1)*I, d, d) for i = 1:M]
-        result = mats[1]
-        for i = 2:M
-            result = kron(result, mats[i])
-        end
-        return result
-    end
+    a_ops = [tensor([i == m ? a : Id for i = 1:M]...) for m = 1:M]
+    n_ops = [adjoint(ao) * ao for ao in a_ops]
 
     χ = 2π * 0.2
     κ₀ = 2π * 0.1
     γ = 0.01
 
-    eye_D = sparse(T(1)*I, D, D)
-
-    # Build Hamiltonian (sparse D×D)
-    H = spzeros(T, D, D)
-    a_ops = [embed_sparse(a_single, m, M, d) for m = 1:M]
-    n_ops = [embed_sparse(n_single, m, M, d) for m = 1:M]
-    for m = 1:M
-        H += χ * embed_sparse(kerr_single, m, M, d)
-    end
+    Id_full = tensor(fill(Id, M)...)
+    H = sum(χ * nm * (nm - Id_full) for nm in n_ops)
     for n = 1:M, m = 1:M
         n == m && continue
-        H += κ₀ * (a_ops[n]' * a_ops[m])
+        H += κ₀ * (adjoint(a_ops[n]) * a_ops[m])
     end
 
-    # Build sparse Liouvillian superoperator (D²×D²)
-    # L = -i(I⊗H - Hᵀ⊗I) + γ Σ_m (conj(a_m)⊗a_m - ½ I⊗n_m - ½ n_mᵀ⊗I)
-    L_super = -im * (kron(eye_D, H) - kron(transpose(H), eye_D))
-    for m = 1:M
-        am = a_ops[m]
-        nm = n_ops[m]
-        L_super +=
-            γ * (kron(conj(am), am) - 0.5*kron(eye_D, nm) - 0.5*kron(transpose(nm), eye_D))
-    end
+    c_ops = [sqrt(γ) * ao for ao in a_ops]
+    L = liouvillian(H, c_ops)
 
-    # Initial state
-    rho_vec = zeros(T, D*D)
-    rho_vec[2+D*(2-1)] = 1.0
+    # Initial state: |1,0,...,0⟩⟨1,0,...,0|
+    ψ = tensor(fock(d, 1), [fock(d, 0) for _ = 2:M]...)
+    ρ_vec = mat2vec(ket2dm(ψ))
 
-    return L_super, rho_vec
+    return L, ρ_vec
 end
 
 function benchmark_cpu_sparse(M::Int, d::Int; n_warmup = 3, n_trials = 20)
     D = d^M
-    # Sparse superoperator is D²×D² but with O(M * d² * D²) non-zeros.
-    # For D=729 (M=6): ~531K² matrix, ~50M non-zeros → ~1.2 GB storage. Feasible.
-    # For D=6561 (M=8): ~43M² matrix, ~5B non-zeros → ~120 GB storage. Too large.
+    # Sparse superoperator is D²×D² with O(M·d²·D²) non-zeros.
+    # D=729 (M=6): ~50M nnz ≈ 1.2 GB — feasible. D=6561 (M=8): ~5B nnz — too large.
     if D > 1000
         return NaN, NaN, NaN
     end
 
-    L_sparse, rho_vec = build_cpu_sparse_system(M, d)
-    rho_dot = similar(rho_vec)
+    L, ρ_vec = build_qt_system(M, d)
+
+    # Warmup (QObject `*` returns a new QObject — captures allocation overhead)
+    for _ = 1:n_warmup
+        L * ρ_vec
+    end
+
+    times = Float64[]
+    for _ = 1:n_trials
+        t0 = time_ns()
+        L * ρ_vec
+        t1 = time_ns()
+        push!(times, (t1 - t0) / 1e6)
+    end
+
+    return median(times), minimum(times), maximum(times)
+end
+
+function benchmark_cusparse_gpu(M::Int, d::Int; n_warmup = 5, n_trials = 50)
+    D = d^M
+    if D > 1000
+        return NaN, NaN, NaN
+    end
+
+    L_cpu, ρ_vec_cpu = build_qt_system(M, d)
+    L_gpu = cu(L_cpu)          # QObject wrapping CuSparseMatrixCSC
+    ρ_vec_gpu = cu(ρ_vec_cpu)  # QObject wrapping CuVector
 
     # Warmup
     for _ = 1:n_warmup
-        mul!(rho_dot, L_sparse, rho_vec)
+        L_gpu * ρ_vec_gpu
     end
+    CUDA.synchronize()
 
-    # Timed runs
     times = Float64[]
     for _ = 1:n_trials
+        CUDA.synchronize()
         t0 = time_ns()
-        mul!(rho_dot, L_sparse, rho_vec)
+        L_gpu * ρ_vec_gpu
+        CUDA.synchronize()
         t1 = time_ns()
         push!(times, (t1 - t0) / 1e6)
     end
@@ -526,12 +532,37 @@ function main()
             println("SKIPPED (D=$D too large)")
         end
 
+        # GPU cuSPARSE SpMV
+        print("  GPU cuSPARSE SpMV:  ")
+        gpu_cusparse_med, gpu_cusparse_min, gpu_cusparse_max = try
+            benchmark_cusparse_gpu(M, d)
+        catch e
+            println("FAILED: $e")
+            (NaN, NaN, NaN)
+        end
+        if !isnan(gpu_cusparse_med)
+            println(
+                "$(round(gpu_cusparse_med, digits=3)) ms (min=$(round(gpu_cusparse_min, digits=3)), max=$(round(gpu_cusparse_max, digits=3)))",
+            )
+        elseif gpu_cusparse_med === NaN
+            println("SKIPPED (D=$D too large)")
+        end
+
         # Speedup
         if !isnan(gpu_med) && !isnan(cpu_sparse_med)
-            println("  Speedup (sparse/GPU): $(round(cpu_sparse_med / gpu_med, digits=1))x")
+            println(
+                "  Speedup (sparse/GPU cuDensityMat): $(round(cpu_sparse_med / gpu_med, digits=1))x",
+            )
         end
         if !isnan(gpu_med) && !isnan(cpu_dense_med)
-            println("  Speedup (dense/GPU):  $(round(cpu_dense_med / gpu_med, digits=1))x")
+            println(
+                "  Speedup (dense/GPU cuDensityMat):  $(round(cpu_dense_med / gpu_med, digits=1))x",
+            )
+        end
+        if !isnan(gpu_cusparse_med) && !isnan(gpu_med)
+            println(
+                "  Speedup (cuSPARSE/cuDensityMat):   $(round(gpu_cusparse_med / gpu_med, digits=1))x",
+            )
         end
 
         push!(
@@ -542,6 +573,7 @@ function main()
                 gpu_ms = gpu_med,
                 cpu_dense_ms = cpu_dense_med,
                 cpu_sparse_ms = cpu_sparse_med,
+                gpu_cusparse_ms = gpu_cusparse_med,
             ),
         )
         println()
@@ -552,15 +584,44 @@ function main()
     mkpath(results_dir)
     csv_file = joinpath(results_dir, "benchmark_results.csv")
     open(csv_file, "w") do io
-        println(io, "M,D,rho_elements,gpu_ms,cpu_dense_ms,cpu_sparse_ms")
+        println(io, "M,D,rho_elements,gpu_ms,cpu_dense_ms,cpu_sparse_ms,gpu_cusparse_ms")
         for r in results
             println(
                 io,
-                "$(r.M),$(r.D),$(r.D^2),$(r.gpu_ms),$(r.cpu_dense_ms),$(r.cpu_sparse_ms)",
+                "$(r.M),$(r.D),$(r.D^2),$(r.gpu_ms),$(r.cpu_dense_ms),$(r.cpu_sparse_ms),$(r.gpu_cusparse_ms)",
             )
         end
     end
     println("Results saved to $csv_file")
+
+    # Emit customSmallerIsBetter JSON for github-action-benchmark
+    json_file = joinpath(results_dir, "bench.json")
+    open(json_file, "w") do io
+        print(io, "[")
+        first = true
+        for r in results
+            for (label, value) in (
+                ("cuDensityMat GPU L[ρ] M=$(r.M) D=$(r.D)", r.gpu_ms),
+                ("QT.jl GPU cuSPARSE L[ρ] M=$(r.M) D=$(r.D)", r.gpu_cusparse_ms),
+                ("CPU dense SpMV L[ρ] M=$(r.M) D=$(r.D)", r.cpu_dense_ms),
+                ("QT.jl CPU sparse L[ρ] M=$(r.M) D=$(r.D)", r.cpu_sparse_ms),
+            )
+                isnan(value) && continue
+                first || print(io, ",")
+                first = false
+                print(
+                    io,
+                    "{\"name\":\"",
+                    label,
+                    "\",\"unit\":\"ms\",\"value\":",
+                    value,
+                    "}",
+                )
+            end
+        end
+        print(io, "]")
+    end
+    println("JSON results saved to $json_file")
 end
 
 main()
diff --git a/docs/src/benchmarks.md b/docs/src/benchmarks.md
index 94c163c..337043c 100644
--- a/docs/src/benchmarks.md
+++ b/docs/src/benchmarks.md
@@ -1,5 +1,7 @@
 # Benchmarks
 
+**[Live time-series dashboard →](https://harmoniqs.github.io/CuQuantum.jl/bench/)** — per-commit GPU benchmark history, populated on every push to `main`.
+
 All benchmarks use the same Lindblad master equation for ``M`` coupled cavities with Fock truncation ``d=3``:
 
 ```math