Skip to content

Conversation

@albertomercurio
Copy link
Contributor

The previous implementation of kron between a CUSPARSE matrix and a Diagonal was wrong. It was assuming the diagonal to be the identity, which is not always the case. I have extended it to any general Diagonal matrix embedding a CuVector.

In order to support the kron with identity like matrices, I have added the extension to FillArrays.jl. Here the diagonal is filled by an unique value.

@github-actions
Copy link
Contributor

github-actions bot commented Nov 14, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/ext/FillArraysExt.jl b/ext/FillArraysExt.jl
index 865faebf5..9fbe77f1f 100644
--- a/ext/FillArraysExt.jl
+++ b/ext/FillArraysExt.jl
@@ -28,13 +28,13 @@ function LinearAlgebra.kron(A::CUSPARSE.CuSparseMatrixCOO{T1, Ti}, B::Diagonal{T
     col = repeat(col, inner = nB)
     data = repeat(convert(CUDA.CuVector{T}, A.nzVal), inner = nB)
 
-    row .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1
-    col .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1
+    row .+= CUDA.CuVector(repeat(0:(nB - 1), outer = Annz)) .+ 1
+    col .+= CUDA.CuVector(repeat(0:(nB - 1), outer = Annz)) .+ 1
 
     # Multiply by the fill value (already promoted type T)
     data .*= fill_value
 
-    CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
+    return CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
 end
 
 # kron between Diagonal{T, AbstractFill} and CuSparseMatrixCOO
@@ -52,9 +52,9 @@ function LinearAlgebra.kron(A::Diagonal{T1, <:FillArrays.AbstractFill{T1}}, B::C
         return CUSPARSE.CuSparseMatrixCOO(CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{T}(undef, 0), out_shape)
     end
 
-    row = (0:nA-1) .* mB
+    row = (0:(nA - 1)) .* mB
     row = CUDA.CuVector(repeat(row, inner = Bnnz))
-    col = (0:nA-1) .* nB
+    col = (0:(nA - 1)) .* nB
     col = CUDA.CuVector(repeat(col, inner = Bnnz))
     data = CUDA.fill(T(fill_value), nA * Bnnz)
 
@@ -63,7 +63,7 @@ function LinearAlgebra.kron(A::Diagonal{T1, <:FillArrays.AbstractFill{T1}}, B::C
 
     data .*= repeat(B.nzVal, outer = nA)
 
-    CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
+    return CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
 end
 
 end  # extension module
diff --git a/test/libraries/cusparse/linalg.jl b/test/libraries/cusparse/linalg.jl
index 6c6308003..153fbebaf 100644
--- a/test/libraries/cusparse/linalg.jl
+++ b/test/libraries/cusparse/linalg.jl
@@ -6,7 +6,7 @@ m = 10
     A  = sprand(T, m, m, 0.2)
     B  = sprand(T, m, m, 0.3)
     ZA = spzeros(T, m, m)
-    C  = Diagonal(rand(T, div(m, 2)))
+    C = Diagonal(rand(T, div(m, 2)))
 
     dC = adapt(CuArray, C)
     @testset "type = $typ" for typ in [CuSparseMatrixCSR, CuSparseMatrixCSC]
@@ -38,45 +38,45 @@ m = 10
         end
         @testset "kronecker product with I opa = $opa" for opa in (identity, transpose, adjoint)
             @test collect(kron(opa(dA), dC)) ≈ kron(opa(A), C)
-            @test collect(kron(dC, opa(dA))) ≈ kron(C, opa(A)) 
+            @test collect(kron(dC, opa(dA))) ≈ kron(C, opa(A))
             @test collect(kron(opa(dZA), dC)) ≈ kron(opa(ZA), C)
             @test collect(kron(dC, opa(dZA))) ≈ kron(C, opa(ZA))
         end
     end
-    
+
     # Test type promotion for kron with Diagonal
     @testset "Type promotion - T1 = $T1, T2 = $T2" for (T1, T2) in [(Float64, Float32), (ComplexF64, Float64)]
         A_T1 = sprand(T1, m, m, 0.2)
         dA_T1 = CuSparseMatrixCSR(A_T1)
         dC_T2 = Diagonal(CUDA.rand(T2, div(m, 2)))
         C_T2 = collect(dC_T2)
-        
+
         # Test kron(sparse_T1, diagonal_T2)
         result_gpu = kron(dA_T1, dC_T2)
         result_cpu = kron(A_T1, C_T2)
         @test eltype(result_gpu) == promote_type(T1, T2)
         @test collect(result_gpu) ≈ result_cpu
-        
+
         # Test kron(diagonal_T2, sparse_T1)
         result_gpu = kron(dC_T2, dA_T1)
         result_cpu = kron(C_T2, A_T1)
         @test eltype(result_gpu) == promote_type(T1, T2)
         @test collect(result_gpu) ≈ result_cpu
     end
-    
+
     # Test kron with zero diagonal matrices
     @testset "kron with zero Diagonal" begin
         A = sprand(T, m, m, 0.2)
         dA = CuSparseMatrixCSR(A)
         dC_zeros = Diagonal(CUDA.zeros(T, div(m, 2)))
         C_zeros = collect(dC_zeros)
-        
+
         # Test kron(sparse, zero_diagonal)
         result_gpu = kron(dA, dC_zeros)
         result_cpu = kron(A, C_zeros)
         @test SparseMatrixCSC(result_gpu) ≈ result_cpu
         @test nnz(result_gpu) == 0
-        
+
         # Test kron(zero_diagonal, sparse)
         result_gpu = kron(dC_zeros, dA)
         result_cpu = kron(C_zeros, A)
diff --git a/test/libraries/fillarrays.jl b/test/libraries/fillarrays.jl
index e4d97c0c8..9df35256f 100644
--- a/test/libraries/fillarrays.jl
+++ b/test/libraries/fillarrays.jl
@@ -6,91 +6,91 @@ using FillArrays
     @testset "T = $T" for T in [Float32, Float64, ComplexF32, ComplexF64]
         m, n = 100, 80
         A = sprand(T, m, n, 0.2)
-        
+
         # Test with Ones
         @testset "Ones - size $diag_size" for diag_size in [5, 10]
             C_ones = Diagonal(Ones{T}(diag_size))
             dA = CuSparseMatrixCSR(A)
-            
+
             # Test kron(sparse, diagonal)
             result_gpu = collect(kron(dA, C_ones))
             result_cpu = kron(A, collect(C_ones))
             @test result_gpu ≈ result_cpu
-            
+
             # Test kron(diagonal, sparse)
             result_gpu = collect(kron(C_ones, dA))
             result_cpu = kron(collect(C_ones), A)
             @test result_gpu ≈ result_cpu
         end
-        
+
         # Test with Fill (constant value)
         @testset "Fill - value $val, size $diag_size" for val in [T(2.5), T(-1.0)], diag_size in [5, 10]
             C_fill = Diagonal(Fill(val, diag_size))
             dA = CuSparseMatrixCSR(A)
-            
+
             # Test kron(sparse, diagonal)
             result_gpu = collect(kron(dA, C_fill))
             result_cpu = kron(A, collect(C_fill))
             @test result_gpu ≈ result_cpu
-            
+
             # Test kron(diagonal, sparse)
             result_gpu = collect(kron(C_fill, dA))
             result_cpu = kron(collect(C_fill), A)
             @test result_gpu ≈ result_cpu
         end
-        
+
         # Test with Zeros
         @testset "Zeros - size $diag_size" for diag_size in [5, 10]
             C_zeros = Diagonal(Zeros{T}(diag_size))
             dA = CuSparseMatrixCSR(A)
-            
+
             # Test kron(sparse, diagonal)
             result_gpu = kron(dA, C_zeros)
             result_cpu = kron(A, collect(C_zeros))
             @test SparseMatrixCSC(result_gpu) ≈ result_cpu
             @test nnz(result_gpu) == 0
-            
+
             # Test kron(diagonal, sparse)
             result_gpu = kron(C_zeros, dA)
             result_cpu = kron(collect(C_zeros), A)
             @test SparseMatrixCSC(result_gpu) ≈ result_cpu
             @test nnz(result_gpu) == 0
         end
-        
+
         # Test with transpose and adjoint wrappers
         @testset "Transpose/Adjoint with Fill" begin
             diag_size = 5
             val = T(3.0)
             C_fill = Diagonal(Fill(val, diag_size))
-            
+
             @testset "opa = $opa" for opa in (identity, transpose, adjoint)
                 dA = CuSparseMatrixCSR(A)
-                
+
                 # Test kron(opa(sparse), diagonal)
                 result_gpu = collect(kron(opa(dA), C_fill))
                 result_cpu = kron(opa(A), collect(C_fill))
                 @test result_gpu ≈ result_cpu
-                
+
                 # Test kron(diagonal, opa(sparse))
                 result_gpu = collect(kron(C_fill, opa(dA)))
                 result_cpu = kron(collect(C_fill), opa(A))
                 @test result_gpu ≈ result_cpu
             end
         end
-        
+
         # Test type promotion
         @testset "Type promotion" begin
             @testset "T1 = $T1, T2 = $T2" for (T1, T2) in [(Float64, Float32), (ComplexF64, Float64), (ComplexF32, Float32)]
                 A_T1 = sprand(T1, m, n, 0.2)
                 dA_T1 = CuSparseMatrixCSR(A_T1)
                 C_fill_T2 = Diagonal(Fill(T2(2.5), 5))
-                
+
                 # Test kron(sparse_T1, diagonal_T2)
                 result_gpu = kron(dA_T1, C_fill_T2)
                 result_cpu = kron(A_T1, collect(C_fill_T2))
                 @test eltype(result_gpu) == promote_type(T1, T2)
                 @test collect(result_gpu) ≈ result_cpu
-                
+
                 # Test kron(diagonal_T2, sparse_T1)
                 result_gpu = kron(C_fill_T2, dA_T1)
                 result_cpu = kron(collect(C_fill_T2), A_T1)

@maleadt
Copy link
Member

maleadt commented Nov 17, 2025

Replaces #2804?

@albertomercurio
Copy link
Contributor Author

Oh I wasn't aware of that PR.

I would say that both fix the bug. This PR also implements the kron with a true Identity (FillArrays.jl).

@albertomercurio
Copy link
Contributor Author

Hello, any thoughts on this?

Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA.jl Benchmarks

Details
Benchmark suite Current: 2ee822e Previous: 5d9474a Ratio
latency/precompile 54976267490 ns 55510377029.5 ns 0.99
latency/ttfp 7892269137 ns 7790703567 ns 1.01
latency/import 4163528684.5 ns 4122189304 ns 1.01
integration/volumerhs 9623297.5 ns 9624973 ns 1.00
integration/byval/slices=1 147104 ns 147064 ns 1.00
integration/byval/slices=3 425930 ns 425893 ns 1.00
integration/byval/reference 145081 ns 145082 ns 1.00
integration/byval/slices=2 286421.5 ns 286384 ns 1.00
integration/cudadevrt 103563 ns 103602 ns 1.00
kernel/indexing 14245 ns 14225 ns 1.00
kernel/indexing_checked 14857 ns 14969 ns 0.99
kernel/occupancy 730.8037037037036 ns 732.5227272727273 ns 1.00
kernel/launch 2282.3333333333335 ns 2249.4444444444443 ns 1.01
kernel/rand 14871 ns 18642 ns 0.80
array/reverse/1d 19817 ns 19990 ns 0.99
array/reverse/2dL_inplace 66891 ns 66917 ns 1.00
array/reverse/1dL 70066 ns 70158 ns 1.00
array/reverse/2d 21887 ns 21954 ns 1.00
array/reverse/1d_inplace 9936 ns 9677 ns 1.03
array/reverse/2d_inplace 13478 ns 11077 ns 1.22
array/reverse/2dL 73893 ns 74051.5 ns 1.00
array/reverse/1dL_inplace 67245 ns 66880 ns 1.01
array/copy 20582 ns 20660 ns 1.00
array/iteration/findall/int 157809.5 ns 158373 ns 1.00
array/iteration/findall/bool 139937 ns 140139 ns 1.00
array/iteration/findfirst/int 161392 ns 161271 ns 1.00
array/iteration/findfirst/bool 162120.5 ns 162049 ns 1.00
array/iteration/scalar 71973 ns 72812.5 ns 0.99
array/iteration/logical 216531 ns 216894.5 ns 1.00
array/iteration/findmin/1d 51408 ns 50981 ns 1.01
array/iteration/findmin/2d 96465 ns 96704 ns 1.00
array/reductions/reduce/Int64/1d 43712 ns 43491 ns 1.01
array/reductions/reduce/Int64/dims=1 45340.5 ns 52642.5 ns 0.86
array/reductions/reduce/Int64/dims=2 61470 ns 61484 ns 1.00
array/reductions/reduce/Int64/dims=1L 88983 ns 88879 ns 1.00
array/reductions/reduce/Int64/dims=2L 88210 ns 87977 ns 1.00
array/reductions/reduce/Float32/1d 37257.5 ns 37248.5 ns 1.00
array/reductions/reduce/Float32/dims=1 45350.5 ns 43278 ns 1.05
array/reductions/reduce/Float32/dims=2 59694 ns 60066 ns 0.99
array/reductions/reduce/Float32/dims=1L 52331 ns 52282 ns 1.00
array/reductions/reduce/Float32/dims=2L 71952 ns 72365.5 ns 0.99
array/reductions/mapreduce/Int64/1d 43926 ns 43561 ns 1.01
array/reductions/mapreduce/Int64/dims=1 45095.5 ns 44306 ns 1.02
array/reductions/mapreduce/Int64/dims=2 61364 ns 61482 ns 1.00
array/reductions/mapreduce/Int64/dims=1L 88947 ns 89001 ns 1.00
array/reductions/mapreduce/Int64/dims=2L 87932 ns 88320 ns 1.00
array/reductions/mapreduce/Float32/1d 37014 ns 38092.5 ns 0.97
array/reductions/mapreduce/Float32/dims=1 51969.5 ns 41962 ns 1.24
array/reductions/mapreduce/Float32/dims=2 59883 ns 60039 ns 1.00
array/reductions/mapreduce/Float32/dims=1L 52655.5 ns 52636 ns 1.00
array/reductions/mapreduce/Float32/dims=2L 72106 ns 72310 ns 1.00
array/broadcast 20168 ns 20127 ns 1.00
array/copyto!/gpu_to_gpu 11228 ns 12738 ns 0.88
array/copyto!/cpu_to_gpu 217858.5 ns 217857 ns 1.00
array/copyto!/gpu_to_cpu 283913 ns 287088 ns 0.99
array/accumulate/Int64/1d 124690 ns 124778 ns 1.00
array/accumulate/Int64/dims=1 83792 ns 83708 ns 1.00
array/accumulate/Int64/dims=2 158389.5 ns 158367 ns 1.00
array/accumulate/Int64/dims=1L 1720126 ns 1710164 ns 1.01
array/accumulate/Int64/dims=2L 968168 ns 967254 ns 1.00
array/accumulate/Float32/1d 109079 ns 109314 ns 1.00
array/accumulate/Float32/dims=1 80980 ns 80184 ns 1.01
array/accumulate/Float32/dims=2 148191 ns 147922 ns 1.00
array/accumulate/Float32/dims=1L 1619247 ns 1618786 ns 1.00
array/accumulate/Float32/dims=2L 698719 ns 698724 ns 1.00
array/construct 1291.75 ns 1295.5 ns 1.00
array/random/randn/Float32 45186.5 ns 47861 ns 0.94
array/random/randn!/Float32 24986 ns 24875 ns 1.00
array/random/rand!/Int64 27569 ns 27408 ns 1.01
array/random/rand!/Float32 8848.833333333332 ns 8909.666666666666 ns 0.99
array/random/rand/Int64 30658 ns 30055 ns 1.02
array/random/rand/Float32 13117 ns 13184 ns 0.99
array/permutedims/4d 55312 ns 55109 ns 1.00
array/permutedims/2d 54319 ns 53832 ns 1.01
array/permutedims/3d 54902 ns 54841 ns 1.00
array/sorting/1d 2757494 ns 2757534 ns 1.00
array/sorting/by 3344816 ns 3344541 ns 1.00
array/sorting/2d 1081979 ns 1081521 ns 1.00
cuda/synchronization/stream/auto 1030 ns 1036.5 ns 0.99
cuda/synchronization/stream/nonblocking 7308.700000000001 ns 7410.8 ns 0.99
cuda/synchronization/stream/blocking 833.3855421686746 ns 820.6336633663366 ns 1.02
cuda/synchronization/context/auto 1162.7 ns 1154.3 ns 1.01
cuda/synchronization/context/nonblocking 6919.4 ns 7124.4 ns 0.97
cuda/synchronization/context/blocking 905.9565217391304 ns 887.4107142857143 ns 1.02

This comment was automatically generated by workflow using github-action-benchmark.

@maleadt
Copy link
Member

maleadt commented Dec 31, 2025

These changes seem to cause test failures.

@albertomercurio
Copy link
Contributor Author

Ok I should have fixed the errors. There are some errors on Julia v1.12 and nightly related to norm, which should not be related to this PR.

@codecov
Copy link

codecov bot commented Jan 4, 2026

Codecov Report

✅ All modified and coverable lines are covered by tests.
✅ Project coverage is 89.47%. Comparing base (0c00b83) to head (96f94a2).

Additional details and impacted files
@@            Coverage Diff             @@
##           master    #2973      +/-   ##
==========================================
+ Coverage   89.43%   89.47%   +0.03%     
==========================================
  Files         148      148              
  Lines       12991    13001      +10     
==========================================
+ Hits        11619    11632      +13     
+ Misses       1372     1369       -3     

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@albertomercurio
Copy link
Contributor Author

Ok by updating the branch all the tests pass

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants