-
Notifications
You must be signed in to change notification settings - Fork 263
[CUSPARSE] Fix wrong method for kron with Diagonal and add support to kron with FillArrays
#2973
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Conversation
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/ext/FillArraysExt.jl b/ext/FillArraysExt.jl
index 865faebf5..9fbe77f1f 100644
--- a/ext/FillArraysExt.jl
+++ b/ext/FillArraysExt.jl
@@ -28,13 +28,13 @@ function LinearAlgebra.kron(A::CUSPARSE.CuSparseMatrixCOO{T1, Ti}, B::Diagonal{T
col = repeat(col, inner = nB)
data = repeat(convert(CUDA.CuVector{T}, A.nzVal), inner = nB)
- row .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1
- col .+= CUDA.CuVector(repeat(0:nB-1, outer = Annz)) .+ 1
+ row .+= CUDA.CuVector(repeat(0:(nB - 1), outer = Annz)) .+ 1
+ col .+= CUDA.CuVector(repeat(0:(nB - 1), outer = Annz)) .+ 1
# Multiply by the fill value (already promoted type T)
data .*= fill_value
- CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
+ return CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
end
# kron between Diagonal{T, AbstractFill} and CuSparseMatrixCOO
@@ -52,9 +52,9 @@ function LinearAlgebra.kron(A::Diagonal{T1, <:FillArrays.AbstractFill{T1}}, B::C
return CUSPARSE.CuSparseMatrixCOO(CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{Ti}(undef, 0), CUDA.CuVector{T}(undef, 0), out_shape)
end
- row = (0:nA-1) .* mB
+ row = (0:(nA - 1)) .* mB
row = CUDA.CuVector(repeat(row, inner = Bnnz))
- col = (0:nA-1) .* nB
+ col = (0:(nA - 1)) .* nB
col = CUDA.CuVector(repeat(col, inner = Bnnz))
data = CUDA.fill(T(fill_value), nA * Bnnz)
@@ -63,7 +63,7 @@ function LinearAlgebra.kron(A::Diagonal{T1, <:FillArrays.AbstractFill{T1}}, B::C
data .*= repeat(B.nzVal, outer = nA)
- CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
+ return CUSPARSE.sparse(row, col, data, out_shape..., fmt = :coo)
end
end # extension module
diff --git a/test/libraries/cusparse/linalg.jl b/test/libraries/cusparse/linalg.jl
index 6c6308003..153fbebaf 100644
--- a/test/libraries/cusparse/linalg.jl
+++ b/test/libraries/cusparse/linalg.jl
@@ -6,7 +6,7 @@ m = 10
A = sprand(T, m, m, 0.2)
B = sprand(T, m, m, 0.3)
ZA = spzeros(T, m, m)
- C = Diagonal(rand(T, div(m, 2)))
+ C = Diagonal(rand(T, div(m, 2)))
dC = adapt(CuArray, C)
@testset "type = $typ" for typ in [CuSparseMatrixCSR, CuSparseMatrixCSC]
@@ -38,45 +38,45 @@ m = 10
end
@testset "kronecker product with I opa = $opa" for opa in (identity, transpose, adjoint)
@test collect(kron(opa(dA), dC)) ≈ kron(opa(A), C)
- @test collect(kron(dC, opa(dA))) ≈ kron(C, opa(A))
+ @test collect(kron(dC, opa(dA))) ≈ kron(C, opa(A))
@test collect(kron(opa(dZA), dC)) ≈ kron(opa(ZA), C)
@test collect(kron(dC, opa(dZA))) ≈ kron(C, opa(ZA))
end
end
-
+
# Test type promotion for kron with Diagonal
@testset "Type promotion - T1 = $T1, T2 = $T2" for (T1, T2) in [(Float64, Float32), (ComplexF64, Float64)]
A_T1 = sprand(T1, m, m, 0.2)
dA_T1 = CuSparseMatrixCSR(A_T1)
dC_T2 = Diagonal(CUDA.rand(T2, div(m, 2)))
C_T2 = collect(dC_T2)
-
+
# Test kron(sparse_T1, diagonal_T2)
result_gpu = kron(dA_T1, dC_T2)
result_cpu = kron(A_T1, C_T2)
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu
-
+
# Test kron(diagonal_T2, sparse_T1)
result_gpu = kron(dC_T2, dA_T1)
result_cpu = kron(C_T2, A_T1)
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu
end
-
+
# Test kron with zero diagonal matrices
@testset "kron with zero Diagonal" begin
A = sprand(T, m, m, 0.2)
dA = CuSparseMatrixCSR(A)
dC_zeros = Diagonal(CUDA.zeros(T, div(m, 2)))
C_zeros = collect(dC_zeros)
-
+
# Test kron(sparse, zero_diagonal)
result_gpu = kron(dA, dC_zeros)
result_cpu = kron(A, C_zeros)
@test SparseMatrixCSC(result_gpu) ≈ result_cpu
@test nnz(result_gpu) == 0
-
+
# Test kron(zero_diagonal, sparse)
result_gpu = kron(dC_zeros, dA)
result_cpu = kron(C_zeros, A)
diff --git a/test/libraries/fillarrays.jl b/test/libraries/fillarrays.jl
index e4d97c0c8..9df35256f 100644
--- a/test/libraries/fillarrays.jl
+++ b/test/libraries/fillarrays.jl
@@ -6,91 +6,91 @@ using FillArrays
@testset "T = $T" for T in [Float32, Float64, ComplexF32, ComplexF64]
m, n = 100, 80
A = sprand(T, m, n, 0.2)
-
+
# Test with Ones
@testset "Ones - size $diag_size" for diag_size in [5, 10]
C_ones = Diagonal(Ones{T}(diag_size))
dA = CuSparseMatrixCSR(A)
-
+
# Test kron(sparse, diagonal)
result_gpu = collect(kron(dA, C_ones))
result_cpu = kron(A, collect(C_ones))
@test result_gpu ≈ result_cpu
-
+
# Test kron(diagonal, sparse)
result_gpu = collect(kron(C_ones, dA))
result_cpu = kron(collect(C_ones), A)
@test result_gpu ≈ result_cpu
end
-
+
# Test with Fill (constant value)
@testset "Fill - value $val, size $diag_size" for val in [T(2.5), T(-1.0)], diag_size in [5, 10]
C_fill = Diagonal(Fill(val, diag_size))
dA = CuSparseMatrixCSR(A)
-
+
# Test kron(sparse, diagonal)
result_gpu = collect(kron(dA, C_fill))
result_cpu = kron(A, collect(C_fill))
@test result_gpu ≈ result_cpu
-
+
# Test kron(diagonal, sparse)
result_gpu = collect(kron(C_fill, dA))
result_cpu = kron(collect(C_fill), A)
@test result_gpu ≈ result_cpu
end
-
+
# Test with Zeros
@testset "Zeros - size $diag_size" for diag_size in [5, 10]
C_zeros = Diagonal(Zeros{T}(diag_size))
dA = CuSparseMatrixCSR(A)
-
+
# Test kron(sparse, diagonal)
result_gpu = kron(dA, C_zeros)
result_cpu = kron(A, collect(C_zeros))
@test SparseMatrixCSC(result_gpu) ≈ result_cpu
@test nnz(result_gpu) == 0
-
+
# Test kron(diagonal, sparse)
result_gpu = kron(C_zeros, dA)
result_cpu = kron(collect(C_zeros), A)
@test SparseMatrixCSC(result_gpu) ≈ result_cpu
@test nnz(result_gpu) == 0
end
-
+
# Test with transpose and adjoint wrappers
@testset "Transpose/Adjoint with Fill" begin
diag_size = 5
val = T(3.0)
C_fill = Diagonal(Fill(val, diag_size))
-
+
@testset "opa = $opa" for opa in (identity, transpose, adjoint)
dA = CuSparseMatrixCSR(A)
-
+
# Test kron(opa(sparse), diagonal)
result_gpu = collect(kron(opa(dA), C_fill))
result_cpu = kron(opa(A), collect(C_fill))
@test result_gpu ≈ result_cpu
-
+
# Test kron(diagonal, opa(sparse))
result_gpu = collect(kron(C_fill, opa(dA)))
result_cpu = kron(collect(C_fill), opa(A))
@test result_gpu ≈ result_cpu
end
end
-
+
# Test type promotion
@testset "Type promotion" begin
@testset "T1 = $T1, T2 = $T2" for (T1, T2) in [(Float64, Float32), (ComplexF64, Float64), (ComplexF32, Float32)]
A_T1 = sprand(T1, m, n, 0.2)
dA_T1 = CuSparseMatrixCSR(A_T1)
C_fill_T2 = Diagonal(Fill(T2(2.5), 5))
-
+
# Test kron(sparse_T1, diagonal_T2)
result_gpu = kron(dA_T1, C_fill_T2)
result_cpu = kron(A_T1, collect(C_fill_T2))
@test eltype(result_gpu) == promote_type(T1, T2)
@test collect(result_gpu) ≈ result_cpu
-
+
# Test kron(diagonal_T2, sparse_T1)
result_gpu = kron(C_fill_T2, dA_T1)
result_cpu = kron(collect(C_fill_T2), A_T1) |
|
Replaces #2804? |
|
Oh I wasn't aware of that PR. I would say that both fix the bug. This PR also implements the |
|
Hello, any thoughts on this? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: 2ee822e | Previous: 5d9474a | Ratio |
|---|---|---|---|
latency/precompile |
54976267490 ns |
55510377029.5 ns |
0.99 |
latency/ttfp |
7892269137 ns |
7790703567 ns |
1.01 |
latency/import |
4163528684.5 ns |
4122189304 ns |
1.01 |
integration/volumerhs |
9623297.5 ns |
9624973 ns |
1.00 |
integration/byval/slices=1 |
147104 ns |
147064 ns |
1.00 |
integration/byval/slices=3 |
425930 ns |
425893 ns |
1.00 |
integration/byval/reference |
145081 ns |
145082 ns |
1.00 |
integration/byval/slices=2 |
286421.5 ns |
286384 ns |
1.00 |
integration/cudadevrt |
103563 ns |
103602 ns |
1.00 |
kernel/indexing |
14245 ns |
14225 ns |
1.00 |
kernel/indexing_checked |
14857 ns |
14969 ns |
0.99 |
kernel/occupancy |
730.8037037037036 ns |
732.5227272727273 ns |
1.00 |
kernel/launch |
2282.3333333333335 ns |
2249.4444444444443 ns |
1.01 |
kernel/rand |
14871 ns |
18642 ns |
0.80 |
array/reverse/1d |
19817 ns |
19990 ns |
0.99 |
array/reverse/2dL_inplace |
66891 ns |
66917 ns |
1.00 |
array/reverse/1dL |
70066 ns |
70158 ns |
1.00 |
array/reverse/2d |
21887 ns |
21954 ns |
1.00 |
array/reverse/1d_inplace |
9936 ns |
9677 ns |
1.03 |
array/reverse/2d_inplace |
13478 ns |
11077 ns |
1.22 |
array/reverse/2dL |
73893 ns |
74051.5 ns |
1.00 |
array/reverse/1dL_inplace |
67245 ns |
66880 ns |
1.01 |
array/copy |
20582 ns |
20660 ns |
1.00 |
array/iteration/findall/int |
157809.5 ns |
158373 ns |
1.00 |
array/iteration/findall/bool |
139937 ns |
140139 ns |
1.00 |
array/iteration/findfirst/int |
161392 ns |
161271 ns |
1.00 |
array/iteration/findfirst/bool |
162120.5 ns |
162049 ns |
1.00 |
array/iteration/scalar |
71973 ns |
72812.5 ns |
0.99 |
array/iteration/logical |
216531 ns |
216894.5 ns |
1.00 |
array/iteration/findmin/1d |
51408 ns |
50981 ns |
1.01 |
array/iteration/findmin/2d |
96465 ns |
96704 ns |
1.00 |
array/reductions/reduce/Int64/1d |
43712 ns |
43491 ns |
1.01 |
array/reductions/reduce/Int64/dims=1 |
45340.5 ns |
52642.5 ns |
0.86 |
array/reductions/reduce/Int64/dims=2 |
61470 ns |
61484 ns |
1.00 |
array/reductions/reduce/Int64/dims=1L |
88983 ns |
88879 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
88210 ns |
87977 ns |
1.00 |
array/reductions/reduce/Float32/1d |
37257.5 ns |
37248.5 ns |
1.00 |
array/reductions/reduce/Float32/dims=1 |
45350.5 ns |
43278 ns |
1.05 |
array/reductions/reduce/Float32/dims=2 |
59694 ns |
60066 ns |
0.99 |
array/reductions/reduce/Float32/dims=1L |
52331 ns |
52282 ns |
1.00 |
array/reductions/reduce/Float32/dims=2L |
71952 ns |
72365.5 ns |
0.99 |
array/reductions/mapreduce/Int64/1d |
43926 ns |
43561 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=1 |
45095.5 ns |
44306 ns |
1.02 |
array/reductions/mapreduce/Int64/dims=2 |
61364 ns |
61482 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=1L |
88947 ns |
89001 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2L |
87932 ns |
88320 ns |
1.00 |
array/reductions/mapreduce/Float32/1d |
37014 ns |
38092.5 ns |
0.97 |
array/reductions/mapreduce/Float32/dims=1 |
51969.5 ns |
41962 ns |
1.24 |
array/reductions/mapreduce/Float32/dims=2 |
59883 ns |
60039 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=1L |
52655.5 ns |
52636 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=2L |
72106 ns |
72310 ns |
1.00 |
array/broadcast |
20168 ns |
20127 ns |
1.00 |
array/copyto!/gpu_to_gpu |
11228 ns |
12738 ns |
0.88 |
array/copyto!/cpu_to_gpu |
217858.5 ns |
217857 ns |
1.00 |
array/copyto!/gpu_to_cpu |
283913 ns |
287088 ns |
0.99 |
array/accumulate/Int64/1d |
124690 ns |
124778 ns |
1.00 |
array/accumulate/Int64/dims=1 |
83792 ns |
83708 ns |
1.00 |
array/accumulate/Int64/dims=2 |
158389.5 ns |
158367 ns |
1.00 |
array/accumulate/Int64/dims=1L |
1720126 ns |
1710164 ns |
1.01 |
array/accumulate/Int64/dims=2L |
968168 ns |
967254 ns |
1.00 |
array/accumulate/Float32/1d |
109079 ns |
109314 ns |
1.00 |
array/accumulate/Float32/dims=1 |
80980 ns |
80184 ns |
1.01 |
array/accumulate/Float32/dims=2 |
148191 ns |
147922 ns |
1.00 |
array/accumulate/Float32/dims=1L |
1619247 ns |
1618786 ns |
1.00 |
array/accumulate/Float32/dims=2L |
698719 ns |
698724 ns |
1.00 |
array/construct |
1291.75 ns |
1295.5 ns |
1.00 |
array/random/randn/Float32 |
45186.5 ns |
47861 ns |
0.94 |
array/random/randn!/Float32 |
24986 ns |
24875 ns |
1.00 |
array/random/rand!/Int64 |
27569 ns |
27408 ns |
1.01 |
array/random/rand!/Float32 |
8848.833333333332 ns |
8909.666666666666 ns |
0.99 |
array/random/rand/Int64 |
30658 ns |
30055 ns |
1.02 |
array/random/rand/Float32 |
13117 ns |
13184 ns |
0.99 |
array/permutedims/4d |
55312 ns |
55109 ns |
1.00 |
array/permutedims/2d |
54319 ns |
53832 ns |
1.01 |
array/permutedims/3d |
54902 ns |
54841 ns |
1.00 |
array/sorting/1d |
2757494 ns |
2757534 ns |
1.00 |
array/sorting/by |
3344816 ns |
3344541 ns |
1.00 |
array/sorting/2d |
1081979 ns |
1081521 ns |
1.00 |
cuda/synchronization/stream/auto |
1030 ns |
1036.5 ns |
0.99 |
cuda/synchronization/stream/nonblocking |
7308.700000000001 ns |
7410.8 ns |
0.99 |
cuda/synchronization/stream/blocking |
833.3855421686746 ns |
820.6336633663366 ns |
1.02 |
cuda/synchronization/context/auto |
1162.7 ns |
1154.3 ns |
1.01 |
cuda/synchronization/context/nonblocking |
6919.4 ns |
7124.4 ns |
0.97 |
cuda/synchronization/context/blocking |
905.9565217391304 ns |
887.4107142857143 ns |
1.02 |
This comment was automatically generated by workflow using github-action-benchmark.
|
These changes seem to cause test failures. |
|
Ok I should have fixed the errors. There are some errors on Julia v1.12 and nightly related to |
Codecov Report✅ All modified and coverable lines are covered by tests. Additional details and impacted files@@ Coverage Diff @@
## master #2973 +/- ##
==========================================
+ Coverage 89.43% 89.47% +0.03%
==========================================
Files 148 148
Lines 12991 13001 +10
==========================================
+ Hits 11619 11632 +13
+ Misses 1372 1369 -3 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
|
Ok by updating the branch all the tests pass |
The previous implementation of
kronbetween a CUSPARSE matrix and aDiagonalwas wrong. It was assuming the diagonal to be the identity, which is not always the case. I have extended it to any generalDiagonalmatrix embedding aCuVector.In order to support the
kronwith identity like matrices, I have added the extension to FillArrays.jl. Here the diagonal is filled by an unique value.