From f8b1f3cd47ff309710b72669d94f2a3b49a615b9 Mon Sep 17 00:00:00 2001 From: Juan Zuniga-Anaya <50754207+jzuniga-amd@users.noreply.github.com> Date: Tue, 9 Jan 2024 09:45:47 -0700 Subject: [PATCH] Divide & Conquer + Jacobi (part 2) (#643) * prepare stedc to use other solvers * fix synchronization issues * address review comments * define APIs and add documentation * add syevdj/heevdj * add test infrastructure for syevdj/heevdj * fix test bug * add batched versions * enable jacobi solver in stedc * fix single-block case * fix synchronization problems * fix synchronization problems * clean up * add sygvdj/hegvdj * clean up * update changelog * fix synch problem showing on gfx1101 * addressed review comments --- CHANGELOG.md | 8 + clients/CMakeLists.txt | 4 +- clients/common/testing_syevdj_heevdj.cpp | 32 + clients/common/testing_sygvdj_hegvdj.cpp | 32 + clients/gtest/CMakeLists.txt | 4 +- clients/gtest/syevdj_heevdj_gtest.cpp | 193 +++ clients/gtest/sygvdj_hegvdj_gtest.cpp | 207 +++ clients/include/rocsolver.hpp | 322 +++- clients/include/rocsolver_dispatcher.hpp | 20 +- clients/include/testing_syevdj_heevdj.hpp | 590 +++++++ clients/include/testing_sygvdj_hegvdj.hpp | 726 +++++++++ docs/api/lapacklike.rst | 80 + docs/design/tuning.rst | 20 +- docs/userguide/intro.rst | 4 + .../include/rocsolver/rocsolver-functions.h | 1392 ++++++++++++++--- library/src/CMakeLists.txt | 8 +- library/src/auxiliary/rocauxiliary_stedc.hpp | 1382 +++++++++------- library/src/include/ideal_sizes.hpp | 8 + .../src/lapack/roclapack_syevdj_heevdj.cpp | 173 ++ .../src/lapack/roclapack_syevdj_heevdj.hpp | 265 ++++ .../roclapack_syevdj_heevdj_batched.cpp | 185 +++ ...oclapack_syevdj_heevdj_strided_batched.cpp | 189 +++ library/src/lapack/roclapack_syevj_heevj.hpp | 91 +- .../src/lapack/roclapack_sygvdj_hegvdj.cpp | 194 +++ .../src/lapack/roclapack_sygvdj_hegvdj.hpp | 273 ++++ .../roclapack_sygvdj_hegvdj_batched.cpp | 202 +++ ...oclapack_sygvdj_hegvdj_strided_batched.cpp | 213 +++ 27 files changed, 6070 insertions(+), 747 deletions(-) create mode 100644 clients/common/testing_syevdj_heevdj.cpp create mode 100644 clients/common/testing_sygvdj_hegvdj.cpp create mode 100644 clients/gtest/syevdj_heevdj_gtest.cpp create mode 100644 clients/gtest/sygvdj_hegvdj_gtest.cpp create mode 100644 clients/include/testing_syevdj_heevdj.hpp create mode 100644 clients/include/testing_sygvdj_hegvdj.hpp create mode 100644 library/src/lapack/roclapack_syevdj_heevdj.cpp create mode 100644 library/src/lapack/roclapack_syevdj_heevdj.hpp create mode 100644 library/src/lapack/roclapack_syevdj_heevdj_batched.cpp create mode 100644 library/src/lapack/roclapack_syevdj_heevdj_strided_batched.cpp create mode 100644 library/src/lapack/roclapack_sygvdj_hegvdj.cpp create mode 100644 library/src/lapack/roclapack_sygvdj_hegvdj.hpp create mode 100644 library/src/lapack/roclapack_sygvdj_hegvdj_batched.cpp create mode 100644 library/src/lapack/roclapack_sygvdj_hegvdj_strided_batched.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index e1ffcf54c..060b9cccf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ Full documentation for rocSOLVER is available at [rocsolver.readthedocs.io](http ## rocSOLVER 3.25.0 for ROCm 6.1.0 +### Added +- Eigensolver routines for symmetric/hermitian matrices using Divide & Conquer and Jacobi algorithm: + - SYEVDJ (with batched and strided\_batched versions) + - HEEVDJ (with batched and strided\_batched versions) +- Generalized symmetric/hermitian-definite eigensolvers using Divide & Conquer and Jacobi algorithm: + - SYGVDJ (with batched and strided\_batched versions) + - HEGVDJ (with batched and strided\_batched versions) + ### Changed - Relaxed array length requirements for GESVDX with `rocblas_srange_index`. diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt index 262b4121f..37b7d6439 100755 --- a/clients/CMakeLists.txt +++ b/clients/CMakeLists.txt @@ -1,5 +1,5 @@ # ########################################################################## -# Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -127,10 +127,12 @@ if(BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS) common/testing_sygsx_hegsx.cpp common/testing_syev_heev.cpp common/testing_syevd_heevd.cpp + common/testing_syevdj_heevdj.cpp common/testing_syevj_heevj.cpp common/testing_syevx_heevx.cpp common/testing_sygv_hegv.cpp common/testing_sygvd_hegvd.cpp + common/testing_sygvdj_hegvdj.cpp common/testing_sygvj_hegvj.cpp common/testing_sygvx_hegvx.cpp common/testing_geblttrf_npvt.cpp diff --git a/clients/common/testing_syevdj_heevdj.cpp b/clients/common/testing_syevdj_heevdj.cpp new file mode 100644 index 000000000..233d32348 --- /dev/null +++ b/clients/common/testing_syevdj_heevdj.cpp @@ -0,0 +1,32 @@ +/* ************************************************************************ + * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ************************************************************************ */ + +#include + +#define TESTING_SYEVDJ_HEEVDJ(...) template void testing_syevdj_heevdj<__VA_ARGS__>(Arguments&); + +INSTANTIATE(TESTING_SYEVDJ_HEEVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) diff --git a/clients/common/testing_sygvdj_hegvdj.cpp b/clients/common/testing_sygvdj_hegvdj.cpp new file mode 100644 index 000000000..dcfc2220e --- /dev/null +++ b/clients/common/testing_sygvdj_hegvdj.cpp @@ -0,0 +1,32 @@ +/* ************************************************************************ + * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ************************************************************************ */ + +#include + +#define TESTING_SYGVDJ_HEGVDJ(...) template void testing_sygvdj_hegvdj<__VA_ARGS__>(Arguments&); + +INSTANTIATE(TESTING_SYGVDJ_HEGVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt index acd477424..9a32d470e 100755 --- a/clients/gtest/CMakeLists.txt +++ b/clients/gtest/CMakeLists.txt @@ -1,5 +1,5 @@ # ########################################################################## -# Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -69,6 +69,8 @@ set(roclapack_test_source sygvj_hegvj_gtest.cpp sygvx_hegvx_gtest.cpp sygvdx_hegvdx_gtest.cpp + syevdj_heevdj_gtest.cpp + sygvdj_hegvdj_gtest.cpp ) set(rocauxiliary_test_source diff --git a/clients/gtest/syevdj_heevdj_gtest.cpp b/clients/gtest/syevdj_heevdj_gtest.cpp new file mode 100644 index 000000000..25edcbc37 --- /dev/null +++ b/clients/gtest/syevdj_heevdj_gtest.cpp @@ -0,0 +1,193 @@ +/* ************************************************************************ + * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ************************************************************************ */ + +#include "testing_syevdj_heevdj.hpp" + +using ::testing::Combine; +using ::testing::TestWithParam; +using ::testing::Values; +using ::testing::ValuesIn; +using namespace std; + +typedef std::tuple, vector> syevdj_heevdj_tuple; + +// each size_range vector is a {n, lda} + +// each op_range vector is a {evect, uplo} + +// case when n == 0, evect == N, and uplo = L will also execute the bad arguments test +// (null handle, null pointers and invalid values) + +const vector> op_range = {{'N', 'L'}, {'N', 'U'}, {'V', 'L'}, {'V', 'U'}}; + +// for checkin_lapack tests +const vector> size_range = { + // quick return + {0, 1}, + // invalid + {-1, 1}, + {10, 5}, + // normal (valid) samples + {1, 1}, + {12, 12}, + {20, 30}, + {36, 36}, + {50, 60}}; + +// for daily_lapack tests +const vector> large_size_range = {{192, 192}, {256, 270}, {300, 300}}; + +Arguments syevdj_heevdj_setup_arguments(syevdj_heevdj_tuple tup) +{ + vector size = std::get<0>(tup); + vector op = std::get<1>(tup); + + Arguments arg; + + arg.set("n", size[0]); + arg.set("lda", size[1]); + + arg.set("evect", op[0]); + arg.set("uplo", op[1]); + + // only testing standard use case/defaults for strides + + arg.timing = 0; + + return arg; +} + +class SYEVDJ_HEEVDJ : public ::TestWithParam +{ +protected: + void TearDown() override + { + EXPECT_EQ(hipGetLastError(), hipSuccess); + } + + template + void run_tests() + { + Arguments arg = syevdj_heevdj_setup_arguments(GetParam()); + + if(arg.peek("n") == 0 && arg.peek("evect") == 'N' + && arg.peek("uplo") == 'L') + testing_syevdj_heevdj_bad_arg(); + + arg.batch_count = (BATCHED || STRIDED ? 3 : 1); + testing_syevdj_heevdj(arg); + } +}; + +class SYEVDJ : public SYEVDJ_HEEVDJ +{ +}; + +class HEEVDJ : public SYEVDJ_HEEVDJ +{ +}; + +// non-batch tests + +TEST_P(SYEVDJ, __float) +{ + run_tests(); +} + +TEST_P(SYEVDJ, __double) +{ + run_tests(); +} + +TEST_P(HEEVDJ, __float_complex) +{ + run_tests(); +} + +TEST_P(HEEVDJ, __double_complex) +{ + run_tests(); +} + +// batched tests + +TEST_P(SYEVDJ, batched__float) +{ + run_tests(); +} + +TEST_P(SYEVDJ, batched__double) +{ + run_tests(); +} + +TEST_P(HEEVDJ, batched__float_complex) +{ + run_tests(); +} + +TEST_P(HEEVDJ, batched__double_complex) +{ + run_tests(); +} + +// strided_batched tests + +TEST_P(SYEVDJ, strided_batched__float) +{ + run_tests(); +} + +TEST_P(SYEVDJ, strided_batched__double) +{ + run_tests(); +} + +TEST_P(HEEVDJ, strided_batched__float_complex) +{ + run_tests(); +} + +TEST_P(HEEVDJ, strided_batched__double_complex) +{ + run_tests(); +} + +// daily_lapack tests normal execution with medium to large sizes +INSTANTIATE_TEST_SUITE_P(daily_lapack, + SYEVDJ, + Combine(ValuesIn(large_size_range), ValuesIn(op_range))); + +INSTANTIATE_TEST_SUITE_P(daily_lapack, + HEEVDJ, + Combine(ValuesIn(large_size_range), ValuesIn(op_range))); + +// checkin_lapack tests normal execution with small sizes, invalid sizes, +// quick returns, and corner cases +INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYEVDJ, Combine(ValuesIn(size_range), ValuesIn(op_range))); + +INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEEVDJ, Combine(ValuesIn(size_range), ValuesIn(op_range))); diff --git a/clients/gtest/sygvdj_hegvdj_gtest.cpp b/clients/gtest/sygvdj_hegvdj_gtest.cpp new file mode 100644 index 000000000..075c0be79 --- /dev/null +++ b/clients/gtest/sygvdj_hegvdj_gtest.cpp @@ -0,0 +1,207 @@ +/* ************************************************************************** + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#include "testing_sygvdj_hegvdj.hpp" + +using ::testing::Combine; +using ::testing::TestWithParam; +using ::testing::Values; +using ::testing::ValuesIn; +using namespace std; + +typedef std::tuple, vector> sygvdj_tuple; + +// each matrix_size_range is a {n, lda, ldb, singular} +// if singular = 1, then the used matrix for the tests is not positive definite + +// each type_range is a {itype, evect, uplo} + +// case when n = 0, itype = 1, evect = 'N', and uplo = U will also execute the bad arguments test +// (null handle, null pointers and invalid values) + +const vector> type_range + = {{'1', 'N', 'U'}, {'2', 'N', 'L'}, {'3', 'N', 'U'}, + {'1', 'V', 'L'}, {'2', 'V', 'U'}, {'3', 'V', 'L'}}; + +// for checkin_lapack tests +const vector> matrix_size_range = { + // quick return + {0, 1, 1, 0}, + // invalid + {-1, 1, 1, 0}, + {20, 5, 5, 0}, + // normal (valid) samples + {20, 30, 20, 1}, + {35, 35, 35, 0}, + {52, 52, 52, 1}, + {50, 50, 60, 1}}; + +// for daily_lapack tests +const vector> large_matrix_size_range = { + {192, 192, 192, 0}, + {256, 270, 256, 0}, + {300, 300, 310, 0}, +}; + +Arguments sygvdj_setup_arguments(sygvdj_tuple tup) +{ + vector matrix_size = std::get<0>(tup); + vector type = std::get<1>(tup); + + Arguments arg; + + arg.set("n", matrix_size[0]); + arg.set("lda", matrix_size[1]); + arg.set("ldb", matrix_size[2]); + + arg.set("itype", type[0]); + arg.set("evect", type[1]); + arg.set("uplo", type[2]); + + // only testing standard use case/defaults for strides + + arg.timing = 0; + arg.singular = matrix_size[3]; + + return arg; +} + +class SYGVDJ_HEGVDJ : public ::TestWithParam +{ +protected: + void TearDown() override + { + EXPECT_EQ(hipGetLastError(), hipSuccess); + } + + template + void run_tests() + { + Arguments arg = sygvdj_setup_arguments(GetParam()); + + if(arg.peek("itype") == '1' && arg.peek("evect") == 'N' + && arg.peek("uplo") == 'U' && arg.peek("n") == 0) + testing_sygvdj_hegvdj_bad_arg(); + + arg.batch_count = (BATCHED || STRIDED ? 3 : 1); + if(arg.singular == 1) + testing_sygvdj_hegvdj(arg); + + arg.singular = 0; + testing_sygvdj_hegvdj(arg); + } +}; + +class SYGVDJ : public SYGVDJ_HEGVDJ +{ +}; + +class HEGVDJ : public SYGVDJ_HEGVDJ +{ +}; + +// non-batch tests + +TEST_P(SYGVDJ, __float) +{ + run_tests(); +} + +TEST_P(SYGVDJ, __double) +{ + run_tests(); +} + +TEST_P(HEGVDJ, __float_complex) +{ + run_tests(); +} + +TEST_P(HEGVDJ, __double_complex) +{ + run_tests(); +} + +// batched tests + +TEST_P(SYGVDJ, batched__float) +{ + run_tests(); +} + +TEST_P(SYGVDJ, batched__double) +{ + run_tests(); +} + +TEST_P(HEGVDJ, batched__float_complex) +{ + run_tests(); +} + +TEST_P(HEGVDJ, batched__double_complex) +{ + run_tests(); +} + +// strided_batched cases + +TEST_P(SYGVDJ, strided_batched__float) +{ + run_tests(); +} + +TEST_P(SYGVDJ, strided_batched__double) +{ + run_tests(); +} + +TEST_P(HEGVDJ, strided_batched__float_complex) +{ + run_tests(); +} + +TEST_P(HEGVDJ, strided_batched__double_complex) +{ + run_tests(); +} + +INSTANTIATE_TEST_SUITE_P(daily_lapack, + SYGVDJ, + Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); + +INSTANTIATE_TEST_SUITE_P(checkin_lapack, + SYGVDJ, + Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); + +INSTANTIATE_TEST_SUITE_P(daily_lapack, + HEGVDJ, + Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range))); + +INSTANTIATE_TEST_SUITE_P(checkin_lapack, + HEGVDJ, + Combine(ValuesIn(matrix_size_range), ValuesIn(type_range))); diff --git a/clients/include/rocsolver.hpp b/clients/include/rocsolver.hpp index d66770af3..c8583158d 100644 --- a/clients/include/rocsolver.hpp +++ b/clients/include/rocsolver.hpp @@ -1,5 +1,5 @@ /* ************************************************************************** - * Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -7391,6 +7391,146 @@ inline rocblas_status rocsolver_syevd_heevd(bool STRIDED, } /********************************************************/ +/******************** SYEVDJ/HEEVDJ ********************/ +// normal and strided_batched +inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED, + rocblas_handle handle, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + float* A, + rocblas_int lda, + rocblas_stride stA, + float* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + return STRIDED + ? rocsolver_ssyevdj_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, info, bc) + : rocsolver_ssyevdj(handle, evect, uplo, n, A, lda, D, info); +} + +inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED, + rocblas_handle handle, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + double* A, + rocblas_int lda, + rocblas_stride stA, + double* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + return STRIDED + ? rocsolver_dsyevdj_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, info, bc) + : rocsolver_dsyevdj(handle, evect, uplo, n, A, lda, D, info); +} + +inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED, + rocblas_handle handle, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + rocblas_float_complex* A, + rocblas_int lda, + rocblas_stride stA, + float* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + return STRIDED + ? rocsolver_cheevdj_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, info, bc) + : rocsolver_cheevdj(handle, evect, uplo, n, A, lda, D, info); +} + +inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED, + rocblas_handle handle, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + rocblas_double_complex* A, + rocblas_int lda, + rocblas_stride stA, + double* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + return STRIDED + ? rocsolver_zheevdj_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, info, bc) + : rocsolver_zheevdj(handle, evect, uplo, n, A, lda, D, info); +} + +// batched +inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED, + rocblas_handle handle, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + float* const A[], + rocblas_int lda, + rocblas_stride stA, + float* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + return rocsolver_ssyevdj_batched(handle, evect, uplo, n, A, lda, D, stD, info, bc); +} + +inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED, + rocblas_handle handle, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + double* const A[], + rocblas_int lda, + rocblas_stride stA, + double* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + return rocsolver_dsyevdj_batched(handle, evect, uplo, n, A, lda, D, stD, info, bc); +} + +inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED, + rocblas_handle handle, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + rocblas_float_complex* const A[], + rocblas_int lda, + rocblas_stride stA, + float* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + return rocsolver_cheevdj_batched(handle, evect, uplo, n, A, lda, D, stD, info, bc); +} + +inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED, + rocblas_handle handle, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + rocblas_double_complex* const A[], + rocblas_int lda, + rocblas_stride stA, + double* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + return rocsolver_zheevdj_batched(handle, evect, uplo, n, A, lda, D, stD, info, bc); +} +/********************************************************/ + /******************** SYEVJ/HEEVJ ********************/ // normal and strided_batched inline rocblas_status rocsolver_syevj_heevj(bool STRIDED, @@ -8431,6 +8571,186 @@ inline rocblas_status rocsolver_sygvd_hegvd(bool STRIDED, } /********************************************************/ +/******************** SYGVDJ_HEGVDJ ********************/ +// normal and strided_batched +inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED, + rocblas_handle handle, + rocblas_eform itype, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + float* A, + rocblas_int lda, + rocblas_stride stA, + float* B, + rocblas_int ldb, + rocblas_stride stB, + float* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + if(STRIDED) + return rocsolver_ssygvdj_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, + stB, D, stD, info, bc); + else + return rocsolver_ssygvdj(handle, itype, evect, uplo, n, A, lda, B, ldb, D, info); +} + +inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED, + rocblas_handle handle, + rocblas_eform itype, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + double* A, + rocblas_int lda, + rocblas_stride stA, + double* B, + rocblas_int ldb, + rocblas_stride stB, + double* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + if(STRIDED) + return rocsolver_dsygvdj_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, + stB, D, stD, info, bc); + else + return rocsolver_dsygvdj(handle, itype, evect, uplo, n, A, lda, B, ldb, D, info); +} + +inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED, + rocblas_handle handle, + rocblas_eform itype, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + rocblas_float_complex* A, + rocblas_int lda, + rocblas_stride stA, + rocblas_float_complex* B, + rocblas_int ldb, + rocblas_stride stB, + float* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + if(STRIDED) + return rocsolver_chegvdj_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, + stB, D, stD, info, bc); + else + return rocsolver_chegvdj(handle, itype, evect, uplo, n, A, lda, B, ldb, D, info); +} + +inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED, + rocblas_handle handle, + rocblas_eform itype, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + rocblas_double_complex* A, + rocblas_int lda, + rocblas_stride stA, + rocblas_double_complex* B, + rocblas_int ldb, + rocblas_stride stB, + double* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + if(STRIDED) + return rocsolver_zhegvdj_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb, + stB, D, stD, info, bc); + else + return rocsolver_zhegvdj(handle, itype, evect, uplo, n, A, lda, B, ldb, D, info); +} + +// batched +inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED, + rocblas_handle handle, + rocblas_eform itype, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + float* const A[], + rocblas_int lda, + rocblas_stride stA, + float* const B[], + rocblas_int ldb, + rocblas_stride stB, + float* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + return rocsolver_ssygvdj_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, info, bc); +} + +inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED, + rocblas_handle handle, + rocblas_eform itype, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + double* const A[], + rocblas_int lda, + rocblas_stride stA, + double* const B[], + rocblas_int ldb, + rocblas_stride stB, + double* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + return rocsolver_dsygvdj_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, info, bc); +} + +inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED, + rocblas_handle handle, + rocblas_eform itype, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + rocblas_float_complex* const A[], + rocblas_int lda, + rocblas_stride stA, + rocblas_float_complex* const B[], + rocblas_int ldb, + rocblas_stride stB, + float* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + return rocsolver_chegvdj_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, info, bc); +} + +inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED, + rocblas_handle handle, + rocblas_eform itype, + rocblas_evect evect, + rocblas_fill uplo, + rocblas_int n, + rocblas_double_complex* const A[], + rocblas_int lda, + rocblas_stride stA, + rocblas_double_complex* const B[], + rocblas_int ldb, + rocblas_stride stB, + double* D, + rocblas_stride stD, + rocblas_int* info, + rocblas_int bc) +{ + return rocsolver_zhegvdj_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, info, bc); +} +/********************************************************/ + /******************** SYGVJ_HEGVJ ********************/ // normal and strided_batched inline rocblas_status rocsolver_sygvj_hegvj(bool STRIDED, diff --git a/clients/include/rocsolver_dispatcher.hpp b/clients/include/rocsolver_dispatcher.hpp index 75de0bb91..815f74d70 100644 --- a/clients/include/rocsolver_dispatcher.hpp +++ b/clients/include/rocsolver_dispatcher.hpp @@ -1,5 +1,5 @@ /* ************************************************************************** - * Copyright (C) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -90,11 +90,13 @@ #include "testing_sterf.hpp" #include "testing_syev_heev.hpp" #include "testing_syevd_heevd.hpp" +#include "testing_syevdj_heevdj.hpp" #include "testing_syevj_heevj.hpp" #include "testing_syevx_heevx.hpp" #include "testing_sygsx_hegsx.hpp" #include "testing_sygv_hegv.hpp" #include "testing_sygvd_hegvd.hpp" +#include "testing_sygvdj_hegvdj.hpp" #include "testing_sygvj_hegvj.hpp" #include "testing_sygvx_hegvx.hpp" #include "testing_sytf2_sytrf.hpp" @@ -322,6 +324,10 @@ class rocsolver_dispatcher {"syevd", testing_syevd_heevd}, {"syevd_batched", testing_syevd_heevd}, {"syevd_strided_batched", testing_syevd_heevd}, + // syevdj + {"syevdj", testing_syevdj_heevdj}, + {"syevdj_batched", testing_syevdj_heevdj}, + {"syevdj_strided_batched", testing_syevdj_heevdj}, // syevj {"syevj", testing_syevj_heevj}, {"syevj_batched", testing_syevj_heevj}, @@ -338,6 +344,10 @@ class rocsolver_dispatcher {"sygvd", testing_sygvd_hegvd}, {"sygvd_batched", testing_sygvd_hegvd}, {"sygvd_strided_batched", testing_sygvd_hegvd}, + // sygvdj + {"sygvdj", testing_sygvdj_hegvdj}, + {"sygvdj_batched", testing_sygvdj_hegvdj}, + {"sygvdj_strided_batched", testing_sygvdj_hegvdj}, // sygvj {"sygvj", testing_sygvj_hegvj}, {"sygvj_batched", testing_sygvj_hegvj}, @@ -412,6 +422,10 @@ class rocsolver_dispatcher {"heevd", testing_syevd_heevd}, {"heevd_batched", testing_syevd_heevd}, {"heevd_strided_batched", testing_syevd_heevd}, + // heevdj + {"heevdj", testing_syevdj_heevdj}, + {"heevdj_batched", testing_syevdj_heevdj}, + {"heevdj_strided_batched", testing_syevdj_heevdj}, // heevj {"heevj", testing_syevj_heevj}, {"heevj_batched", testing_syevj_heevj}, @@ -428,6 +442,10 @@ class rocsolver_dispatcher {"hegvd", testing_sygvd_hegvd}, {"hegvd_batched", testing_sygvd_hegvd}, {"hegvd_strided_batched", testing_sygvd_hegvd}, + // hegvdj + {"hegvdj", testing_sygvdj_hegvdj}, + {"hegvdj_batched", testing_sygvdj_hegvdj}, + {"hegvdj_strided_batched", testing_sygvdj_hegvdj}, // hegvj {"hegvj", testing_sygvj_hegvj}, {"hegvj_batched", testing_sygvj_hegvj}, diff --git a/clients/include/testing_syevdj_heevdj.hpp b/clients/include/testing_syevdj_heevdj.hpp new file mode 100644 index 000000000..128f4e515 --- /dev/null +++ b/clients/include/testing_syevdj_heevdj.hpp @@ -0,0 +1,590 @@ +/* ************************************************************************ + * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ************************************************************************ */ + +#pragma once + +#include "client_util.hpp" +#include "clientcommon.hpp" +#include "lapack_host_reference.hpp" +#include "norm.hpp" +#include "rocsolver.hpp" +#include "rocsolver_arguments.hpp" +#include "rocsolver_test.hpp" + +template +void syevdj_heevdj_checkBadArgs(const rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + T dA, + const rocblas_int lda, + const rocblas_stride stA, + S dD, + const rocblas_stride stD, + U dinfo, + const rocblas_int bc) +{ + // handle + EXPECT_ROCBLAS_STATUS( + rocsolver_syevdj_heevdj(STRIDED, nullptr, evect, uplo, n, dA, lda, stA, dD, stD, dinfo, bc), + rocblas_status_invalid_handle); + + // values + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, rocblas_evect(0), uplo, n, dA, + lda, stA, dD, stD, dinfo, bc), + rocblas_status_invalid_value); + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, rocblas_fill_full, n, dA, + lda, stA, dD, stD, dinfo, bc), + rocblas_status_invalid_value); + + // sizes (only check batch_count if applicable) + if(STRIDED) + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA, lda, stA, + dD, stD, dinfo, -1), + rocblas_status_invalid_size); + + // pointers + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, (T) nullptr, lda, + stA, dD, stD, dinfo, bc), + rocblas_status_invalid_pointer); + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA, lda, stA, + (S) nullptr, stD, dinfo, bc), + rocblas_status_invalid_pointer); + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, + stD, (U) nullptr, bc), + rocblas_status_invalid_pointer); + + // quick return with invalid pointers + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, 0, (T) nullptr, lda, + stA, (S) nullptr, stD, dinfo, bc), + rocblas_status_success); + + // quick return with zero batch_count if applicable + if(STRIDED) + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA, lda, stA, + dD, stD, (U) nullptr, 0), + rocblas_status_success); +} + +template +void testing_syevdj_heevdj_bad_arg() +{ + using S = decltype(std::real(T{})); + + // safe arguments + rocblas_local_handle handle; + rocblas_evect evect = rocblas_evect_none; + rocblas_fill uplo = rocblas_fill_lower; + rocblas_int n = 1; + rocblas_int lda = 1; + rocblas_stride stA = 1; + rocblas_stride stD = 1; + rocblas_int bc = 1; + + if(BATCHED) + { + // memory allocations + device_batch_vector dA(1, 1, 1); + device_strided_batch_vector dD(1, 1, 1, 1); + device_strided_batch_vector dinfo(1, 1, 1, 1); + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dD.memcheck()); + CHECK_HIP_ERROR(dinfo.memcheck()); + + // check bad arguments + syevdj_heevdj_checkBadArgs(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), + stD, dinfo.data(), bc); + } + else + { + // memory allocations + device_strided_batch_vector dA(1, 1, 1, 1); + device_strided_batch_vector dD(1, 1, 1, 1); + device_strided_batch_vector dinfo(1, 1, 1, 1); + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dD.memcheck()); + CHECK_HIP_ERROR(dinfo.memcheck()); + + // check bad arguments + syevdj_heevdj_checkBadArgs(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), + stD, dinfo.data(), bc); + } +} + +template +void syevdj_heevdj_initData(const rocblas_handle handle, + const rocblas_evect evect, + const rocblas_int n, + Td& dA, + const rocblas_int lda, + const rocblas_int bc, + Th& hA, + std::vector& A, + bool test = true) +{ + if(CPU) + { + rocblas_init(hA, true); + + // scale A to avoid singularities + for(rocblas_int b = 0; b < bc; ++b) + { + for(rocblas_int i = 0; i < n; i++) + { + for(rocblas_int j = 0; j < n; j++) + { + if(i == j) + hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; + else + hA[b][i + j * lda] -= 4; + } + } + + // make copy of original data to test vectors if required + if(test && evect == rocblas_evect_original) + { + for(rocblas_int i = 0; i < n; i++) + { + for(rocblas_int j = 0; j < n; j++) + A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; + } + } + } + } + + if(GPU) + { + // now copy to the GPU + CHECK_HIP_ERROR(dA.transfer_from(hA)); + } +} + +template +void syevdj_heevdj_getError(const rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + Td& dA, + const rocblas_int lda, + const rocblas_stride stA, + Sd& dD, + const rocblas_stride stD, + Id& dinfo, + const rocblas_int bc, + Th& hA, + Th& hAres, + Sh& hD, + Sh& hDres, + Ih& hinfo, + Ih& hinfoRes, + double* max_err) +{ + constexpr bool COMPLEX = rocblas_is_complex; + using S = decltype(std::real(T{})); + + int lwork = (COMPLEX ? 2 * n - 1 : 0); + int lrwork = 3 * n - 1; + std::vector work(lwork); + std::vector rwork(lrwork); + std::vector A(lda * n * bc); + + // input data initialization + syevdj_heevdj_initData(handle, evect, n, dA, lda, bc, hA, A); + + // execute computations + // GPU lapack + CHECK_ROCBLAS_ERROR(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(), lda, + stA, dD.data(), stD, dinfo.data(), bc)); + + CHECK_HIP_ERROR(hDres.transfer_from(dD)); + CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); + if(evect == rocblas_evect_original) + CHECK_HIP_ERROR(hAres.transfer_from(dA)); + + // CPU lapack + for(rocblas_int b = 0; b < bc; ++b) + cpu_syev_heev(evect, uplo, n, hA[b], lda, hD[b], work.data(), lwork, rwork.data(), lrwork, + hinfo[b]); + + // Check info for non-convergence + *max_err = 0; + for(rocblas_int b = 0; b < bc; ++b) + { + EXPECT_EQ(hinfo[b][0], hinfoRes[b][0]) << "where b = " << b; + if(hinfo[b][0] != hinfoRes[b][0]) + *max_err += 1; + } + + // (We expect the used input matrices to always converge. Testing + // implicitly the equivalent non-converged matrix is very complicated and it boils + // down to essentially run the algorithm again and until convergence is achieved). + + double err = 0; + + for(rocblas_int b = 0; b < bc; ++b) + { + if(evect != rocblas_evect_original) + { + // only eigenvalues needed; can compare with LAPACK + + // error is ||hD - hDRes|| / ||hD|| + // using frobenius norm + if(hinfo[b][0] == 0) + err = norm_error('F', 1, n, 1, hD[b], hDres[b]); + *max_err = err > *max_err ? err : *max_err; + } + else + { + // both eigenvalues and eigenvectors needed; need to implicitly test + // eigenvectors due to non-uniqueness of eigenvectors under scaling + if(hinfo[b][0] == 0) + { + // multiply A with each of the n eigenvectors and divide by corresponding + // eigenvalues + T alpha; + T beta = 0; + for(int j = 0; j < n; j++) + { + alpha = T(1) / hDres[b][j]; + cpu_symv_hemv(uplo, n, alpha, A.data() + b * lda * n, lda, hAres[b] + j * lda, + 1, beta, hA[b] + j * lda, 1); + } + + // error is ||hA - hARes|| / ||hA|| + // using frobenius norm + err = norm_error('F', n, n, lda, hA[b], hAres[b]); + *max_err = err > *max_err ? err : *max_err; + } + } + } +} + +template +void syevdj_heevdj_getPerfData(const rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + Td& dA, + const rocblas_int lda, + const rocblas_stride stA, + Sd& dD, + const rocblas_stride stD, + Id& dinfo, + const rocblas_int bc, + Th& hA, + Sh& hD, + Ih& hinfo, + double* gpu_time_used, + double* cpu_time_used, + const rocblas_int hot_calls, + const int profile, + const bool profile_kernels, + const bool perf) +{ + std::vector A(lda * n * bc); + + if(!perf) + { + // cpu-lapack performance (only if not in perf mode) + *cpu_time_used = nan(""); + } + + syevdj_heevdj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); + + // cold calls + for(int iter = 0; iter < 2; iter++) + { + syevdj_heevdj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); + + CHECK_ROCBLAS_ERROR(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(), lda, + stA, dD.data(), stD, dinfo.data(), bc)); + } + + // gpu-lapack performance + hipStream_t stream; + CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); + double start; + + if(profile > 0) + { + if(profile_kernels) + rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile + | rocblas_layer_mode_ex_log_kernel); + else + rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); + rocsolver_log_set_max_levels(profile); + } + + for(rocblas_int iter = 0; iter < hot_calls; iter++) + { + syevdj_heevdj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); + + start = get_time_us_sync(stream); + rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), + stD, dinfo.data(), bc); + *gpu_time_used += get_time_us_sync(stream) - start; + } + *gpu_time_used /= hot_calls; +} + +template +void testing_syevdj_heevdj(Arguments& argus) +{ + using S = decltype(std::real(T{})); + + // get arguments + rocblas_local_handle handle; + char evectC = argus.get("evect"); + char uploC = argus.get("uplo"); + rocblas_int n = argus.get("n"); + rocblas_int lda = argus.get("lda", n); + rocblas_stride stA = argus.get("strideA", lda * n); + rocblas_stride stD = argus.get("strideD", n); + + rocblas_evect evect = char2rocblas_evect(evectC); + rocblas_fill uplo = char2rocblas_fill(uploC); + rocblas_int bc = argus.batch_count; + rocblas_int hot_calls = argus.iters; + + // check non-supported values + if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) + { + if(BATCHED) + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, + (T* const*)nullptr, lda, stA, (S*)nullptr, + stD, (rocblas_int*)nullptr, bc), + rocblas_status_invalid_value); + else + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, + (T*)nullptr, lda, stA, (S*)nullptr, stD, + (rocblas_int*)nullptr, bc), + rocblas_status_invalid_value); + + if(argus.timing) + rocsolver_bench_inform(inform_invalid_args); + + return; + } + + // determine sizes + size_t size_A = size_t(lda) * n; + size_t size_D = n; + size_t size_Ares = (argus.unit_check || argus.norm_check) ? size_A : 0; + size_t size_Dres = (argus.unit_check || argus.norm_check) ? size_D : 0; + + double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; + + // check invalid sizes + bool invalid_size = (n < 0 || lda < n || bc < 0); + if(invalid_size) + { + if(BATCHED) + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, + (T* const*)nullptr, lda, stA, (S*)nullptr, + stD, (rocblas_int*)nullptr, bc), + rocblas_status_invalid_size); + else + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, + (T*)nullptr, lda, stA, (S*)nullptr, stD, + (rocblas_int*)nullptr, bc), + rocblas_status_invalid_size); + + if(argus.timing) + rocsolver_bench_inform(inform_invalid_size); + + return; + } + + // memory size query is necessary + if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) + { + CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); + if(BATCHED) + CHECK_ALLOC_QUERY(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, + (T* const*)nullptr, lda, stA, (S*)nullptr, + stD, (rocblas_int*)nullptr, bc)); + else + CHECK_ALLOC_QUERY(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, (T*)nullptr, + lda, stA, (S*)nullptr, stD, + (rocblas_int*)nullptr, bc)); + + size_t size; + CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); + if(argus.mem_query) + { + rocsolver_bench_inform(inform_mem_query, size); + return; + } + + CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); + } + + // memory allocations (all cases) + // host + host_strided_batch_vector hD(size_D, 1, stD, bc); + host_strided_batch_vector hinfo(1, 1, 1, bc); + host_strided_batch_vector hinfoRes(1, 1, 1, bc); + host_strided_batch_vector hDres(size_Dres, 1, stD, bc); + // device + device_strided_batch_vector dD(size_D, 1, stD, bc); + device_strided_batch_vector dinfo(1, 1, 1, bc); + if(size_D) + CHECK_HIP_ERROR(dD.memcheck()); + CHECK_HIP_ERROR(dinfo.memcheck()); + + if(BATCHED) + { + // memory allocations + host_batch_vector hA(size_A, 1, bc); + host_batch_vector hAres(size_Ares, 1, bc); + device_batch_vector dA(size_A, 1, bc); + if(size_A) + CHECK_HIP_ERROR(dA.memcheck()); + + // check quick return + if(n == 0 || bc == 0) + { + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(), + lda, stA, dD.data(), stD, dinfo.data(), bc), + rocblas_status_success); + if(argus.timing) + rocsolver_bench_inform(inform_quick_return); + + return; + } + + // check computations + if(argus.unit_check || argus.norm_check) + { + syevdj_heevdj_getError(handle, evect, uplo, n, dA, lda, stA, dD, stD, dinfo, + bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error); + } + + // collect performance data + if(argus.timing) + { + syevdj_heevdj_getPerfData(handle, evect, uplo, n, dA, lda, stA, dD, stD, + dinfo, bc, hA, hD, hinfo, &gpu_time_used, + &cpu_time_used, hot_calls, argus.profile, + argus.profile_kernels, argus.perf); + } + } + + else + { + // memory allocations + host_strided_batch_vector hA(size_A, 1, stA, bc); + host_strided_batch_vector hAres(size_Ares, 1, stA, bc); + device_strided_batch_vector dA(size_A, 1, stA, bc); + if(size_A) + CHECK_HIP_ERROR(dA.memcheck()); + + // check quick return + if(n == 0 || bc == 0) + { + EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(), + lda, stA, dD.data(), stD, dinfo.data(), bc), + rocblas_status_success); + if(argus.timing) + rocsolver_bench_inform(inform_quick_return); + + return; + } + + // check computations + if(argus.unit_check || argus.norm_check) + { + syevdj_heevdj_getError(handle, evect, uplo, n, dA, lda, stA, dD, stD, dinfo, + bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error); + } + + // collect performance data + if(argus.timing) + { + syevdj_heevdj_getPerfData(handle, evect, uplo, n, dA, lda, stA, dD, stD, + dinfo, bc, hA, hD, hinfo, &gpu_time_used, + &cpu_time_used, hot_calls, argus.profile, + argus.profile_kernels, argus.perf); + } + } + + // validate results for rocsolver-test + // using 2 * n * machine_precision as tolerance + if(argus.unit_check) + ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); + + // output results for rocsolver-bench + if(argus.timing) + { + if(!argus.perf) + { + rocsolver_bench_header("Arguments:"); + if(BATCHED) + { + rocsolver_bench_output("evect", "uplo", "n", "lda", "strideD", "batch_c"); + rocsolver_bench_output(evectC, uploC, n, lda, stD, bc); + } + else if(STRIDED) + { + rocsolver_bench_output("evect", "uplo", "n", "lda", "strideA", "strideD", "batch_c"); + rocsolver_bench_output(evectC, uploC, n, lda, stA, stD, bc); + } + else + { + rocsolver_bench_output("evect", "uplo", "n", "lda"); + rocsolver_bench_output(evectC, uploC, n, lda); + } + rocsolver_bench_header("Results:"); + if(argus.norm_check) + { + rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); + rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); + } + else + { + rocsolver_bench_output("cpu_time_us", "gpu_time_us"); + rocsolver_bench_output(cpu_time_used, gpu_time_used); + } + rocsolver_bench_endl(); + } + else + { + if(argus.norm_check) + rocsolver_bench_output(gpu_time_used, max_error); + else + rocsolver_bench_output(gpu_time_used); + } + } + + // ensure all arguments were consumed + argus.validate_consumed(); +} + +#define EXTERN_TESTING_SYEVDJ_HEEVDJ(...) \ + extern template void testing_syevdj_heevdj<__VA_ARGS__>(Arguments&); + +INSTANTIATE(EXTERN_TESTING_SYEVDJ_HEEVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) diff --git a/clients/include/testing_sygvdj_hegvdj.hpp b/clients/include/testing_sygvdj_hegvdj.hpp new file mode 100644 index 000000000..6d739d690 --- /dev/null +++ b/clients/include/testing_sygvdj_hegvdj.hpp @@ -0,0 +1,726 @@ +/* ************************************************************************** + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#pragma once + +#include "client_util.hpp" +#include "clientcommon.hpp" +#include "lapack_host_reference.hpp" +#include "norm.hpp" +#include "rocsolver.hpp" +#include "rocsolver_arguments.hpp" +#include "rocsolver_test.hpp" + +template +void sygvdj_hegvdj_checkBadArgs(const rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + T dA, + const rocblas_int lda, + const rocblas_stride stA, + T dB, + const rocblas_int ldb, + const rocblas_stride stB, + U dD, + const rocblas_stride stD, + rocblas_int* dInfo, + const rocblas_int bc) +{ + // handle + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, nullptr, itype, evect, uplo, n, dA, lda, + stA, dB, ldb, stB, dD, stD, dInfo, bc), + rocblas_status_invalid_handle); + + // values + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, rocblas_eform(0), evect, uplo, n, + dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc), + rocblas_status_invalid_value); + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, rocblas_evect(0), uplo, n, + dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc), + rocblas_status_invalid_value); + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, rocblas_evect_tridiagonal, + uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, + dInfo, bc), + rocblas_status_invalid_value); + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, rocblas_fill_full, + n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc), + rocblas_status_invalid_value); + + // sizes (only check batch_count if applicable) + if(STRIDED) + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA, + lda, stA, dB, ldb, stB, dD, stD, dInfo, -1), + rocblas_status_invalid_size); + + // pointers + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, (T) nullptr, + lda, stA, dB, ldb, stB, dD, stD, dInfo, bc), + rocblas_status_invalid_pointer); + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA, lda, + stA, (T) nullptr, ldb, stB, dD, stD, dInfo, bc), + rocblas_status_invalid_pointer); + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA, lda, + stA, dB, ldb, stB, (U) nullptr, stD, dInfo, bc), + rocblas_status_invalid_pointer); + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, + dB, ldb, stB, dD, stD, (rocblas_int*)nullptr, bc), + rocblas_status_invalid_pointer); + + // quick return with invalid pointers + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, 0, + (T) nullptr, lda, stA, (T) nullptr, ldb, stB, + (U) nullptr, stD, dInfo, bc), + rocblas_status_success); + + // quick return with zero batch_count if applicable + if(STRIDED) + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA, + lda, stA, dB, ldb, stB, dD, stD, + (rocblas_int*)nullptr, 0), + rocblas_status_success); +} + +template +void testing_sygvdj_hegvdj_bad_arg() +{ + using S = decltype(std::real(T{})); + + // safe arguments + rocblas_local_handle handle; + rocblas_int n = 1; + rocblas_int lda = 1; + rocblas_int ldb = 1; + rocblas_stride stA = 1; + rocblas_stride stB = 1; + rocblas_stride stD = 1; + rocblas_int bc = 1; + rocblas_eform itype = rocblas_eform_ax; + rocblas_evect evect = rocblas_evect_none; + rocblas_fill uplo = rocblas_fill_upper; + + if(BATCHED) + { + // memory allocations + device_batch_vector dA(1, 1, 1); + device_batch_vector dB(1, 1, 1); + device_strided_batch_vector dD(1, 1, 1, 1); + device_strided_batch_vector dInfo(1, 1, 1, 1); + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dB.memcheck()); + CHECK_HIP_ERROR(dD.memcheck()); + CHECK_HIP_ERROR(dInfo.memcheck()); + + // check bad arguments + sygvdj_hegvdj_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, + dB.data(), ldb, stB, dD.data(), stD, dInfo.data(), bc); + } + else + { + // memory allocations + device_strided_batch_vector dA(1, 1, 1, 1); + device_strided_batch_vector dB(1, 1, 1, 1); + device_strided_batch_vector dD(1, 1, 1, 1); + device_strided_batch_vector dInfo(1, 1, 1, 1); + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dB.memcheck()); + CHECK_HIP_ERROR(dD.memcheck()); + CHECK_HIP_ERROR(dInfo.memcheck()); + + // check bad arguments + sygvdj_hegvdj_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, + dB.data(), ldb, stB, dD.data(), stD, dInfo.data(), bc); + } +} + +template +void sygvdj_hegvdj_initData(const rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_int n, + Td& dA, + const rocblas_int lda, + const rocblas_stride stA, + Td& dB, + const rocblas_int ldb, + const rocblas_stride stB, + const rocblas_int bc, + Th& hA, + Th& hB, + host_strided_batch_vector& A, + host_strided_batch_vector& B, + const bool test, + const bool singular) +{ + if(CPU) + { + rocblas_int info; + rocblas_init(hA, true); + rocblas_init(hB, false); + + for(rocblas_int b = 0; b < bc; ++b) + { + for(rocblas_int i = 0; i < n; i++) + { + for(rocblas_int j = 0; j < n; j++) + { + if(i == j) + { + hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; + hB[b][i + j * ldb] = std::real(hB[b][i + j * ldb]) + 400; + } + else + { + hA[b][i + j * lda] -= 4; + } + } + } + + if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) + { + // make some matrices B not positive definite + // always the same elements for debugging purposes + // the algorithm must detect the lower order of the principal minors <= 0 + // in those matrices in the batch that are non positive definite + rocblas_int i = n / 4 + b; + i -= (i / n) * n; + hB[b][i + i * ldb] = 0; + i = n / 2 + b; + i -= (i / n) * n; + hB[b][i + i * ldb] = 0; + i = n - 1 + b; + i -= (i / n) * n; + hB[b][i + i * ldb] = 0; + } + + // store A and B for testing purposes + if(test && evect != rocblas_evect_none) + { + for(rocblas_int i = 0; i < n; i++) + { + for(rocblas_int j = 0; j < n; j++) + { + if(itype != rocblas_eform_bax) + { + A[b][i + j * lda] = hA[b][i + j * lda]; + B[b][i + j * ldb] = hB[b][i + j * ldb]; + } + else + { + A[b][i + j * lda] = hB[b][i + j * ldb]; + B[b][i + j * ldb] = hA[b][i + j * lda]; + } + } + } + } + } + } + + if(GPU) + { + // now copy data to the GPU + CHECK_HIP_ERROR(dA.transfer_from(hA)); + CHECK_HIP_ERROR(dB.transfer_from(hB)); + } +} + +template +void sygvdj_hegvdj_getError(const rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + Td& dA, + const rocblas_int lda, + const rocblas_stride stA, + Td& dB, + const rocblas_int ldb, + const rocblas_stride stB, + Ud& dD, + const rocblas_stride stD, + Vd& dInfo, + const rocblas_int bc, + Th& hA, + Th& hARes, + Th& hB, + Uh& hD, + Uh& hDRes, + Vh& hInfo, + Vh& hInfoRes, + double* max_err, + const bool singular) +{ + constexpr bool COMPLEX = rocblas_is_complex; + using S = decltype(std::real(T{})); + + rocblas_int lwork = (COMPLEX ? 2 * n - 1 : 3 * n - 1); + rocblas_int lrwork = (COMPLEX ? 3 * n - 2 : 0); + std::vector work(lwork); + std::vector rwork(lrwork); + host_strided_batch_vector A(lda * n, 1, lda * n, bc); + host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); + + // input data initialization + sygvdj_hegvdj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, + hA, hB, A, B, true, singular); + + // execute computations + // GPU lapack + CHECK_ROCBLAS_ERROR(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA.data(), + lda, stA, dB.data(), ldb, stB, dD.data(), stD, + dInfo.data(), bc)); + + CHECK_HIP_ERROR(hDRes.transfer_from(dD)); + CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); + if(evect != rocblas_evect_none) + CHECK_HIP_ERROR(hARes.transfer_from(dA)); + + // CPU lapack + for(rocblas_int b = 0; b < bc; ++b) + { + cpu_sygv_hegv(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hD[b], work.data(), lwork, + rwork.data(), hInfo[b]); + } + + // (We expect the used input matrices to always converge. Testing + // implicitly the equivalent non-converged matrix is very complicated and it boils + // down to essentially run the algorithm again and until convergence is achieved. + // We do test with indefinite matrices B). + + // check info for non-convergence and/or positive-definiteness + *max_err = 0; + for(rocblas_int b = 0; b < bc; ++b) + { + EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; + if(hInfo[b][0] != hInfoRes[b][0]) + *max_err += 1; + } + + double err; + + for(rocblas_int b = 0; b < bc; ++b) + { + if(evect == rocblas_evect_none) + { + // only eigenvalues needed; can compare with LAPACK + + // error is ||hD - hDRes|| / ||hD|| + // using frobenius norm + if(hInfoRes[b][0] == 0) + { + err = norm_error('F', 1, n, 1, hD[b], hDRes[b]); + *max_err = err > *max_err ? err : *max_err; + } + } + else + { + // both eigenvalues and eigenvectors needed; need to implicitly test + // eigenvectors due to non-uniqueness of eigenvectors under scaling + if(hInfoRes[b][0] == 0) + { + T alpha = 1; + T beta = 0; + + // hARes contains eigenvectors x + // compute B*x (or A*x) and store in hB + cpu_symm_hemm(rocblas_side_left, uplo, n, n, alpha, B[b], ldb, hARes[b], lda, beta, + hB[b], ldb); + + if(itype == rocblas_eform_ax) + { + // problem is A*x = (lambda)*B*x + + // compute (1/lambda)*A*x and store in hA + for(int j = 0; j < n; j++) + { + alpha = T(1) / hDRes[b][j]; + cpu_symv_hemv(uplo, n, alpha, A[b], lda, hARes[b] + j * lda, 1, beta, + hA[b] + j * lda, 1); + } + + // move B*x into hARes + for(rocblas_int i = 0; i < n; i++) + for(rocblas_int j = 0; j < n; j++) + hARes[b][i + j * lda] = hB[b][i + j * ldb]; + } + else + { + // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x + + // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA + for(int j = 0; j < n; j++) + { + alpha = T(1) / hDRes[b][j]; + cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta, + hA[b] + j * lda, 1); + } + } + + // error is ||hA - hARes|| / ||hA|| + // using frobenius norm + err = norm_error('F', n, n, lda, hA[b], hARes[b]); + *max_err = err > *max_err ? err : *max_err; + } + } + } +} + +template +void sygvdj_hegvdj_getPerfData(const rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + Td& dA, + const rocblas_int lda, + const rocblas_stride stA, + Td& dB, + const rocblas_int ldb, + const rocblas_stride stB, + Ud& dD, + const rocblas_stride stD, + Vd& dInfo, + const rocblas_int bc, + Th& hA, + Th& hB, + Uh& hD, + Vh& hInfo, + double* gpu_time_used, + double* cpu_time_used, + const rocblas_int hot_calls, + const int profile, + const bool profile_kernels, + const bool perf, + const bool singular) +{ + host_strided_batch_vector A(1, 1, 1, 1); + host_strided_batch_vector B(1, 1, 1, 1); + + if(!perf) + { + // cpu-lapack performance (only if not in perf mode) + *cpu_time_used = nan(""); + } + + sygvdj_hegvdj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, + hA, hB, A, B, false, singular); + + // cold calls + for(int iter = 0; iter < 2; iter++) + { + sygvdj_hegvdj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, + bc, hA, hB, A, B, false, singular); + + CHECK_ROCBLAS_ERROR(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, + dA.data(), lda, stA, dB.data(), ldb, stB, + dD.data(), stD, dInfo.data(), bc)); + } + + // gpu-lapack performance + hipStream_t stream; + CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); + double start; + + if(profile > 0) + { + if(profile_kernels) + rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile + | rocblas_layer_mode_ex_log_kernel); + else + rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); + rocsolver_log_set_max_levels(profile); + } + + for(rocblas_int iter = 0; iter < hot_calls; iter++) + { + sygvdj_hegvdj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, + bc, hA, hB, A, B, false, singular); + + start = get_time_us_sync(stream); + rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, + dB.data(), ldb, stB, dD.data(), stD, dInfo.data(), bc); + *gpu_time_used += get_time_us_sync(stream) - start; + } + *gpu_time_used /= hot_calls; +} + +template +void testing_sygvdj_hegvdj(Arguments& argus) +{ + using S = decltype(std::real(T{})); + + // get arguments + rocblas_local_handle handle; + char itypeC = argus.get("itype"); + char evectC = argus.get("evect"); + char uploC = argus.get("uplo"); + rocblas_int n = argus.get("n"); + rocblas_int lda = argus.get("lda", n); + rocblas_int ldb = argus.get("ldb", n); + rocblas_stride stA = argus.get("strideA", lda * n); + rocblas_stride stB = argus.get("strideB", ldb * n); + rocblas_stride stD = argus.get("strideD", n); + + rocblas_eform itype = char2rocblas_eform(itypeC); + rocblas_evect evect = char2rocblas_evect(evectC); + rocblas_fill uplo = char2rocblas_fill(uploC); + rocblas_int bc = argus.batch_count; + rocblas_int hot_calls = argus.iters; + + rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; + rocblas_stride stDRes = (argus.unit_check || argus.norm_check) ? stD : 0; + + // check non-supported values + if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) + { + if(BATCHED) + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, + (T* const*)nullptr, lda, stA, + (T* const*)nullptr, ldb, stB, (S*)nullptr, + stD, (rocblas_int*)nullptr, bc), + rocblas_status_invalid_value); + else + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, + (T*)nullptr, lda, stA, (T*)nullptr, ldb, + stB, (S*)nullptr, stD, + (rocblas_int*)nullptr, bc), + rocblas_status_invalid_value); + + if(argus.timing) + rocsolver_bench_inform(inform_invalid_args); + + return; + } + + // determine sizes + size_t size_A = size_t(lda) * n; + size_t size_B = size_t(ldb) * n; + size_t size_D = size_t(n); + double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; + + size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; + size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0; + + // check invalid sizes + bool invalid_size = (n < 0 || lda < n || ldb < n || bc < 0); + if(invalid_size) + { + if(BATCHED) + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, + (T* const*)nullptr, lda, stA, + (T* const*)nullptr, ldb, stB, (S*)nullptr, + stD, (rocblas_int*)nullptr, bc), + rocblas_status_invalid_size); + else + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, + (T*)nullptr, lda, stA, (T*)nullptr, ldb, + stB, (S*)nullptr, stD, + (rocblas_int*)nullptr, bc), + rocblas_status_invalid_size); + + if(argus.timing) + rocsolver_bench_inform(inform_invalid_size); + + return; + } + + // memory size query is necessary + if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) + { + CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); + if(BATCHED) + CHECK_ALLOC_QUERY(rocsolver_sygvdj_hegvdj( + STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, + (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (rocblas_int*)nullptr, bc)); + else + CHECK_ALLOC_QUERY(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, + (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, + (S*)nullptr, stD, (rocblas_int*)nullptr, bc)); + + size_t size; + CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); + if(argus.mem_query) + { + rocsolver_bench_inform(inform_mem_query, size); + return; + } + + CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); + } + + // memory allocations (all cases) + // host + host_strided_batch_vector hD(size_D, 1, stD, bc); + host_strided_batch_vector hDRes(size_DRes, 1, stDRes, bc); + host_strided_batch_vector hInfo(1, 1, 1, bc); + host_strided_batch_vector hInfoRes(1, 1, 1, bc); + // device + device_strided_batch_vector dD(size_D, 1, stD, bc); + device_strided_batch_vector dInfo(1, 1, 1, bc); + if(size_D) + CHECK_HIP_ERROR(dD.memcheck()); + CHECK_HIP_ERROR(dInfo.memcheck()); + + if(BATCHED) + { + // memory allocations + host_batch_vector hA(size_A, 1, bc); + host_batch_vector hARes(size_ARes, 1, bc); + host_batch_vector hB(size_B, 1, bc); + device_batch_vector dA(size_A, 1, bc); + device_batch_vector dB(size_B, 1, bc); + if(size_A) + CHECK_HIP_ERROR(dA.memcheck()); + if(size_B) + CHECK_HIP_ERROR(dB.memcheck()); + + // check quick return + if(n == 0 || bc == 0) + { + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, + dA.data(), lda, stA, dB.data(), ldb, stB, + dD.data(), stD, dInfo.data(), bc), + rocblas_status_success); + if(argus.timing) + rocsolver_bench_inform(inform_quick_return); + + return; + } + + // check computations + if(argus.unit_check || argus.norm_check) + sygvdj_hegvdj_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, + stB, dD, stD, dInfo, bc, hA, hARes, hB, hD, hDRes, + hInfo, hInfoRes, &max_error, argus.singular); + + // collect performance data + if(argus.timing) + sygvdj_hegvdj_getPerfData( + handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc, hA, + hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, + argus.profile_kernels, argus.perf, argus.singular); + } + + else + { + // memory allocations + host_strided_batch_vector hA(size_A, 1, stA, bc); + host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); + host_strided_batch_vector hB(size_B, 1, stB, bc); + device_strided_batch_vector dA(size_A, 1, stA, bc); + device_strided_batch_vector dB(size_B, 1, stB, bc); + if(size_A) + CHECK_HIP_ERROR(dA.memcheck()); + if(size_B) + CHECK_HIP_ERROR(dB.memcheck()); + + // check quick return + if(n == 0 || bc == 0) + { + EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, + dA.data(), lda, stA, dB.data(), ldb, stB, + dD.data(), stD, dInfo.data(), bc), + rocblas_status_success); + if(argus.timing) + rocsolver_bench_inform(inform_quick_return); + + return; + } + + // check computations + if(argus.unit_check || argus.norm_check) + sygvdj_hegvdj_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, + stB, dD, stD, dInfo, bc, hA, hARes, hB, hD, hDRes, + hInfo, hInfoRes, &max_error, argus.singular); + + // collect performance data + if(argus.timing) + sygvdj_hegvdj_getPerfData( + handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc, hA, + hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, + argus.profile_kernels, argus.perf, argus.singular); + } + + // validate results for rocsolver-test + // using 2 * n * machine_precision as tolerance + if(argus.unit_check) + ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); + + // output results for rocsolver-bench + if(argus.timing) + { + if(!argus.perf) + { + rocsolver_bench_header("Arguments:"); + if(BATCHED) + { + rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideD", + "batch_c"); + rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stD, bc); + } + else if(STRIDED) + { + rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideA", + "strideB", "strideD", "batch_c"); + rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stA, stB, stD, bc); + } + else + { + rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb"); + rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb); + } + rocsolver_bench_header("Results:"); + if(argus.norm_check) + { + rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); + rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); + } + else + { + rocsolver_bench_output("cpu_time_us", "gpu_time_us"); + rocsolver_bench_output(cpu_time_used, gpu_time_used); + } + rocsolver_bench_endl(); + } + else + { + if(argus.norm_check) + rocsolver_bench_output(gpu_time_used, max_error); + else + rocsolver_bench_output(gpu_time_used); + } + } + + // ensure all arguments were consumed + argus.validate_consumed(); +} + +#define EXTERN_TESTING_SYGVDJ_HEGVDJ(...) \ + extern template void testing_sygvdj_hegvdj<__VA_ARGS__>(Arguments&); + +INSTANTIATE(EXTERN_TESTING_SYGVDJ_HEGVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) diff --git a/docs/api/lapacklike.rst b/docs/api/lapacklike.rst index 5f4067228..a7bb18f2d 100644 --- a/docs/api/lapacklike.rst +++ b/docs/api/lapacklike.rst @@ -388,6 +388,86 @@ rocsolver_hegvj_strided_batched() .. doxygenfunction:: rocsolver_chegvj_strided_batched +.. _syevdj: + +rocsolver_syevdj() +--------------------------------------------------- +.. doxygenfunction:: rocsolver_dsyevdj + :outline: +.. doxygenfunction:: rocsolver_ssyevdj + +rocsolver_syevdj_batched() +--------------------------------------------------- +.. doxygenfunction:: rocsolver_dsyevdj_batched + :outline: +.. doxygenfunction:: rocsolver_ssyevdj_batched + +rocsolver_syevdj_strided_batched() +--------------------------------------------------- +.. doxygenfunction:: rocsolver_dsyevdj_strided_batched + :outline: +.. doxygenfunction:: rocsolver_ssyevdj_strided_batched + +.. _heevdj: + +rocsolver_heevdj() +--------------------------------------------------- +.. doxygenfunction:: rocsolver_zheevdj + :outline: +.. doxygenfunction:: rocsolver_cheevdj + +rocsolver_heevdj_batched() +--------------------------------------------------- +.. doxygenfunction:: rocsolver_zheevdj_batched + :outline: +.. doxygenfunction:: rocsolver_cheevdj_batched + +rocsolver_heevdj_strided_batched() +--------------------------------------------------- +.. doxygenfunction:: rocsolver_zheevdj_strided_batched + :outline: +.. doxygenfunction:: rocsolver_cheevdj_strided_batched + +.. _sygvdj: + +rocsolver_sygvdj() +--------------------------------------------------- +.. doxygenfunction:: rocsolver_dsygvdj + :outline: +.. doxygenfunction:: rocsolver_ssygvdj + +rocsolver_sygvdj_batched() +--------------------------------------------------- +.. doxygenfunction:: rocsolver_dsygvdj_batched + :outline: +.. doxygenfunction:: rocsolver_ssygvdj_batched + +rocsolver_sygvdj_strided_batched() +--------------------------------------------------- +.. doxygenfunction:: rocsolver_dsygvdj_strided_batched + :outline: +.. doxygenfunction:: rocsolver_ssygvdj_strided_batched + +.. _hegvdj: + +rocsolver_hegvdj() +--------------------------------------------------- +.. doxygenfunction:: rocsolver_zhegvdj + :outline: +.. doxygenfunction:: rocsolver_chegvdj + +rocsolver_hegvdj_batched() +--------------------------------------------------- +.. doxygenfunction:: rocsolver_zhegvdj_batched + :outline: +.. doxygenfunction:: rocsolver_chegvdj_batched + +rocsolver_hegvdj_strided_batched() +--------------------------------------------------- +.. doxygenfunction:: rocsolver_zhegvdj_strided_batched + :outline: +.. doxygenfunction:: rocsolver_chegvdj_strided_batched + .. _likesvds: diff --git a/docs/design/tuning.rst b/docs/design/tuning.rst index def54ed7b..ed0e17b8c 100644 --- a/docs/design/tuning.rst +++ b/docs/design/tuning.rst @@ -262,10 +262,15 @@ STEDC_MIN_DC_SIZE (As of the current rocSOLVER release, this constant has not been tuned for any specific cases.) +STEDC_NUM_SPLIT_BLKS +--------------------- +.. doxygendefine:: STEDC_NUM_SPLIT_BLKS + +(As of the current rocSOLVER release, this constant has not been tuned for any specific cases.) -syevj and heevj functions -========================== +syevj, heevj, syevdj and heevdj functions +=========================================== The Jacobi eigensolver routines SYEVJ/HEEVJ (or the corresponding batched and strided-batched routines) can be executed with a single kernel call (for small-size matrices) or with multiple kernel calls (for large-size @@ -274,15 +279,20 @@ computed cosine and sine values, and the number of iterations/sweeps is controll the matrix is partitioned into blocks, Jacobi rotations are accumulated per block (to be applied in separate kernel calls), and the number of iterations/sweeps is controlled by the CPU (requiring synchronization of the handle stream). +When running SYEVDJ/HEEVDJ (or the corresponding batched and strided-batched routines), +the computation of the eigenvectors of the associated tridiagonal matrix +can be sped up using a divide-and-conquer approach, +provided the size of the independent block is large enough. + SYEVJ_BLOCKED_SWITCH ---------------------- .. doxygendefine:: SYEVJ_BLOCKED_SWITCH (As of the current rocSOLVER release, this constant has not been tuned for any specific cases.) -STEDC_NUM_SPLIT_BLKS ---------------------- -.. doxygendefine:: STEDC_NUM_SPLIT_BLKS +SYEVDJ_MIN_DC_SIZE +------------------- +.. doxygendefine:: SYEVDJ_MIN_DC_SIZE (As of the current rocSOLVER release, this constant has not been tuned for any specific cases.) diff --git a/docs/userguide/intro.rst b/docs/userguide/intro.rst index 22ff74e2b..199d3dac2 100644 --- a/docs/userguide/intro.rst +++ b/docs/userguide/intro.rst @@ -210,6 +210,10 @@ LAPACK-like functions :ref:`rocsolver_sygvj `, x, x, , :ref:`rocsolver_heevj `, , , x, x :ref:`rocsolver_hegvj `, , , x, x + :ref:`rocsolver_syevdj `, x, x, , + :ref:`rocsolver_sygvdj `, x, x, , + :ref:`rocsolver_heevdj `, , , x, x + :ref:`rocsolver_hegvdj `, , , x, x .. csv-table:: Singular value decomposition :header: "Function", "single", "double", "single complex", "double complex" diff --git a/library/include/rocsolver/rocsolver-functions.h b/library/include/rocsolver/rocsolver-functions.h index f48e7821a..5429fdac4 100644 --- a/library/include/rocsolver/rocsolver-functions.h +++ b/library/include/rocsolver/rocsolver-functions.h @@ -1,5 +1,5 @@ /* ************************************************************************** - * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -12276,7 +12276,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvdj_batched(rocblas_handle handle, The leading dimension of A_l. @param[in] strideA rocblas_stride. - Stride from the start of one matrix A_l to the next one A_(j+1). + Stride from the start of one matrix A_l to the next one A_(l+1). There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. @param[in] @@ -16282,206 +16282,1242 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_cheevd_batched(rocblas_handle handle, rocblas_int* info, const rocblas_int batch_count); -ROCSOLVER_EXPORT rocblas_status rocsolver_zheevd_batched(rocblas_handle handle, - const rocblas_evect evect, - const rocblas_fill uplo, - const rocblas_int n, - rocblas_double_complex* const A[], - const rocblas_int lda, - double* D, - const rocblas_stride strideD, - double* E, - const rocblas_stride strideE, - rocblas_int* info, - const rocblas_int batch_count); +ROCSOLVER_EXPORT rocblas_status rocsolver_zheevd_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* const A[], + const rocblas_int lda, + double* D, + const rocblas_stride strideD, + double* E, + const rocblas_stride strideE, + rocblas_int* info, + const rocblas_int batch_count); +//! @} + +/*! @{ + \brief SYEVD_STRIDED_BATCHED computes the eigenvalues and optionally the eigenvectors of a batch of + real symmetric matrices A_l. + + \details + The eigenvalues are returned in ascending order. The eigenvectors are computed using a + divide-and-conquer algorithm, depending on the value of evect. The computed eigenvectors + are orthonormal. + + @param[in] + handle rocblas_handle. + @param[in] + evect #rocblas_evect. + Specifies whether the eigenvectors are to be computed. + If evect is rocblas_evect_original, then the eigenvectors are computed. + rocblas_evect_tridiagonal is not supported. + @param[in] + uplo rocblas_fill. + Specifies whether the upper or lower part of the symmetric matrices A_l is stored. + If uplo indicates lower (or upper), then the upper (or lower) part of A_l + is not used. + @param[in] + n rocblas_int. n >= 0. + Number of rows and columns of matrices A_l. + @param[inout] + A pointer to type. Array on the GPU (the size depends on the value of strideA). + On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and + the algorithm converged; otherwise the contents of A_l are destroyed. + @param[in] + lda rocblas_int. lda >= n. + Specifies the leading dimension of matrices A_l. + @param[in] + strideA rocblas_stride. + Stride from the start of one matrix A_l to the next one A_(l+1). + There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. + @param[out] + D pointer to type. Array on the GPU (the size depends on the value of strideD). + The eigenvalues of A_l in increasing order. + @param[in] + strideD rocblas_stride. + Stride from the start of one vector D_l to the next one D_(l+1). + There is no restriction for the value of strideD. Normal use case is strideD >= n. + @param[out] + E pointer to type. Array on the GPU (the size depends on the value of strideE). + This array is used to work internally with the tridiagonal matrix T_l associated with A_l. + On exit, if info[l] > 0, E_l contains the unconverged off-diagonal elements of T_l + (or properly speaking, a tridiagonal matrix equivalent to T_l). The diagonal elements + of this matrix are in D_l; those that converged correspond to a subset of the + eigenvalues of A_l (not necessarily ordered). + @param[in] + strideE rocblas_stride. + Stride from the start of one vector E_l to the next one E_(l+1). + There is no restriction for the value of strideE. Normal use case is strideE >= n. + @param[out] + info pointer to rocblas_int. Array of batch_count integers on the GPU. + If info[l] = 0, successful exit for matrix A_l. + If info[l] = i > 0 and evect is rocblas_evect_none, the algorithm did not converge. + i elements of E_l did not converge to zero. + If info[l] = i > 0 and evect is rocblas_evect_original, the algorithm failed to + compute an eigenvalue in the submatrix from [i/(n+1), i/(n+1)] to [i%(n+1), i%(n+1)]. + @param[in] + batch_count rocblas_int. batch_count >= 0. + Number of matrices in the batch. + **************************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_ssyevd_strided_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + float* A, + const rocblas_int lda, + const rocblas_stride strideA, + float* D, + const rocblas_stride strideD, + float* E, + const rocblas_stride strideE, + rocblas_int* info, + const rocblas_int batch_count); + +ROCSOLVER_EXPORT rocblas_status rocsolver_dsyevd_strided_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + double* A, + const rocblas_int lda, + const rocblas_stride strideA, + double* D, + const rocblas_stride strideD, + double* E, + const rocblas_stride strideE, + rocblas_int* info, + const rocblas_int batch_count); +//! @} + +/*! @{ + \brief HEEVD_STRIDED_BATCHED computes the eigenvalues and optionally the eigenvectors of a batch of + Hermitian matrices A_l. + + \details + The eigenvalues are returned in ascending order. The eigenvectors are computed using a + divide-and-conquer algorithm, depending on the value of evect. The computed eigenvectors + are orthonormal. + + @param[in] + handle rocblas_handle. + @param[in] + evect #rocblas_evect. + Specifies whether the eigenvectors are to be computed. + If evect is rocblas_evect_original, then the eigenvectors are computed. + rocblas_evect_tridiagonal is not supported. + @param[in] + uplo rocblas_fill. + Specifies whether the upper or lower part of the Hermitian matrices A_l is stored. + If uplo indicates lower (or upper), then the upper (or lower) part of A_l + is not used. + @param[in] + n rocblas_int. n >= 0. + Number of rows and columns of matrices A_l. + @param[inout] + A pointer to type. Array on the GPU (the size depends on the value of strideA). + On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and + the algorithm converged; otherwise the contents of A_l are destroyed. + @param[in] + lda rocblas_int. lda >= n. + Specifies the leading dimension of matrices A_l. + @param[in] + strideA rocblas_stride. + Stride from the start of one matrix A_l to the next one A_(l+1). + There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. + @param[out] + D pointer to real type. Array on the GPU (the size depends on the value of strideD). + The eigenvalues of A_l in increasing order. + @param[in] + strideD rocblas_stride. + Stride from the start of one vector D_l to the next one D_(l+1). + There is no restriction for the value of strideD. Normal use case is strideD >= n. + @param[out] + E pointer to real type. Array on the GPU (the size depends on the value of strideE). + This array is used to work internally with the tridiagonal matrix T_l associated with A_l. + On exit, if info[l] > 0, E_l contains the unconverged off-diagonal elements of T_l + (or properly speaking, a tridiagonal matrix equivalent to T_l). The diagonal elements + of this matrix are in D_l; those that converged correspond to a subset of the + eigenvalues of A_l (not necessarily ordered). + @param[in] + strideE rocblas_stride. + Stride from the start of one vector E_l to the next one E_(l+1). + There is no restriction for the value of strideE. Normal use case is strideE >= n. + @param[out] + info pointer to rocblas_int. Array of batch_count integers on the GPU. + If info[l] = 0, successful exit for matrix A_l. + If info[l] = i > 0 and evect is rocblas_evect_none, the algorithm did not converge. + i elements of E_l did not converge to zero. + If info[l] = i > 0 and evect is rocblas_evect_original, the algorithm failed to + compute an eigenvalue in the submatrix from [i/(n+1), i/(n+1)] to [i%(n+1), i%(n+1)]. + @param[in] + batch_count rocblas_int. batch_count >= 0. + Number of matrices in the batch. + **************************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_cheevd_strided_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_float_complex* A, + const rocblas_int lda, + const rocblas_stride strideA, + float* D, + const rocblas_stride strideD, + float* E, + const rocblas_stride strideE, + rocblas_int* info, + const rocblas_int batch_count); + +ROCSOLVER_EXPORT rocblas_status rocsolver_zheevd_strided_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* A, + const rocblas_int lda, + const rocblas_stride strideA, + double* D, + const rocblas_stride strideD, + double* E, + const rocblas_stride strideE, + rocblas_int* info, + const rocblas_int batch_count); +//! @} + +/*! @{ + \brief SYEVDJ computes the eigenvalues and optionally the eigenvectors of a real symmetric + matrix A. + + \details + The eigenvalues are found using the iterative Jacobi algorithm and are returned in ascending order. + The eigenvectors are computed using a divide-and-conquer approach depending on the value of evect. + The computed eigenvectors are orthonormal. + + @param[in] + handle rocblas_handle. + @param[in] + evect #rocblas_evect.\n + Specifies whether the eigenvectors are to be computed. + If evect is rocblas_evect_original, then the eigenvectors are computed. + rocblas_evect_tridiagonal is not supported. + @param[in] + uplo rocblas_fill.\n + Specifies whether the upper or lower part of the symmetric matrix A is stored. + If uplo indicates lower (or upper), then the upper (or lower) part of A + is not used. + @param[in] + n rocblas_int. n >= 0.\n + Number of rows and columns of matrix A. + @param[inout] + A pointer to type. Array on the GPU of dimension lda*n.\n + On entry, the matrix A. On exit, the eigenvectors of A if they were computed and + the algorithm converged; otherwise the contents of A are destroyed. + @param[in] + lda rocblas_int. lda >= n.\n + Specifies the leading dimension of matrix A. + @param[out] + D pointer to type. Array on the GPU of dimension n.\n + The eigenvalues of A in increasing order. + @param[out] + info pointer to a rocblas_int on the GPU.\n + If info = 0, successful exit. If info = 1, the algorithm did not converge. + **************************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_ssyevdj(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + float* A, + const rocblas_int lda, + float* D, + rocblas_int* info); + +ROCSOLVER_EXPORT rocblas_status rocsolver_dsyevdj(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + double* A, + const rocblas_int lda, + double* D, + rocblas_int* info); +//! @} + +/*! @{ + \brief HEEVDJ computes the eigenvalues and optionally the eigenvectors of a complex Hermitian + matrix A. + + \details + The eigenvalues are found using the iterative Jacobi algorithm and are returned in ascending order. + The eigenvectors are computed using a divide-and-conquer approach depending on the value of evect. + The computed eigenvectors are orthonormal. + + @param[in] + handle rocblas_handle. + @param[in] + evect #rocblas_evect.\n + Specifies whether the eigenvectors are to be computed. + If evect is rocblas_evect_original, then the eigenvectors are computed. + rocblas_evect_tridiagonal is not supported. + @param[in] + uplo rocblas_fill.\n + Specifies whether the upper or lower part of the Hermitian matrix A is stored. + If uplo indicates lower (or upper), then the upper (or lower) part of A + is not used. + @param[in] + n rocblas_int. n >= 0.\n + Number of rows and columns of matrix A. + @param[inout] + A pointer to type. Array on the GPU of dimension lda*n.\n + On entry, the matrix A. On exit, the eigenvectors of A if they were computed and + the algorithm converged; otherwise the contents of A are destroyed. + @param[in] + lda rocblas_int. lda >= n.\n + Specifies the leading dimension of matrix A. + @param[out] + D pointer to real type. Array on the GPU of dimension n.\n + The eigenvalues of A in increasing order. + @param[out] + info pointer to a rocblas_int on the GPU.\n + If info = 0, successful exit. If info = 1, the algorithm did not converge. + **************************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_cheevdj(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_float_complex* A, + const rocblas_int lda, + float* D, + rocblas_int* info); + +ROCSOLVER_EXPORT rocblas_status rocsolver_zheevdj(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* A, + const rocblas_int lda, + double* D, + rocblas_int* info); +//! @} + +/*! @{ + \brief SYEVDJ_BATCHED computes the eigenvalues and optionally the eigenvectors of a + batch of real symmetric matrices A_l. + + \details + The eigenvalues are found using the iterative Jacobi algorithm and are returned in ascending order. + The eigenvectors are computed using a divide-and-conquer approach depending on the value of evect. + The computed eigenvectors are orthonormal. + + @param[in] + handle rocblas_handle. + @param[in] + evect #rocblas_evect.\n + Specifies whether the eigenvectors are to be computed. + If evect is rocblas_evect_original, then the eigenvectors are computed. + rocblas_evect_tridiagonal is not supported. + @param[in] + uplo rocblas_fill.\n + Specifies whether the upper or lower part of the symmetric matrices A_l is stored. + If uplo indicates lower (or upper), then the upper (or lower) part of A_l + is not used. + @param[in] + n rocblas_int. n >= 0.\n + Number of rows and columns of matrices A_l. + @param[inout] + A Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n + On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and + the algorithm converged; otherwise the contents of A_l are destroyed. + @param[in] + lda rocblas_int. lda >= n.\n + Specifies the leading dimension of matrices A_l. + @param[out] + D pointer to type. Array on the GPU (the size depends on the value of strideD).\n + The eigenvalues of A_l in increasing order. + @param[in] + strideD rocblas_stride.\n + Stride from the start of one vector D_l to the next one D_(l+1). + There is no restriction for the value of strideD. Normal use case is strideD >= n. + @param[out] + info pointer to rocblas_int. Array of batch_count integers on the GPU.\n + If info[l] = 0, successful exit for A_l. If info[l] = 1, the algorithm did not converge for A_l. + @param[in] + batch_count rocblas_int. batch_count >= 0.\n + Number of matrices in the batch. + **************************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_ssyevdj_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + float* const A[], + const rocblas_int lda, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); + +ROCSOLVER_EXPORT rocblas_status rocsolver_dsyevdj_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + double* const A[], + const rocblas_int lda, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); +//! @} + +/*! @{ + \brief HEEVDJ_BATCHED computes the eigenvalues and optionally the eigenvectors of a + batch of complex Hermitian matrices A_l. + + \details + The eigenvalues are found using the iterative Jacobi algorithm and are returned in ascending order. + The eigenvectors are computed using a divide-and-conquer approach depending on the value of evect. + The computed eigenvectors are orthonormal. + + @param[in] + handle rocblas_handle. + @param[in] + evect #rocblas_evect.\n + Specifies whether the eigenvectors are to be computed. + If evect is rocblas_evect_original, then the eigenvectors are computed. + rocblas_evect_tridiagonal is not supported. + @param[in] + uplo rocblas_fill.\n + Specifies whether the upper or lower part of the Hermitian matrices A_l is stored. + If uplo indicates lower (or upper), then the upper (or lower) part of A_l + is not used. + @param[in] + n rocblas_int. n >= 0.\n + Number of rows and columns of matrices A_l. + @param[inout] + A Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n + On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and + the algorithm converged; otherwise the contents of A_l are destroyed. + @param[in] + lda rocblas_int. lda >= n.\n + Specifies the leading dimension of matrices A_l. + @param[out] + D pointer to real type. Array on the GPU (the size depends on the value of strideD).\n + The eigenvalues of A_l in increasing order. + @param[in] + strideD rocblas_stride.\n + Stride from the start of one vector D_l to the next one D_(l+1). + There is no restriction for the value of strideD. Normal use case is strideD >= n. + @param[out] + info pointer to rocblas_int. Array of batch_count integers on the GPU.\n + If info[l] = 0, successful exit for A_l. If info[l] = 1, the algorithm did not converge for A_l. + @param[in] + batch_count rocblas_int. batch_count >= 0.\n + Number of matrices in the batch. + **************************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_cheevdj_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_float_complex* const A[], + const rocblas_int lda, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); + +ROCSOLVER_EXPORT rocblas_status rocsolver_zheevdj_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* const A[], + const rocblas_int lda, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); +//! @} + +/*! @{ + \brief SYEVDJ_STRIDED_BATCHED computes the eigenvalues and optionally the eigenvectors of a + batch of real symmetric matrices A_l. + + \details + The eigenvalues are found using the iterative Jacobi algorithm and are returned in ascending order. + The eigenvectors are computed using a divide-and-conquer approach depending on the value of evect. + The computed eigenvectors are orthonormal. + + @param[in] + handle rocblas_handle. + @param[in] + evect #rocblas_evect.\n + Specifies whether the eigenvectors are to be computed. + If evect is rocblas_evect_original, then the eigenvectors are computed. + rocblas_evect_tridiagonal is not supported. + @param[in] + uplo rocblas_fill.\n + Specifies whether the upper or lower part of the symmetric matrices A_l is stored. + If uplo indicates lower (or upper), then the upper (or lower) part of A_l + is not used. + @param[in] + n rocblas_int. n >= 0.\n + Number of rows and columns of matrices A_l. + @param[inout] + A Pointer to type. Array on the GPU (the size depends on the value of strideA).\n + On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and + the algorithm converged; otherwise the contents of A_l are destroyed. + @param[in] + lda rocblas_int. lda >= n.\n + Specifies the leading dimension of matrices A_l. + @param[in] + strideA rocblas_stride.\n + Stride from the start of one matrix A_l to the next one A_(l+1). + There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. + @param[out] + D pointer to type. Array on the GPU (the size depends on the value of strideD).\n + The eigenvalues of A_l in increasing order. + @param[in] + strideD rocblas_stride.\n + Stride from the start of one vector D_l to the next one D_(l+1). + There is no restriction for the value of strideD. Normal use case is strideD >= n. + @param[out] + info pointer to rocblas_int. Array of batch_count integers on the GPU.\n + If info[l] = 0, successful exit for A_l. If info[l] = 1, the algorithm did not converge for A_l. + @param[in] + batch_count rocblas_int. batch_count >= 0.\n + Number of matrices in the batch. + **************************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_ssyevdj_strided_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + float* A, + const rocblas_int lda, + const rocblas_stride strideA, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); + +ROCSOLVER_EXPORT rocblas_status rocsolver_dsyevdj_strided_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + double* A, + const rocblas_int lda, + const rocblas_stride strideA, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); +//! @} + +/*! @{ + \brief HEEVDJ_STRIDED_BATCHED computes the eigenvalues and optionally the eigenvectors of a + batch of complex Hermitian matrices A_l. + + \details + The eigenvalues are found using the iterative Jacobi algorithm and are returned in ascending order. + The eigenvectors are computed using a divide-and-conquer approach depending on the value of evect. + The computed eigenvectors are orthonormal. + + @param[in] + handle rocblas_handle. + @param[in] + evect #rocblas_evect.\n + Specifies whether the eigenvectors are to be computed. + If evect is rocblas_evect_original, then the eigenvectors are computed. + rocblas_evect_tridiagonal is not supported. + @param[in] + uplo rocblas_fill.\n + Specifies whether the upper or lower part of the Hermitian matrices A_l is stored. + If uplo indicates lower (or upper), then the upper (or lower) part of A_l + is not used. + @param[in] + n rocblas_int. n >= 0.\n + Number of rows and columns of matrices A_l. + @param[inout] + A Pointer to type. Array on the GPU (the size depends on the value of strideA).\n + On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and + the algorithm converged; otherwise the contents of A_l are destroyed. + @param[in] + lda rocblas_int. lda >= n.\n + Specifies the leading dimension of matrices A_l. + @param[in] + strideA rocblas_stride.\n + Stride from the start of one matrix A_l to the next one A_(l+1). + There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. + @param[out] + D pointer to real type. Array on the GPU (the size depends on the value of strideD).\n + The eigenvalues of A_l in increasing order. + @param[in] + strideD rocblas_stride.\n + Stride from the start of one vector D_l to the next one D_(l+1). + There is no restriction for the value of strideD. Normal use case is strideD >= n. + @param[out] + info pointer to rocblas_int. Array of batch_count integers on the GPU.\n + If info[l] = 0, successful exit for A_l. If info[l] = 1, the algorithm did not converge for A_l. + @param[in] + batch_count rocblas_int. batch_count >= 0.\n + Number of matrices in the batch. + **************************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_cheevdj_strided_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_float_complex* A, + const rocblas_int lda, + const rocblas_stride strideA, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); + +ROCSOLVER_EXPORT rocblas_status rocsolver_zheevdj_strided_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* A, + const rocblas_int lda, + const rocblas_stride strideA, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); +//! @} + +/*! @{ + \brief SYGVDJ computes the eigenvalues and (optionally) eigenvectors of + a real generalized symmetric-definite eigenproblem. + + \details + The problem solved by this function is either of the form + + \f[ + \begin{array}{cl} + A X = \lambda B X & \: \text{1st form,}\\ + A B X = \lambda X & \: \text{2nd form, or}\\ + B A X = \lambda X & \: \text{3rd form,} + \end{array} + \f] + + depending on the value of itype. The eigenvalues are found using the iterative Jacobi algorithm, + and are returned in ascending order. The eigenvectors are computed using a divide-and-conquer algorithm, + depending on the value of evect. + + When computed, the matrix Z of eigenvectors is normalized as follows: + + \f[ + \begin{array}{cl} + Z^T B Z=I & \: \text{if 1st or 2nd form, or}\\ + Z^T B^{-1} Z=I & \: \text{if 3rd form.} + \end{array} + \f] + + @param[in] + handle rocblas_handle. + @param[in] + itype #rocblas_eform.\n + Specifies the form of the generalized eigenproblem. + @param[in] + evect #rocblas_evect.\n + Specifies whether the eigenvectors are to be computed. + If evect is rocblas_evect_original, then the eigenvectors are computed. + rocblas_evect_tridiagonal is not supported. + @param[in] + uplo rocblas_fill.\n + Specifies whether the upper or lower parts of the matrices A and B are stored. + If uplo indicates lower (or upper), then the upper (or lower) parts of A and B + are not used. + @param[in] + n rocblas_int. n >= 0.\n + Number of rows and columns of matrix A. + @param[inout] + A pointer to type. Array on the GPU of dimension lda*n.\n + On entry, the matrix A. On exit, the normalized matrix Z of eigenvectors if they were computed + and the algorithm converged; otherwise the contents of A are destroyed. + @param[in] + lda rocblas_int. lda >= n.\n + Specifies the leading dimension of matrix A. + @param[inout] + B pointer to type. Array on the GPU of dimension ldb*n.\n + On entry, the symmetric positive definite matrix B. On exit, + the triangular factor of B as returned by \ref rocsolver_spotrf "POTRF". + @param[in] + ldb rocblas_int. ldb >= n.\n + Specifies the leading dimension of matrix B. + @param[out] + D pointer to type. Array on the GPU of dimension n.\n + The eigenvalues in increasing order. + @param[out] + info pointer to a rocblas_int on the GPU.\n + If info = 0, successful exit. If info = 1, the algorithm did not converge. + If info = n + i, the leading minor of order i of B is not positive definite. + **************************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_ssygvdj(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + float* A, + const rocblas_int lda, + float* B, + const rocblas_int ldb, + float* D, + rocblas_int* info); + +ROCSOLVER_EXPORT rocblas_status rocsolver_dsygvdj(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + double* A, + const rocblas_int lda, + double* B, + const rocblas_int ldb, + double* D, + rocblas_int* info); +//! @} + +/*! @{ + \brief HEGVDJ computes the eigenvalues and (optionally) eigenvectors of + a complex generalized Hermitian-definite eigenproblem. + + \details + The problem solved by this function is either of the form + + \f[ + \begin{array}{cl} + A X = \lambda B X & \: \text{1st form,}\\ + A B X = \lambda X & \: \text{2nd form, or}\\ + B A X = \lambda X & \: \text{3rd form,} + \end{array} + \f] + + depending on the value of itype. The eigenvalues are found using the iterative Jacobi algorithm, + and are returned in ascending order. The eigenvectors are computed using a divide-and-conquer algorithm, + depending on the value of evect. + + When computed, the matrix Z of eigenvectors is normalized as follows: + + \f[ + \begin{array}{cl} + Z^H B Z=I & \: \text{if 1st or 2nd form, or}\\ + Z^H B^{-1} Z=I & \: \text{if 3rd form.} + \end{array} + \f] + + @param[in] + handle rocblas_handle. + @param[in] + itype #rocblas_eform.\n + Specifies the form of the generalized eigenproblem. + @param[in] + evect #rocblas_evect.\n + Specifies whether the eigenvectors are to be computed. + If evect is rocblas_evect_original, then the eigenvectors are computed. + rocblas_evect_tridiagonal is not supported. + @param[in] + uplo rocblas_fill.\n + Specifies whether the upper or lower parts of the matrices A and B are stored. + If uplo indicates lower (or upper), then the upper (or lower) parts of A and B + are not used. + @param[in] + n rocblas_int. n >= 0.\n + Number of rows and columns of matrix A. + @param[inout] + A pointer to type. Array on the GPU of dimension lda*n.\n + On entry, the matrix A. On exit, the normalized matrix Z of eigenvectors if they were computed + and the algorithm converged; otherwise the contents of A are destroyed. + @param[in] + lda rocblas_int. lda >= n.\n + Specifies the leading dimension of matrix A. + @param[inout] + B pointer to type. Array on the GPU of dimension ldb*n.\n + On entry, the Hermitian positive definite matrix B. On exit, + the triangular factor of B as returned by \ref rocsolver_spotrf "POTRF". + @param[in] + ldb rocblas_int. ldb >= n.\n + Specifies the leading dimension of matrix B. + @param[out] + D pointer to real type. Array on the GPU of dimension n.\n + The eigenvalues in increasing order. + @param[out] + info pointer to a rocblas_int on the GPU.\n + If info = 0, successful exit. If info = 1, the algorithm did not converge. + If info = n + i, the leading minor of order i of B is not positive definite. + **************************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_chegvdj(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_float_complex* A, + const rocblas_int lda, + rocblas_float_complex* B, + const rocblas_int ldb, + float* D, + rocblas_int* info); + +ROCSOLVER_EXPORT rocblas_status rocsolver_zhegvdj(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* A, + const rocblas_int lda, + rocblas_double_complex* B, + const rocblas_int ldb, + double* D, + rocblas_int* info); +//! @} + +/*! @{ + \brief SYGVDJ_BATCHED computes the eigenvalues and (optionally) eigenvectors of + batch of real generalized symmetric-definite eigenproblems. + + \details + For each instance in the batch, the problem solved by this function is either of the form + + \f[ + \begin{array}{cl} + A_l X_l = \lambda B_l X_l & \: \text{1st form,}\\ + A_l B_l X_l = \lambda X_l & \: \text{2nd form, or}\\ + B_l A_l X_l = \lambda X_l & \: \text{3rd form,} + \end{array} + \f] + + depending on the value of itype. The eigenvalues are found using the iterative Jacobi algorithm, + and are returned in ascending order. The eigenvectors are computed using a divide-and-conquer algorithm, + depending on the value of evect. + + When computed, the matrix Z_l of eigenvectors is normalized as follows: + + \f[ + \begin{array}{cl} + Z^T_l B_l Z_l=I & \: \text{if 1st or 2nd form, or}\\ + Z^T_l B^{-1}_l Z_l=I & \: \text{if 3rd form.} + \end{array} + \f] + + @param[in] + handle rocblas_handle. + @param[in] + itype #rocblas_eform.\n + Specifies the form of the generalized eigenproblems. + @param[in] + evect #rocblas_evect.\n + Specifies whether the eigenvectors are to be computed. + If evect is rocblas_evect_original, then the eigenvectors are computed. + rocblas_evect_tridiagonal is not supported. + @param[in] + uplo rocblas_fill.\n + Specifies whether the upper or lower parts of the matrices A_l and B_l are stored. + If uplo indicates lower (or upper), then the upper (or lower) parts of A_l and B_l + are not used. + @param[in] + n rocblas_int. n >= 0.\n + Number of rows and columns of matrix A_l. + @param[inout] + A array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n + On entry, the matrices A_l. On exit, the normalized matrices Z_l of eigenvectors if they were computed + and the algorithm converged; otherwise the contents of A_l are destroyed. + @param[in] + lda rocblas_int. lda >= n.\n + Specifies the leading dimension of matrices A_l. + @param[inout] + B array of pointers to type. Each pointer points to an array on the GPU of dimension ldb*n.\n + On entry, the symmetric positive definite matrices B_l. On exit, + the triangular factor of B_l as returned by \ref rocsolver_spotrf_batched "POTRF_BATCHED". + @param[in] + ldb rocblas_int. ldb >= n.\n + Specifies the leading dimension of matrices B_l. + @param[out] + D pointer to type. Array on the GPU (the size depends on the value of strideD).\n + The eigenvalues in increasing order. + @param[in] + strideD rocblas_stride.\n + Stride from the start of one vector D_l to the next one D_(l+1). + There is no restriction for the value of strideD. Normal use is strideD >= n. + @param[out] + info pointer to rocblas_int. Array of batch_count integers on the GPU.\n + If info[l] = 0, successful exit. If info[l] = 1, the algorithm did not converge for matrix A_l. + If info[l] = n + i, the leading minor of order i of B_l is not positive definite. + @param[in] + batch_count rocblas_int. batch_count >= 0.\n + Number of eigenproblems in the batch. + **************************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_ssygvdj_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + float* const A[], + const rocblas_int lda, + float* const B[], + const rocblas_int ldb, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); + +ROCSOLVER_EXPORT rocblas_status rocsolver_dsygvdj_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + double* const A[], + const rocblas_int lda, + double* const B[], + const rocblas_int ldb, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); +//! @} + +/*! @{ + \brief HEGVDJ_BATCHED computes the eigenvalues and (optionally) eigenvectors of + batch of complex generalized Hermitian-definite eigenproblems. + + \details + For each instance in the batch, the problem solved by this function is either of the form + + \f[ + \begin{array}{cl} + A_l X_l = \lambda B_l X_l & \: \text{1st form,}\\ + A_l B_l X_l = \lambda X_l & \: \text{2nd form, or}\\ + B_l A_l X_l = \lambda X_l & \: \text{3rd form,} + \end{array} + \f] + + depending on the value of itype. The eigenvalues are found using the iterative Jacobi algorithm, + and are returned in ascending order. The eigenvectors are computed using a divide-and-conquer algorithm, + depending on the value of evect. + + When computed, the matrix Z_l of eigenvectors is normalized as follows: + + \f[ + \begin{array}{cl} + Z^H_l B_l Z_l=I & \: \text{if 1st or 2nd form, or}\\ + Z^H_l B^{-1}_l Z_l=I & \: \text{if 3rd form.} + \end{array} + \f] + + @param[in] + handle rocblas_handle. + @param[in] + itype #rocblas_eform.\n + Specifies the form of the generalized eigenproblems. + @param[in] + evect #rocblas_evect.\n + Specifies whether the eigenvectors are to be computed. + If evect is rocblas_evect_original, then the eigenvectors are computed. + rocblas_evect_tridiagonal is not supported. + @param[in] + uplo rocblas_fill.\n + Specifies whether the upper or lower parts of the matrices A_l and B_l are stored. + If uplo indicates lower (or upper), then the upper (or lower) parts of A_l and B_l + are not used. + @param[in] + n rocblas_int. n >= 0.\n + Number of rows and columns of matrix A_l. + @param[inout] + A array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n + On entry, the matrices A_l. On exit, the normalized matrices Z_l of eigenvectors if they were computed + and the algorithm converged; otherwise the contents of A_l are destroyed. + @param[in] + lda rocblas_int. lda >= n.\n + Specifies the leading dimension of matrices A_l. + @param[inout] + B array of pointers to type. Each pointer points to an array on the GPU of dimension ldb*n.\n + On entry, the Hermitian positive definite matrices B_l. On exit, + the triangular factor of B_l as returned by \ref rocsolver_spotrf_batched "POTRF_BATCHED". + @param[in] + ldb rocblas_int. ldb >= n.\n + Specifies the leading dimension of matrices B_l. + @param[out] + D pointer to real type. Array on the GPU (the size depends on the value of strideD).\n + The eigenvalues in increasing order. + @param[in] + strideD rocblas_stride.\n + Stride from the start of one vector D_l to the next one D_(l+1). + There is no restriction for the value of strideD. Normal use is strideD >= n. + @param[out] + info pointer to rocblas_int. Array of batch_count integers on the GPU.\n + If info[l] = 0, successful exit. If info[l] = 1, the algorithm did not converge for matrix A_l. + If info[l] = n + i, the leading minor of order i of B_l is not positive definite. + @param[in] + batch_count rocblas_int. batch_count >= 0.\n + Number of eigenproblems in the batch. + **************************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_chegvdj_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_float_complex* const A[], + const rocblas_int lda, + rocblas_float_complex* const B[], + const rocblas_int ldb, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); + +ROCSOLVER_EXPORT rocblas_status rocsolver_zhegvdj_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* const A[], + const rocblas_int lda, + rocblas_double_complex* const B[], + const rocblas_int ldb, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); //! @} /*! @{ - \brief SYEVD_STRIDED_BATCHED computes the eigenvalues and optionally the eigenvectors of a batch of - real symmetric matrices A_l. + \brief SYGVDJ_STRIDED_BATCHED computes the eigenvalues and (optionally) eigenvectors of + batch of real generalized symmetric-definite eigenproblems. \details - The eigenvalues are returned in ascending order. The eigenvectors are computed using a - divide-and-conquer algorithm, depending on the value of evect. The computed eigenvectors - are orthonormal. + For each instance in the batch, the problem solved by this function is either of the form + + \f[ + \begin{array}{cl} + A_l X_l = \lambda B_l X_l & \: \text{1st form,}\\ + A_l B_l X_l = \lambda X_l & \: \text{2nd form, or}\\ + B_l A_l X_l = \lambda X_l & \: \text{3rd form,} + \end{array} + \f] + + depending on the value of itype. The eigenvalues are found using the iterative Jacobi algorithm, + and are returned in ascending order. The eigenvectors are computed using a divide-and-conquer algorithm, + depending on the value of evect. + + When computed, the matrix Z_l of eigenvectors is normalized as follows: + + \f[ + \begin{array}{cl} + Z^T_l B_l Z_l=I & \: \text{if 1st or 2nd form, or}\\ + Z^T_l B^{-1}_l Z_l=I & \: \text{if 3rd form.} + \end{array} + \f] @param[in] handle rocblas_handle. @param[in] - evect #rocblas_evect. + itype #rocblas_eform.\n + Specifies the form of the generalized eigenproblems. + @param[in] + evect #rocblas_evect.\n Specifies whether the eigenvectors are to be computed. If evect is rocblas_evect_original, then the eigenvectors are computed. rocblas_evect_tridiagonal is not supported. @param[in] - uplo rocblas_fill. - Specifies whether the upper or lower part of the symmetric matrices A_l is stored. - If uplo indicates lower (or upper), then the upper (or lower) part of A_l - is not used. + uplo rocblas_fill.\n + Specifies whether the upper or lower parts of the matrices A_l and B_l are stored. + If uplo indicates lower (or upper), then the upper (or lower) parts of A_l and B_l + are not used. @param[in] - n rocblas_int. n >= 0. - Number of rows and columns of matrices A_l. + n rocblas_int. n >= 0.\n + Number of rows and columns of matrix A_l. @param[inout] - A pointer to type. Array on the GPU (the size depends on the value of strideA). - On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and - the algorithm converged; otherwise the contents of A_l are destroyed. + A pointer to type. Array on the GPU (the size depends on the value of strideA).\n + On entry, the matrices A_l. On exit, the normalized matrices Z_l of eigenvectors if they were computed + and the algorithm converged; otherwise the contents of A_l are destroyed. @param[in] - lda rocblas_int. lda >= n. + lda rocblas_int. lda >= n.\n Specifies the leading dimension of matrices A_l. @param[in] - strideA rocblas_stride. + strideA rocblas_stride.\n Stride from the start of one matrix A_l to the next one A_(l+1). - There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. - @param[out] - D pointer to type. Array on the GPU (the size depends on the value of strideD). - The eigenvalues of A_l in increasing order. + There is no restriction for the value of strideA. Normal use is strideA >= lda*n. + @param[inout] + B pointer to type. Array on the GPU (the size depends on the value of strideB).\n + On entry, the symmetric positive definite matrices B_l. On exit, + the triangular factor of B_l as returned by \ref rocsolver_spotrf_strided_batched "POTRF_STRIDED_BATCHED". @param[in] - strideD rocblas_stride. - Stride from the start of one vector D_l to the next one D_(l+1). - There is no restriction for the value of strideD. Normal use case is strideD >= n. + ldb rocblas_int. ldb >= n.\n + Specifies the leading dimension of matrices B_l. + @param[in] + strideB rocblas_stride.\n + Stride from the start of one matrix B_l to the next one B_(l+1). + There is no restriction for the value of strideB. Normal use is strideB >= ldb*n. @param[out] - E pointer to type. Array on the GPU (the size depends on the value of strideE). - This array is used to work internally with the tridiagonal matrix T_l associated with A_l. - On exit, if info[l] > 0, E_l contains the unconverged off-diagonal elements of T_l - (or properly speaking, a tridiagonal matrix equivalent to T_l). The diagonal elements - of this matrix are in D_l; those that converged correspond to a subset of the - eigenvalues of A_l (not necessarily ordered). + D pointer to type. Array on the GPU (the size depends on the value of strideD).\n + The eigenvalues in increasing order. @param[in] - strideE rocblas_stride. - Stride from the start of one vector E_l to the next one E_(l+1). - There is no restriction for the value of strideE. Normal use case is strideE >= n. + strideD rocblas_stride.\n + Stride from the start of one vector D_l to the next one D_(l+1). + There is no restriction for the value of strideD. Normal use is strideD >= n. @param[out] - info pointer to rocblas_int. Array of batch_count integers on the GPU. - If info[l] = 0, successful exit for matrix A_l. - If info[l] = i > 0 and evect is rocblas_evect_none, the algorithm did not converge. - i elements of E_l did not converge to zero. - If info[l] = i > 0 and evect is rocblas_evect_original, the algorithm failed to - compute an eigenvalue in the submatrix from [i/(n+1), i/(n+1)] to [i%(n+1), i%(n+1)]. + info pointer to rocblas_int. Array of batch_count integers on the GPU.\n + If info[l] = 0, successful exit. If info[l] = 1, the algorithm did not converge for matrix A_l. + If info[l] = n + i, the leading minor of order i of B_l is not positive definite. @param[in] - batch_count rocblas_int. batch_count >= 0. - Number of matrices in the batch. + batch_count rocblas_int. batch_count >= 0.\n + Number of eigenproblems in the batch. **************************************************************************************/ -ROCSOLVER_EXPORT rocblas_status rocsolver_ssyevd_strided_batched(rocblas_handle handle, - const rocblas_evect evect, - const rocblas_fill uplo, - const rocblas_int n, - float* A, - const rocblas_int lda, - const rocblas_stride strideA, - float* D, - const rocblas_stride strideD, - float* E, - const rocblas_stride strideE, - rocblas_int* info, - const rocblas_int batch_count); +ROCSOLVER_EXPORT rocblas_status rocsolver_ssygvdj_strided_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + float* A, + const rocblas_int lda, + const rocblas_stride strideA, + float* B, + const rocblas_int ldb, + const rocblas_stride strideB, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); -ROCSOLVER_EXPORT rocblas_status rocsolver_dsyevd_strided_batched(rocblas_handle handle, - const rocblas_evect evect, - const rocblas_fill uplo, - const rocblas_int n, - double* A, - const rocblas_int lda, - const rocblas_stride strideA, - double* D, - const rocblas_stride strideD, - double* E, - const rocblas_stride strideE, - rocblas_int* info, - const rocblas_int batch_count); +ROCSOLVER_EXPORT rocblas_status rocsolver_dsygvdj_strided_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + double* A, + const rocblas_int lda, + const rocblas_stride strideA, + double* B, + const rocblas_int ldb, + const rocblas_stride strideB, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); //! @} /*! @{ - \brief HEEVD_STRIDED_BATCHED computes the eigenvalues and optionally the eigenvectors of a batch of - Hermitian matrices A_l. + \brief HEGVDJ_STRIDED_BATCHED computes the eigenvalues and (optionally) eigenvectors of + batch of complex generalized Hermitian-definite eigenproblems. \details - The eigenvalues are returned in ascending order. The eigenvectors are computed using a - divide-and-conquer algorithm, depending on the value of evect. The computed eigenvectors - are orthonormal. + For each instance in the batch, the problem solved by this function is either of the form + + \f[ + \begin{array}{cl} + A_l X_l = \lambda B_l X_l & \: \text{1st form,}\\ + A_l B_l X_l = \lambda X_l & \: \text{2nd form, or}\\ + B_l A_l X_l = \lambda X_l & \: \text{3rd form,} + \end{array} + \f] + + depending on the value of itype. The eigenvalues are found using the iterative Jacobi algorithm, + and are returned in ascending order. The eigenvectors are computed using a divide-and-conquer algorithm, + depending on the value of evect. + + When computed, the matrix Z_l of eigenvectors is normalized as follows: + + \f[ + \begin{array}{cl} + Z^H_l B_l Z_l=I & \: \text{if 1st or 2nd form, or}\\ + Z^H_l B^{-1}_l Z_l=I & \: \text{if 3rd form.} + \end{array} + \f] @param[in] handle rocblas_handle. @param[in] - evect #rocblas_evect. + itype #rocblas_eform.\n + Specifies the form of the generalized eigenproblems. + @param[in] + evect #rocblas_evect.\n Specifies whether the eigenvectors are to be computed. If evect is rocblas_evect_original, then the eigenvectors are computed. rocblas_evect_tridiagonal is not supported. @param[in] - uplo rocblas_fill. - Specifies whether the upper or lower part of the Hermitian matrices A_l is stored. - If uplo indicates lower (or upper), then the upper (or lower) part of A_l - is not used. + uplo rocblas_fill.\n + Specifies whether the upper or lower parts of the matrices A_l and B_l are stored. + If uplo indicates lower (or upper), then the upper (or lower) parts of A_l and B_l + are not used. @param[in] - n rocblas_int. n >= 0. - Number of rows and columns of matrices A_l. + n rocblas_int. n >= 0.\n + Number of rows and columns of matrix A_l. @param[inout] - A pointer to type. Array on the GPU (the size depends on the value of strideA). - On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and - the algorithm converged; otherwise the contents of A_l are destroyed. + A pointer to type. Array on the GPU (the size depends on the value of strideA).\n + On entry, the matrices A_l. On exit, the normalized matrices Z_l of eigenvectors if they were computed + and the algorithm converged; otherwise the contents of A_l are destroyed. @param[in] - lda rocblas_int. lda >= n. + lda rocblas_int. lda >= n.\n Specifies the leading dimension of matrices A_l. @param[in] - strideA rocblas_stride. + strideA rocblas_stride.\n Stride from the start of one matrix A_l to the next one A_(l+1). - There is no restriction for the value of strideA. Normal use case is strideA >= lda*n. - @param[out] - D pointer to real type. Array on the GPU (the size depends on the value of strideD). - The eigenvalues of A_l in increasing order. + There is no restriction for the value of strideA. Normal use is strideA >= lda*n. + @param[inout] + B pointer to type. Array on the GPU (the size depends on the value of strideB).\n + On entry, the Hermitian positive definite matrices B_l. On exit, + the triangular factor of B_l as returned by \ref rocsolver_spotrf_batched "POTRF_BATCHED". @param[in] - strideD rocblas_stride. - Stride from the start of one vector D_l to the next one D_(l+1). - There is no restriction for the value of strideD. Normal use case is strideD >= n. + ldb rocblas_int. ldb >= n.\n + Specifies the leading dimension of matrices B_l. + @param[in] + strideB rocblas_stride.\n + Stride from the start of one matrix B_l to the next one B_(l+1). + There is no restriction for the value of strideB. Normal use is strideB >= ldb*n. @param[out] - E pointer to real type. Array on the GPU (the size depends on the value of strideE). - This array is used to work internally with the tridiagonal matrix T_l associated with A_l. - On exit, if info[l] > 0, E_l contains the unconverged off-diagonal elements of T_l - (or properly speaking, a tridiagonal matrix equivalent to T_l). The diagonal elements - of this matrix are in D_l; those that converged correspond to a subset of the - eigenvalues of A_l (not necessarily ordered). + D pointer to real type. Array on the GPU (the size depends on the value of strideD).\n + The eigenvalues in increasing order. @param[in] - strideE rocblas_stride. - Stride from the start of one vector E_l to the next one E_(l+1). - There is no restriction for the value of strideE. Normal use case is strideE >= n. + strideD rocblas_stride.\n + Stride from the start of one vector D_l to the next one D_(l+1). + There is no restriction for the value of strideD. Normal use is strideD >= n. @param[out] - info pointer to rocblas_int. Array of batch_count integers on the GPU. - If info[l] = 0, successful exit for matrix A_l. - If info[l] = i > 0 and evect is rocblas_evect_none, the algorithm did not converge. - i elements of E_l did not converge to zero. - If info[l] = i > 0 and evect is rocblas_evect_original, the algorithm failed to - compute an eigenvalue in the submatrix from [i/(n+1), i/(n+1)] to [i%(n+1), i%(n+1)]. + info pointer to rocblas_int. Array of batch_count integers on the GPU.\n + If info[l] = 0, successful exit. If info[l] = 1, the algorithm did not converge for matrix A_l. + If info[l] = n + i, the leading minor of order i of B_l is not positive definite. @param[in] - batch_count rocblas_int. batch_count >= 0. - Number of matrices in the batch. + batch_count rocblas_int. batch_count >= 0.\n + Number of eigenproblems in the batch. **************************************************************************************/ -ROCSOLVER_EXPORT rocblas_status rocsolver_cheevd_strided_batched(rocblas_handle handle, - const rocblas_evect evect, - const rocblas_fill uplo, - const rocblas_int n, - rocblas_float_complex* A, - const rocblas_int lda, - const rocblas_stride strideA, - float* D, - const rocblas_stride strideD, - float* E, - const rocblas_stride strideE, - rocblas_int* info, - const rocblas_int batch_count); +ROCSOLVER_EXPORT rocblas_status rocsolver_chegvdj_strided_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_float_complex* A, + const rocblas_int lda, + const rocblas_stride strideA, + rocblas_float_complex* B, + const rocblas_int ldb, + const rocblas_stride strideB, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); -ROCSOLVER_EXPORT rocblas_status rocsolver_zheevd_strided_batched(rocblas_handle handle, - const rocblas_evect evect, - const rocblas_fill uplo, - const rocblas_int n, - rocblas_double_complex* A, - const rocblas_int lda, - const rocblas_stride strideA, - double* D, - const rocblas_stride strideD, - double* E, - const rocblas_stride strideE, - rocblas_int* info, - const rocblas_int batch_count); +ROCSOLVER_EXPORT rocblas_status rocsolver_zhegvdj_strided_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* A, + const rocblas_int lda, + const rocblas_stride strideA, + rocblas_double_complex* B, + const rocblas_int ldb, + const rocblas_stride strideB, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count); //! @} /*! @{ @@ -24116,9 +25152,9 @@ ROCSOLVER_EXPORT rocblas_status */ /*! \brief CREATE_RFINFO initializes the structure rfinfo that contains the meta data and descriptors of the involved matrices - required by the re-factorization functions + required by the re-factorization functions \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and - by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE". + by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE". \details @param[out] @@ -24132,7 +25168,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_create_rfinfo(rocsolver_rfinfo* rfinfo rocblas_handle handle); /*! \brief DESTROY_RFINFO destroys the structure rfinfo used by the re-factorization functions - \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and + \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE". \details @@ -24145,7 +25181,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_destroy_rfinfo(rocsolver_rfinfo rfinfo /*! \brief SET_RFINFO_MODE sets the mode of the structure rfinfo required by the re-factorization functions \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and - by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE". + by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE". \details @param[in] @@ -24153,7 +25189,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_destroy_rfinfo(rocsolver_rfinfo rfinfo The rfinfo struct to be set up. @param[in] mode #rocsolver_rfinfo_mode. - Use rocsolver_rfinfo_mode_cholesky when the Cholesky factorization is required. + Use rocsolver_rfinfo_mode_cholesky when the Cholesky factorization is required. ********************************************************************/ ROCSOLVER_EXPORT rocblas_status rocsolver_set_rfinfo_mode(rocsolver_rfinfo rfinfo, @@ -24161,7 +25197,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_set_rfinfo_mode(rocsolver_rfinfo rfinf /*! \brief GET_RFINFO_MODE gets the mode of the structure rfinfo required by the re-factorization functions \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and - by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE". + by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE". \details @param[in] @@ -24169,7 +25205,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_set_rfinfo_mode(rocsolver_rfinfo rfinf The referenced rfinfo struct. @param[out] mode #rocsolver_rfinfo_mode. - The queried mode. + The queried mode. ********************************************************************/ ROCSOLVER_EXPORT rocblas_status rocsolver_get_rfinfo_mode(rocsolver_rfinfo rfinfo, @@ -24344,7 +25380,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_splitlu(rocblas_handle handle, /*! @{ \brief CSRRF_ANALYSIS performs the analysis phase required by the re-factorization functions - \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and + \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE". \details Consider a sparse matrix \f$M\f$ previously factorized as @@ -24353,20 +25389,20 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_splitlu(rocblas_handle handle, Q^TMQ = L_ML_M^T \f] - (Cholesky factorization for the symmetric positive definite case), or + (Cholesky factorization for the symmetric positive definite case), or - \f[ + \f[ PMQ = L_MU_M \f] - (LU factorization for the general case) + (LU factorization for the general case) where \f$L_M\f$ is lower triangular (with unit diagonal in the general case), \f$U_M\f$ is upper triangular, and \f$P\f$ and \f$Q\f$ are permutation matrices associated with pivoting and re-ordering (to minimize fill-in), respectively. The meta data generated by this routine is collected in the output parameter rfinfo. This information will allow the fast re-factorization of another sparse matrix \f$A\f$ as - \f[ + \f[ Q^TAQ = L_AL_A^T, \quad \text{or} \f] @@ -24382,17 +25418,17 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_splitlu(rocblas_handle handle, as long as \f$A\f$ has the same sparsity pattern as the previous matrix \f$M\f$. - This function supposes that the rfinfo struct has been initialized by \ref rocsolver_create_rfinfo "RFINFO_CREATE". - By default, rfinfo is set up to work with the LU factorization (general matrices). If the matrix is symmetric positive definite, - and the Cholesky factorization is - desired, then the corresponding mode must be manually set up by \ref rocsolver_set_rfinfo_mode "SET_RFINFO_MODE". This function - does not automatically detect symmetry. + This function supposes that the rfinfo struct has been initialized by \ref rocsolver_create_rfinfo "RFINFO_CREATE". + By default, rfinfo is set up to work with the LU factorization (general matrices). If the matrix is symmetric positive definite, + and the Cholesky factorization is + desired, then the corresponding mode must be manually set up by \ref rocsolver_set_rfinfo_mode "SET_RFINFO_MODE". This function + does not automatically detect symmetry. For the LU factorization mode, the LU factors \f$L_M\f$ and \f$U_M\f$ must be passed in a bundle matrix \f$T=(L_M-I)+U_M\f$ as returned by \ref rocsolver_scsrrf_sumlu "CSRRF_SUMLU". For the Cholesky mode, the lower triangular part of \f$T\f$ must contain the Cholesky factor \f$L_M\f$; the strictly upper triangular - part of \f$T\f$ will be ignored. Similarly, the strictly upper triangular part of \f$M\f$ is ignored when working - in Cholesky mode. + part of \f$T\f$ will be ignored. Similarly, the strictly upper triangular part of \f$M\f$ is ignored when working + in Cholesky mode. \note If only a re-factorization will be executed (i.e. no solver phase), then nrhs can be set to zero @@ -24421,7 +25457,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_splitlu(rocblas_handle handle, @param[in] valM pointer to type. Array on the GPU of dimension nnzM. The values of the non-zero elements of M. The strictly upper triangular entries are - not referenced when working in Cholesky mode. + not referenced when working in Cholesky mode. @param[in] nnzT rocblas_int. nnzT >= 0. The number of non-zero elements in T. @@ -24441,7 +25477,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_splitlu(rocblas_handle handle, pivP pointer to rocblas_int. Array on the GPU of dimension n. Contains the pivot indices representing the permutation matrix P, i.e. the order in which the rows of matrix M were re-arranged. When working in Cholesky mode, - this array is not referenced and can be null. + this array is not referenced and can be null. @param[in] pivQ pointer to rocblas_int. Array on the GPU of dimension n. Contains the pivot indices representing the permutation matrix Q, i.e. the @@ -24515,8 +25551,8 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_analysis(rocblas_handle handle, This function supposes that rfinfo has been updated, by function \ref rocsolver_scsrrf_analysis "CSRRF_ANALYSIS", after the analysis phase of the previous matrix M and its initial factorization. Both functions, CSRRF_ANALYSIS and - CSRRF_REFACTLU must be run with the same rfinfo mode (LU factorization, the default mode), otherwise the workflow will - result in an error. + CSRRF_REFACTLU must be run with the same rfinfo mode (LU factorization, the default mode), otherwise the workflow will + result in an error. @param[in] handle rocblas_handle. @@ -24595,7 +25631,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_refactlu(rocblas_handle handle, /*! @{ \brief CSRRF_REFACTCHOL performs a fast Cholesky factorization of a sparse symmetric positive definite matrix \f$A\f$ - based on the information from the factorization of a previous matrix \f$M\f$ with the same sparsity pattern + based on the information from the factorization of a previous matrix \f$M\f$ with the same sparsity pattern (re-factorization). \details Consider a sparse matrix \f$M\f$ previously factorized as @@ -24652,7 +25688,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_refactlu(rocblas_handle handle, @param[out] valT pointer to type. Array on the GPU of dimension nnzT. The values of the non-zero elements of the new Cholesky factor L_A. - The strictly upper triangular entries of this array are not referenced. + The strictly upper triangular entries of this array are not referenced. @param[in] pivQ pointer to rocblas_int. Array on the GPU of dimension n. Contains the pivot indices representing the permutation matrix Q, i.e. the @@ -24705,23 +25741,23 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_refactchol(rocblas_handle handl Q^TAQ = L_AL_A^T \f] - (Cholesky factorization for the symmetric positive definite case), or + (Cholesky factorization for the symmetric positive definite case), or - \f[ + \f[ PAQ = L_AU_A \f] - (LU factorization for the general case), + (LU factorization for the general case), and \f$B\f$ is a dense matrix of right hand sides. - This function supposes that rfinfo has been updated by function \ref rocsolver_scsrrf_analysis "CSRRF_ANALYSIS", + This function supposes that rfinfo has been updated by function \ref rocsolver_scsrrf_analysis "CSRRF_ANALYSIS", after the analysis phase. Both functions, CSRRF_ANALYSIS and CSRRF_SOLVE must be run with the same rfinfo mode (LU or Cholesky factorization), otherwise the workflow will result in an error. For the LU factorization mode, the LU factors \f$L_A\f$ and \f$U_A\f$ must be passed in a bundle matrix \f$T=(L_A-I)+U_A\f$ - as returned by \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" or \ref rocsolver_scsrrf_sumlu "CSRRF_SUMLU". For the Cholesky mode, + as returned by \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" or \ref rocsolver_scsrrf_sumlu "CSRRF_SUMLU". For the Cholesky mode, the lower triangular part of \f$T\f$ must contain the Cholesky factor \f$L_A\f$; the strictly upper triangular part of \f$T\f$ will be ignored. diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index dd7fc560a..1b3998cba 100755 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -1,5 +1,5 @@ # ########################################################################## -# Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -172,6 +172,12 @@ set(rocsolver_lapack_source lapack/roclapack_sygvx_hegvx_batched.cpp lapack/roclapack_sygvx_hegvx_strided_batched.cpp lapack/roclapack_sygvdx_hegvdx_inplace.cpp + lapack/roclapack_syevdj_heevdj.cpp + lapack/roclapack_syevdj_heevdj_batched.cpp + lapack/roclapack_syevdj_heevdj_strided_batched.cpp + lapack/roclapack_sygvdj_hegvdj.cpp + lapack/roclapack_sygvdj_hegvdj_batched.cpp + lapack/roclapack_sygvdj_hegvdj_strided_batched.cpp ) set(rocsolver_auxiliary_source diff --git a/library/src/auxiliary/rocauxiliary_stedc.hpp b/library/src/auxiliary/rocauxiliary_stedc.hpp index a4e35e153..2cc595f6c 100644 --- a/library/src/auxiliary/rocauxiliary_stedc.hpp +++ b/library/src/auxiliary/rocauxiliary_stedc.hpp @@ -4,7 +4,7 @@ * Univ. of Tennessee, Univ. of California Berkeley, * Univ. of Colorado Denver and NAG Ltd.. * December 2016 - * Copyright (C) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -39,12 +39,17 @@ #include "rocblas.hpp" #include "rocsolver/rocsolver.h" -#define STEDC_BDIM 512 // Number of threads per thread-block used in main stedc kernel +#define STEDC_BDIM 512 // Number of threads per thread-block used in main stedc kernels #define MAXITERS 50 // Max number of iterations for root finding method +#define MAXSWEEPS 20 // Max number of sweeps for Jacobi solver (when used) #define CLASSICQR 1 // used to specify classic QR iteration solver (default case) #define JACOBI 2 // used to specify Jacobi iteration solver #define BISECTION 3 // used to specify bisection and inverse iteration solver +/***************** Device auxiliary functions *****************************************/ +/**************************************************************************************/ + +//--------------------------------------------------------------------------------------// /** SEQ_EVAL evaluates the secular equation at a given point. It accumulates the corrections to the elements in D so that distance to poles are computed accurately **/ template @@ -143,6 +148,7 @@ __device__ void seq_eval(const rocblas_int type, *pt_er = er; } +//--------------------------------------------------------------------------------------// /** SEQ_SOLVE solves secular equation at point k (i.e. computes kth eigenvalue that is within an internal interval). We use rational interpolation and fixed weights method between the 2 poles of the interval. @@ -405,6 +411,7 @@ __device__ rocblas_int seq_solve(const rocblas_int dd, return converged ? 0 : 1; } +//--------------------------------------------------------------------------------------// /** SEQ_SOLVE_EXT solves secular equation at point n (i.e. computes last eigenvalue). We use rational interpolation and fixed weights method between the (n-1)th and nth poles. (TODO: In the future, we could consider using 3 poles for those cases that may need it @@ -597,28 +604,76 @@ __device__ rocblas_int seq_solve_ext(const rocblas_int dd, return converged ? 0 : 1; } +//--------------------------------------------------------------------------------------// /** STEDC_NUM_LEVELS returns the ideal number of times/levels in which a matrix (or split block) will be divided during the divide phase of divide & conquer algorithm. i.e. number of sub-blocks = 2^levels **/ __host__ __device__ inline rocblas_int stedc_num_levels(const rocblas_int n, int solver_mode) { rocblas_int levels = 0; + // return the max number of levels such that the sub-blocks are at least of size 1 + // (i.e. 2^levels <= n), and there are no more than 256 sub-blocks (i.e. 2^levels <= 256) + if(n <= 2) + return levels; switch(solver_mode) { case BISECTION: + { // TODO break; + } - case JACOBI: - // TODO + case JACOBI: // TODO: tuning will be required; using the same tuning as QR for now + { + if(n <= 4) + { + levels = 1; + } + else if(n <= 32) + { + levels = 2; + } + else if(n <= 232) + { + levels = 4; + } + else + { + if(n <= 1946) + { + if(n <= 1692) + { + if(n <= 295) + { + levels = 5; + } + else + { + levels = 7; + } + } + else + { + levels = 7; + } + } + else + { + levels = 8; + } + } break; + } case CLASSICQR: default: - // return the max number of levels such that the sub-blocks are at least of size 8, and - // there are no more than 256 sub-blocks - if(n <= 32) + { + if(n <= 4) + { + levels = 1; + } + else if(n <= 32) { levels = 2; } @@ -652,13 +707,45 @@ __host__ __device__ inline rocblas_int stedc_num_levels(const rocblas_int n, int } } } + } //end switch return levels; } -/** STEDC_SPLIT finds independent blocks in the tridiagonal matrix - given by D and E. (The independent blocks can then be solved in - parallel by the DC algorithm) **/ +//--------------------------------------------------------------------------------------// +/** DE2TRIDIAG generates a tridiagonal matrix from vectors of diagonal entries (D) and + off-diagonal entries (E). **/ +template +__device__ inline void de2tridiag(const int numt, + const rocblas_int id, + const rocblas_int n, + S* D, + S* E, + S* C, + const rocblas_int ldc) +{ + for(rocblas_int k = id; k < n * n; k += numt) + { + rocblas_int i = k % n; + rocblas_int j = k / n; + S val; + bool offd = (i == j + 1); + if(offd || i == j - 1) + val = offd ? E[j] : E[i]; + else + val = (i == j) ? D[i] : 0; + C[i + j * ldc] = val; + } +} + +/*************** Main kernels *********************************************************/ +/**************************************************************************************/ + +//--------------------------------------------------------------------------------------// +/** STEDC_SPLIT finds independent blocks (split-blocks) in the tridiagonal matrix + given by D and E. The independent blocks can then be solved in + parallel by the DC algorithm. + - Call this kernel with batch_count single-threaded groups in x **/ template ROCSOLVER_KERNEL void stedc_split(const rocblas_int n, S* DD, @@ -673,7 +760,7 @@ ROCSOLVER_KERNEL void stedc_split(const rocblas_int n, // select batch instance S* D = DD + (bid * strideD); S* E = EE + (bid * strideE); - rocblas_int* splits = splitsA + bid * (3 * n + 2); + rocblas_int* splits = splitsA + bid * (5 * n + 2); rocblas_int k = 0; //position where the last block starts S tol; //tolerance. If an element of E is <= tol we have an independent block @@ -704,28 +791,330 @@ ROCSOLVER_KERNEL void stedc_split(const rocblas_int n, splits[n + 1] = nb; //also save the number of split blocks } -/** STEDC_KERNEL implements the main loop of the DC algorithm to compute the - eigenvalues/eigenvectors of the symmetric tridiagonal submatrices. **/ +//--------------------------------------------------------------------------------------// +/** STEDC_DIVIDE_KERNEL implements the divide phase of the DC algorithm. It divides each + split-block into a number of sub-blocks. + - Call this kernel with batch_count groups in x. Groups are of size STEDC_BDIM. + - If there are actually more split-blocks than STEDC_BDIM, some threads will work with more + than one split-block sequentially. **/ +template +ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_divide_kernel(const rocblas_int n, + S* DD, + const rocblas_stride strideD, + S* EE, + const rocblas_stride strideE, + rocblas_int* splitsA, + const int solver_mode) +{ + // threads and groups indices + /* --------------------------------------------------- */ + // batch instance id + rocblas_int bid = hipBlockIdx_x; + // split-block id + rocblas_int sid = hipThreadIdx_x; + /* --------------------------------------------------- */ + + // select batch instance to work with + /* --------------------------------------------------- */ + S* D = DD + bid * strideD; + S* E = EE + bid * strideE; + /* --------------------------------------------------- */ + + // temporary arrays in global memory + /* --------------------------------------------------- */ + // contains the beginning of split blocks + rocblas_int* splits = splitsA + bid * (5 * n + 2); + // the sub-blocks sizes + rocblas_int* nsA = splits + n + 2; + // the sub-blocks initial positions + rocblas_int* psA = nsA + n; + /* --------------------------------------------------- */ + + // local variables + /* --------------------------------------------------- */ + // total number of split blocks + rocblas_int nb = splits[n + 1]; + // size of split block + rocblas_int bs; + // beginning of split block + rocblas_int p1; + // beginning of sub-block + rocblas_int p2; + // number of sub-blocks + rocblas_int blks; + // number of level of division + rocblas_int levs; + // other aux variables + S p; + rocblas_int *ns, *ps; + /* --------------------------------------------------- */ + + // work with STEDC_BDIM split blocks in parallel + /* --------------------------------------------------- */ + for(int kb = sid; kb < nb; kb += STEDC_BDIM) + { + // Select current split block + p1 = splits[kb]; + p2 = splits[kb + 1]; + bs = p2 - p1; + ns = nsA + p1; + ps = psA + p1; + + // determine ideal number of sub-blocks in split-block + levs = stedc_num_levels(bs, solver_mode); + blks = 1 << levs; + + // 1. DIVIDE PHASE + /* ----------------------------------------------------------------- */ + // (artificially divide split-block into blks sub-blocks + // find initial positions of each sub-blocks) + + // find sizes of sub-blocks + ns[0] = bs; + rocblas_int t, t2; + for(int i = 0; i < levs; ++i) + { + for(int j = (1 << i); j > 0; --j) + { + t = ns[j - 1]; + t2 = t / 2; + ns[j * 2 - 1] = (2 * t2 < t) ? t2 + 1 : t2; + ns[j * 2 - 2] = t2; + } + } + + // find beginning of sub-blocks and update D elements + p2 = p1; + ps[0] = p2; + for(int i = 1; i < blks; ++i) + { + p2 += ns[i - 1]; + ps[i] = p2; + + // perform sub-block division + p = E[p2 - 1]; + D[p2] -= p; + D[p2 - 1] -= p; + } + } +} + +//--------------------------------------------------------------------------------------// +/** STEDC_SOLVE_KERNEL implements the solver phase of the DC algorithm to + compute the eigenvalues/eigenvectors of the different sub-blocks of each split-block. + A matrix in the batch could have many split-blocks, and each split-block could be + divided in a maximum of nn sub-blocks. + - Call his kernel with batch_count groups in z, STEDC_NUM_SPLIT_BLKS groups in y + and nn groups in x. Groups are size STEDC_BDIM. + - STEDC_NUM_SPLIT_BLKS is fixed (is the number of split-blocks that will be analysed + in parallel). If there are actually more split-blocks, some groups will work with more + than one split-block sequentially. + - An uper bound for the number of sub-blocks (nn) can be estimated from the size n. + If a group has an id larger than the actual number of sub-blocks in a split-block, + it will do nothing. **/ +template +ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_solve_kernel(const rocblas_int n, + S* DD, + const rocblas_stride strideD, + S* EE, + const rocblas_stride strideE, + S* CC, + const rocblas_int shiftC, + const rocblas_int ldc, + const rocblas_stride strideC, + rocblas_int* iinfo, + S* WA, + rocblas_int* splitsA, + const S eps, + const S ssfmin, + const S ssfmax, + const int solver_mode) +{ + // threads and groups indices + /* --------------------------------------------------- */ + // batch instance id + rocblas_int bid = hipBlockIdx_z; + // split-block id + rocblas_int sid = hipBlockIdx_y; + // sub-block id + rocblas_int tid = hipBlockIdx_x; + // thread index + rocblas_int tidb = hipThreadIdx_x; + /* --------------------------------------------------- */ + + // select batch instance to work with + /* --------------------------------------------------- */ + S* C; + if(CC) + C = load_ptr_batch(CC, bid, shiftC, strideC); + S* D = DD + bid * strideD; + S* E = EE + bid * strideE; + rocblas_int* info = iinfo + bid; + /* --------------------------------------------------- */ + + // temporary arrays in global memory + /* --------------------------------------------------- */ + // contains the beginning of split blocks + rocblas_int* splits = splitsA + bid * (5 * n + 2); + // the sub-blocks sizes + rocblas_int* nsA = splits + n + 2; + // the sub-blocks initial positions + rocblas_int* psA = nsA + n; + // worksapce for solvers + S* W; + switch(solver_mode) + { + case BISECTION: break; + + case JACOBI: W = WA + bid * (2 + n * n); break; + + case CLASSICQR: + default: W = WA + bid * (2 * n); + } + /* --------------------------------------------------- */ + + // temporary arrays in shared memory + /* --------------------------------------------------- */ + extern __shared__ rocblas_int lsmem[]; + // shared mem for jacobi solver if needed + S* sj1; + rocblas_int* sj2; + if(solver_mode == JACOBI) + { + sj2 = lsmem; + sj1 = reinterpret_cast(sj2 + n + n % 2); + } + /* --------------------------------------------------- */ + + // local variables + /* --------------------------------------------------- */ + // total number of split blocks + rocblas_int nb = splits[n + 1]; + // size of split block + rocblas_int bs; + // size of sub block + rocblas_int sbs; + // beginning of split block + rocblas_int p1; + // beginning of sub-block + rocblas_int p2; + // number of sub-blocks + rocblas_int blks; + // number of level of division + rocblas_int levs; + // other aux variables + S p; + rocblas_int *ns, *ps; + /* --------------------------------------------------- */ + + // work with STEDC_NUM_SPLIT_BLKS split blocks in parallel + /* --------------------------------------------------- */ + for(int kb = sid; kb < nb; kb += STEDC_NUM_SPLIT_BLKS) + { + // Select current split block + p1 = splits[kb]; + p2 = splits[kb + 1]; + bs = p2 - p1; + ns = nsA + p1; + ps = psA + p1; + + // determine ideal number of sub-blocks + levs = stedc_num_levels(bs, solver_mode); + blks = 1 << levs; + + // 2. SOLVE PHASE + /* ----------------------------------------------------------------- */ + // Solve the blks sub-blocks in parallel. + + if(tid < blks) + { + sbs = ns[tid]; + p2 = ps[tid]; + + switch(solver_mode) + { + case BISECTION: + { + // TODO + break; + } + + case JACOBI: + { + // transform D and E into full upper tridiag matrix and copy to C + de2tridiag(STEDC_BDIM, tidb, sbs, D + p2, E + p2, C + p2 + p2 * ldc, ldc); + + // set work space + S* W_Acpy = W; + S* W_residual = W_Acpy + n * n; + rocblas_int* W_n_sweeps = reinterpret_cast(W_residual + 1); + + // set shared mem + rocblas_int even_n = sbs + sbs % 2; + rocblas_int half_n = even_n / 2; + S* cosines_res = sj1; + S* sines_diag = cosines_res + half_n; + rocblas_int* top = sj2; + rocblas_int* bottom = top + half_n; + + // re-arrange threads in 2D array + rocblas_int ddx, ddy; + syevj_get_dims(sbs, STEDC_BDIM, &ddx, &ddy); + rocblas_int tix = tidb % ddx; + rocblas_int tiy = tidb / ddx; + __syncthreads(); + + // solve + run_syevj(ddx, ddy, tix, tiy, rocblas_esort_ascending, rocblas_evect_original, + rocblas_fill_upper, sbs, C + p2 + p2 * ldc, ldc, 0, eps, W_residual, + MAXSWEEPS, W_n_sweeps, D + p2, info, W_Acpy + p2 + p2 * n, + cosines_res, sines_diag, top, bottom); + + break; + } + + case CLASSICQR: + default: + { + // (Until STEQR is parallelized, only the first thread associated + // to each sub-block do computations) + if(tidb == 0) + run_steqr(sbs, D + p2, E + p2, C + p2 + p2 * ldc, ldc, info, W + p2 * 2, + 30 * bs, eps, ssfmin, ssfmax, false); + } + } // end switch + __syncthreads(); + } + } +} + +//--------------------------------------------------------------------------------------// +/** STEDC_MERGE_KERNEL implements the main loop of the DC algorithm to merge the + eigenvalues/eigenvectors of the different sub-blocks of each split-block. + A matrix in the batch could have many split-blocks, and each split-block could be + divided in a maximum of nn sub-blocks. + - Call his kernel with batch_count groups in y, and STEDC_NUM_SPLIT_BLKS groups in x. + Groups are size STEDC_BDIM. + - STEDC_NUM_SPLIT_BLKS is fixed (is the number of split-blocks that will be analysed + in parallel). If there are actually more split-blocks, some groups will work with more + than one split-block sequentially. **/ template -ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_kernel(const rocblas_int n, - S* DD, - const rocblas_stride strideD, - S* EE, - const rocblas_stride strideE, - S* CC, - const rocblas_int shiftC, - const rocblas_int ldc, - const rocblas_stride strideC, - rocblas_int* iinfo, - S* WA, - S* tmpzA, - S* vecsA, - rocblas_int* splitsA, - const S eps, - const S ssfmin, - const S ssfmax, - const rocblas_int maxblks, - int solver_mode) +ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_merge_kernel(const rocblas_int n, + S* DD, + const rocblas_stride strideD, + S* EE, + const rocblas_stride strideE, + S* CC, + const rocblas_int shiftC, + const rocblas_int ldc, + const rocblas_stride strideC, + S* tmpzA, + S* vecsA, + rocblas_int* splitsA, + const S eps, + const S ssfmin, + const S ssfmax, + const int solver_mode) { // threads and groups indices /* --------------------------------------------------- */ @@ -744,19 +1133,20 @@ ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_kernel(const rocblas_i C = load_ptr_batch(CC, bid, shiftC, strideC); S* D = DD + bid * strideD; S* E = EE + bid * strideE; - rocblas_int* info = iinfo + bid; /* --------------------------------------------------- */ // temporary arrays in global memory /* --------------------------------------------------- */ // contains the beginning of split blocks - rocblas_int* splits = splitsA + bid * (3 * n + 2); + rocblas_int* splits = splitsA + bid * (5 * n + 2); + // the sub-blocks sizes + rocblas_int* nsA = splits + n + 2; + // the sub-blocks initial positions + rocblas_int* psA = nsA + n; // if idd[i] = 0, the value in position i has been deflated - rocblas_int* idd = splits + n + 2; + rocblas_int* idd = psA + n; // container of permutations when solving the secular eqns rocblas_int* pers = idd + n; - // worksapce for STEQR - S* W = WA + bid * (2 * n); // the rank-1 modification vectors in the merges S* z = tmpzA + bid * (2 * n); // roots of secular equations @@ -769,18 +1159,14 @@ ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_kernel(const rocblas_i // temporary arrays in shared memory /* --------------------------------------------------- */ - extern __shared__ rocblas_int lmem[]; - // shares the sub-blocks sizes - rocblas_int* ns = lmem; - // shares the sub-blocks initial positions - rocblas_int* ps = ns + maxblks; + extern __shared__ rocblas_int lsmem[]; // used to store temp values during the different reductions - S* inrms = reinterpret_cast(ps + maxblks); + S* inrms = reinterpret_cast(lsmem); /* --------------------------------------------------- */ // local variables /* --------------------------------------------------- */ - // total number of blocks + // total number of split blocks rocblas_int nb = splits[n + 1]; // size of split block rocblas_int bs; @@ -792,7 +1178,11 @@ ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_kernel(const rocblas_i rocblas_int blks; // number of level of division rocblas_int levs; + // number of threads working in sub-block + rocblas_int tn; + // other aux variables S p; + rocblas_int *ns, *ps; /* --------------------------------------------------- */ // work with STEDC_NUM_SPLIT_BLKS split blocks in parallel @@ -805,583 +1195,488 @@ ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_kernel(const rocblas_i p1 = splits[kb]; p2 = splits[kb + 1]; bs = p2 - p1; + ns = nsA + p1; + ps = psA + p1; // determine ideal number of sub-blocks levs = stedc_num_levels(bs, solver_mode); blks = 1 << levs; - // if split block is too small - if(blks == 1) + // arrange threads so that a group of bdim/blks threads works with each sub-block + // tn is the number of threads associated to each sub-block + tn = STEDC_BDIM / blks; + // tid indexes the sub-block + tid = id / tn; + p2 = ps[tid]; + // tidb indexes the threads in each sub-block + tidb = id % tn; + + // 3. MERGE PHASE + /* ----------------------------------------------------------------- */ + rocblas_int iam, sz, bdm; + S* ptz; + + // main loop to work with merges on each level + // (once two leaves in the merge tree are identified, all related threads + // work together to solve the secular equation and update eiegenvectors) + for(int k = 0; k < levs; ++k) { - switch(solver_mode) + // 3a. find rank-1 modification components (z and p) for this merge + /* ----------------------------------------------------------------- */ + // iam indexes the sub-blocks according to its level in the merge tree + rocblas_int bd = 1 << k; + rocblas_int ss = 1; + bdm = bd << 1; + iam = tid % bdm; + + // Threads with iam < bd work with components above the merge point; + // threads with iam >= bd work below the merge point + if(iam < bd && tid < blks) { - case BISECTION: - // TODO - break; - - case JACOBI: - // TODO - break; - - case CLASSICQR: - default: - if(id == 0) - { - run_steqr(bs, D + p1, E + p1, C + p1 + p1 * ldc, ldc, info, W + p1 * 2, 30 * bs, - eps, ssfmin, ssfmax, false); - } + sz = ns[tid]; + for(int j = 1; j < bd - iam; ++j) + sz += ns[tid + j]; + // with this, all threads involved in a merge (above merge point) + // will point to the same row of C and the same off-diag element + ptz = C + p2 - 1 + sz; + p = 2 * E[p2 - 1 + sz]; + } + else if(iam >= bd && tid < blks) + { + sz = 0; + for(int j = 0; j < iam - bd; ++j) + sz += ns[tid - j - 1]; + // with this, all threads involved in a merge (below merge point) + // will point to the same row of C and the same off-diag element + ptz = C + p2 - sz; + p = 2 * E[p2 - sz - 1]; } - } - // ===== otherwise, use divide & conquer ===== - else - { - // arrange threads so that a group of bdim/blks threads works with each sub-block - // tn is the number of threads associated to each sub-block - rocblas_int tn = STEDC_BDIM / blks; - // tid indexes the sub-block - tid = id / tn; - // tidb indexes the threads in each sub-block - tidb = id % tn; - - // 1. DIVIDE PHASE - /* ----------------------------------------------------------------- */ - // (artificially divide split block into blks sub-blocks - // find initial positions of each sub-blocks) + // copy elements of z if(tidb == 0) - ns[tid] = 0; - __syncthreads(); - - // find sub-block sizes - if(id == 0) { - ns[0] = bs; - rocblas_int t, t2; - for(int i = 0; i < levs; ++i) - { - for(int j = (1 << i); j > 0; --j) - { - t = ns[j - 1]; - t2 = t / 2; - ns[j * 2 - 1] = (2 * t2 < t) ? t2 + 1 : t2; - ns[j * 2 - 2] = t2; - } - } + for(int j = 0; j < ns[tid]; ++j) + z[p2 + j] = ptz[(p2 + j) * ldc] / sqrt(2); } __syncthreads(); - // find beginning of sub-block and update D elements - p2 = 0; - for(int i = 0; i < tid; ++i) - p2 += ns[i]; - p2 += p1; + /* ----------------------------------------------------------------- */ + + // 3b. calculate deflation tolerance + /* ----------------------------------------------------------------- */ + S valf, valg, maxd, maxz; + + // first compute maximum of diagonal and z in each thread block if(tidb == 0) { - ps[tid] = p2; - if(tid > 0 && tid < blks) + maxd = std::abs(D[p2]); + maxz = std::abs(z[p2]); + for(int i = 1; i < ns[tid]; ++i) { - // perform sub-block division - p = E[p2 - 1]; - D[p2] -= p; - D[p2 - 1] -= p; + valf = std::abs(D[p2 + i]); + valg = std::abs(z[p2 + i]); + maxd = valf > maxd ? valf : maxd; + maxz = valg > maxz ? valg : maxz; } + inrms[tid] = maxd; + inrms[tid + blks] = maxz; } __syncthreads(); - /* ----------------------------------------------------------------- */ - // 2. SOLVE PHASE - /* ----------------------------------------------------------------- */ - // Solve the blks sub-blocks in parallel. - switch(solver_mode) + // now follow reduction process + // (using only one thread as not compute intensive) + if(iam == 0 && tidb == 0) { - case BISECTION: - // TODO - break; - - case JACOBI: - // TODO - break; - - case CLASSICQR: - default: - // (Until STEQR is parallelized, only the first thread associated - // to each sub-block do computations) - if(tidb == 0) + for(int i = 1; i < bdm; ++i) { - run_steqr(ns[tid], D + p2, E + p2, C + p2 + p2 * ldc, ldc, info, W + p2 * 2, - 30 * bs, eps, ssfmin, ssfmax, false); + valf = inrms[tid + i]; + valg = inrms[tid + blks + i]; + maxd = valf > maxd ? valf : maxd; + maxz = valg > maxz ? valg : maxz; } + inrms[tid] = maxd; + inrms[tid + blks] = maxz; } __syncthreads(); + + // tol should be 8 * eps * (max diagonal or z element participating in merge) + maxd = inrms[tid - iam]; + maxz = inrms[tid - iam + blks]; + maxd = maxz > maxd ? maxz : maxd; + S tol = 8 * eps * maxd; /* ----------------------------------------------------------------- */ - // 3. MERGE PHASE + // 3c. deflate enigenvalues /* ----------------------------------------------------------------- */ - rocblas_int iam, sz, bdm; - S* ptz; + S f, g, c, s, r; - // main loop to work with merges on each level - // (once two leaves in the merge tree are identified, all related threads - // work together to solve the secular equation and update eiegenvectors) - for(int k = 0; k < levs; ++k) + // first deflate each thread sub-block + // (only the first thread of each sub-block works as this is + // a sequential process) + if(tidb == 0) { - // 3a. find rank-1 modification components (z and p) for this merge - /* ----------------------------------------------------------------- */ - // iam indexes the sub-blocks according to its level in the merge tree - rocblas_int bd = 1 << k; - rocblas_int ss = 1; - bdm = bd << 1; - iam = tid % bdm; - - // Threads with iam < bd work with components above the merge point; - // threads with iam >= bd work below the merge point - if(iam < bd && tid < blks) - { - sz = ns[tid]; - for(int j = 1; j < bd - iam; ++j) - sz += ns[tid + j]; - // with this, all threads involved in a merge (above merge point) - // will point to the same row of C and the same off-diag element - ptz = C + p2 - 1 + sz; - p = 2 * E[p2 - 1 + sz]; - } - else if(iam >= bd && tid < blks) + for(int i = 0; i < ns[tid]; ++i) { - sz = 0; - for(int j = 0; j < iam - bd; ++j) - sz += ns[tid - j - 1]; - // with this, all threads involved in a merge (below merge point) - // will point to the same row of C and the same off-diag element - ptz = C + p2 - sz; - p = 2 * E[p2 - sz - 1]; - } - - // copy elements of z - if(tidb == 0) - { - for(int j = 0; j < ns[tid]; ++j) - z[p2 + j] = ptz[(p2 + j) * ldc] / sqrt(2); - } - __syncthreads(); - /* ----------------------------------------------------------------- */ - - // 3b. calculate deflation tolerance - /* ----------------------------------------------------------------- */ - S valf, valg, maxd, maxz; - - // first compute maximum of diagonal and z in each thread block - if(tidb == 0) - { - maxd = std::abs(D[p2]); - maxz = std::abs(z[p2]); - for(int i = 1; i < ns[tid]; ++i) - { - valf = std::abs(D[p2 + i]); - valg = std::abs(z[p2 + i]); - maxd = valf > maxd ? valf : maxd; - maxz = valg > maxz ? valg : maxz; - } - inrms[tid] = maxd; - inrms[tid + blks] = maxz; - } - __syncthreads(); - - // now follow reduction process - // (using only one thread as not compute intensive) - if(iam == 0 && tidb == 0) - { - for(int i = 1; i < bdm; ++i) + g = z[p2 + i]; + if(abs(p * g) <= tol) { - valf = inrms[tid + i]; - valg = inrms[tid + blks + i]; - maxd = valf > maxd ? valf : maxd; - maxz = valg > maxz ? valg : maxz; + // deflated ev because component in z is zero + idd[p2 + i] = 0; } - inrms[tid] = maxd; - inrms[tid + blks] = maxz; - } - __syncthreads(); - - // tol should be 8 * eps * (max diagonal or z element participating in merge) - maxd = inrms[tid - iam]; - maxz = inrms[tid - iam + blks]; - maxd = maxz > maxd ? maxz : maxd; - S tol = 8 * eps * maxd; - /* ----------------------------------------------------------------- */ - - // 3c. deflate enigenvalues - /* ----------------------------------------------------------------- */ - S f, g, c, s, r; - - // first deflate each thread sub-block - // (only the first thread of each sub-block works as this is - // a sequential process) - if(tidb == 0) - { - for(int i = 0; i < ns[tid]; ++i) + else { - g = z[p2 + i]; - if(abs(p * g) <= tol) + rocblas_int jj = 1; + valg = D[p2 + i]; + for(int j = 0; j < i; ++j) { - // deflated ev because component in z is zero - idd[p2 + i] = 0; - } - else - { - rocblas_int jj = 1; - valg = D[p2 + i]; - for(int j = 0; j < i; ++j) + if(idd[p2 + j] == 1 && abs(D[p2 + j] - valg) <= tol) { - if(idd[p2 + j] == 1 && abs(D[p2 + j] - valg) <= tol) + // deflated ev because it is repeated + idd[p2 + i] = 0; + // rotation to eliminate component in z + f = z[p2 + j]; + lartg(f, g, c, s, r); + z[p2 + j] = r; + z[p2 + i] = 0; + // update C with the rotation + for(int ii = 0; ii < n; ++ii) { - // deflated ev because it is repeated - idd[p2 + i] = 0; - // rotation to eliminate component in z - f = z[p2 + j]; - lartg(f, g, c, s, r); - z[p2 + j] = r; - z[p2 + i] = 0; - // update C with the rotation - for(int ii = 0; ii < n; ++ii) - { - valf = C[ii + (p2 + j) * ldc]; - valg = C[ii + (p2 + i) * ldc]; - C[ii + (p2 + j) * ldc] = valf * c - valg * s; - C[ii + (p2 + i) * ldc] = valf * s + valg * c; - } - break; + valf = C[ii + (p2 + j) * ldc]; + valg = C[ii + (p2 + i) * ldc]; + C[ii + (p2 + j) * ldc] = valf * c - valg * s; + C[ii + (p2 + i) * ldc] = valf * s + valg * c; } - jj++; - } - if(jj > i) - { - // non-deflated ev - idd[p2 + i] = 1; + break; } + jj++; + } + if(jj > i) + { + // non-deflated ev + idd[p2 + i] = 1; } } } - __syncthreads(); + } + __syncthreads(); - // then compare with other sub-blocks participating in this merge - // following a simple, reduction-like process. - // (only the first thread of each sub-block works in the reduction) - for(int ii = 0; ii <= k; ++ii) + // then compare with other sub-blocks participating in this merge + // following a simple, reduction-like process. + // (only the first thread of each sub-block works in the reduction) + for(int ii = 0; ii <= k; ++ii) + { + if(tidb == 0) { - if(tidb == 0) + rocblas_int div = 1 << (ii + 1); + //actual number of threads is halved each time + if(iam % div == div - 1) { - rocblas_int div = 1 << (ii + 1); - //actual number of threads is halved each time - if(iam % div == div - 1) + // find limits + rocblas_int inb = (1 << ii) - 1; + rocblas_int inc = div - 1; + rocblas_int countb = ns[tid]; + rocblas_int countc = 0; + for(int i = inc; i > inb; --i) + countc += ns[tid - i]; + for(int i = inb; i > 0; --i) + countb += ns[tid - i]; + inb = ps[tid - inb]; + inc = ps[tid - inc]; + + // perform comparisons + for(int i = 0; i < countb; ++i) { - // find limits - rocblas_int inb = (1 << ii) - 1; - rocblas_int inc = div - 1; - rocblas_int countb = ns[tid]; - rocblas_int countc = 0; - for(int i = inc; i > inb; --i) - countc += ns[tid - i]; - for(int i = inb; i > 0; --i) - countb += ns[tid - i]; - inb = ps[tid - inb]; - inc = ps[tid - inc]; - - // perform comparisons - for(int i = 0; i < countb; ++i) + if(idd[inb + i] == 1) { - if(idd[inb + i] == 1) + valg = D[inb + i]; + for(int j = 0; j < countc; ++j) { - valg = D[inb + i]; - for(int j = 0; j < countc; ++j) + if(idd[inc + j] == 1 && abs(D[inc + j] - valg) <= tol) { - if(idd[inc + j] == 1 && abs(D[inc + j] - valg) <= tol) + // deflated ev because it is repeated + idd[inb + i] = 0; + // rotation to eliminate component in z + g = z[inb + i]; + f = z[inc + j]; + lartg(f, g, c, s, r); + z[inc + j] = r; + z[inb + i] = 0; + // update C with the rotation + for(int ii = 0; ii < n; ++ii) { - // deflated ev because it is repeated - idd[inb + i] = 0; - // rotation to eliminate component in z - g = z[inb + i]; - f = z[inc + j]; - lartg(f, g, c, s, r); - z[inc + j] = r; - z[inb + i] = 0; - // update C with the rotation - for(int ii = 0; ii < n; ++ii) - { - valf = C[ii + (inc + j) * ldc]; - valg = C[ii + (inb + i) * ldc]; - C[ii + (inc + j) * ldc] = valf * c - valg * s; - C[ii + (inb + i) * ldc] = valf * s + valg * c; - } - break; + valf = C[ii + (inc + j) * ldc]; + valg = C[ii + (inb + i) * ldc]; + C[ii + (inc + j) * ldc] = valf * c - valg * s; + C[ii + (inb + i) * ldc] = valf * s + valg * c; } + break; } } } } } - __syncthreads(); } - /* ----------------------------------------------------------------- */ - - // 3d. Organize data with non-deflated values to prepare secular equation - /* ----------------------------------------------------------------- */ - // determine boundaries of what would be the new merged sub-block - // 'in' will be its initial position - rocblas_int in = ps[tid - iam]; - // 'sz' will be its size (i.e. the sum of the sizes of all merging sub-blocks) - sz = ns[tid]; - for(int i = iam; i > 0; --i) - sz += ns[tid - i]; - for(int i = bdm - 1 - iam; i > 0; --i) - sz += ns[tid + i]; - - // All threads of the participating merging blocks will work together - // to solve the correspondinbg secular eqn. Now 'iam' indexes those threads - iam = iam * tn + tidb; - bdm *= tn; - - // define shifted arrays - S* tmpd = temps + in * n; - S* ev = evs + in; - S* diag = D + in; - rocblas_int* mask = idd + in; - S* zz = z + in; - rocblas_int* per = pers + in; - - // find degree and components of secular equation - // tmpd contains the non-deflated diagonal elements (ie. poles of the secular eqn) - // zz contains the corresponding non-zero elements of the rank-1 modif vector - rocblas_int dd = 0; - for(int i = 0; i < sz; ++i) + __syncthreads(); + } + /* ----------------------------------------------------------------- */ + + // 3d. Organize data with non-deflated values to prepare secular equation + /* ----------------------------------------------------------------- */ + // determine boundaries of what would be the new merged sub-block + // 'in' will be its initial position + rocblas_int in = ps[tid - iam]; + // 'sz' will be its size (i.e. the sum of the sizes of all merging sub-blocks) + sz = ns[tid]; + for(int i = iam; i > 0; --i) + sz += ns[tid - i]; + for(int i = bdm - 1 - iam; i > 0; --i) + sz += ns[tid + i]; + + // All threads of the participating merging blocks will work together + // to solve the correspondinbg secular eqn. Now 'iam' indexes those threads + iam = iam * tn + tidb; + bdm *= tn; + + // define shifted arrays + S* tmpd = temps + in * n; + S* ev = evs + in; + S* diag = D + in; + rocblas_int* mask = idd + in; + S* zz = z + in; + rocblas_int* per = pers + in; + + // find degree and components of secular equation + // tmpd contains the non-deflated diagonal elements (ie. poles of the secular eqn) + // zz contains the corresponding non-zero elements of the rank-1 modif vector + rocblas_int dd = 0; + for(int i = 0; i < sz; ++i) + { + if(mask[i] == 1) { - if(mask[i] == 1) + if(tidb == 0 && iam == 0) { - if(tidb == 0 && iam == 0) - { - per[dd] = i; - tmpd[dd] = p < 0 ? -diag[i] : diag[i]; - if(dd != i) - zz[dd] = zz[i]; - } - dd++; + per[dd] = i; + tmpd[dd] = p < 0 ? -diag[i] : diag[i]; + if(dd != i) + zz[dd] = zz[i]; } + dd++; } - __syncthreads(); + } + __syncthreads(); - // Order the elements in tmpd and zz using a simple parallel selection/bubble sort. - // This will allow us to find initial intervals for eigenvalue guesses - rocblas_int tsz = 1 << (levs - 1 - k); - tsz = (bs - 1) / tsz + 1; - for(int i = 0; i < tsz; ++i) + // Order the elements in tmpd and zz using a simple parallel selection/bubble sort. + // This will allow us to find initial intervals for eigenvalue guesses + rocblas_int tsz = 1 << (levs - 1 - k); + tsz = (bs - 1) / tsz + 1; + for(int i = 0; i < tsz; ++i) + { + if(i < dd) { - if(i < dd) + if(i % 2 == 0) { - if(i % 2 == 0) + for(int j = iam; j < dd / 2; j += bdm) { - for(int j = iam; j < dd / 2; j += bdm) + if(tmpd[2 * j] > tmpd[2 * j + 1]) { - if(tmpd[2 * j] > tmpd[2 * j + 1]) - { - valf = tmpd[2 * j]; - tmpd[2 * j] = tmpd[2 * j + 1]; - tmpd[2 * j + 1] = valf; - valf = zz[2 * j]; - zz[2 * j] = zz[2 * j + 1]; - zz[2 * j + 1] = valf; - bd = per[2 * j]; - per[2 * j] = per[2 * j + 1]; - per[2 * j + 1] = bd; - } + valf = tmpd[2 * j]; + tmpd[2 * j] = tmpd[2 * j + 1]; + tmpd[2 * j + 1] = valf; + valf = zz[2 * j]; + zz[2 * j] = zz[2 * j + 1]; + zz[2 * j + 1] = valf; + bd = per[2 * j]; + per[2 * j] = per[2 * j + 1]; + per[2 * j + 1] = bd; } } - else + } + else + { + for(int j = iam; j < (dd - 1) / 2; j += bdm) { - for(int j = iam; j < (dd - 1) / 2; j += bdm) + if(tmpd[2 * j + 1] > tmpd[2 * j + 2]) { - if(tmpd[2 * j + 1] > tmpd[2 * j + 2]) - { - valf = tmpd[2 * j + 1]; - tmpd[2 * j + 1] = tmpd[2 * j + 2]; - tmpd[2 * j + 2] = valf; - valf = zz[2 * j + 1]; - zz[2 * j + 1] = zz[2 * j + 2]; - zz[2 * j + 2] = valf; - bd = per[2 * j + 1]; - per[2 * j + 1] = per[2 * j + 2]; - per[2 * j + 2] = bd; - } + valf = tmpd[2 * j + 1]; + tmpd[2 * j + 1] = tmpd[2 * j + 2]; + tmpd[2 * j + 2] = valf; + valf = zz[2 * j + 1]; + zz[2 * j + 1] = zz[2 * j + 2]; + zz[2 * j + 2] = valf; + bd = per[2 * j + 1]; + per[2 * j + 1] = per[2 * j + 2]; + per[2 * j + 2] = bd; } } } - __syncthreads(); } + __syncthreads(); + } + + // make dd copies of the non-deflated ordered diagonal elements + // (i.e. the poles of the secular eqn) so that the distances to the + // eigenvalues (D - lambda_i) are updated while computing each eigenvalue. + // This will prevent collapses and division by zero when an eigenvalue + // is too close to a pole. + for(int j = iam + 1; j < sz; j += bdm) + { + for(int i = 0; i < dd; ++i) + tmpd[i + j * n] = tmpd[i]; + } + + // finally copy over all diagonal elements in ev. ev will be overwritten by the + // new computed eigenvalues of the merged block + for(int i = iam; i < sz; i += bdm) + ev[i] = diag[i]; + __syncthreads(); + /* ----------------------------------------------------------------- */ - // make dd copies of the non-deflated ordered diagonal elements - // (i.e. the poles of the secular eqn) so that the distances to the - // eigenvalues (D - lambda_i) are updated while computing each eigenvalue. - // This will prevent collapses and division by zero when an eigenvalue - // is too close to a pole. - for(int j = iam + 1; j < sz; j += bdm) + // 3e. Solve secular eqns, i.e. find the dd zeros + // corresponding to non-deflated new eigenvalues of the merged block + /* ----------------------------------------------------------------- */ + // each thread will find a different zero in parallel + S a, b; + for(int j = iam; j < sz; j += bdm) + { + if(mask[j] == 1) { - for(int i = 0; i < dd; ++i) - tmpd[i + j * n] = tmpd[i]; + // find position in the ordered array + int cc = 0; + valf = p < 0 ? -ev[j] : ev[j]; + for(int jj = 0; jj < dd; ++jj) + { + if(tmpd[jj + j * n] == valf) + break; + else + cc++; + } + + // computed zero will overwrite 'ev' at the corresponding position. + // 'tmpd' will be updated with the distances D - lambda_i. + // deflated values are not changed. + rocblas_int linfo; + if(cc == dd - 1) + linfo = seq_solve_ext(dd, tmpd + j * n, zz, (p < 0 ? -p : p), ev + j, eps, + ssfmin, ssfmax); + else + linfo = seq_solve(dd, tmpd + j * n, zz, (p < 0 ? -p : p), cc, ev + j, eps, + ssfmin, ssfmax); + if(p < 0) + ev[j] *= -1; } + } + __syncthreads(); - // finally copy over all diagonal elements in ev. ev will be overwritten by the - // new computed eigenvalues of the merged block - for(int i = iam; i < sz; i += bdm) - ev[i] = diag[i]; - __syncthreads(); - /* ----------------------------------------------------------------- */ - - // 3e. Solve secular eqns, i.e. find the dd zeros - // corresponding to non-deflated new eigenvalues of the merged block - /* ----------------------------------------------------------------- */ - // each thread will find a different zero in parallel - S a, b; - for(int j = iam; j < sz; j += bdm) + // Re-scale vector Z to avoid bad numerics when an eigenvalue + // is too close to a pole + for(int i = iam; i < dd; i += bdm) + { + valf = 1; + for(int j = 0; j < sz; ++j) { if(mask[j] == 1) { - // find position in the ordered array - int cc = 0; - valf = p < 0 ? -ev[j] : ev[j]; - for(int jj = 0; jj < dd; ++jj) - { - if(tmpd[jj + j * n] == valf) - break; - else - cc++; - } - - // computed zero will overwrite 'ev' at the corresponding position. - // 'tmpd' will be updated with the distances D - lambda_i. - // deflated values are not changed. - rocblas_int linfo; - if(cc == dd - 1) - linfo = seq_solve_ext(dd, tmpd + j * n, zz, (p < 0 ? -p : p), ev + j, - eps, ssfmin, ssfmax); + valg = tmpd[i + j * n]; + if(p > 0) + valf *= (per[i] == j) ? valg : valg / (diag[per[i]] - diag[j]); else - linfo = seq_solve(dd, tmpd + j * n, zz, (p < 0 ? -p : p), cc, ev + j, - eps, ssfmin, ssfmax); - if(p < 0) - ev[j] *= -1; + valf *= (per[i] == j) ? valg : -valg / (diag[per[i]] - diag[j]); } } - __syncthreads(); + valf = sqrt(-valf); + zz[i] = zz[i] < 0 ? -valf : valf; + } + __syncthreads(); + /* ----------------------------------------------------------------- */ - // Re-scale vector Z to avoid bad numerics when an eigenvalue - // is too close to a pole - for(int i = iam; i < dd; i += bdm) + // 3f. Compute vectors corresponding to non-deflated values + /* ----------------------------------------------------------------- */ + S temp, nrm, evj; + rocblas_int nn = (bs - 1) / blks + 1; + bool go; + for(int j = 0; j < nn; ++j) + { + go = (j < ns[tid] && idd[p2 + j] == 1); + + // compute vectors of rank-1 perturbed system and their norms + nrm = 0; + if(go) { - valf = 1; - for(int j = 0; j < sz; ++j) + evj = evs[p2 + j]; + for(int i = tidb; i < dd; i += tn) { - if(mask[j] == 1) - { - valg = tmpd[i + j * n]; - if(p > 0) - valf *= (per[i] == j) ? valg : valg / (diag[per[i]] - diag[j]); - else - valf *= (per[i] == j) ? valg : -valg / (diag[per[i]] - diag[j]); - } + valf = zz[i] / temps[i + (p2 + j) * n]; + nrm += valf * valf; + temps[i + (p2 + j) * n] = valf; } - valf = sqrt(-valf); - zz[i] = zz[i] < 0 ? -valf : valf; } + inrms[tid * tn + tidb] = nrm; __syncthreads(); - /* ----------------------------------------------------------------- */ - - // 3f. Compute vectors corresponding to non-deflated values - /* ----------------------------------------------------------------- */ - S temp, nrm, evj; - rocblas_int nn = (bs - 1) / blks + 1; - bool go; - for(int j = 0; j < nn; ++j) + + // reduction (for the norms) + for(int r = tn / 2; r > 0; r /= 2) { - go = (j < ns[tid] && idd[p2 + j] == 1); + if(go && tidb < r) + { + nrm += inrms[tid * tn + tidb + r]; + inrms[tid * tn + tidb] = nrm; + } + __syncthreads(); + } + nrm = sqrt(nrm); - // compute vectors of rank-1 perturbed system and their norms - nrm = 0; + // multiply by C (row by row) + for(int ii = 0; ii < tsz; ++ii) + { + rocblas_int i = in + ii; + go &= (ii < sz); + + // inner products + temp = 0; if(go) { - evj = evs[p2 + j]; - for(int i = tidb; i < dd; i += tn) - { - valf = zz[i] / temps[i + (p2 + j) * n]; - nrm += valf * valf; - temps[i + (p2 + j) * n] = valf; - } + for(int kk = tidb; kk < dd; kk += tn) + temp += C[i + (per[kk] + in) * ldc] * temps[kk + (p2 + j) * n]; } - inrms[tid * tn + tidb] = nrm; + inrms[tid * tn + tidb] = temp; __syncthreads(); - // reduction (for the norms) + // reduction for(int r = tn / 2; r > 0; r /= 2) { if(go && tidb < r) { - nrm += inrms[tid * tn + tidb + r]; - inrms[tid * tn + tidb] = nrm; + temp += inrms[tid * tn + tidb + r]; + inrms[tid * tn + tidb] = temp; } __syncthreads(); } - nrm = sqrt(nrm); - - // multiply by C (row by row) - for(int ii = 0; ii < tsz; ++ii) - { - rocblas_int i = in + ii; - go &= (ii < sz); - // inner products - temp = 0; - if(go) - { - for(int kk = tidb; kk < dd; kk += tn) - temp += C[i + (per[kk] + in) * ldc] * temps[kk + (p2 + j) * n]; - } - inrms[tid * tn + tidb] = temp; - __syncthreads(); - - // reduction - for(int r = tn / 2; r > 0; r /= 2) - { - if(go && tidb < r) - { - temp += inrms[tid * tn + tidb + r]; - inrms[tid * tn + tidb] = temp; - } - __syncthreads(); - } - - // result - if(go && tidb == 0) - vecs[i + (p2 + j) * n] = temp / nrm; - __syncthreads(); - } + // result + if(go && tidb == 0) + vecs[i + (p2 + j) * n] = temp / nrm; + __syncthreads(); } - __syncthreads(); - /* ----------------------------------------------------------------- */ + } + __syncthreads(); + /* ----------------------------------------------------------------- */ - // 3g. update D and C with computed values and vectors - /* ----------------------------------------------------------------- */ - for(int j = 0; j < nn; ++j) + // 3g. update D and C with computed values and vectors + /* ----------------------------------------------------------------- */ + for(int j = 0; j < nn; ++j) + { + if(j < ns[tid] && idd[p2 + j] == 1) { - if(j < ns[tid] && idd[p2 + j] == 1) - { - if(tidb == 0) - D[p2 + j] = evs[p2 + j]; - for(int i = in + tidb; i < in + sz; i += tn) - C[i + (p2 + j) * ldc] = vecs[i + (p2 + j) * n]; - } - __syncthreads(); + if(tidb == 0) + D[p2 + j] = evs[p2 + j]; + for(int i = in + tidb; i < in + sz; i += tn) + C[i + (p2 + j) * ldc] = vecs[i + (p2 + j) * n]; } - /* ----------------------------------------------------------------- */ - - } // end of main loop in merge phase of divide & conquer + __syncthreads(); + } + /* ----------------------------------------------------------------- */ - } // end of conditional that decides when to use normal algorithm or divide & conquer + } // end of main loop in merge phase of divide & conquer } // end of for-loop for the independent split blocks } @@ -1431,6 +1726,10 @@ ROCSOLVER_KERNEL void stedc_sort(const rocblas_int n, } } +/******************* Host functions ********************************************/ +/*******************************************************************************/ + +//--------------------------------------------------------------------------------------// /** This local gemm adapts rocblas_gemm to multiply complex*real, and overwrite result: A = A*B **/ template void rocsolver_stedc_getMemorySize(const rocblas_evect evect, @@ -1598,11 +1898,16 @@ void rocsolver_stedc_getMemorySize(const rocblas_evect evect, { size_t s1, s2; - // requirements for classic solver of small independent blocks - if(solver_mode == BISECTION || solver_mode == JACOBI) - s1 = 0; - else - rocsolver_steqr_getMemorySize(evect, n, batch_count, &s1); + // requirements for solver of small independent blocks + switch(solver_mode) + { + case BISECTION: s1 = 0; break; + + case JACOBI: s1 = sizeof(S) * (n * n + 2) * batch_count; break; + + case CLASSICQR: + default: rocsolver_steqr_getMemorySize(evect, n, batch_count, &s1); + } // extra requirements for original eigenvectors of small independent blocks *size_tempvect = (n * n) * batch_count * sizeof(S); @@ -1618,13 +1923,14 @@ void rocsolver_stedc_getMemorySize(const rocblas_evect evect, *size_work_stack = max(s1, s2); // size for split blocks and sub-blocks positions - *size_splits = sizeof(rocblas_int) * (3 * n + 2) * batch_count; + *size_splits = sizeof(rocblas_int) * (5 * n + 2) * batch_count; // size for temporary diagonal and rank-1 modif vector *size_tmpz = sizeof(S) * (2 * n) * batch_count; } } +//--------------------------------------------------------------------------------------// /** This helper check argument correctness for stedc API **/ template rocblas_status rocsolver_stedc_argCheck(rocblas_handle handle, @@ -1660,6 +1966,7 @@ rocblas_status rocsolver_stedc_argCheck(rocblas_handle handle, return rocblas_status_continue; } +//--------------------------------------------------------------------------------------// /** STEDC templated function **/ template rocblas_status rocsolver_stedc_template(rocblas_handle handle, @@ -1734,10 +2041,6 @@ rocblas_status rocsolver_stedc_template(rocblas_handle handle, ssfmax = sqrt(ssfmax) / S(3.0); rocblas_int blocksn = (n - 1) / BS2 + 1; - // find independent split blocks in matrix - ROCSOLVER_LAUNCH_KERNEL(stedc_split, dim3(batch_count), dim3(1), 0, stream, n, D + shiftD, - strideD, E + shiftE, strideE, splits, eps); - // initialize identity matrix in C if required if(evect == rocblas_evect_tridiagonal) ROCSOLVER_LAUNCH_KERNEL(init_ident, dim3(blocksn, blocksn, batch_count), @@ -1751,19 +2054,44 @@ rocblas_status rocsolver_stedc_template(rocblas_handle handle, // find max number of sub-blocks to consider during the divide phase rocblas_int maxblks = 1 << stedc_num_levels(n, solver_mode); - size_t lmemsize = sizeof(rocblas_int) * 2 * maxblks + sizeof(S) * STEDC_BDIM; - // execute divide and conquer kernel with tempvect - ROCSOLVER_LAUNCH_KERNEL((stedc_kernel), dim3(STEDC_NUM_SPLIT_BLKS, batch_count), + // find independent split blocks in matrix + ROCSOLVER_LAUNCH_KERNEL(stedc_split, dim3(batch_count), dim3(1), 0, stream, n, D + shiftD, + strideD, E + shiftE, strideE, splits, eps); + + // 1. divide phase + //----------------------------- + ROCSOLVER_LAUNCH_KERNEL((stedc_divide_kernel), dim3(batch_count), dim3(STEDC_BDIM), 0, + stream, n, D + shiftD, strideD, E + shiftE, strideE, splits, + solver_mode); + + // 2. solve phase + //----------------------------- + size_t lmemsize = 0; + if(solver_mode == JACOBI) + lmemsize += (n + n % 2) * (sizeof(rocblas_int) + sizeof(S)); + + ROCSOLVER_LAUNCH_KERNEL((stedc_solve_kernel), + dim3(maxblks, STEDC_NUM_SPLIT_BLKS, batch_count), dim3(STEDC_BDIM), + lmemsize, stream, n, D + shiftD, strideD, E + shiftE, strideE, + tempvect, 0, ldt, strideT, info, (S*)work_stack, splits, eps, + ssfmin, ssfmax, solver_mode); + + // 3. merge phase + //---------------- + lmemsize = sizeof(S) * STEDC_BDIM; + + ROCSOLVER_LAUNCH_KERNEL((stedc_merge_kernel), dim3(STEDC_NUM_SPLIT_BLKS, batch_count), dim3(STEDC_BDIM), lmemsize, stream, n, D + shiftD, strideD, - E + shiftE, strideE, tempvect, 0, ldt, strideT, info, (S*)work_stack, - tmpz, tempgemm, splits, eps, ssfmin, ssfmax, maxblks, solver_mode); + E + shiftE, strideE, tempvect, 0, ldt, strideT, tmpz, tempgemm, + splits, eps, ssfmin, ssfmax, solver_mode); - // update eigenvectors C <- C*tempvect + // 4. update and sort + //---------------------- + // eigenvectors C <- C*tempvect local_gemm(handle, n, C, shiftC, ldc, strideC, tempvect, tempgemm, (S*)work_stack, 0, ldt, strideT, batch_count, workArr); - // finally sort eigenvalues and eigenvectors ROCSOLVER_LAUNCH_KERNEL((stedc_sort), dim3(batch_count), dim3(1), 0, stream, n, D + shiftD, strideD, C, shiftC, ldc, strideC); } diff --git a/library/src/include/ideal_sizes.hpp b/library/src/include/ideal_sizes.hpp index ad965bb46..903aed00f 100644 --- a/library/src/include/ideal_sizes.hpp +++ b/library/src/include/ideal_sizes.hpp @@ -228,6 +228,14 @@ if any, will be factorized with the unblocked algorithm (SYTF2).*/ #define SYTRF_SYTF2_SWITCHSIZE 128 +/****************************** syevdj ****************************************** +*******************************************************************************/ +/*! \brief Determines the minimum size required for the Jacobi divide and conquer to be used. + + \details If the size of the block is smaller than SYEVDJ_MIN_DC_SIZE, + the eigenvectors are computed with the normal Jacobi algorithm. */ +#define SYEVDJ_MIN_DC_SIZE 16 + /**************************** getf2/getfr ************************************* *******************************************************************************/ #define GETF2_SPKER_MAX_M 1024 //always <= 1024 diff --git a/library/src/lapack/roclapack_syevdj_heevdj.cpp b/library/src/lapack/roclapack_syevdj_heevdj.cpp new file mode 100644 index 000000000..68073a543 --- /dev/null +++ b/library/src/lapack/roclapack_syevdj_heevdj.cpp @@ -0,0 +1,173 @@ +/* ************************************************************************ + * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#include "roclapack_syevdj_heevdj.hpp" + +template +rocblas_status rocsolver_syevdj_heevdj_impl(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + W A, + const rocblas_int lda, + S* D, + rocblas_int* info) +{ + const char* name = (!rocblas_is_complex ? "syevdj" : "heevdj"); + ROCSOLVER_ENTER_TOP(name, "--evect", evect, "--uplo", uplo, "-n", n, "--lda", lda); + + if(!handle) + return rocblas_status_invalid_handle; + + // argument checking + rocblas_status st = rocsolver_syevdj_heevdj_argCheck(handle, evect, uplo, n, A, lda, D, info); + if(st != rocblas_status_continue) + return st; + + // working with unshifted arrays + rocblas_int shiftA = 0; + + // normal (non-batched non-strided) execution + rocblas_stride strideA = 0; + rocblas_stride strideD = 0; + rocblas_int batch_count = 1; + + // memory workspace sizes: + // size for constants in rocblas calls + size_t size_scalars; + // size of reusable workspaces + size_t size_work1; + size_t size_work2; + size_t size_work3; + // extra space for call stedc + size_t size_workSplits, size_work4; + // size of array of pointers (only for batched case) + size_t size_workArr; + // size for temporary householder scalars + size_t size_workTau; + // size for temporary vectors + size_t size_workVec; + // size for temporary superdiagonal of tridiag form + size_t size_workE; + + rocsolver_syevdj_heevdj_getMemorySize( + evect, uplo, n, batch_count, &size_scalars, &size_workE, &size_workTau, &size_workVec, + &size_workSplits, &size_work1, &size_work2, &size_work3, &size_work4, &size_workArr); + + if(rocblas_is_device_memory_size_query(handle)) + return rocblas_set_optimal_device_memory_size( + handle, size_scalars, size_workE, size_workTau, size_workVec, size_workSplits, + size_work1, size_work2, size_work3, size_work4, size_workArr); + + // memory workspace allocation + void *scalars, *work1, *work2, *work3, *work4, *workE, *workVec, *workSplits, *workTau, *workArr; + rocblas_device_malloc mem(handle, size_scalars, size_workE, size_workTau, size_workVec, + size_workSplits, size_work1, size_work2, size_work3, size_work4, + size_workArr); + + if(!mem) + return rocblas_status_memory_error; + + scalars = mem[0]; + workE = mem[1]; + workTau = mem[2]; + workVec = mem[3]; + workSplits = mem[4]; + work1 = mem[5]; + work2 = mem[6]; + work3 = mem[7]; + work4 = mem[8]; + workArr = mem[9]; + if(size_scalars > 0) + init_scalars(handle, (T*)scalars); + + // execution + return rocsolver_syevdj_heevdj_template( + handle, evect, uplo, n, A, shiftA, lda, strideA, D, strideD, info, batch_count, (T*)scalars, + (S*)workE, (T*)workTau, (T*)workVec, (rocblas_int*)workSplits, work1, work2, work3, work4, + workArr); +} + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" { + +rocblas_status rocsolver_ssyevdj(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + float* A, + const rocblas_int lda, + float* D, + rocblas_int* info) +{ + return rocsolver_syevdj_heevdj_impl(handle, evect, uplo, n, A, lda, D, info); +} + +rocblas_status rocsolver_dsyevdj(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + double* A, + const rocblas_int lda, + double* D, + rocblas_int* info) +{ + return rocsolver_syevdj_heevdj_impl(handle, evect, uplo, n, A, lda, D, info); +} + +rocblas_status rocsolver_cheevdj(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_float_complex* A, + const rocblas_int lda, + float* D, + rocblas_int* info) +{ + return rocsolver_syevdj_heevdj_impl(handle, evect, uplo, n, A, lda, D, + info); +} + +rocblas_status rocsolver_zheevdj(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* A, + const rocblas_int lda, + double* D, + rocblas_int* info) +{ + return rocsolver_syevdj_heevdj_impl(handle, evect, uplo, n, A, lda, D, + info); +} + +} // extern C diff --git a/library/src/lapack/roclapack_syevdj_heevdj.hpp b/library/src/lapack/roclapack_syevdj_heevdj.hpp new file mode 100644 index 000000000..0ba73bb06 --- /dev/null +++ b/library/src/lapack/roclapack_syevdj_heevdj.hpp @@ -0,0 +1,265 @@ +/************************************************************************ + * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ************************************************************************/ + +#pragma once + +#include "auxiliary/rocauxiliary_ormtr_unmtr.hpp" +#include "auxiliary/rocauxiliary_stedc.hpp" +#include "rocblas.hpp" +#include "roclapack_syev_heev.hpp" +#include "roclapack_sytrd_hetrd.hpp" +#include "rocsolver/rocsolver.h" + +/** Helper to calculate workspace sizes **/ +template +void rocsolver_syevdj_heevdj_getMemorySize(const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + const rocblas_int batch_count, + size_t* size_scalars, + size_t* size_workE, + size_t* size_workTau, + size_t* size_workVec, + size_t* size_workSplits, + size_t* size_work1, + size_t* size_work2, + size_t* size_work3, + size_t* size_work4, + size_t* size_workArr) +{ + // if quick return, set workspace to zero + if(n <= 1 || batch_count == 0) + { + *size_scalars = 0; + *size_work1 = 0; + *size_work2 = 0; + *size_work3 = 0; + *size_work4 = 0; + *size_workSplits = 0; + *size_workE = 0; + *size_workTau = 0; + *size_workVec = 0; + *size_workArr = 0; + return; + } + + // if size too small or no vectors required + if(evect != rocblas_evect_original || n < SYEVDJ_MIN_DC_SIZE) + { + // space to store the residual + *size_workE = sizeof(S) * batch_count; + + // space to store the number of sweeps + *size_workSplits = sizeof(rocblas_int) * batch_count; + + // requirements for jacobi + rocsolver_syevj_heevj_getMemorySize(evect, uplo, n, batch_count, + size_workVec, size_workTau, size_work1, + size_work2, size_work3, size_work4); + + *size_scalars = 0; + *size_workArr = 0; + + return; + } + + size_t unused; + size_t w11 = 0, w12 = 0, w13 = 0; + size_t w21 = 0, w22 = 0, w23 = 0; + size_t w31 = 0, w32 = 0, w33 = 0; + + // space for the superdiagonal of tridiag form + *size_workE = sizeof(S) * n * batch_count; + + // space for the householder scalars + *size_workTau = sizeof(T) * n * batch_count; + + // temp space for eigenvectors + *size_workVec = sizeof(T) * n * n * batch_count; + + // requirements for tridiagonalization (sytrd/hetrd) + rocsolver_sytrd_hetrd_getMemorySize(n, batch_count, size_scalars, &w11, &w21, &w31, + &unused); + + // extra requirements for computing eigenvalues and vectors (stedc) + int solver_mode = JACOBI; + rocsolver_stedc_getMemorySize(rocblas_evect_tridiagonal, n, batch_count, &w12, + &w22, &w32, size_work4, size_workSplits, &unused, + solver_mode); + + // extra requirements for ormtr/unmtr + rocsolver_ormtr_unmtr_getMemorySize(rocblas_side_left, uplo, n, n, batch_count, + &unused, &w13, &w23, &w33, &unused); + + // get max values + *size_work1 = std::max({w11, w12, w13}); + *size_work2 = std::max({w21, w22, w23}); + *size_work3 = std::max({w31, w32, w33}); + + // size of array of pointers to workspace + if(BATCHED) + *size_workArr = 2 * sizeof(T*) * batch_count; + else + *size_workArr = 0; +} + +/** Argument checking **/ +template +rocblas_status rocsolver_syevdj_heevdj_argCheck(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + T A, + const rocblas_int lda, + S* D, + rocblas_int* info, + const rocblas_int batch_count = 1) +{ + // order is important for unit tests: + + // 1. invalid/non-supported values + if((evect != rocblas_evect_original && evect != rocblas_evect_none) + || (uplo != rocblas_fill_lower && uplo != rocblas_fill_upper)) + return rocblas_status_invalid_value; + + // 2. invalid size + if(n < 0 || lda < n || batch_count < 0) + return rocblas_status_invalid_size; + + // skip pointer check if querying memory size + if(rocblas_is_device_memory_size_query(handle)) + return rocblas_status_continue; + + // 3. invalid pointers + if((n && !A) || (n && !D) || (batch_count && !info)) + return rocblas_status_invalid_pointer; + + return rocblas_status_continue; +} + +template +rocblas_status rocsolver_syevdj_heevdj_template(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + W A, + const rocblas_int shiftA, + const rocblas_int lda, + const rocblas_stride strideA, + S* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count, + T* scalars, + S* workE, + T* workTau, + T* workVec, + rocblas_int* workSplits, + void* work1, + void* work2, + void* work3, + void* work4, + void* workArr) +{ + ROCSOLVER_ENTER("syevdj_heevdj", "evect:", evect, "uplo:", uplo, "n:", n, "shiftA:", shiftA, + "lda:", lda, "bc:", batch_count); + + // quick return + if(batch_count == 0) + return rocblas_status_success; + + hipStream_t stream; + rocblas_get_stream(handle, &stream); + + rocblas_int blocksReset = (batch_count - 1) / BS1 + 1; + dim3 gridReset(blocksReset, 1, 1); + dim3 threads(BS1, 1, 1); + + // info = 0 + ROCSOLVER_LAUNCH_KERNEL(reset_info, gridReset, threads, 0, stream, info, batch_count, 0); + + // quick return + if(n == 0) + return rocblas_status_success; + + // quick return for n = 1 (scalar case) + if(n == 1) + { + ROCSOLVER_LAUNCH_KERNEL(scalar_case, gridReset, threads, 0, stream, evect, A, strideA, D, + strideD, batch_count); + return rocblas_status_success; + } + + // TODO: Scale the matrix + + if(evect != rocblas_evect_original || n < SYEVDJ_MIN_DC_SIZE) + { + // **** do not use D&C approach **** + + rocsolver_syevj_heevj_template( + handle, rocblas_esort_ascending, evect, uplo, n, A, shiftA, lda, strideA, (S)0, workE, + 20, workSplits, D, strideD, info, batch_count, workVec, workTau, (S*)work1, + (rocblas_int*)work2, (rocblas_int*)work3, (rocblas_int*)work4); + } + else + { + // **** Use D&C approach **** + + // reduce A to tridiagonal form + // (Note: a tridiag form is necessary to apply D&C. To solve the subblocks with Jacobi will + // require copy D and E into a full tridiag matrix however, given all the zeros above the super diagonal, + // it is expected that the algorithm converges in fewer sweeps) + rocsolver_sytrd_hetrd_template(handle, uplo, n, A, shiftA, lda, strideA, D, + strideD, workE, n, workTau, n, batch_count, scalars, + (T*)work1, (T*)work2, (T*)work3, (T**)workArr); + + constexpr bool ISBATCHED = BATCHED || STRIDED; + const rocblas_int ldv = n; + const rocblas_stride strideV = n * n; + + // solve with Jacobi solver + int solver_mode = JACOBI; + rocsolver_stedc_template( + handle, rocblas_evect_tridiagonal, n, D, 0, strideD, workE, 0, n, workVec, 0, ldv, + strideV, info, batch_count, work1, (S*)work2, (S*)work3, (S*)work4, workSplits, + (S**)workArr, solver_mode); + + // update vectors + rocsolver_ormtr_unmtr_template( + handle, rocblas_side_left, uplo, rocblas_operation_none, n, n, A, shiftA, lda, strideA, + workTau, n, workVec, 0, ldv, strideV, batch_count, scalars, (T*)work1, (T*)work2, + (T*)work3, (T**)workArr); + + // copy vectors into A + const rocblas_int copyblocks = (n - 1) / BS2 + 1; + ROCSOLVER_LAUNCH_KERNEL(copy_mat, dim3(copyblocks, copyblocks, batch_count), + dim3(BS2, BS2), 0, stream, n, n, workVec, 0, ldv, strideV, A, + shiftA, lda, strideA); + } + + return rocblas_status_success; +} diff --git a/library/src/lapack/roclapack_syevdj_heevdj_batched.cpp b/library/src/lapack/roclapack_syevdj_heevdj_batched.cpp new file mode 100644 index 000000000..266f40d43 --- /dev/null +++ b/library/src/lapack/roclapack_syevdj_heevdj_batched.cpp @@ -0,0 +1,185 @@ +/* ************************************************************************ + * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#include "roclapack_syevdj_heevdj.hpp" + +template +rocblas_status rocsolver_syevdj_heevdj_batched_impl(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + W A, + const rocblas_int lda, + S* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + const char* name = (!rocblas_is_complex ? "syevdj_batched" : "heevdj_batched"); + ROCSOLVER_ENTER_TOP(name, "--evect", evect, "--uplo", uplo, "-n", n, "--lda", lda, "--strideD", + strideD, "--batch_count", batch_count); + + if(!handle) + return rocblas_status_invalid_handle; + + // argument checking + rocblas_status st + = rocsolver_syevdj_heevdj_argCheck(handle, evect, uplo, n, A, lda, D, info, batch_count); + if(st != rocblas_status_continue) + return st; + + // working with unshifted arrays + rocblas_int shiftA = 0; + + // batched execution + rocblas_stride strideA = 0; + + // memory workspace sizes: + // size for constants in rocblas calls + size_t size_scalars; + // size of reusable workspaces + size_t size_work1; + size_t size_work2; + size_t size_work3; + // extra space for call stedc + size_t size_workSplits, size_work4; + // size of array of pointers (only for batched case) + size_t size_workArr; + // size for temporary householder scalars + size_t size_workTau; + // size for temporary vectors + size_t size_workVec; + // size for temporary superdiagonal of tridiag form + size_t size_workE; + + rocsolver_syevdj_heevdj_getMemorySize( + evect, uplo, n, batch_count, &size_scalars, &size_workE, &size_workTau, &size_workVec, + &size_workSplits, &size_work1, &size_work2, &size_work3, &size_work4, &size_workArr); + + if(rocblas_is_device_memory_size_query(handle)) + return rocblas_set_optimal_device_memory_size( + handle, size_scalars, size_workE, size_workTau, size_workVec, size_workSplits, + size_work1, size_work2, size_work3, size_work4, size_workArr); + + // memory workspace allocation + void *scalars, *work1, *work2, *work3, *work4, *workE, *workVec, *workSplits, *workTau, *workArr; + rocblas_device_malloc mem(handle, size_scalars, size_workE, size_workTau, size_workVec, + size_workSplits, size_work1, size_work2, size_work3, size_work4, + size_workArr); + + if(!mem) + return rocblas_status_memory_error; + + scalars = mem[0]; + workE = mem[1]; + workTau = mem[2]; + workVec = mem[3]; + workSplits = mem[4]; + work1 = mem[5]; + work2 = mem[6]; + work3 = mem[7]; + work4 = mem[8]; + workArr = mem[9]; + if(size_scalars > 0) + init_scalars(handle, (T*)scalars); + + // execution + return rocsolver_syevdj_heevdj_template( + handle, evect, uplo, n, A, shiftA, lda, strideA, D, strideD, info, batch_count, (T*)scalars, + (S*)workE, (T*)workTau, (T*)workVec, (rocblas_int*)workSplits, work1, work2, work3, work4, + workArr); +} + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" { + +rocblas_status rocsolver_ssyevdj_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + float* const A[], + const rocblas_int lda, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_syevdj_heevdj_batched_impl(handle, evect, uplo, n, A, lda, D, strideD, + info, batch_count); +} + +rocblas_status rocsolver_dsyevdj_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + double* const A[], + const rocblas_int lda, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_syevdj_heevdj_batched_impl(handle, evect, uplo, n, A, lda, D, strideD, + info, batch_count); +} + +rocblas_status rocsolver_cheevdj_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_float_complex* const A[], + const rocblas_int lda, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_syevdj_heevdj_batched_impl( + handle, evect, uplo, n, A, lda, D, strideD, info, batch_count); +} + +rocblas_status rocsolver_zheevdj_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* const A[], + const rocblas_int lda, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_syevdj_heevdj_batched_impl( + handle, evect, uplo, n, A, lda, D, strideD, info, batch_count); +} + +} // extern C diff --git a/library/src/lapack/roclapack_syevdj_heevdj_strided_batched.cpp b/library/src/lapack/roclapack_syevdj_heevdj_strided_batched.cpp new file mode 100644 index 000000000..e109755b7 --- /dev/null +++ b/library/src/lapack/roclapack_syevdj_heevdj_strided_batched.cpp @@ -0,0 +1,189 @@ +/* ************************************************************************ + * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#include "roclapack_syevdj_heevdj.hpp" + +template +rocblas_status rocsolver_syevdj_heevdj_strided_batched_impl(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + W A, + const rocblas_int lda, + const rocblas_stride strideA, + S* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + const char* name = (!rocblas_is_complex ? "syevdj_strided_batched" : "heevdj_strided_batched"); + ROCSOLVER_ENTER_TOP(name, "--evect", evect, "--uplo", uplo, "-n", n, "--lda", lda, "--strideA", + strideA, "--strideD", strideD, "--batch_count", batch_count); + + if(!handle) + return rocblas_status_invalid_handle; + + // argument checking + rocblas_status st + = rocsolver_syevdj_heevdj_argCheck(handle, evect, uplo, n, A, lda, D, info, batch_count); + if(st != rocblas_status_continue) + return st; + + // working with unshifted arrays + rocblas_int shiftA = 0; + + // strided_batched execution + + // memory workspace sizes: + // size for constants in rocblas calls + size_t size_scalars; + // size of reusable workspaces + size_t size_work1; + size_t size_work2; + size_t size_work3; + // extra space for call stedc + size_t size_workSplits, size_work4; + // size of array of pointers (only for batched case) + size_t size_workArr; + // size for temporary householder scalars + size_t size_workTau; + // size for temporary vectors + size_t size_workVec; + // size for temporary superdiagonal of tridiag form + size_t size_workE; + + rocsolver_syevdj_heevdj_getMemorySize( + evect, uplo, n, batch_count, &size_scalars, &size_workE, &size_workTau, &size_workVec, + &size_workSplits, &size_work1, &size_work2, &size_work3, &size_work4, &size_workArr); + + if(rocblas_is_device_memory_size_query(handle)) + return rocblas_set_optimal_device_memory_size( + handle, size_scalars, size_workE, size_workTau, size_workVec, size_workSplits, + size_work1, size_work2, size_work3, size_work4, size_workArr); + + // memory workspace allocation + void *scalars, *work1, *work2, *work3, *work4, *workE, *workVec, *workSplits, *workTau, *workArr; + rocblas_device_malloc mem(handle, size_scalars, size_workE, size_workTau, size_workVec, + size_workSplits, size_work1, size_work2, size_work3, size_work4, + size_workArr); + + if(!mem) + return rocblas_status_memory_error; + + scalars = mem[0]; + workE = mem[1]; + workTau = mem[2]; + workVec = mem[3]; + workSplits = mem[4]; + work1 = mem[5]; + work2 = mem[6]; + work3 = mem[7]; + work4 = mem[8]; + workArr = mem[9]; + if(size_scalars > 0) + init_scalars(handle, (T*)scalars); + + // execution + return rocsolver_syevdj_heevdj_template( + handle, evect, uplo, n, A, shiftA, lda, strideA, D, strideD, info, batch_count, (T*)scalars, + (S*)workE, (T*)workTau, (T*)workVec, (rocblas_int*)workSplits, work1, work2, work3, work4, + workArr); +} + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" { + +rocblas_status rocsolver_ssyevdj_strided_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + float* A, + const rocblas_int lda, + const rocblas_stride strideA, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_syevdj_heevdj_strided_batched_impl( + handle, evect, uplo, n, A, lda, strideA, D, strideD, info, batch_count); +} + +rocblas_status rocsolver_dsyevdj_strided_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + double* A, + const rocblas_int lda, + const rocblas_stride strideA, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_syevdj_heevdj_strided_batched_impl( + handle, evect, uplo, n, A, lda, strideA, D, strideD, info, batch_count); +} + +rocblas_status rocsolver_cheevdj_strided_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_float_complex* A, + const rocblas_int lda, + const rocblas_stride strideA, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_syevdj_heevdj_strided_batched_impl( + handle, evect, uplo, n, A, lda, strideA, D, strideD, info, batch_count); +} + +rocblas_status rocsolver_zheevdj_strided_batched(rocblas_handle handle, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* A, + const rocblas_int lda, + const rocblas_stride strideA, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_syevdj_heevdj_strided_batched_impl( + handle, evect, uplo, n, A, lda, strideA, D, strideD, info, batch_count); +} + +} // extern C diff --git a/library/src/lapack/roclapack_syevj_heevj.hpp b/library/src/lapack/roclapack_syevj_heevj.hpp index 840f3aebd..8dfa690f2 100644 --- a/library/src/lapack/roclapack_syevj_heevj.hpp +++ b/library/src/lapack/roclapack_syevj_heevj.hpp @@ -6,7 +6,30 @@ * and * Hari & Kovac (2019). On the Convergence of Complex Jacobi Methods. * Linear and Multilinear Algebra 69(3), p. 489-514. - * Copyright (C) 2021-2023 Advanced Micro Devices, Inc. + * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * ***********************************************************************/ #pragma once @@ -95,8 +118,6 @@ __device__ void run_syevj(const rocblas_int dimx, } } } - cosines_res[tix] = local_res; - sines_diag[tix] = local_diag; } else { @@ -123,9 +144,9 @@ __device__ void run_syevj(const rocblas_int dimx, } } } - cosines_res[tix] = local_res; - sines_diag[tix] = local_diag; } + cosines_res[tix] = local_res; + sines_diag[tix] = local_diag; // initialize top/bottom pairs for(i = tix; i < half_n; i += dimx) @@ -155,9 +176,8 @@ __device__ void run_syevj(const rocblas_int dimx, { for(rocblas_int cc = 0; cc < count; ++cc) { - rocblas_int kx = tix + cc * dimx; - // get current top/bottom pair + rocblas_int kx = tix + cc * dimx; i = kx < half_n ? top[kx] : n; j = kx < half_n ? bottom[kx] : n; @@ -281,6 +301,7 @@ __device__ void run_syevj(const rocblas_int dimx, if(tiy == 0) { local_res = 0; + for(i = tix; i < n; i += dimx) { for(j = 0; j < i; j++) @@ -297,33 +318,35 @@ __device__ void run_syevj(const rocblas_int dimx, sweeps++; } - if(tiy > 0) - return; - // finalize outputs - if(tix == 0) + if(tiy == 0) { - *residual = sqrt(local_res); - if(sweeps <= max_sweeps) - { - *n_sweeps = sweeps; - *info = 0; - } - else + if(tix == 0) { - *n_sweeps = max_sweeps; - *info = 1; + *residual = sqrt(local_res); + if(sweeps <= max_sweeps) + { + *n_sweeps = sweeps; + *info = 0; + } + else + { + *n_sweeps = max_sweeps; + *info = 1; + } } - } - // update W and then sort eigenvalues and eigenvectors by selection sort - for(i = tix; i < n; i += dimx) - W[i] = std::real(Acpy[i + i * n]); + // update W + for(i = tix; i < n; i += dimx) + W[i] = std::real(Acpy[i + i * n]); + } __syncthreads(); - if((evect == rocblas_evect_none && tix > 0) || esort == rocblas_esort_none) + // if no sort, then stop + if(esort == rocblas_esort_none) return; + //otherwise sort eigenvalues and eigenvectors by selection sort rocblas_int m; S p; for(j = 0; j < n - 1; j++) @@ -340,7 +363,7 @@ __device__ void run_syevj(const rocblas_int dimx, } __syncthreads(); - if(m != j) + if(m != j && tiy == 0) { if(tix == 0) { @@ -354,19 +377,23 @@ __device__ void run_syevj(const rocblas_int dimx, swap(A[i + m * lda], A[i + j * lda]); } } + __syncthreads(); } } -__host__ __device__ inline void syevj_get_dims(rocblas_int n, rocblas_int* ddx, rocblas_int* ddy) +__host__ __device__ inline void + syevj_get_dims(rocblas_int n, rocblas_int bdim, rocblas_int* ddx, rocblas_int* ddy) { // (TODO: Some tuning could be beneficial in the future. // For now, we use a max of BDIM = ddx * ddy threads. - // ddy is set to ceil(n/2) and ddx to min(BDIM/ddy, ceil(n/2)). + // ddy is set to min(BDIM/4, ceil(n/2)) and ddx to min(BDIM/ddy, ceil(n/2)). rocblas_int even_n = n + n % 2; rocblas_int half_n = even_n / 2; - *ddy = half_n; - *ddx = min(SYEVJ_BDIM / half_n, half_n); + rocblas_int y = min(bdim / 4, half_n); + rocblas_int x = min(bdim / y, half_n); + *ddx = x; + *ddy = y; } template @@ -403,7 +430,7 @@ ROCSOLVER_KERNEL void __launch_bounds__(SYEVJ_BDIM) syevj_small_kernel(const roc // get dimensions of 2D thread array rocblas_int ddx, ddy; - syevj_get_dims(n, &ddx, &ddy); + syevj_get_dims(n, SYEVJ_BDIM, &ddx, &ddy); // shared memory extern __shared__ double lmem[]; @@ -1436,7 +1463,7 @@ rocblas_status rocsolver_syevj_heevj_template(rocblas_handle handle, // (TODO: SYEVJ_BLOCKED_SWITCH may need re-tuning as it could be larger than 64 now). rocblas_int ddx, ddy; - syevj_get_dims(n, &ddx, &ddy); + syevj_get_dims(n, SYEVJ_BDIM, &ddx, &ddy); dim3 grid(1, 1, batch_count); dim3 threads(ddx * ddy, 1, 1); size_t lmemsize = (sizeof(S) + sizeof(T)) * ddx + 2 * sizeof(rocblas_int) * half_n; diff --git a/library/src/lapack/roclapack_sygvdj_hegvdj.cpp b/library/src/lapack/roclapack_sygvdj_hegvdj.cpp new file mode 100644 index 000000000..7469995d1 --- /dev/null +++ b/library/src/lapack/roclapack_sygvdj_hegvdj.cpp @@ -0,0 +1,194 @@ +/* ************************************************************************** + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#include "roclapack_sygvdj_hegvdj.hpp" + +template +rocblas_status rocsolver_sygvdj_hegvdj_impl(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + U A, + const rocblas_int lda, + U B, + const rocblas_int ldb, + S* D, + rocblas_int* info) +{ + const char* name = (!rocblas_is_complex ? "sygvdj" : "hegvdj"); + ROCSOLVER_ENTER_TOP(name, "--itype", itype, "--evect", evect, "--uplo", uplo, "-n", n, "--lda", + lda, "--ldb", ldb); + + if(!handle) + return rocblas_status_invalid_handle; + + // argument checking + rocblas_status st + = rocsolver_sygvdj_hegvdj_argCheck(handle, itype, evect, uplo, n, lda, ldb, A, B, D, info); + if(st != rocblas_status_continue) + return st; + + // working with unshifted arrays + rocblas_int shiftA = 0; + rocblas_int shiftB = 0; + + // normal (non-batched non-strided) execution + rocblas_stride strideA = 0; + rocblas_stride strideB = 0; + rocblas_stride strideD = 0; + rocblas_int batch_count = 1; + + // memory workspace sizes: + // size for constants in rocblas calls + size_t size_scalars; + // size of reusable workspaces + bool optim_mem; + size_t size_work1, size_work2, size_work3, size_work4; + // extra requirements for calling SYEVDJ/HEEVDJ + size_t size_workE; + size_t size_workTau; + size_t size_workVec; + size_t size_workSplits; + size_t size_workArr; + // size of temporary info array + size_t size_iinfo; + + rocsolver_sygvdj_hegvdj_getMemorySize( + itype, evect, uplo, n, batch_count, &size_scalars, &size_work1, &size_work2, &size_work3, + &size_work4, &size_workE, &size_workTau, &size_workVec, &size_workSplits, &size_iinfo, + &size_workArr, &optim_mem); + + if(rocblas_is_device_memory_size_query(handle)) + return rocblas_set_optimal_device_memory_size( + handle, size_scalars, size_work1, size_work2, size_work3, size_work4, size_workE, + size_workTau, size_workVec, size_workSplits, size_iinfo, size_workArr); + + // memory workspace allocation + void *scalars, *work1, *work2, *work3, *work4, *workE, *workTau, *workVec, *workSplits, + *workArr, *iinfo; + rocblas_device_malloc mem(handle, size_scalars, size_work1, size_work2, size_work3, size_work4, + size_workE, size_workTau, size_workVec, size_workSplits, size_iinfo, + size_workArr); + + if(!mem) + return rocblas_status_memory_error; + + scalars = mem[0]; + work1 = mem[1]; + work2 = mem[2]; + work3 = mem[3]; + work4 = mem[4]; + workE = mem[5]; + workTau = mem[6]; + workVec = mem[7]; + workSplits = mem[8]; + iinfo = mem[9]; + workArr = mem[10]; + if(size_scalars > 0) + init_scalars(handle, (T*)scalars); + + // execution + return rocsolver_sygvdj_hegvdj_template( + handle, itype, evect, uplo, n, A, shiftA, lda, strideA, B, shiftB, ldb, strideB, D, strideD, + info, batch_count, (T*)scalars, work1, work2, work3, work4, (S*)workE, (T*)workTau, + (T*)workVec, (rocblas_int*)workSplits, (rocblas_int*)iinfo, workArr, optim_mem); +} + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" { + +rocblas_status rocsolver_ssygvdj(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + float* A, + const rocblas_int lda, + float* B, + const rocblas_int ldb, + float* D, + rocblas_int* info) +{ + return rocsolver_sygvdj_hegvdj_impl(handle, itype, evect, uplo, n, A, lda, B, ldb, D, + info); +} + +rocblas_status rocsolver_dsygvdj(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + double* A, + const rocblas_int lda, + double* B, + const rocblas_int ldb, + double* D, + rocblas_int* info) +{ + return rocsolver_sygvdj_hegvdj_impl(handle, itype, evect, uplo, n, A, lda, B, ldb, D, + info); +} + +rocblas_status rocsolver_chegvdj(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_float_complex* A, + const rocblas_int lda, + rocblas_float_complex* B, + const rocblas_int ldb, + float* D, + rocblas_int* info) +{ + return rocsolver_sygvdj_hegvdj_impl(handle, itype, evect, uplo, n, A, + lda, B, ldb, D, info); +} + +rocblas_status rocsolver_zhegvdj(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* A, + const rocblas_int lda, + rocblas_double_complex* B, + const rocblas_int ldb, + double* D, + rocblas_int* info) +{ + return rocsolver_sygvdj_hegvdj_impl(handle, itype, evect, uplo, n, A, + lda, B, ldb, D, info); +} + +} // extern C diff --git a/library/src/lapack/roclapack_sygvdj_hegvdj.hpp b/library/src/lapack/roclapack_sygvdj_hegvdj.hpp new file mode 100644 index 000000000..cdd6c5a9e --- /dev/null +++ b/library/src/lapack/roclapack_sygvdj_hegvdj.hpp @@ -0,0 +1,273 @@ +/************************************************************************* + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#pragma once + +#include "rocblas.hpp" +#include "roclapack_potrf.hpp" +#include "roclapack_syevdj_heevdj.hpp" +#include "roclapack_sygst_hegst.hpp" +#include "roclapack_sygv_hegv.hpp" +#include "rocsolver/rocsolver.h" + +template +rocblas_status rocsolver_sygvdj_hegvdj_argCheck(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + const rocblas_int lda, + const rocblas_int ldb, + T A, + T B, + S D, + rocblas_int* info, + const rocblas_int batch_count = 1) +{ + // order is important for unit tests: + + // 1. invalid/non-supported values + if(itype != rocblas_eform_ax && itype != rocblas_eform_abx && itype != rocblas_eform_bax) + return rocblas_status_invalid_value; + if(evect != rocblas_evect_none && evect != rocblas_evect_original) + return rocblas_status_invalid_value; + if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) + return rocblas_status_invalid_value; + + // 2. invalid size + if(n < 0 || lda < n || ldb < n || batch_count < 0) + return rocblas_status_invalid_size; + + // skip pointer check if querying memory size + if(rocblas_is_device_memory_size_query(handle)) + return rocblas_status_continue; + + // 3. invalid pointers + if((n && !A) || (n && !B) || (n && !D) || (batch_count && !info)) + return rocblas_status_invalid_pointer; + + return rocblas_status_continue; +} + +template +void rocsolver_sygvdj_hegvdj_getMemorySize(const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + const rocblas_int batch_count, + size_t* size_scalars, + size_t* size_work1, + size_t* size_work2, + size_t* size_work3, + size_t* size_work4, + size_t* size_workE, + size_t* size_workTau, + size_t* size_workVec, + size_t* size_workSplits, + size_t* size_iinfo, + size_t* size_workArr, + bool* optim_mem) +{ + // if quick return no need of workspace + if(n == 0 || batch_count == 0) + { + *size_scalars = 0; + *size_work1 = 0; + *size_work2 = 0; + *size_work3 = 0; + *size_work4 = 0; + *size_workE = 0; + *size_workTau = 0; + *size_workVec = 0; + *size_iinfo = 0; + *size_workSplits = 0; + *size_workArr = 0; + *optim_mem = true; + return; + } + + bool opt1, opt2, opt3 = true; + size_t unused, temp1, temp2, temp3, temp4, temp5; + + // requirements for calling POTRF + rocsolver_potrf_getMemorySize(n, uplo, batch_count, size_scalars, + size_work1, size_work2, size_work3, + size_work4, size_workArr, size_iinfo, &opt1); + *size_iinfo = max(*size_iinfo, sizeof(rocblas_int) * batch_count); + + // requirements for calling SYGST/HEGST + rocsolver_sygst_hegst_getMemorySize(uplo, itype, n, batch_count, &unused, + &temp1, &temp2, &temp3, &temp4, &opt2); + *size_work1 = max(*size_work1, temp1); + *size_work2 = max(*size_work2, temp2); + *size_work3 = max(*size_work3, temp3); + *size_work4 = max(*size_work4, temp4); + + // requirements for calling SYEVDJ/HEEVDJ + rocsolver_syevdj_heevdj_getMemorySize( + evect, uplo, n, batch_count, &unused, size_workE, size_workTau, size_workVec, + size_workSplits, &temp1, &temp2, &temp3, &temp4, &temp5); + *size_work1 = max(*size_work1, temp1); + *size_work2 = max(*size_work2, temp2); + *size_work3 = max(*size_work3, temp3); + *size_work4 = max(*size_work4, temp4); + *size_workArr = max(*size_workArr, temp5); + + if(evect == rocblas_evect_original) + { + if(itype == rocblas_eform_ax || itype == rocblas_eform_abx) + { + // requirements for calling TRSM + rocblas_operation trans + = (uplo == rocblas_fill_upper ? rocblas_operation_none + : rocblas_operation_conjugate_transpose); + rocsolver_trsm_mem(rocblas_side_left, trans, n, n, batch_count, + &temp1, &temp2, &temp3, &temp4, &opt3); + *size_work1 = max(*size_work1, temp1); + *size_work2 = max(*size_work2, temp2); + *size_work3 = max(*size_work3, temp3); + *size_work4 = max(*size_work4, temp4); + } + } + + *optim_mem = opt1 && opt2 && opt3; +} + +template +rocblas_status rocsolver_sygvdj_hegvdj_template(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + U A, + const rocblas_int shiftA, + const rocblas_int lda, + const rocblas_stride strideA, + U B, + const rocblas_int shiftB, + const rocblas_int ldb, + const rocblas_stride strideB, + S* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count, + T* scalars, + void* work1, + void* work2, + void* work3, + void* work4, + S* workE, + T* workTau, + T* workVec, + rocblas_int* workSplits, + rocblas_int* iinfo, + void* workArr, + bool optim_mem) +{ + ROCSOLVER_ENTER("sygvdj_hegvdj", "itype:", itype, "evect:", evect, "uplo:", uplo, "n:", n, + "shiftA:", shiftA, "lda:", lda, "shiftB:", shiftB, "ldb:", ldb, + "bc:", batch_count); + + // quick return + if(batch_count == 0) + return rocblas_status_success; + + hipStream_t stream; + rocblas_get_stream(handle, &stream); + + rocblas_int blocksReset = (batch_count - 1) / BS1 + 1; + dim3 gridReset(blocksReset, 1, 1); + dim3 threads(BS1, 1, 1); + + // info=0 (starting with no errors) + ROCSOLVER_LAUNCH_KERNEL(reset_info, gridReset, threads, 0, stream, info, batch_count, 0); + + // quick return + if(n == 0) + return rocblas_status_success; + + // everything must be executed with scalars on the host + rocblas_pointer_mode old_mode; + rocblas_get_pointer_mode(handle, &old_mode); + rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host); + + // constants for rocblas functions calls + T one = 1; + + // perform Cholesky factorization of B + rocsolver_potrf_template(handle, uplo, n, B, shiftB, ldb, strideB, info, + batch_count, scalars, work1, work2, work3, + work4, (T*)workArr, iinfo, optim_mem); + + /** (TODO: Strictly speaking, computations should stop here is B is not positive definite. + A should not be modified in this case as no eigenvalues or eigenvectors can be computed. + Need to find a way to do this efficiently; for now A will be destroyed in the non + positive-definite case) **/ + + // reduce to standard eigenvalue problem and solve + rocsolver_sygst_hegst_template( + handle, itype, uplo, n, A, shiftA, lda, strideA, B, shiftB, ldb, strideB, batch_count, + scalars, work1, work2, work3, work4, optim_mem); + + rocsolver_syevdj_heevdj_template( + handle, evect, uplo, n, A, shiftA, lda, strideA, D, strideD, iinfo, batch_count, scalars, + workE, workTau, workVec, workSplits, work1, work2, work3, work4, workArr); + + // combine info from POTRF with info from SYEV/HEEV + ROCSOLVER_LAUNCH_KERNEL(sygv_update_info, gridReset, threads, 0, stream, info, iinfo, n, + batch_count); + + // backtransform eigenvectors + if(evect == rocblas_evect_original) + { + if(itype == rocblas_eform_ax || itype == rocblas_eform_abx) + { + if(uplo == rocblas_fill_upper) + rocsolver_trsm_upper( + handle, rocblas_side_left, rocblas_operation_none, rocblas_diagonal_non_unit, n, + n, B, shiftB, ldb, strideB, A, shiftA, lda, strideA, batch_count, optim_mem, + work1, work2, work3, work4); + else + rocsolver_trsm_lower( + handle, rocblas_side_left, rocblas_operation_conjugate_transpose, + rocblas_diagonal_non_unit, n, n, B, shiftB, ldb, strideB, A, shiftA, lda, + strideA, batch_count, optim_mem, work1, work2, work3, work4); + } + else + { + rocblas_operation trans + = (uplo == rocblas_fill_upper ? rocblas_operation_conjugate_transpose + : rocblas_operation_none); + rocblasCall_trmm(handle, rocblas_side_left, uplo, trans, rocblas_diagonal_non_unit, n, + n, &one, 0, B, shiftB, ldb, strideB, A, shiftA, lda, strideA, + batch_count, (T**)workArr); + } + } + + rocblas_set_pointer_mode(handle, old_mode); + return rocblas_status_success; +} diff --git a/library/src/lapack/roclapack_sygvdj_hegvdj_batched.cpp b/library/src/lapack/roclapack_sygvdj_hegvdj_batched.cpp new file mode 100644 index 000000000..0e04aca02 --- /dev/null +++ b/library/src/lapack/roclapack_sygvdj_hegvdj_batched.cpp @@ -0,0 +1,202 @@ +/* ************************************************************************** + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#include "roclapack_sygvdj_hegvdj.hpp" + +template +rocblas_status rocsolver_sygvdj_hegvdj_batched_impl(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + U A, + const rocblas_int lda, + U B, + const rocblas_int ldb, + S* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + const char* name = (!rocblas_is_complex ? "sygvdj_batched" : "hegvdj_batched"); + ROCSOLVER_ENTER_TOP(name, "--itype", itype, "--evect", evect, "--uplo", uplo, "-n", n, "--lda", + lda, "--ldb", ldb, "--strideD", strideD, "--batch_count", batch_count); + + if(!handle) + return rocblas_status_invalid_handle; + + // argument checking + rocblas_status st = rocsolver_sygvdj_hegvdj_argCheck(handle, itype, evect, uplo, n, lda, ldb, A, + B, D, info, batch_count); + if(st != rocblas_status_continue) + return st; + + // working with unshifted arrays + rocblas_int shiftA = 0; + rocblas_int shiftB = 0; + + // batched execution + rocblas_stride strideA = 0; + rocblas_stride strideB = 0; + + // memory workspace sizes: + // size for constants in rocblas calls + size_t size_scalars; + // size of reusable workspaces + bool optim_mem; + size_t size_work1, size_work2, size_work3, size_work4; + // extra requirements for calling SYEVDJ/HEEVDJ + size_t size_workE; + size_t size_workTau; + size_t size_workVec; + size_t size_workSplits; + size_t size_workArr; + // size of temporary info array + size_t size_iinfo; + + rocsolver_sygvdj_hegvdj_getMemorySize( + itype, evect, uplo, n, batch_count, &size_scalars, &size_work1, &size_work2, &size_work3, + &size_work4, &size_workE, &size_workTau, &size_workVec, &size_workSplits, &size_iinfo, + &size_workArr, &optim_mem); + + if(rocblas_is_device_memory_size_query(handle)) + return rocblas_set_optimal_device_memory_size( + handle, size_scalars, size_work1, size_work2, size_work3, size_work4, size_workE, + size_workTau, size_workVec, size_workSplits, size_iinfo, size_workArr); + + // memory workspace allocation + void *scalars, *work1, *work2, *work3, *work4, *workE, *workTau, *workVec, *workSplits, + *workArr, *iinfo; + rocblas_device_malloc mem(handle, size_scalars, size_work1, size_work2, size_work3, size_work4, + size_workE, size_workTau, size_workVec, size_workSplits, size_iinfo, + size_workArr); + + if(!mem) + return rocblas_status_memory_error; + + scalars = mem[0]; + work1 = mem[1]; + work2 = mem[2]; + work3 = mem[3]; + work4 = mem[4]; + workE = mem[5]; + workTau = mem[6]; + workVec = mem[7]; + workSplits = mem[8]; + iinfo = mem[9]; + workArr = mem[10]; + if(size_scalars > 0) + init_scalars(handle, (T*)scalars); + + // execution + return rocsolver_sygvdj_hegvdj_template( + handle, itype, evect, uplo, n, A, shiftA, lda, strideA, B, shiftB, ldb, strideB, D, strideD, + info, batch_count, (T*)scalars, work1, work2, work3, work4, (S*)workE, (T*)workTau, + (T*)workVec, (rocblas_int*)workSplits, (rocblas_int*)iinfo, workArr, optim_mem); +} + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" { + +rocblas_status rocsolver_ssygvdj_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + float* const A[], + const rocblas_int lda, + float* const B[], + const rocblas_int ldb, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_sygvdj_hegvdj_batched_impl(handle, itype, evect, uplo, n, A, lda, B, + ldb, D, strideD, info, batch_count); +} + +rocblas_status rocsolver_dsygvdj_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + double* const A[], + const rocblas_int lda, + double* const B[], + const rocblas_int ldb, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_sygvdj_hegvdj_batched_impl(handle, itype, evect, uplo, n, A, lda, B, + ldb, D, strideD, info, batch_count); +} + +rocblas_status rocsolver_chegvdj_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_float_complex* const A[], + const rocblas_int lda, + rocblas_float_complex* const B[], + const rocblas_int ldb, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_sygvdj_hegvdj_batched_impl( + handle, itype, evect, uplo, n, A, lda, B, ldb, D, strideD, info, batch_count); +} + +rocblas_status rocsolver_zhegvdj_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* const A[], + const rocblas_int lda, + rocblas_double_complex* const B[], + const rocblas_int ldb, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_sygvdj_hegvdj_batched_impl( + handle, itype, evect, uplo, n, A, lda, B, ldb, D, strideD, info, batch_count); +} + +} // extern C diff --git a/library/src/lapack/roclapack_sygvdj_hegvdj_strided_batched.cpp b/library/src/lapack/roclapack_sygvdj_hegvdj_strided_batched.cpp new file mode 100644 index 000000000..e31903501 --- /dev/null +++ b/library/src/lapack/roclapack_sygvdj_hegvdj_strided_batched.cpp @@ -0,0 +1,213 @@ +/* ************************************************************************** + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#include "roclapack_sygvdj_hegvdj.hpp" + +template +rocblas_status rocsolver_sygvdj_hegvdj_strided_batched_impl(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + U A, + const rocblas_int lda, + const rocblas_stride strideA, + U B, + const rocblas_int ldb, + const rocblas_stride strideB, + S* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + const char* name = (!rocblas_is_complex ? "sygvdj_strided_batched" : "hegvdj_strided_batched"); + ROCSOLVER_ENTER_TOP(name, "--itype", itype, "--evect", evect, "--uplo", uplo, "-n", n, "--lda", + lda, "--strideA", strideA, "--ldb", ldb, "--strideB", strideB, "--strideD", + strideD, "--batch_count", batch_count); + + if(!handle) + return rocblas_status_invalid_handle; + + // argument checking + rocblas_status st = rocsolver_sygvdj_hegvdj_argCheck(handle, itype, evect, uplo, n, lda, ldb, A, + B, D, info, batch_count); + if(st != rocblas_status_continue) + return st; + + // working with unshifted arrays + rocblas_int shiftA = 0; + rocblas_int shiftB = 0; + + // memory workspace sizes: + // size for constants in rocblas calls + size_t size_scalars; + // size of reusable workspaces + bool optim_mem; + size_t size_work1, size_work2, size_work3, size_work4; + // extra requirements for calling SYEVDJ/HEEVDJ + size_t size_workE; + size_t size_workTau; + size_t size_workVec; + size_t size_workSplits; + size_t size_workArr; + // size of temporary info array + size_t size_iinfo; + + rocsolver_sygvdj_hegvdj_getMemorySize( + itype, evect, uplo, n, batch_count, &size_scalars, &size_work1, &size_work2, &size_work3, + &size_work4, &size_workE, &size_workTau, &size_workVec, &size_workSplits, &size_iinfo, + &size_workArr, &optim_mem); + + if(rocblas_is_device_memory_size_query(handle)) + return rocblas_set_optimal_device_memory_size( + handle, size_scalars, size_work1, size_work2, size_work3, size_work4, size_workE, + size_workTau, size_workVec, size_workSplits, size_iinfo, size_workArr); + + // memory workspace allocation + void *scalars, *work1, *work2, *work3, *work4, *workE, *workTau, *workVec, *workSplits, + *workArr, *iinfo; + rocblas_device_malloc mem(handle, size_scalars, size_work1, size_work2, size_work3, size_work4, + size_workE, size_workTau, size_workVec, size_workSplits, size_iinfo, + size_workArr); + + if(!mem) + return rocblas_status_memory_error; + + scalars = mem[0]; + work1 = mem[1]; + work2 = mem[2]; + work3 = mem[3]; + work4 = mem[4]; + workE = mem[5]; + workTau = mem[6]; + workVec = mem[7]; + workSplits = mem[8]; + iinfo = mem[9]; + workArr = mem[10]; + if(size_scalars > 0) + init_scalars(handle, (T*)scalars); + + // execution + return rocsolver_sygvdj_hegvdj_template( + handle, itype, evect, uplo, n, A, shiftA, lda, strideA, B, shiftB, ldb, strideB, D, strideD, + info, batch_count, (T*)scalars, work1, work2, work3, work4, (S*)workE, (T*)workTau, + (T*)workVec, (rocblas_int*)workSplits, (rocblas_int*)iinfo, workArr, optim_mem); +} + +/* + * =========================================================================== + * C wrapper + * =========================================================================== + */ + +extern "C" { + +rocblas_status rocsolver_ssygvdj_strided_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + float* A, + const rocblas_int lda, + const rocblas_stride strideA, + float* B, + const rocblas_int ldb, + const rocblas_stride strideB, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_sygvdj_hegvdj_strided_batched_impl(handle, itype, evect, uplo, n, A, + lda, strideA, B, ldb, strideB, D, + strideD, info, batch_count); +} + +rocblas_status rocsolver_dsygvdj_strided_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + double* A, + const rocblas_int lda, + const rocblas_stride strideA, + double* B, + const rocblas_int ldb, + const rocblas_stride strideB, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_sygvdj_hegvdj_strided_batched_impl(handle, itype, evect, uplo, n, A, + lda, strideA, B, ldb, strideB, D, + strideD, info, batch_count); +} + +rocblas_status rocsolver_chegvdj_strided_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_float_complex* A, + const rocblas_int lda, + const rocblas_stride strideA, + rocblas_float_complex* B, + const rocblas_int ldb, + const rocblas_stride strideB, + float* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_sygvdj_hegvdj_strided_batched_impl( + handle, itype, evect, uplo, n, A, lda, strideA, B, ldb, strideB, D, strideD, info, + batch_count); +} + +rocblas_status rocsolver_zhegvdj_strided_batched(rocblas_handle handle, + const rocblas_eform itype, + const rocblas_evect evect, + const rocblas_fill uplo, + const rocblas_int n, + rocblas_double_complex* A, + const rocblas_int lda, + const rocblas_stride strideA, + rocblas_double_complex* B, + const rocblas_int ldb, + const rocblas_stride strideB, + double* D, + const rocblas_stride strideD, + rocblas_int* info, + const rocblas_int batch_count) +{ + return rocsolver_sygvdj_hegvdj_strided_batched_impl( + handle, itype, evect, uplo, n, A, lda, strideA, B, ldb, strideB, D, strideD, info, + batch_count); +} + +} // extern C