From f8b1f3cd47ff309710b72669d94f2a3b49a615b9 Mon Sep 17 00:00:00 2001
From: Juan Zuniga-Anaya <50754207+jzuniga-amd@users.noreply.github.com>
Date: Tue, 9 Jan 2024 09:45:47 -0700
Subject: [PATCH] Divide & Conquer + Jacobi (part 2) (#643)

* prepare stedc to use other solvers
* fix synchronization issues
* address review comments
* define APIs and add documentation
* add syevdj/heevdj
* add test infrastructure for syevdj/heevdj
* fix test bug
* add batched versions
* enable jacobi solver in stedc
* fix single-block case
* fix synchronization problems
* fix synchronization problems
* clean up
* add sygvdj/hegvdj
* clean up
* update changelog
* fix synch problem showing on gfx1101
* addressed review comments
---
 CHANGELOG.md                                  |    8 +
 clients/CMakeLists.txt                        |    4 +-
 clients/common/testing_syevdj_heevdj.cpp      |   32 +
 clients/common/testing_sygvdj_hegvdj.cpp      |   32 +
 clients/gtest/CMakeLists.txt                  |    4 +-
 clients/gtest/syevdj_heevdj_gtest.cpp         |  193 +++
 clients/gtest/sygvdj_hegvdj_gtest.cpp         |  207 +++
 clients/include/rocsolver.hpp                 |  322 +++-
 clients/include/rocsolver_dispatcher.hpp      |   20 +-
 clients/include/testing_syevdj_heevdj.hpp     |  590 +++++++
 clients/include/testing_sygvdj_hegvdj.hpp     |  726 +++++++++
 docs/api/lapacklike.rst                       |   80 +
 docs/design/tuning.rst                        |   20 +-
 docs/userguide/intro.rst                      |    4 +
 .../include/rocsolver/rocsolver-functions.h   | 1392 ++++++++++++++---
 library/src/CMakeLists.txt                    |    8 +-
 library/src/auxiliary/rocauxiliary_stedc.hpp  | 1382 +++++++++-------
 library/src/include/ideal_sizes.hpp           |    8 +
 .../src/lapack/roclapack_syevdj_heevdj.cpp    |  173 ++
 .../src/lapack/roclapack_syevdj_heevdj.hpp    |  265 ++++
 .../roclapack_syevdj_heevdj_batched.cpp       |  185 +++
 ...oclapack_syevdj_heevdj_strided_batched.cpp |  189 +++
 library/src/lapack/roclapack_syevj_heevj.hpp  |   91 +-
 .../src/lapack/roclapack_sygvdj_hegvdj.cpp    |  194 +++
 .../src/lapack/roclapack_sygvdj_hegvdj.hpp    |  273 ++++
 .../roclapack_sygvdj_hegvdj_batched.cpp       |  202 +++
 ...oclapack_sygvdj_hegvdj_strided_batched.cpp |  213 +++
 27 files changed, 6070 insertions(+), 747 deletions(-)
 create mode 100644 clients/common/testing_syevdj_heevdj.cpp
 create mode 100644 clients/common/testing_sygvdj_hegvdj.cpp
 create mode 100644 clients/gtest/syevdj_heevdj_gtest.cpp
 create mode 100644 clients/gtest/sygvdj_hegvdj_gtest.cpp
 create mode 100644 clients/include/testing_syevdj_heevdj.hpp
 create mode 100644 clients/include/testing_sygvdj_hegvdj.hpp
 create mode 100644 library/src/lapack/roclapack_syevdj_heevdj.cpp
 create mode 100644 library/src/lapack/roclapack_syevdj_heevdj.hpp
 create mode 100644 library/src/lapack/roclapack_syevdj_heevdj_batched.cpp
 create mode 100644 library/src/lapack/roclapack_syevdj_heevdj_strided_batched.cpp
 create mode 100644 library/src/lapack/roclapack_sygvdj_hegvdj.cpp
 create mode 100644 library/src/lapack/roclapack_sygvdj_hegvdj.hpp
 create mode 100644 library/src/lapack/roclapack_sygvdj_hegvdj_batched.cpp
 create mode 100644 library/src/lapack/roclapack_sygvdj_hegvdj_strided_batched.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e1ffcf54c..060b9cccf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,14 @@ Full documentation for rocSOLVER is available at [rocsolver.readthedocs.io](http
 
 
 ## rocSOLVER 3.25.0 for ROCm 6.1.0
+### Added
+- Eigensolver routines for symmetric/hermitian matrices using Divide & Conquer and Jacobi algorithm:
+    - SYEVDJ (with batched and strided\_batched versions)
+    - HEEVDJ (with batched and strided\_batched versions)
+- Generalized symmetric/hermitian-definite eigensolvers using Divide & Conquer and Jacobi algorithm:
+    - SYGVDJ (with batched and strided\_batched versions)
+    - HEGVDJ (with batched and strided\_batched versions)
+
 ### Changed
 - Relaxed array length requirements for GESVDX with `rocblas_srange_index`.
 
diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt
index 262b4121f..37b7d6439 100755
--- a/clients/CMakeLists.txt
+++ b/clients/CMakeLists.txt
@@ -1,5 +1,5 @@
 # ##########################################################################
-# Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -127,10 +127,12 @@ if(BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS)
     common/testing_sygsx_hegsx.cpp
     common/testing_syev_heev.cpp
     common/testing_syevd_heevd.cpp
+    common/testing_syevdj_heevdj.cpp
     common/testing_syevj_heevj.cpp
     common/testing_syevx_heevx.cpp
     common/testing_sygv_hegv.cpp
     common/testing_sygvd_hegvd.cpp
+    common/testing_sygvdj_hegvdj.cpp
     common/testing_sygvj_hegvj.cpp
     common/testing_sygvx_hegvx.cpp
     common/testing_geblttrf_npvt.cpp
diff --git a/clients/common/testing_syevdj_heevdj.cpp b/clients/common/testing_syevdj_heevdj.cpp
new file mode 100644
index 000000000..233d32348
--- /dev/null
+++ b/clients/common/testing_syevdj_heevdj.cpp
@@ -0,0 +1,32 @@
+/* ************************************************************************
+ * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ************************************************************************ */
+
+#include <testing_syevdj_heevdj.hpp>
+
+#define TESTING_SYEVDJ_HEEVDJ(...) template void testing_syevdj_heevdj<__VA_ARGS__>(Arguments&);
+
+INSTANTIATE(TESTING_SYEVDJ_HEEVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP)
diff --git a/clients/common/testing_sygvdj_hegvdj.cpp b/clients/common/testing_sygvdj_hegvdj.cpp
new file mode 100644
index 000000000..dcfc2220e
--- /dev/null
+++ b/clients/common/testing_sygvdj_hegvdj.cpp
@@ -0,0 +1,32 @@
+/* ************************************************************************
+ * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ************************************************************************ */
+
+#include <testing_sygvdj_hegvdj.hpp>
+
+#define TESTING_SYGVDJ_HEGVDJ(...) template void testing_sygvdj_hegvdj<__VA_ARGS__>(Arguments&);
+
+INSTANTIATE(TESTING_SYGVDJ_HEGVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP)
diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt
index acd477424..9a32d470e 100755
--- a/clients/gtest/CMakeLists.txt
+++ b/clients/gtest/CMakeLists.txt
@@ -1,5 +1,5 @@
 # ##########################################################################
-# Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -69,6 +69,8 @@ set(roclapack_test_source
   sygvj_hegvj_gtest.cpp
   sygvx_hegvx_gtest.cpp
   sygvdx_hegvdx_gtest.cpp
+  syevdj_heevdj_gtest.cpp
+  sygvdj_hegvdj_gtest.cpp
 )
 
 set(rocauxiliary_test_source
diff --git a/clients/gtest/syevdj_heevdj_gtest.cpp b/clients/gtest/syevdj_heevdj_gtest.cpp
new file mode 100644
index 000000000..25edcbc37
--- /dev/null
+++ b/clients/gtest/syevdj_heevdj_gtest.cpp
@@ -0,0 +1,193 @@
+/* ************************************************************************
+ * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ************************************************************************ */
+
+#include "testing_syevdj_heevdj.hpp"
+
+using ::testing::Combine;
+using ::testing::TestWithParam;
+using ::testing::Values;
+using ::testing::ValuesIn;
+using namespace std;
+
+typedef std::tuple<vector<int>, vector<printable_char>> syevdj_heevdj_tuple;
+
+// each size_range vector is a {n, lda}
+
+// each op_range vector is a {evect, uplo}
+
+// case when n == 0, evect == N, and uplo = L will also execute the bad arguments test
+// (null handle, null pointers and invalid values)
+
+const vector<vector<printable_char>> op_range = {{'N', 'L'}, {'N', 'U'}, {'V', 'L'}, {'V', 'U'}};
+
+// for checkin_lapack tests
+const vector<vector<int>> size_range = {
+    // quick return
+    {0, 1},
+    // invalid
+    {-1, 1},
+    {10, 5},
+    // normal (valid) samples
+    {1, 1},
+    {12, 12},
+    {20, 30},
+    {36, 36},
+    {50, 60}};
+
+// for daily_lapack tests
+const vector<vector<int>> large_size_range = {{192, 192}, {256, 270}, {300, 300}};
+
+Arguments syevdj_heevdj_setup_arguments(syevdj_heevdj_tuple tup)
+{
+    vector<int> size = std::get<0>(tup);
+    vector<printable_char> op = std::get<1>(tup);
+
+    Arguments arg;
+
+    arg.set<rocblas_int>("n", size[0]);
+    arg.set<rocblas_int>("lda", size[1]);
+
+    arg.set<char>("evect", op[0]);
+    arg.set<char>("uplo", op[1]);
+
+    // only testing standard use case/defaults for strides
+
+    arg.timing = 0;
+
+    return arg;
+}
+
+class SYEVDJ_HEEVDJ : public ::TestWithParam<syevdj_heevdj_tuple>
+{
+protected:
+    void TearDown() override
+    {
+        EXPECT_EQ(hipGetLastError(), hipSuccess);
+    }
+
+    template <bool BATCHED, bool STRIDED, typename T>
+    void run_tests()
+    {
+        Arguments arg = syevdj_heevdj_setup_arguments(GetParam());
+
+        if(arg.peek<rocblas_int>("n") == 0 && arg.peek<char>("evect") == 'N'
+           && arg.peek<char>("uplo") == 'L')
+            testing_syevdj_heevdj_bad_arg<BATCHED, STRIDED, T>();
+
+        arg.batch_count = (BATCHED || STRIDED ? 3 : 1);
+        testing_syevdj_heevdj<BATCHED, STRIDED, T>(arg);
+    }
+};
+
+class SYEVDJ : public SYEVDJ_HEEVDJ
+{
+};
+
+class HEEVDJ : public SYEVDJ_HEEVDJ
+{
+};
+
+// non-batch tests
+
+TEST_P(SYEVDJ, __float)
+{
+    run_tests<false, false, float>();
+}
+
+TEST_P(SYEVDJ, __double)
+{
+    run_tests<false, false, double>();
+}
+
+TEST_P(HEEVDJ, __float_complex)
+{
+    run_tests<false, false, rocblas_float_complex>();
+}
+
+TEST_P(HEEVDJ, __double_complex)
+{
+    run_tests<false, false, rocblas_double_complex>();
+}
+
+// batched tests
+
+TEST_P(SYEVDJ, batched__float)
+{
+    run_tests<true, true, float>();
+}
+
+TEST_P(SYEVDJ, batched__double)
+{
+    run_tests<true, true, double>();
+}
+
+TEST_P(HEEVDJ, batched__float_complex)
+{
+    run_tests<true, true, rocblas_float_complex>();
+}
+
+TEST_P(HEEVDJ, batched__double_complex)
+{
+    run_tests<true, true, rocblas_double_complex>();
+}
+
+// strided_batched tests
+
+TEST_P(SYEVDJ, strided_batched__float)
+{
+    run_tests<false, true, float>();
+}
+
+TEST_P(SYEVDJ, strided_batched__double)
+{
+    run_tests<false, true, double>();
+}
+
+TEST_P(HEEVDJ, strided_batched__float_complex)
+{
+    run_tests<false, true, rocblas_float_complex>();
+}
+
+TEST_P(HEEVDJ, strided_batched__double_complex)
+{
+    run_tests<false, true, rocblas_double_complex>();
+}
+
+// daily_lapack tests normal execution with medium to large sizes
+INSTANTIATE_TEST_SUITE_P(daily_lapack,
+                         SYEVDJ,
+                         Combine(ValuesIn(large_size_range), ValuesIn(op_range)));
+
+INSTANTIATE_TEST_SUITE_P(daily_lapack,
+                         HEEVDJ,
+                         Combine(ValuesIn(large_size_range), ValuesIn(op_range)));
+
+// checkin_lapack tests normal execution with small sizes, invalid sizes,
+// quick returns, and corner cases
+INSTANTIATE_TEST_SUITE_P(checkin_lapack, SYEVDJ, Combine(ValuesIn(size_range), ValuesIn(op_range)));
+
+INSTANTIATE_TEST_SUITE_P(checkin_lapack, HEEVDJ, Combine(ValuesIn(size_range), ValuesIn(op_range)));
diff --git a/clients/gtest/sygvdj_hegvdj_gtest.cpp b/clients/gtest/sygvdj_hegvdj_gtest.cpp
new file mode 100644
index 000000000..075c0be79
--- /dev/null
+++ b/clients/gtest/sygvdj_hegvdj_gtest.cpp
@@ -0,0 +1,207 @@
+/* **************************************************************************
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#include "testing_sygvdj_hegvdj.hpp"
+
+using ::testing::Combine;
+using ::testing::TestWithParam;
+using ::testing::Values;
+using ::testing::ValuesIn;
+using namespace std;
+
+typedef std::tuple<vector<int>, vector<printable_char>> sygvdj_tuple;
+
+// each matrix_size_range is a {n, lda, ldb, singular}
+// if singular = 1, then the used matrix for the tests is not positive definite
+
+// each type_range is a {itype, evect, uplo}
+
+// case when n = 0, itype = 1, evect = 'N', and uplo = U will also execute the bad arguments test
+// (null handle, null pointers and invalid values)
+
+const vector<vector<printable_char>> type_range
+    = {{'1', 'N', 'U'}, {'2', 'N', 'L'}, {'3', 'N', 'U'},
+       {'1', 'V', 'L'}, {'2', 'V', 'U'}, {'3', 'V', 'L'}};
+
+// for checkin_lapack tests
+const vector<vector<int>> matrix_size_range = {
+    // quick return
+    {0, 1, 1, 0},
+    // invalid
+    {-1, 1, 1, 0},
+    {20, 5, 5, 0},
+    // normal (valid) samples
+    {20, 30, 20, 1},
+    {35, 35, 35, 0},
+    {52, 52, 52, 1},
+    {50, 50, 60, 1}};
+
+// for daily_lapack tests
+const vector<vector<int>> large_matrix_size_range = {
+    {192, 192, 192, 0},
+    {256, 270, 256, 0},
+    {300, 300, 310, 0},
+};
+
+Arguments sygvdj_setup_arguments(sygvdj_tuple tup)
+{
+    vector<int> matrix_size = std::get<0>(tup);
+    vector<printable_char> type = std::get<1>(tup);
+
+    Arguments arg;
+
+    arg.set<rocblas_int>("n", matrix_size[0]);
+    arg.set<rocblas_int>("lda", matrix_size[1]);
+    arg.set<rocblas_int>("ldb", matrix_size[2]);
+
+    arg.set<char>("itype", type[0]);
+    arg.set<char>("evect", type[1]);
+    arg.set<char>("uplo", type[2]);
+
+    // only testing standard use case/defaults for strides
+
+    arg.timing = 0;
+    arg.singular = matrix_size[3];
+
+    return arg;
+}
+
+class SYGVDJ_HEGVDJ : public ::TestWithParam<sygvdj_tuple>
+{
+protected:
+    void TearDown() override
+    {
+        EXPECT_EQ(hipGetLastError(), hipSuccess);
+    }
+
+    template <bool BATCHED, bool STRIDED, typename T>
+    void run_tests()
+    {
+        Arguments arg = sygvdj_setup_arguments(GetParam());
+
+        if(arg.peek<char>("itype") == '1' && arg.peek<char>("evect") == 'N'
+           && arg.peek<char>("uplo") == 'U' && arg.peek<rocblas_int>("n") == 0)
+            testing_sygvdj_hegvdj_bad_arg<BATCHED, STRIDED, T>();
+
+        arg.batch_count = (BATCHED || STRIDED ? 3 : 1);
+        if(arg.singular == 1)
+            testing_sygvdj_hegvdj<BATCHED, STRIDED, T>(arg);
+
+        arg.singular = 0;
+        testing_sygvdj_hegvdj<BATCHED, STRIDED, T>(arg);
+    }
+};
+
+class SYGVDJ : public SYGVDJ_HEGVDJ
+{
+};
+
+class HEGVDJ : public SYGVDJ_HEGVDJ
+{
+};
+
+// non-batch tests
+
+TEST_P(SYGVDJ, __float)
+{
+    run_tests<false, false, float>();
+}
+
+TEST_P(SYGVDJ, __double)
+{
+    run_tests<false, false, double>();
+}
+
+TEST_P(HEGVDJ, __float_complex)
+{
+    run_tests<false, false, rocblas_float_complex>();
+}
+
+TEST_P(HEGVDJ, __double_complex)
+{
+    run_tests<false, false, rocblas_double_complex>();
+}
+
+// batched tests
+
+TEST_P(SYGVDJ, batched__float)
+{
+    run_tests<true, true, float>();
+}
+
+TEST_P(SYGVDJ, batched__double)
+{
+    run_tests<true, true, double>();
+}
+
+TEST_P(HEGVDJ, batched__float_complex)
+{
+    run_tests<true, true, rocblas_float_complex>();
+}
+
+TEST_P(HEGVDJ, batched__double_complex)
+{
+    run_tests<true, true, rocblas_double_complex>();
+}
+
+// strided_batched cases
+
+TEST_P(SYGVDJ, strided_batched__float)
+{
+    run_tests<false, true, float>();
+}
+
+TEST_P(SYGVDJ, strided_batched__double)
+{
+    run_tests<false, true, double>();
+}
+
+TEST_P(HEGVDJ, strided_batched__float_complex)
+{
+    run_tests<false, true, rocblas_float_complex>();
+}
+
+TEST_P(HEGVDJ, strided_batched__double_complex)
+{
+    run_tests<false, true, rocblas_double_complex>();
+}
+
+INSTANTIATE_TEST_SUITE_P(daily_lapack,
+                         SYGVDJ,
+                         Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range)));
+
+INSTANTIATE_TEST_SUITE_P(checkin_lapack,
+                         SYGVDJ,
+                         Combine(ValuesIn(matrix_size_range), ValuesIn(type_range)));
+
+INSTANTIATE_TEST_SUITE_P(daily_lapack,
+                         HEGVDJ,
+                         Combine(ValuesIn(large_matrix_size_range), ValuesIn(type_range)));
+
+INSTANTIATE_TEST_SUITE_P(checkin_lapack,
+                         HEGVDJ,
+                         Combine(ValuesIn(matrix_size_range), ValuesIn(type_range)));
diff --git a/clients/include/rocsolver.hpp b/clients/include/rocsolver.hpp
index d66770af3..c8583158d 100644
--- a/clients/include/rocsolver.hpp
+++ b/clients/include/rocsolver.hpp
@@ -1,5 +1,5 @@
 /* **************************************************************************
- * Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -7391,6 +7391,146 @@ inline rocblas_status rocsolver_syevd_heevd(bool STRIDED,
 }
 /********************************************************/
 
+/******************** SYEVDJ/HEEVDJ ********************/
+// normal and strided_batched
+inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              float* A,
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              float* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    return STRIDED
+        ? rocsolver_ssyevdj_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, info, bc)
+        : rocsolver_ssyevdj(handle, evect, uplo, n, A, lda, D, info);
+}
+
+inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              double* A,
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              double* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    return STRIDED
+        ? rocsolver_dsyevdj_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, info, bc)
+        : rocsolver_dsyevdj(handle, evect, uplo, n, A, lda, D, info);
+}
+
+inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              rocblas_float_complex* A,
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              float* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    return STRIDED
+        ? rocsolver_cheevdj_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, info, bc)
+        : rocsolver_cheevdj(handle, evect, uplo, n, A, lda, D, info);
+}
+
+inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              rocblas_double_complex* A,
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              double* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    return STRIDED
+        ? rocsolver_zheevdj_strided_batched(handle, evect, uplo, n, A, lda, stA, D, stD, info, bc)
+        : rocsolver_zheevdj(handle, evect, uplo, n, A, lda, D, info);
+}
+
+// batched
+inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              float* const A[],
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              float* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    return rocsolver_ssyevdj_batched(handle, evect, uplo, n, A, lda, D, stD, info, bc);
+}
+
+inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              double* const A[],
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              double* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    return rocsolver_dsyevdj_batched(handle, evect, uplo, n, A, lda, D, stD, info, bc);
+}
+
+inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              rocblas_float_complex* const A[],
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              float* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    return rocsolver_cheevdj_batched(handle, evect, uplo, n, A, lda, D, stD, info, bc);
+}
+
+inline rocblas_status rocsolver_syevdj_heevdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              rocblas_double_complex* const A[],
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              double* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    return rocsolver_zheevdj_batched(handle, evect, uplo, n, A, lda, D, stD, info, bc);
+}
+/********************************************************/
+
 /******************** SYEVJ/HEEVJ ********************/
 // normal and strided_batched
 inline rocblas_status rocsolver_syevj_heevj(bool STRIDED,
@@ -8431,6 +8571,186 @@ inline rocblas_status rocsolver_sygvd_hegvd(bool STRIDED,
 }
 /********************************************************/
 
+/******************** SYGVDJ_HEGVDJ ********************/
+// normal and strided_batched
+inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_eform itype,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              float* A,
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              float* B,
+                                              rocblas_int ldb,
+                                              rocblas_stride stB,
+                                              float* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    if(STRIDED)
+        return rocsolver_ssygvdj_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb,
+                                                 stB, D, stD, info, bc);
+    else
+        return rocsolver_ssygvdj(handle, itype, evect, uplo, n, A, lda, B, ldb, D, info);
+}
+
+inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_eform itype,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              double* A,
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              double* B,
+                                              rocblas_int ldb,
+                                              rocblas_stride stB,
+                                              double* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    if(STRIDED)
+        return rocsolver_dsygvdj_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb,
+                                                 stB, D, stD, info, bc);
+    else
+        return rocsolver_dsygvdj(handle, itype, evect, uplo, n, A, lda, B, ldb, D, info);
+}
+
+inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_eform itype,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              rocblas_float_complex* A,
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              rocblas_float_complex* B,
+                                              rocblas_int ldb,
+                                              rocblas_stride stB,
+                                              float* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    if(STRIDED)
+        return rocsolver_chegvdj_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb,
+                                                 stB, D, stD, info, bc);
+    else
+        return rocsolver_chegvdj(handle, itype, evect, uplo, n, A, lda, B, ldb, D, info);
+}
+
+inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_eform itype,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              rocblas_double_complex* A,
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              rocblas_double_complex* B,
+                                              rocblas_int ldb,
+                                              rocblas_stride stB,
+                                              double* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    if(STRIDED)
+        return rocsolver_zhegvdj_strided_batched(handle, itype, evect, uplo, n, A, lda, stA, B, ldb,
+                                                 stB, D, stD, info, bc);
+    else
+        return rocsolver_zhegvdj(handle, itype, evect, uplo, n, A, lda, B, ldb, D, info);
+}
+
+// batched
+inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_eform itype,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              float* const A[],
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              float* const B[],
+                                              rocblas_int ldb,
+                                              rocblas_stride stB,
+                                              float* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    return rocsolver_ssygvdj_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, info, bc);
+}
+
+inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_eform itype,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              double* const A[],
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              double* const B[],
+                                              rocblas_int ldb,
+                                              rocblas_stride stB,
+                                              double* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    return rocsolver_dsygvdj_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, info, bc);
+}
+
+inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_eform itype,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              rocblas_float_complex* const A[],
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              rocblas_float_complex* const B[],
+                                              rocblas_int ldb,
+                                              rocblas_stride stB,
+                                              float* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    return rocsolver_chegvdj_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, info, bc);
+}
+
+inline rocblas_status rocsolver_sygvdj_hegvdj(bool STRIDED,
+                                              rocblas_handle handle,
+                                              rocblas_eform itype,
+                                              rocblas_evect evect,
+                                              rocblas_fill uplo,
+                                              rocblas_int n,
+                                              rocblas_double_complex* const A[],
+                                              rocblas_int lda,
+                                              rocblas_stride stA,
+                                              rocblas_double_complex* const B[],
+                                              rocblas_int ldb,
+                                              rocblas_stride stB,
+                                              double* D,
+                                              rocblas_stride stD,
+                                              rocblas_int* info,
+                                              rocblas_int bc)
+{
+    return rocsolver_zhegvdj_batched(handle, itype, evect, uplo, n, A, lda, B, ldb, D, stD, info, bc);
+}
+/********************************************************/
+
 /******************** SYGVJ_HEGVJ ********************/
 // normal and strided_batched
 inline rocblas_status rocsolver_sygvj_hegvj(bool STRIDED,
diff --git a/clients/include/rocsolver_dispatcher.hpp b/clients/include/rocsolver_dispatcher.hpp
index 75de0bb91..815f74d70 100644
--- a/clients/include/rocsolver_dispatcher.hpp
+++ b/clients/include/rocsolver_dispatcher.hpp
@@ -1,5 +1,5 @@
 /* **************************************************************************
- * Copyright (C) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -90,11 +90,13 @@
 #include "testing_sterf.hpp"
 #include "testing_syev_heev.hpp"
 #include "testing_syevd_heevd.hpp"
+#include "testing_syevdj_heevdj.hpp"
 #include "testing_syevj_heevj.hpp"
 #include "testing_syevx_heevx.hpp"
 #include "testing_sygsx_hegsx.hpp"
 #include "testing_sygv_hegv.hpp"
 #include "testing_sygvd_hegvd.hpp"
+#include "testing_sygvdj_hegvdj.hpp"
 #include "testing_sygvj_hegvj.hpp"
 #include "testing_sygvx_hegvx.hpp"
 #include "testing_sytf2_sytrf.hpp"
@@ -322,6 +324,10 @@ class rocsolver_dispatcher
             {"syevd", testing_syevd_heevd<false, false, T>},
             {"syevd_batched", testing_syevd_heevd<true, true, T>},
             {"syevd_strided_batched", testing_syevd_heevd<false, true, T>},
+            // syevdj
+            {"syevdj", testing_syevdj_heevdj<false, false, T>},
+            {"syevdj_batched", testing_syevdj_heevdj<true, true, T>},
+            {"syevdj_strided_batched", testing_syevdj_heevdj<false, true, T>},
             // syevj
             {"syevj", testing_syevj_heevj<false, false, T>},
             {"syevj_batched", testing_syevj_heevj<true, true, T>},
@@ -338,6 +344,10 @@ class rocsolver_dispatcher
             {"sygvd", testing_sygvd_hegvd<false, false, T>},
             {"sygvd_batched", testing_sygvd_hegvd<true, true, T>},
             {"sygvd_strided_batched", testing_sygvd_hegvd<false, true, T>},
+            // sygvdj
+            {"sygvdj", testing_sygvdj_hegvdj<false, false, T>},
+            {"sygvdj_batched", testing_sygvdj_hegvdj<true, true, T>},
+            {"sygvdj_strided_batched", testing_sygvdj_hegvdj<false, true, T>},
             // sygvj
             {"sygvj", testing_sygvj_hegvj<false, false, T>},
             {"sygvj_batched", testing_sygvj_hegvj<true, true, T>},
@@ -412,6 +422,10 @@ class rocsolver_dispatcher
             {"heevd", testing_syevd_heevd<false, false, T>},
             {"heevd_batched", testing_syevd_heevd<true, true, T>},
             {"heevd_strided_batched", testing_syevd_heevd<false, true, T>},
+            // heevdj
+            {"heevdj", testing_syevdj_heevdj<false, false, T>},
+            {"heevdj_batched", testing_syevdj_heevdj<true, true, T>},
+            {"heevdj_strided_batched", testing_syevdj_heevdj<false, true, T>},
             // heevj
             {"heevj", testing_syevj_heevj<false, false, T>},
             {"heevj_batched", testing_syevj_heevj<true, true, T>},
@@ -428,6 +442,10 @@ class rocsolver_dispatcher
             {"hegvd", testing_sygvd_hegvd<false, false, T>},
             {"hegvd_batched", testing_sygvd_hegvd<true, true, T>},
             {"hegvd_strided_batched", testing_sygvd_hegvd<false, true, T>},
+            // hegvdj
+            {"hegvdj", testing_sygvdj_hegvdj<false, false, T>},
+            {"hegvdj_batched", testing_sygvdj_hegvdj<true, true, T>},
+            {"hegvdj_strided_batched", testing_sygvdj_hegvdj<false, true, T>},
             // hegvj
             {"hegvj", testing_sygvj_hegvj<false, false, T>},
             {"hegvj_batched", testing_sygvj_hegvj<true, true, T>},
diff --git a/clients/include/testing_syevdj_heevdj.hpp b/clients/include/testing_syevdj_heevdj.hpp
new file mode 100644
index 000000000..128f4e515
--- /dev/null
+++ b/clients/include/testing_syevdj_heevdj.hpp
@@ -0,0 +1,590 @@
+/* ************************************************************************
+ * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ************************************************************************ */
+
+#pragma once
+
+#include "client_util.hpp"
+#include "clientcommon.hpp"
+#include "lapack_host_reference.hpp"
+#include "norm.hpp"
+#include "rocsolver.hpp"
+#include "rocsolver_arguments.hpp"
+#include "rocsolver_test.hpp"
+
+template <bool STRIDED, typename T, typename S, typename U>
+void syevdj_heevdj_checkBadArgs(const rocblas_handle handle,
+                                const rocblas_evect evect,
+                                const rocblas_fill uplo,
+                                const rocblas_int n,
+                                T dA,
+                                const rocblas_int lda,
+                                const rocblas_stride stA,
+                                S dD,
+                                const rocblas_stride stD,
+                                U dinfo,
+                                const rocblas_int bc)
+{
+    // handle
+    EXPECT_ROCBLAS_STATUS(
+        rocsolver_syevdj_heevdj(STRIDED, nullptr, evect, uplo, n, dA, lda, stA, dD, stD, dinfo, bc),
+        rocblas_status_invalid_handle);
+
+    // values
+    EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, rocblas_evect(0), uplo, n, dA,
+                                                  lda, stA, dD, stD, dinfo, bc),
+                          rocblas_status_invalid_value);
+    EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, rocblas_fill_full, n, dA,
+                                                  lda, stA, dD, stD, dinfo, bc),
+                          rocblas_status_invalid_value);
+
+    // sizes (only check batch_count if applicable)
+    if(STRIDED)
+        EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA, lda, stA,
+                                                      dD, stD, dinfo, -1),
+                              rocblas_status_invalid_size);
+
+    // pointers
+    EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, (T) nullptr, lda,
+                                                  stA, dD, stD, dinfo, bc),
+                          rocblas_status_invalid_pointer);
+    EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA, lda, stA,
+                                                  (S) nullptr, stD, dinfo, bc),
+                          rocblas_status_invalid_pointer);
+    EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD,
+                                                  stD, (U) nullptr, bc),
+                          rocblas_status_invalid_pointer);
+
+    // quick return with invalid pointers
+    EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, 0, (T) nullptr, lda,
+                                                  stA, (S) nullptr, stD, dinfo, bc),
+                          rocblas_status_success);
+
+    // quick return with zero batch_count if applicable
+    if(STRIDED)
+        EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA, lda, stA,
+                                                      dD, stD, (U) nullptr, 0),
+                              rocblas_status_success);
+}
+
+template <bool BATCHED, bool STRIDED, typename T>
+void testing_syevdj_heevdj_bad_arg()
+{
+    using S = decltype(std::real(T{}));
+
+    // safe arguments
+    rocblas_local_handle handle;
+    rocblas_evect evect = rocblas_evect_none;
+    rocblas_fill uplo = rocblas_fill_lower;
+    rocblas_int n = 1;
+    rocblas_int lda = 1;
+    rocblas_stride stA = 1;
+    rocblas_stride stD = 1;
+    rocblas_int bc = 1;
+
+    if(BATCHED)
+    {
+        // memory allocations
+        device_batch_vector<T> dA(1, 1, 1);
+        device_strided_batch_vector<S> dD(1, 1, 1, 1);
+        device_strided_batch_vector<rocblas_int> dinfo(1, 1, 1, 1);
+        CHECK_HIP_ERROR(dA.memcheck());
+        CHECK_HIP_ERROR(dD.memcheck());
+        CHECK_HIP_ERROR(dinfo.memcheck());
+
+        // check bad arguments
+        syevdj_heevdj_checkBadArgs<STRIDED>(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(),
+                                            stD, dinfo.data(), bc);
+    }
+    else
+    {
+        // memory allocations
+        device_strided_batch_vector<T> dA(1, 1, 1, 1);
+        device_strided_batch_vector<S> dD(1, 1, 1, 1);
+        device_strided_batch_vector<rocblas_int> dinfo(1, 1, 1, 1);
+        CHECK_HIP_ERROR(dA.memcheck());
+        CHECK_HIP_ERROR(dD.memcheck());
+        CHECK_HIP_ERROR(dinfo.memcheck());
+
+        // check bad arguments
+        syevdj_heevdj_checkBadArgs<STRIDED>(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(),
+                                            stD, dinfo.data(), bc);
+    }
+}
+
+template <bool CPU, bool GPU, typename T, typename Td, typename Th>
+void syevdj_heevdj_initData(const rocblas_handle handle,
+                            const rocblas_evect evect,
+                            const rocblas_int n,
+                            Td& dA,
+                            const rocblas_int lda,
+                            const rocblas_int bc,
+                            Th& hA,
+                            std::vector<T>& A,
+                            bool test = true)
+{
+    if(CPU)
+    {
+        rocblas_init<T>(hA, true);
+
+        // scale A to avoid singularities
+        for(rocblas_int b = 0; b < bc; ++b)
+        {
+            for(rocblas_int i = 0; i < n; i++)
+            {
+                for(rocblas_int j = 0; j < n; j++)
+                {
+                    if(i == j)
+                        hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400;
+                    else
+                        hA[b][i + j * lda] -= 4;
+                }
+            }
+
+            // make copy of original data to test vectors if required
+            if(test && evect == rocblas_evect_original)
+            {
+                for(rocblas_int i = 0; i < n; i++)
+                {
+                    for(rocblas_int j = 0; j < n; j++)
+                        A[b * lda * n + i + j * lda] = hA[b][i + j * lda];
+                }
+            }
+        }
+    }
+
+    if(GPU)
+    {
+        // now copy to the GPU
+        CHECK_HIP_ERROR(dA.transfer_from(hA));
+    }
+}
+
+template <bool STRIDED, typename T, typename Sd, typename Td, typename Id, typename Sh, typename Th, typename Ih>
+void syevdj_heevdj_getError(const rocblas_handle handle,
+                            const rocblas_evect evect,
+                            const rocblas_fill uplo,
+                            const rocblas_int n,
+                            Td& dA,
+                            const rocblas_int lda,
+                            const rocblas_stride stA,
+                            Sd& dD,
+                            const rocblas_stride stD,
+                            Id& dinfo,
+                            const rocblas_int bc,
+                            Th& hA,
+                            Th& hAres,
+                            Sh& hD,
+                            Sh& hDres,
+                            Ih& hinfo,
+                            Ih& hinfoRes,
+                            double* max_err)
+{
+    constexpr bool COMPLEX = rocblas_is_complex<T>;
+    using S = decltype(std::real(T{}));
+
+    int lwork = (COMPLEX ? 2 * n - 1 : 0);
+    int lrwork = 3 * n - 1;
+    std::vector<T> work(lwork);
+    std::vector<S> rwork(lrwork);
+    std::vector<T> A(lda * n * bc);
+
+    // input data initialization
+    syevdj_heevdj_initData<true, true, T>(handle, evect, n, dA, lda, bc, hA, A);
+
+    // execute computations
+    // GPU lapack
+    CHECK_ROCBLAS_ERROR(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(), lda,
+                                                stA, dD.data(), stD, dinfo.data(), bc));
+
+    CHECK_HIP_ERROR(hDres.transfer_from(dD));
+    CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo));
+    if(evect == rocblas_evect_original)
+        CHECK_HIP_ERROR(hAres.transfer_from(dA));
+
+    // CPU lapack
+    for(rocblas_int b = 0; b < bc; ++b)
+        cpu_syev_heev(evect, uplo, n, hA[b], lda, hD[b], work.data(), lwork, rwork.data(), lrwork,
+                      hinfo[b]);
+
+    // Check info for non-convergence
+    *max_err = 0;
+    for(rocblas_int b = 0; b < bc; ++b)
+    {
+        EXPECT_EQ(hinfo[b][0], hinfoRes[b][0]) << "where b = " << b;
+        if(hinfo[b][0] != hinfoRes[b][0])
+            *max_err += 1;
+    }
+
+    // (We expect the used input matrices to always converge. Testing
+    // implicitly the equivalent non-converged matrix is very complicated and it boils
+    // down to essentially run the algorithm again and until convergence is achieved).
+
+    double err = 0;
+
+    for(rocblas_int b = 0; b < bc; ++b)
+    {
+        if(evect != rocblas_evect_original)
+        {
+            // only eigenvalues needed; can compare with LAPACK
+
+            // error is ||hD - hDRes|| / ||hD||
+            // using frobenius norm
+            if(hinfo[b][0] == 0)
+                err = norm_error('F', 1, n, 1, hD[b], hDres[b]);
+            *max_err = err > *max_err ? err : *max_err;
+        }
+        else
+        {
+            // both eigenvalues and eigenvectors needed; need to implicitly test
+            // eigenvectors due to non-uniqueness of eigenvectors under scaling
+            if(hinfo[b][0] == 0)
+            {
+                // multiply A with each of the n eigenvectors and divide by corresponding
+                // eigenvalues
+                T alpha;
+                T beta = 0;
+                for(int j = 0; j < n; j++)
+                {
+                    alpha = T(1) / hDres[b][j];
+                    cpu_symv_hemv(uplo, n, alpha, A.data() + b * lda * n, lda, hAres[b] + j * lda,
+                                  1, beta, hA[b] + j * lda, 1);
+                }
+
+                // error is ||hA - hARes|| / ||hA||
+                // using frobenius norm
+                err = norm_error('F', n, n, lda, hA[b], hAres[b]);
+                *max_err = err > *max_err ? err : *max_err;
+            }
+        }
+    }
+}
+
+template <bool STRIDED, typename T, typename Sd, typename Td, typename Id, typename Sh, typename Th, typename Ih>
+void syevdj_heevdj_getPerfData(const rocblas_handle handle,
+                               const rocblas_evect evect,
+                               const rocblas_fill uplo,
+                               const rocblas_int n,
+                               Td& dA,
+                               const rocblas_int lda,
+                               const rocblas_stride stA,
+                               Sd& dD,
+                               const rocblas_stride stD,
+                               Id& dinfo,
+                               const rocblas_int bc,
+                               Th& hA,
+                               Sh& hD,
+                               Ih& hinfo,
+                               double* gpu_time_used,
+                               double* cpu_time_used,
+                               const rocblas_int hot_calls,
+                               const int profile,
+                               const bool profile_kernels,
+                               const bool perf)
+{
+    std::vector<T> A(lda * n * bc);
+
+    if(!perf)
+    {
+        // cpu-lapack performance (only if not in perf mode)
+        *cpu_time_used = nan("");
+    }
+
+    syevdj_heevdj_initData<true, false, T>(handle, evect, n, dA, lda, bc, hA, A, 0);
+
+    // cold calls
+    for(int iter = 0; iter < 2; iter++)
+    {
+        syevdj_heevdj_initData<false, true, T>(handle, evect, n, dA, lda, bc, hA, A, 0);
+
+        CHECK_ROCBLAS_ERROR(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(), lda,
+                                                    stA, dD.data(), stD, dinfo.data(), bc));
+    }
+
+    // gpu-lapack performance
+    hipStream_t stream;
+    CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream));
+    double start;
+
+    if(profile > 0)
+    {
+        if(profile_kernels)
+            rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile
+                                         | rocblas_layer_mode_ex_log_kernel);
+        else
+            rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile);
+        rocsolver_log_set_max_levels(profile);
+    }
+
+    for(rocblas_int iter = 0; iter < hot_calls; iter++)
+    {
+        syevdj_heevdj_initData<false, true, T>(handle, evect, n, dA, lda, bc, hA, A, 0);
+
+        start = get_time_us_sync(stream);
+        rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(),
+                                stD, dinfo.data(), bc);
+        *gpu_time_used += get_time_us_sync(stream) - start;
+    }
+    *gpu_time_used /= hot_calls;
+}
+
+template <bool BATCHED, bool STRIDED, typename T>
+void testing_syevdj_heevdj(Arguments& argus)
+{
+    using S = decltype(std::real(T{}));
+
+    // get arguments
+    rocblas_local_handle handle;
+    char evectC = argus.get<char>("evect");
+    char uploC = argus.get<char>("uplo");
+    rocblas_int n = argus.get<rocblas_int>("n");
+    rocblas_int lda = argus.get<rocblas_int>("lda", n);
+    rocblas_stride stA = argus.get<rocblas_stride>("strideA", lda * n);
+    rocblas_stride stD = argus.get<rocblas_stride>("strideD", n);
+
+    rocblas_evect evect = char2rocblas_evect(evectC);
+    rocblas_fill uplo = char2rocblas_fill(uploC);
+    rocblas_int bc = argus.batch_count;
+    rocblas_int hot_calls = argus.iters;
+
+    // check non-supported values
+    if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal)
+    {
+        if(BATCHED)
+            EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n,
+                                                          (T* const*)nullptr, lda, stA, (S*)nullptr,
+                                                          stD, (rocblas_int*)nullptr, bc),
+                                  rocblas_status_invalid_value);
+        else
+            EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n,
+                                                          (T*)nullptr, lda, stA, (S*)nullptr, stD,
+                                                          (rocblas_int*)nullptr, bc),
+                                  rocblas_status_invalid_value);
+
+        if(argus.timing)
+            rocsolver_bench_inform(inform_invalid_args);
+
+        return;
+    }
+
+    // determine sizes
+    size_t size_A = size_t(lda) * n;
+    size_t size_D = n;
+    size_t size_Ares = (argus.unit_check || argus.norm_check) ? size_A : 0;
+    size_t size_Dres = (argus.unit_check || argus.norm_check) ? size_D : 0;
+
+    double max_error = 0, gpu_time_used = 0, cpu_time_used = 0;
+
+    // check invalid sizes
+    bool invalid_size = (n < 0 || lda < n || bc < 0);
+    if(invalid_size)
+    {
+        if(BATCHED)
+            EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n,
+                                                          (T* const*)nullptr, lda, stA, (S*)nullptr,
+                                                          stD, (rocblas_int*)nullptr, bc),
+                                  rocblas_status_invalid_size);
+        else
+            EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n,
+                                                          (T*)nullptr, lda, stA, (S*)nullptr, stD,
+                                                          (rocblas_int*)nullptr, bc),
+                                  rocblas_status_invalid_size);
+
+        if(argus.timing)
+            rocsolver_bench_inform(inform_invalid_size);
+
+        return;
+    }
+
+    // memory size query is necessary
+    if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND)
+    {
+        CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle));
+        if(BATCHED)
+            CHECK_ALLOC_QUERY(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n,
+                                                      (T* const*)nullptr, lda, stA, (S*)nullptr,
+                                                      stD, (rocblas_int*)nullptr, bc));
+        else
+            CHECK_ALLOC_QUERY(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, (T*)nullptr,
+                                                      lda, stA, (S*)nullptr, stD,
+                                                      (rocblas_int*)nullptr, bc));
+
+        size_t size;
+        CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size));
+        if(argus.mem_query)
+        {
+            rocsolver_bench_inform(inform_mem_query, size);
+            return;
+        }
+
+        CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size));
+    }
+
+    // memory allocations (all cases)
+    // host
+    host_strided_batch_vector<S> hD(size_D, 1, stD, bc);
+    host_strided_batch_vector<rocblas_int> hinfo(1, 1, 1, bc);
+    host_strided_batch_vector<rocblas_int> hinfoRes(1, 1, 1, bc);
+    host_strided_batch_vector<S> hDres(size_Dres, 1, stD, bc);
+    // device
+    device_strided_batch_vector<S> dD(size_D, 1, stD, bc);
+    device_strided_batch_vector<rocblas_int> dinfo(1, 1, 1, bc);
+    if(size_D)
+        CHECK_HIP_ERROR(dD.memcheck());
+    CHECK_HIP_ERROR(dinfo.memcheck());
+
+    if(BATCHED)
+    {
+        // memory allocations
+        host_batch_vector<T> hA(size_A, 1, bc);
+        host_batch_vector<T> hAres(size_Ares, 1, bc);
+        device_batch_vector<T> dA(size_A, 1, bc);
+        if(size_A)
+            CHECK_HIP_ERROR(dA.memcheck());
+
+        // check quick return
+        if(n == 0 || bc == 0)
+        {
+            EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(),
+                                                          lda, stA, dD.data(), stD, dinfo.data(), bc),
+                                  rocblas_status_success);
+            if(argus.timing)
+                rocsolver_bench_inform(inform_quick_return);
+
+            return;
+        }
+
+        // check computations
+        if(argus.unit_check || argus.norm_check)
+        {
+            syevdj_heevdj_getError<STRIDED, T>(handle, evect, uplo, n, dA, lda, stA, dD, stD, dinfo,
+                                               bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error);
+        }
+
+        // collect performance data
+        if(argus.timing)
+        {
+            syevdj_heevdj_getPerfData<STRIDED, T>(handle, evect, uplo, n, dA, lda, stA, dD, stD,
+                                                  dinfo, bc, hA, hD, hinfo, &gpu_time_used,
+                                                  &cpu_time_used, hot_calls, argus.profile,
+                                                  argus.profile_kernels, argus.perf);
+        }
+    }
+
+    else
+    {
+        // memory allocations
+        host_strided_batch_vector<T> hA(size_A, 1, stA, bc);
+        host_strided_batch_vector<T> hAres(size_Ares, 1, stA, bc);
+        device_strided_batch_vector<T> dA(size_A, 1, stA, bc);
+        if(size_A)
+            CHECK_HIP_ERROR(dA.memcheck());
+
+        // check quick return
+        if(n == 0 || bc == 0)
+        {
+            EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(),
+                                                          lda, stA, dD.data(), stD, dinfo.data(), bc),
+                                  rocblas_status_success);
+            if(argus.timing)
+                rocsolver_bench_inform(inform_quick_return);
+
+            return;
+        }
+
+        // check computations
+        if(argus.unit_check || argus.norm_check)
+        {
+            syevdj_heevdj_getError<STRIDED, T>(handle, evect, uplo, n, dA, lda, stA, dD, stD, dinfo,
+                                               bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error);
+        }
+
+        // collect performance data
+        if(argus.timing)
+        {
+            syevdj_heevdj_getPerfData<STRIDED, T>(handle, evect, uplo, n, dA, lda, stA, dD, stD,
+                                                  dinfo, bc, hA, hD, hinfo, &gpu_time_used,
+                                                  &cpu_time_used, hot_calls, argus.profile,
+                                                  argus.profile_kernels, argus.perf);
+        }
+    }
+
+    // validate results for rocsolver-test
+    // using 2 * n * machine_precision as tolerance
+    if(argus.unit_check)
+        ROCSOLVER_TEST_CHECK(T, max_error, 2 * n);
+
+    // output results for rocsolver-bench
+    if(argus.timing)
+    {
+        if(!argus.perf)
+        {
+            rocsolver_bench_header("Arguments:");
+            if(BATCHED)
+            {
+                rocsolver_bench_output("evect", "uplo", "n", "lda", "strideD", "batch_c");
+                rocsolver_bench_output(evectC, uploC, n, lda, stD, bc);
+            }
+            else if(STRIDED)
+            {
+                rocsolver_bench_output("evect", "uplo", "n", "lda", "strideA", "strideD", "batch_c");
+                rocsolver_bench_output(evectC, uploC, n, lda, stA, stD, bc);
+            }
+            else
+            {
+                rocsolver_bench_output("evect", "uplo", "n", "lda");
+                rocsolver_bench_output(evectC, uploC, n, lda);
+            }
+            rocsolver_bench_header("Results:");
+            if(argus.norm_check)
+            {
+                rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error");
+                rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error);
+            }
+            else
+            {
+                rocsolver_bench_output("cpu_time_us", "gpu_time_us");
+                rocsolver_bench_output(cpu_time_used, gpu_time_used);
+            }
+            rocsolver_bench_endl();
+        }
+        else
+        {
+            if(argus.norm_check)
+                rocsolver_bench_output(gpu_time_used, max_error);
+            else
+                rocsolver_bench_output(gpu_time_used);
+        }
+    }
+
+    // ensure all arguments were consumed
+    argus.validate_consumed();
+}
+
+#define EXTERN_TESTING_SYEVDJ_HEEVDJ(...) \
+    extern template void testing_syevdj_heevdj<__VA_ARGS__>(Arguments&);
+
+INSTANTIATE(EXTERN_TESTING_SYEVDJ_HEEVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP)
diff --git a/clients/include/testing_sygvdj_hegvdj.hpp b/clients/include/testing_sygvdj_hegvdj.hpp
new file mode 100644
index 000000000..6d739d690
--- /dev/null
+++ b/clients/include/testing_sygvdj_hegvdj.hpp
@@ -0,0 +1,726 @@
+/* **************************************************************************
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#pragma once
+
+#include "client_util.hpp"
+#include "clientcommon.hpp"
+#include "lapack_host_reference.hpp"
+#include "norm.hpp"
+#include "rocsolver.hpp"
+#include "rocsolver_arguments.hpp"
+#include "rocsolver_test.hpp"
+
+template <bool STRIDED, typename T, typename U>
+void sygvdj_hegvdj_checkBadArgs(const rocblas_handle handle,
+                                const rocblas_eform itype,
+                                const rocblas_evect evect,
+                                const rocblas_fill uplo,
+                                const rocblas_int n,
+                                T dA,
+                                const rocblas_int lda,
+                                const rocblas_stride stA,
+                                T dB,
+                                const rocblas_int ldb,
+                                const rocblas_stride stB,
+                                U dD,
+                                const rocblas_stride stD,
+                                rocblas_int* dInfo,
+                                const rocblas_int bc)
+{
+    // handle
+    EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, nullptr, itype, evect, uplo, n, dA, lda,
+                                                  stA, dB, ldb, stB, dD, stD, dInfo, bc),
+                          rocblas_status_invalid_handle);
+
+    // values
+    EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, rocblas_eform(0), evect, uplo, n,
+                                                  dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc),
+                          rocblas_status_invalid_value);
+    EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, rocblas_evect(0), uplo, n,
+                                                  dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc),
+                          rocblas_status_invalid_value);
+    EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, rocblas_evect_tridiagonal,
+                                                  uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD,
+                                                  dInfo, bc),
+                          rocblas_status_invalid_value);
+    EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, rocblas_fill_full,
+                                                  n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc),
+                          rocblas_status_invalid_value);
+
+    // sizes (only check batch_count if applicable)
+    if(STRIDED)
+        EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA,
+                                                      lda, stA, dB, ldb, stB, dD, stD, dInfo, -1),
+                              rocblas_status_invalid_size);
+
+    // pointers
+    EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, (T) nullptr,
+                                                  lda, stA, dB, ldb, stB, dD, stD, dInfo, bc),
+                          rocblas_status_invalid_pointer);
+    EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA, lda,
+                                                  stA, (T) nullptr, ldb, stB, dD, stD, dInfo, bc),
+                          rocblas_status_invalid_pointer);
+    EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA, lda,
+                                                  stA, dB, ldb, stB, (U) nullptr, stD, dInfo, bc),
+                          rocblas_status_invalid_pointer);
+    EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA,
+                                                  dB, ldb, stB, dD, stD, (rocblas_int*)nullptr, bc),
+                          rocblas_status_invalid_pointer);
+
+    // quick return with invalid pointers
+    EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, 0,
+                                                  (T) nullptr, lda, stA, (T) nullptr, ldb, stB,
+                                                  (U) nullptr, stD, dInfo, bc),
+                          rocblas_status_success);
+
+    // quick return with zero batch_count if applicable
+    if(STRIDED)
+        EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA,
+                                                      lda, stA, dB, ldb, stB, dD, stD,
+                                                      (rocblas_int*)nullptr, 0),
+                              rocblas_status_success);
+}
+
+template <bool BATCHED, bool STRIDED, typename T>
+void testing_sygvdj_hegvdj_bad_arg()
+{
+    using S = decltype(std::real(T{}));
+
+    // safe arguments
+    rocblas_local_handle handle;
+    rocblas_int n = 1;
+    rocblas_int lda = 1;
+    rocblas_int ldb = 1;
+    rocblas_stride stA = 1;
+    rocblas_stride stB = 1;
+    rocblas_stride stD = 1;
+    rocblas_int bc = 1;
+    rocblas_eform itype = rocblas_eform_ax;
+    rocblas_evect evect = rocblas_evect_none;
+    rocblas_fill uplo = rocblas_fill_upper;
+
+    if(BATCHED)
+    {
+        // memory allocations
+        device_batch_vector<T> dA(1, 1, 1);
+        device_batch_vector<T> dB(1, 1, 1);
+        device_strided_batch_vector<S> dD(1, 1, 1, 1);
+        device_strided_batch_vector<rocblas_int> dInfo(1, 1, 1, 1);
+        CHECK_HIP_ERROR(dA.memcheck());
+        CHECK_HIP_ERROR(dB.memcheck());
+        CHECK_HIP_ERROR(dD.memcheck());
+        CHECK_HIP_ERROR(dInfo.memcheck());
+
+        // check bad arguments
+        sygvdj_hegvdj_checkBadArgs<STRIDED>(handle, itype, evect, uplo, n, dA.data(), lda, stA,
+                                            dB.data(), ldb, stB, dD.data(), stD, dInfo.data(), bc);
+    }
+    else
+    {
+        // memory allocations
+        device_strided_batch_vector<T> dA(1, 1, 1, 1);
+        device_strided_batch_vector<T> dB(1, 1, 1, 1);
+        device_strided_batch_vector<S> dD(1, 1, 1, 1);
+        device_strided_batch_vector<rocblas_int> dInfo(1, 1, 1, 1);
+        CHECK_HIP_ERROR(dA.memcheck());
+        CHECK_HIP_ERROR(dB.memcheck());
+        CHECK_HIP_ERROR(dD.memcheck());
+        CHECK_HIP_ERROR(dInfo.memcheck());
+
+        // check bad arguments
+        sygvdj_hegvdj_checkBadArgs<STRIDED>(handle, itype, evect, uplo, n, dA.data(), lda, stA,
+                                            dB.data(), ldb, stB, dD.data(), stD, dInfo.data(), bc);
+    }
+}
+
+template <bool CPU, bool GPU, typename T, typename Td, typename Th>
+void sygvdj_hegvdj_initData(const rocblas_handle handle,
+                            const rocblas_eform itype,
+                            const rocblas_evect evect,
+                            const rocblas_int n,
+                            Td& dA,
+                            const rocblas_int lda,
+                            const rocblas_stride stA,
+                            Td& dB,
+                            const rocblas_int ldb,
+                            const rocblas_stride stB,
+                            const rocblas_int bc,
+                            Th& hA,
+                            Th& hB,
+                            host_strided_batch_vector<T>& A,
+                            host_strided_batch_vector<T>& B,
+                            const bool test,
+                            const bool singular)
+{
+    if(CPU)
+    {
+        rocblas_int info;
+        rocblas_init<T>(hA, true);
+        rocblas_init<T>(hB, false);
+
+        for(rocblas_int b = 0; b < bc; ++b)
+        {
+            for(rocblas_int i = 0; i < n; i++)
+            {
+                for(rocblas_int j = 0; j < n; j++)
+                {
+                    if(i == j)
+                    {
+                        hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400;
+                        hB[b][i + j * ldb] = std::real(hB[b][i + j * ldb]) + 400;
+                    }
+                    else
+                    {
+                        hA[b][i + j * lda] -= 4;
+                    }
+                }
+            }
+
+            if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1))
+            {
+                // make some matrices B not positive definite
+                // always the same elements for debugging purposes
+                // the algorithm must detect the lower order of the principal minors <= 0
+                // in those matrices in the batch that are non positive definite
+                rocblas_int i = n / 4 + b;
+                i -= (i / n) * n;
+                hB[b][i + i * ldb] = 0;
+                i = n / 2 + b;
+                i -= (i / n) * n;
+                hB[b][i + i * ldb] = 0;
+                i = n - 1 + b;
+                i -= (i / n) * n;
+                hB[b][i + i * ldb] = 0;
+            }
+
+            // store A and B for testing purposes
+            if(test && evect != rocblas_evect_none)
+            {
+                for(rocblas_int i = 0; i < n; i++)
+                {
+                    for(rocblas_int j = 0; j < n; j++)
+                    {
+                        if(itype != rocblas_eform_bax)
+                        {
+                            A[b][i + j * lda] = hA[b][i + j * lda];
+                            B[b][i + j * ldb] = hB[b][i + j * ldb];
+                        }
+                        else
+                        {
+                            A[b][i + j * lda] = hB[b][i + j * ldb];
+                            B[b][i + j * ldb] = hA[b][i + j * lda];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if(GPU)
+    {
+        // now copy data to the GPU
+        CHECK_HIP_ERROR(dA.transfer_from(hA));
+        CHECK_HIP_ERROR(dB.transfer_from(hB));
+    }
+}
+
+template <bool STRIDED, typename T, typename Td, typename Ud, typename Vd, typename Th, typename Uh, typename Vh>
+void sygvdj_hegvdj_getError(const rocblas_handle handle,
+                            const rocblas_eform itype,
+                            const rocblas_evect evect,
+                            const rocblas_fill uplo,
+                            const rocblas_int n,
+                            Td& dA,
+                            const rocblas_int lda,
+                            const rocblas_stride stA,
+                            Td& dB,
+                            const rocblas_int ldb,
+                            const rocblas_stride stB,
+                            Ud& dD,
+                            const rocblas_stride stD,
+                            Vd& dInfo,
+                            const rocblas_int bc,
+                            Th& hA,
+                            Th& hARes,
+                            Th& hB,
+                            Uh& hD,
+                            Uh& hDRes,
+                            Vh& hInfo,
+                            Vh& hInfoRes,
+                            double* max_err,
+                            const bool singular)
+{
+    constexpr bool COMPLEX = rocblas_is_complex<T>;
+    using S = decltype(std::real(T{}));
+
+    rocblas_int lwork = (COMPLEX ? 2 * n - 1 : 3 * n - 1);
+    rocblas_int lrwork = (COMPLEX ? 3 * n - 2 : 0);
+    std::vector<T> work(lwork);
+    std::vector<S> rwork(lrwork);
+    host_strided_batch_vector<T> A(lda * n, 1, lda * n, bc);
+    host_strided_batch_vector<T> B(ldb * n, 1, ldb * n, bc);
+
+    // input data initialization
+    sygvdj_hegvdj_initData<true, true, T>(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc,
+                                          hA, hB, A, B, true, singular);
+
+    // execute computations
+    // GPU lapack
+    CHECK_ROCBLAS_ERROR(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA.data(),
+                                                lda, stA, dB.data(), ldb, stB, dD.data(), stD,
+                                                dInfo.data(), bc));
+
+    CHECK_HIP_ERROR(hDRes.transfer_from(dD));
+    CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo));
+    if(evect != rocblas_evect_none)
+        CHECK_HIP_ERROR(hARes.transfer_from(dA));
+
+    // CPU lapack
+    for(rocblas_int b = 0; b < bc; ++b)
+    {
+        cpu_sygv_hegv(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hD[b], work.data(), lwork,
+                      rwork.data(), hInfo[b]);
+    }
+
+    // (We expect the used input matrices to always converge. Testing
+    // implicitly the equivalent non-converged matrix is very complicated and it boils
+    // down to essentially run the algorithm again and until convergence is achieved.
+    // We do test with indefinite matrices B).
+
+    // check info for non-convergence and/or positive-definiteness
+    *max_err = 0;
+    for(rocblas_int b = 0; b < bc; ++b)
+    {
+        EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b;
+        if(hInfo[b][0] != hInfoRes[b][0])
+            *max_err += 1;
+    }
+
+    double err;
+
+    for(rocblas_int b = 0; b < bc; ++b)
+    {
+        if(evect == rocblas_evect_none)
+        {
+            // only eigenvalues needed; can compare with LAPACK
+
+            // error is ||hD - hDRes|| / ||hD||
+            // using frobenius norm
+            if(hInfoRes[b][0] == 0)
+            {
+                err = norm_error('F', 1, n, 1, hD[b], hDRes[b]);
+                *max_err = err > *max_err ? err : *max_err;
+            }
+        }
+        else
+        {
+            // both eigenvalues and eigenvectors needed; need to implicitly test
+            // eigenvectors due to non-uniqueness of eigenvectors under scaling
+            if(hInfoRes[b][0] == 0)
+            {
+                T alpha = 1;
+                T beta = 0;
+
+                // hARes contains eigenvectors x
+                // compute B*x (or A*x) and store in hB
+                cpu_symm_hemm(rocblas_side_left, uplo, n, n, alpha, B[b], ldb, hARes[b], lda, beta,
+                              hB[b], ldb);
+
+                if(itype == rocblas_eform_ax)
+                {
+                    // problem is A*x = (lambda)*B*x
+
+                    // compute (1/lambda)*A*x and store in hA
+                    for(int j = 0; j < n; j++)
+                    {
+                        alpha = T(1) / hDRes[b][j];
+                        cpu_symv_hemv(uplo, n, alpha, A[b], lda, hARes[b] + j * lda, 1, beta,
+                                      hA[b] + j * lda, 1);
+                    }
+
+                    // move B*x into hARes
+                    for(rocblas_int i = 0; i < n; i++)
+                        for(rocblas_int j = 0; j < n; j++)
+                            hARes[b][i + j * lda] = hB[b][i + j * ldb];
+                }
+                else
+                {
+                    // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x
+
+                    // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA
+                    for(int j = 0; j < n; j++)
+                    {
+                        alpha = T(1) / hDRes[b][j];
+                        cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta,
+                                      hA[b] + j * lda, 1);
+                    }
+                }
+
+                // error is ||hA - hARes|| / ||hA||
+                // using frobenius norm
+                err = norm_error('F', n, n, lda, hA[b], hARes[b]);
+                *max_err = err > *max_err ? err : *max_err;
+            }
+        }
+    }
+}
+
+template <bool STRIDED, typename T, typename Td, typename Ud, typename Vd, typename Th, typename Uh, typename Vh>
+void sygvdj_hegvdj_getPerfData(const rocblas_handle handle,
+                               const rocblas_eform itype,
+                               const rocblas_evect evect,
+                               const rocblas_fill uplo,
+                               const rocblas_int n,
+                               Td& dA,
+                               const rocblas_int lda,
+                               const rocblas_stride stA,
+                               Td& dB,
+                               const rocblas_int ldb,
+                               const rocblas_stride stB,
+                               Ud& dD,
+                               const rocblas_stride stD,
+                               Vd& dInfo,
+                               const rocblas_int bc,
+                               Th& hA,
+                               Th& hB,
+                               Uh& hD,
+                               Vh& hInfo,
+                               double* gpu_time_used,
+                               double* cpu_time_used,
+                               const rocblas_int hot_calls,
+                               const int profile,
+                               const bool profile_kernels,
+                               const bool perf,
+                               const bool singular)
+{
+    host_strided_batch_vector<T> A(1, 1, 1, 1);
+    host_strided_batch_vector<T> B(1, 1, 1, 1);
+
+    if(!perf)
+    {
+        // cpu-lapack performance (only if not in perf mode)
+        *cpu_time_used = nan("");
+    }
+
+    sygvdj_hegvdj_initData<true, false, T>(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc,
+                                           hA, hB, A, B, false, singular);
+
+    // cold calls
+    for(int iter = 0; iter < 2; iter++)
+    {
+        sygvdj_hegvdj_initData<false, true, T>(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB,
+                                               bc, hA, hB, A, B, false, singular);
+
+        CHECK_ROCBLAS_ERROR(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n,
+                                                    dA.data(), lda, stA, dB.data(), ldb, stB,
+                                                    dD.data(), stD, dInfo.data(), bc));
+    }
+
+    // gpu-lapack performance
+    hipStream_t stream;
+    CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream));
+    double start;
+
+    if(profile > 0)
+    {
+        if(profile_kernels)
+            rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile
+                                         | rocblas_layer_mode_ex_log_kernel);
+        else
+            rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile);
+        rocsolver_log_set_max_levels(profile);
+    }
+
+    for(rocblas_int iter = 0; iter < hot_calls; iter++)
+    {
+        sygvdj_hegvdj_initData<false, true, T>(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB,
+                                               bc, hA, hB, A, B, false, singular);
+
+        start = get_time_us_sync(stream);
+        rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA,
+                                dB.data(), ldb, stB, dD.data(), stD, dInfo.data(), bc);
+        *gpu_time_used += get_time_us_sync(stream) - start;
+    }
+    *gpu_time_used /= hot_calls;
+}
+
+template <bool BATCHED, bool STRIDED, typename T>
+void testing_sygvdj_hegvdj(Arguments& argus)
+{
+    using S = decltype(std::real(T{}));
+
+    // get arguments
+    rocblas_local_handle handle;
+    char itypeC = argus.get<char>("itype");
+    char evectC = argus.get<char>("evect");
+    char uploC = argus.get<char>("uplo");
+    rocblas_int n = argus.get<rocblas_int>("n");
+    rocblas_int lda = argus.get<rocblas_int>("lda", n);
+    rocblas_int ldb = argus.get<rocblas_int>("ldb", n);
+    rocblas_stride stA = argus.get<rocblas_stride>("strideA", lda * n);
+    rocblas_stride stB = argus.get<rocblas_stride>("strideB", ldb * n);
+    rocblas_stride stD = argus.get<rocblas_stride>("strideD", n);
+
+    rocblas_eform itype = char2rocblas_eform(itypeC);
+    rocblas_evect evect = char2rocblas_evect(evectC);
+    rocblas_fill uplo = char2rocblas_fill(uploC);
+    rocblas_int bc = argus.batch_count;
+    rocblas_int hot_calls = argus.iters;
+
+    rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0;
+    rocblas_stride stDRes = (argus.unit_check || argus.norm_check) ? stD : 0;
+
+    // check non-supported values
+    if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal)
+    {
+        if(BATCHED)
+            EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n,
+                                                          (T* const*)nullptr, lda, stA,
+                                                          (T* const*)nullptr, ldb, stB, (S*)nullptr,
+                                                          stD, (rocblas_int*)nullptr, bc),
+                                  rocblas_status_invalid_value);
+        else
+            EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n,
+                                                          (T*)nullptr, lda, stA, (T*)nullptr, ldb,
+                                                          stB, (S*)nullptr, stD,
+                                                          (rocblas_int*)nullptr, bc),
+                                  rocblas_status_invalid_value);
+
+        if(argus.timing)
+            rocsolver_bench_inform(inform_invalid_args);
+
+        return;
+    }
+
+    // determine sizes
+    size_t size_A = size_t(lda) * n;
+    size_t size_B = size_t(ldb) * n;
+    size_t size_D = size_t(n);
+    double max_error = 0, gpu_time_used = 0, cpu_time_used = 0;
+
+    size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0;
+    size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0;
+
+    // check invalid sizes
+    bool invalid_size = (n < 0 || lda < n || ldb < n || bc < 0);
+    if(invalid_size)
+    {
+        if(BATCHED)
+            EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n,
+                                                          (T* const*)nullptr, lda, stA,
+                                                          (T* const*)nullptr, ldb, stB, (S*)nullptr,
+                                                          stD, (rocblas_int*)nullptr, bc),
+                                  rocblas_status_invalid_size);
+        else
+            EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n,
+                                                          (T*)nullptr, lda, stA, (T*)nullptr, ldb,
+                                                          stB, (S*)nullptr, stD,
+                                                          (rocblas_int*)nullptr, bc),
+                                  rocblas_status_invalid_size);
+
+        if(argus.timing)
+            rocsolver_bench_inform(inform_invalid_size);
+
+        return;
+    }
+
+    // memory size query is necessary
+    if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND)
+    {
+        CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle));
+        if(BATCHED)
+            CHECK_ALLOC_QUERY(rocsolver_sygvdj_hegvdj(
+                STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA,
+                (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (rocblas_int*)nullptr, bc));
+        else
+            CHECK_ALLOC_QUERY(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n,
+                                                      (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB,
+                                                      (S*)nullptr, stD, (rocblas_int*)nullptr, bc));
+
+        size_t size;
+        CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size));
+        if(argus.mem_query)
+        {
+            rocsolver_bench_inform(inform_mem_query, size);
+            return;
+        }
+
+        CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size));
+    }
+
+    // memory allocations (all cases)
+    // host
+    host_strided_batch_vector<S> hD(size_D, 1, stD, bc);
+    host_strided_batch_vector<S> hDRes(size_DRes, 1, stDRes, bc);
+    host_strided_batch_vector<rocblas_int> hInfo(1, 1, 1, bc);
+    host_strided_batch_vector<rocblas_int> hInfoRes(1, 1, 1, bc);
+    // device
+    device_strided_batch_vector<S> dD(size_D, 1, stD, bc);
+    device_strided_batch_vector<rocblas_int> dInfo(1, 1, 1, bc);
+    if(size_D)
+        CHECK_HIP_ERROR(dD.memcheck());
+    CHECK_HIP_ERROR(dInfo.memcheck());
+
+    if(BATCHED)
+    {
+        // memory allocations
+        host_batch_vector<T> hA(size_A, 1, bc);
+        host_batch_vector<T> hARes(size_ARes, 1, bc);
+        host_batch_vector<T> hB(size_B, 1, bc);
+        device_batch_vector<T> dA(size_A, 1, bc);
+        device_batch_vector<T> dB(size_B, 1, bc);
+        if(size_A)
+            CHECK_HIP_ERROR(dA.memcheck());
+        if(size_B)
+            CHECK_HIP_ERROR(dB.memcheck());
+
+        // check quick return
+        if(n == 0 || bc == 0)
+        {
+            EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n,
+                                                          dA.data(), lda, stA, dB.data(), ldb, stB,
+                                                          dD.data(), stD, dInfo.data(), bc),
+                                  rocblas_status_success);
+            if(argus.timing)
+                rocsolver_bench_inform(inform_quick_return);
+
+            return;
+        }
+
+        // check computations
+        if(argus.unit_check || argus.norm_check)
+            sygvdj_hegvdj_getError<STRIDED, T>(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb,
+                                               stB, dD, stD, dInfo, bc, hA, hARes, hB, hD, hDRes,
+                                               hInfo, hInfoRes, &max_error, argus.singular);
+
+        // collect performance data
+        if(argus.timing)
+            sygvdj_hegvdj_getPerfData<STRIDED, T>(
+                handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc, hA,
+                hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile,
+                argus.profile_kernels, argus.perf, argus.singular);
+    }
+
+    else
+    {
+        // memory allocations
+        host_strided_batch_vector<T> hA(size_A, 1, stA, bc);
+        host_strided_batch_vector<T> hARes(size_ARes, 1, stARes, bc);
+        host_strided_batch_vector<T> hB(size_B, 1, stB, bc);
+        device_strided_batch_vector<T> dA(size_A, 1, stA, bc);
+        device_strided_batch_vector<T> dB(size_B, 1, stB, bc);
+        if(size_A)
+            CHECK_HIP_ERROR(dA.memcheck());
+        if(size_B)
+            CHECK_HIP_ERROR(dB.memcheck());
+
+        // check quick return
+        if(n == 0 || bc == 0)
+        {
+            EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n,
+                                                          dA.data(), lda, stA, dB.data(), ldb, stB,
+                                                          dD.data(), stD, dInfo.data(), bc),
+                                  rocblas_status_success);
+            if(argus.timing)
+                rocsolver_bench_inform(inform_quick_return);
+
+            return;
+        }
+
+        // check computations
+        if(argus.unit_check || argus.norm_check)
+            sygvdj_hegvdj_getError<STRIDED, T>(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb,
+                                               stB, dD, stD, dInfo, bc, hA, hARes, hB, hD, hDRes,
+                                               hInfo, hInfoRes, &max_error, argus.singular);
+
+        // collect performance data
+        if(argus.timing)
+            sygvdj_hegvdj_getPerfData<STRIDED, T>(
+                handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc, hA,
+                hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile,
+                argus.profile_kernels, argus.perf, argus.singular);
+    }
+
+    // validate results for rocsolver-test
+    // using 2 * n * machine_precision as tolerance
+    if(argus.unit_check)
+        ROCSOLVER_TEST_CHECK(T, max_error, 2 * n);
+
+    // output results for rocsolver-bench
+    if(argus.timing)
+    {
+        if(!argus.perf)
+        {
+            rocsolver_bench_header("Arguments:");
+            if(BATCHED)
+            {
+                rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideD",
+                                       "batch_c");
+                rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stD, bc);
+            }
+            else if(STRIDED)
+            {
+                rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideA",
+                                       "strideB", "strideD", "batch_c");
+                rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stA, stB, stD, bc);
+            }
+            else
+            {
+                rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb");
+                rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb);
+            }
+            rocsolver_bench_header("Results:");
+            if(argus.norm_check)
+            {
+                rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error");
+                rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error);
+            }
+            else
+            {
+                rocsolver_bench_output("cpu_time_us", "gpu_time_us");
+                rocsolver_bench_output(cpu_time_used, gpu_time_used);
+            }
+            rocsolver_bench_endl();
+        }
+        else
+        {
+            if(argus.norm_check)
+                rocsolver_bench_output(gpu_time_used, max_error);
+            else
+                rocsolver_bench_output(gpu_time_used);
+        }
+    }
+
+    // ensure all arguments were consumed
+    argus.validate_consumed();
+}
+
+#define EXTERN_TESTING_SYGVDJ_HEGVDJ(...) \
+    extern template void testing_sygvdj_hegvdj<__VA_ARGS__>(Arguments&);
+
+INSTANTIATE(EXTERN_TESTING_SYGVDJ_HEGVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP)
diff --git a/docs/api/lapacklike.rst b/docs/api/lapacklike.rst
index 5f4067228..a7bb18f2d 100644
--- a/docs/api/lapacklike.rst
+++ b/docs/api/lapacklike.rst
@@ -388,6 +388,86 @@ rocsolver_<type>hegvj_strided_batched()
 .. doxygenfunction:: rocsolver_chegvj_strided_batched
 
 
+.. _syevdj:
+
+rocsolver_<type>syevdj()
+---------------------------------------------------
+.. doxygenfunction:: rocsolver_dsyevdj
+   :outline:
+.. doxygenfunction:: rocsolver_ssyevdj
+
+rocsolver_<type>syevdj_batched()
+---------------------------------------------------
+.. doxygenfunction:: rocsolver_dsyevdj_batched
+   :outline:
+.. doxygenfunction:: rocsolver_ssyevdj_batched
+
+rocsolver_<type>syevdj_strided_batched()
+---------------------------------------------------
+.. doxygenfunction:: rocsolver_dsyevdj_strided_batched
+   :outline:
+.. doxygenfunction:: rocsolver_ssyevdj_strided_batched
+
+.. _heevdj:
+
+rocsolver_<type>heevdj()
+---------------------------------------------------
+.. doxygenfunction:: rocsolver_zheevdj
+   :outline:
+.. doxygenfunction:: rocsolver_cheevdj
+
+rocsolver_<type>heevdj_batched()
+---------------------------------------------------
+.. doxygenfunction:: rocsolver_zheevdj_batched
+   :outline:
+.. doxygenfunction:: rocsolver_cheevdj_batched
+
+rocsolver_<type>heevdj_strided_batched()
+---------------------------------------------------
+.. doxygenfunction:: rocsolver_zheevdj_strided_batched
+   :outline:
+.. doxygenfunction:: rocsolver_cheevdj_strided_batched
+
+.. _sygvdj:
+
+rocsolver_<type>sygvdj()
+---------------------------------------------------
+.. doxygenfunction:: rocsolver_dsygvdj
+   :outline:
+.. doxygenfunction:: rocsolver_ssygvdj
+
+rocsolver_<type>sygvdj_batched()
+---------------------------------------------------
+.. doxygenfunction:: rocsolver_dsygvdj_batched
+   :outline:
+.. doxygenfunction:: rocsolver_ssygvdj_batched
+
+rocsolver_<type>sygvdj_strided_batched()
+---------------------------------------------------
+.. doxygenfunction:: rocsolver_dsygvdj_strided_batched
+   :outline:
+.. doxygenfunction:: rocsolver_ssygvdj_strided_batched
+
+.. _hegvdj:
+
+rocsolver_<type>hegvdj()
+---------------------------------------------------
+.. doxygenfunction:: rocsolver_zhegvdj
+   :outline:
+.. doxygenfunction:: rocsolver_chegvdj
+
+rocsolver_<type>hegvdj_batched()
+---------------------------------------------------
+.. doxygenfunction:: rocsolver_zhegvdj_batched
+   :outline:
+.. doxygenfunction:: rocsolver_chegvdj_batched
+
+rocsolver_<type>hegvdj_strided_batched()
+---------------------------------------------------
+.. doxygenfunction:: rocsolver_zhegvdj_strided_batched
+   :outline:
+.. doxygenfunction:: rocsolver_chegvdj_strided_batched
+
 
 .. _likesvds:
 
diff --git a/docs/design/tuning.rst b/docs/design/tuning.rst
index def54ed7b..ed0e17b8c 100644
--- a/docs/design/tuning.rst
+++ b/docs/design/tuning.rst
@@ -262,10 +262,15 @@ STEDC_MIN_DC_SIZE
 
 (As of the current rocSOLVER release, this constant has not been tuned for any specific cases.)
 
+STEDC_NUM_SPLIT_BLKS
+---------------------
+.. doxygendefine:: STEDC_NUM_SPLIT_BLKS
+
+(As of the current rocSOLVER release, this constant has not been tuned for any specific cases.)
 
 
-syevj and heevj functions
-==========================
+syevj, heevj, syevdj and heevdj functions
+===========================================
 
 The Jacobi eigensolver routines SYEVJ/HEEVJ (or the corresponding batched and strided-batched routines) can
 be executed with a single kernel call (for small-size matrices) or with multiple kernel calls (for large-size
@@ -274,15 +279,20 @@ computed cosine and sine values, and the number of iterations/sweeps is controll
 the matrix is partitioned into blocks, Jacobi rotations are accumulated per block (to be applied in separate kernel
 calls), and the number of iterations/sweeps is controlled by the CPU (requiring synchronization of the handle stream).
 
+When running SYEVDJ/HEEVDJ (or the corresponding batched and strided-batched routines),
+the computation of the eigenvectors of the associated tridiagonal matrix
+can be sped up using a divide-and-conquer approach,
+provided the size of the independent block is large enough.
+
 SYEVJ_BLOCKED_SWITCH
 ----------------------
 .. doxygendefine:: SYEVJ_BLOCKED_SWITCH
 
 (As of the current rocSOLVER release, this constant has not been tuned for any specific cases.)
 
-STEDC_NUM_SPLIT_BLKS
----------------------
-.. doxygendefine:: STEDC_NUM_SPLIT_BLKS
+SYEVDJ_MIN_DC_SIZE
+-------------------
+.. doxygendefine:: SYEVDJ_MIN_DC_SIZE
 
 (As of the current rocSOLVER release, this constant has not been tuned for any specific cases.)
 
diff --git a/docs/userguide/intro.rst b/docs/userguide/intro.rst
index 22ff74e2b..199d3dac2 100644
--- a/docs/userguide/intro.rst
+++ b/docs/userguide/intro.rst
@@ -210,6 +210,10 @@ LAPACK-like functions
     :ref:`rocsolver_sygvj <sygvj>`, x, x, ,
     :ref:`rocsolver_heevj <heevj>`, , , x, x
     :ref:`rocsolver_hegvj <hegvj>`, , , x, x
+    :ref:`rocsolver_syevdj <syevdj>`, x, x, ,
+    :ref:`rocsolver_sygvdj <sygvdj>`, x, x, ,
+    :ref:`rocsolver_heevdj <heevdj>`, , , x, x
+    :ref:`rocsolver_hegvdj <hegvdj>`, , , x, x
 
 .. csv-table:: Singular value decomposition
     :header: "Function", "single", "double", "single complex", "double complex"
diff --git a/library/include/rocsolver/rocsolver-functions.h b/library/include/rocsolver/rocsolver-functions.h
index f48e7821a..5429fdac4 100644
--- a/library/include/rocsolver/rocsolver-functions.h
+++ b/library/include/rocsolver/rocsolver-functions.h
@@ -1,5 +1,5 @@
 /* **************************************************************************
- * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -12276,7 +12276,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvdj_batched(rocblas_handle handle,
                 The leading dimension of A_l.
     @param[in]
     strideA     rocblas_stride.
-                Stride from the start of one matrix A_l to the next one A_(j+1).
+                Stride from the start of one matrix A_l to the next one A_(l+1).
                 There is no restriction for the value of strideA.
                 Normal use case is strideA >= lda*n.
     @param[in]
@@ -16282,206 +16282,1242 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_cheevd_batched(rocblas_handle handle,
                                                          rocblas_int* info,
                                                          const rocblas_int batch_count);
 
-ROCSOLVER_EXPORT rocblas_status rocsolver_zheevd_batched(rocblas_handle handle,
-                                                         const rocblas_evect evect,
-                                                         const rocblas_fill uplo,
-                                                         const rocblas_int n,
-                                                         rocblas_double_complex* const A[],
-                                                         const rocblas_int lda,
-                                                         double* D,
-                                                         const rocblas_stride strideD,
-                                                         double* E,
-                                                         const rocblas_stride strideE,
-                                                         rocblas_int* info,
-                                                         const rocblas_int batch_count);
+ROCSOLVER_EXPORT rocblas_status rocsolver_zheevd_batched(rocblas_handle handle,
+                                                         const rocblas_evect evect,
+                                                         const rocblas_fill uplo,
+                                                         const rocblas_int n,
+                                                         rocblas_double_complex* const A[],
+                                                         const rocblas_int lda,
+                                                         double* D,
+                                                         const rocblas_stride strideD,
+                                                         double* E,
+                                                         const rocblas_stride strideE,
+                                                         rocblas_int* info,
+                                                         const rocblas_int batch_count);
+//! @}
+
+/*! @{
+    \brief SYEVD_STRIDED_BATCHED computes the eigenvalues and optionally the eigenvectors of a batch of
+    real symmetric matrices A_l.
+
+    \details
+    The eigenvalues are returned in ascending order. The eigenvectors are computed using a
+    divide-and-conquer algorithm, depending on the value of evect. The computed eigenvectors
+    are orthonormal.
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    evect       #rocblas_evect.
+                Specifies whether the eigenvectors are to be computed.
+                If evect is rocblas_evect_original, then the eigenvectors are computed.
+                rocblas_evect_tridiagonal is not supported.
+    @param[in]
+    uplo        rocblas_fill.
+                Specifies whether the upper or lower part of the symmetric matrices A_l is stored.
+                If uplo indicates lower (or upper), then the upper (or lower) part of A_l
+                is not used.
+    @param[in]
+    n           rocblas_int. n >= 0.
+                Number of rows and columns of matrices A_l.
+    @param[inout]
+    A           pointer to type. Array on the GPU (the size depends on the value of strideA).
+                On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and
+                the algorithm converged; otherwise the contents of A_l are destroyed.
+    @param[in]
+    lda         rocblas_int. lda >= n.
+                Specifies the leading dimension of matrices A_l.
+    @param[in]
+    strideA     rocblas_stride.
+                Stride from the start of one matrix A_l to the next one A_(l+1).
+                There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
+    @param[out]
+    D           pointer to type. Array on the GPU (the size depends on the value of strideD).
+                The eigenvalues of A_l in increasing order.
+    @param[in]
+    strideD     rocblas_stride.
+                Stride from the start of one vector D_l to the next one D_(l+1).
+                There is no restriction for the value of strideD. Normal use case is strideD >= n.
+    @param[out]
+    E           pointer to type. Array on the GPU (the size depends on the value of strideE).
+                This array is used to work internally with the tridiagonal matrix T_l associated with A_l.
+                On exit, if info[l] > 0, E_l contains the unconverged off-diagonal elements of T_l
+                (or properly speaking, a tridiagonal matrix equivalent to T_l). The diagonal elements
+                of this matrix are in D_l; those that converged correspond to a subset of the
+                eigenvalues of A_l (not necessarily ordered).
+    @param[in]
+    strideE     rocblas_stride.
+                Stride from the start of one vector E_l to the next one E_(l+1).
+                There is no restriction for the value of strideE. Normal use case is strideE >= n.
+    @param[out]
+    info        pointer to rocblas_int. Array of batch_count integers on the GPU.
+                If info[l] = 0, successful exit for matrix A_l.
+                If info[l] = i > 0 and evect is rocblas_evect_none, the algorithm did not converge.
+                i elements of E_l did not converge to zero.
+                If info[l] = i > 0 and evect is rocblas_evect_original, the algorithm failed to
+                compute an eigenvalue in the submatrix from [i/(n+1), i/(n+1)] to [i%(n+1), i%(n+1)].
+    @param[in]
+    batch_count rocblas_int. batch_count >= 0.
+                Number of matrices in the batch.
+    **************************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_ssyevd_strided_batched(rocblas_handle handle,
+                                                                 const rocblas_evect evect,
+                                                                 const rocblas_fill uplo,
+                                                                 const rocblas_int n,
+                                                                 float* A,
+                                                                 const rocblas_int lda,
+                                                                 const rocblas_stride strideA,
+                                                                 float* D,
+                                                                 const rocblas_stride strideD,
+                                                                 float* E,
+                                                                 const rocblas_stride strideE,
+                                                                 rocblas_int* info,
+                                                                 const rocblas_int batch_count);
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_dsyevd_strided_batched(rocblas_handle handle,
+                                                                 const rocblas_evect evect,
+                                                                 const rocblas_fill uplo,
+                                                                 const rocblas_int n,
+                                                                 double* A,
+                                                                 const rocblas_int lda,
+                                                                 const rocblas_stride strideA,
+                                                                 double* D,
+                                                                 const rocblas_stride strideD,
+                                                                 double* E,
+                                                                 const rocblas_stride strideE,
+                                                                 rocblas_int* info,
+                                                                 const rocblas_int batch_count);
+//! @}
+
+/*! @{
+    \brief HEEVD_STRIDED_BATCHED computes the eigenvalues and optionally the eigenvectors of a batch of
+    Hermitian matrices A_l.
+
+    \details
+    The eigenvalues are returned in ascending order. The eigenvectors are computed using a
+    divide-and-conquer algorithm, depending on the value of evect. The computed eigenvectors
+    are orthonormal.
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    evect       #rocblas_evect.
+                Specifies whether the eigenvectors are to be computed.
+                If evect is rocblas_evect_original, then the eigenvectors are computed.
+                rocblas_evect_tridiagonal is not supported.
+    @param[in]
+    uplo        rocblas_fill.
+                Specifies whether the upper or lower part of the Hermitian matrices A_l is stored.
+                If uplo indicates lower (or upper), then the upper (or lower) part of A_l
+                is not used.
+    @param[in]
+    n           rocblas_int. n >= 0.
+                Number of rows and columns of matrices A_l.
+    @param[inout]
+    A           pointer to type. Array on the GPU (the size depends on the value of strideA).
+                On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and
+                the algorithm converged; otherwise the contents of A_l are destroyed.
+    @param[in]
+    lda         rocblas_int. lda >= n.
+                Specifies the leading dimension of matrices A_l.
+    @param[in]
+    strideA     rocblas_stride.
+                Stride from the start of one matrix A_l to the next one A_(l+1).
+                There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
+    @param[out]
+    D           pointer to real type. Array on the GPU (the size depends on the value of strideD).
+                The eigenvalues of A_l in increasing order.
+    @param[in]
+    strideD     rocblas_stride.
+                Stride from the start of one vector D_l to the next one D_(l+1).
+                There is no restriction for the value of strideD. Normal use case is strideD >= n.
+    @param[out]
+    E           pointer to real type. Array on the GPU (the size depends on the value of strideE).
+                This array is used to work internally with the tridiagonal matrix T_l associated with A_l.
+                On exit, if info[l] > 0, E_l contains the unconverged off-diagonal elements of T_l
+                (or properly speaking, a tridiagonal matrix equivalent to T_l). The diagonal elements
+                of this matrix are in D_l; those that converged correspond to a subset of the
+                eigenvalues of A_l (not necessarily ordered).
+    @param[in]
+    strideE     rocblas_stride.
+                Stride from the start of one vector E_l to the next one E_(l+1).
+                There is no restriction for the value of strideE. Normal use case is strideE >= n.
+    @param[out]
+    info        pointer to rocblas_int. Array of batch_count integers on the GPU.
+                If info[l] = 0, successful exit for matrix A_l.
+                If info[l] = i > 0 and evect is rocblas_evect_none, the algorithm did not converge.
+                i elements of E_l did not converge to zero.
+                If info[l] = i > 0 and evect is rocblas_evect_original, the algorithm failed to
+                compute an eigenvalue in the submatrix from [i/(n+1), i/(n+1)] to [i%(n+1), i%(n+1)].
+    @param[in]
+    batch_count rocblas_int. batch_count >= 0.
+                Number of matrices in the batch.
+    **************************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_cheevd_strided_batched(rocblas_handle handle,
+                                                                 const rocblas_evect evect,
+                                                                 const rocblas_fill uplo,
+                                                                 const rocblas_int n,
+                                                                 rocblas_float_complex* A,
+                                                                 const rocblas_int lda,
+                                                                 const rocblas_stride strideA,
+                                                                 float* D,
+                                                                 const rocblas_stride strideD,
+                                                                 float* E,
+                                                                 const rocblas_stride strideE,
+                                                                 rocblas_int* info,
+                                                                 const rocblas_int batch_count);
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_zheevd_strided_batched(rocblas_handle handle,
+                                                                 const rocblas_evect evect,
+                                                                 const rocblas_fill uplo,
+                                                                 const rocblas_int n,
+                                                                 rocblas_double_complex* A,
+                                                                 const rocblas_int lda,
+                                                                 const rocblas_stride strideA,
+                                                                 double* D,
+                                                                 const rocblas_stride strideD,
+                                                                 double* E,
+                                                                 const rocblas_stride strideE,
+                                                                 rocblas_int* info,
+                                                                 const rocblas_int batch_count);
+//! @}
+
+/*! @{
+    \brief SYEVDJ computes the eigenvalues and optionally the eigenvectors of a real symmetric
+    matrix A.
+
+    \details
+    The eigenvalues are found using the iterative Jacobi algorithm and are returned in ascending order.
+    The eigenvectors are computed using a divide-and-conquer approach depending on the value of evect.
+    The computed eigenvectors are orthonormal.
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    evect       #rocblas_evect.\n
+                Specifies whether the eigenvectors are to be computed.
+                If evect is rocblas_evect_original, then the eigenvectors are computed.
+                rocblas_evect_tridiagonal is not supported.
+    @param[in]
+    uplo        rocblas_fill.\n
+                Specifies whether the upper or lower part of the symmetric matrix A is stored.
+                If uplo indicates lower (or upper), then the upper (or lower) part of A
+                is not used.
+    @param[in]
+    n           rocblas_int. n >= 0.\n
+                Number of rows and columns of matrix A.
+    @param[inout]
+    A           pointer to type. Array on the GPU of dimension lda*n.\n
+                On entry, the matrix A. On exit, the eigenvectors of A if they were computed and
+                the algorithm converged; otherwise the contents of A are destroyed.
+    @param[in]
+    lda         rocblas_int. lda >= n.\n
+                Specifies the leading dimension of matrix A.
+    @param[out]
+    D           pointer to type. Array on the GPU of dimension n.\n
+                The eigenvalues of A in increasing order.
+    @param[out]
+    info        pointer to a rocblas_int on the GPU.\n
+                If info = 0, successful exit. If info = 1, the algorithm did not converge.
+    **************************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_ssyevdj(rocblas_handle handle,
+                                                  const rocblas_evect evect,
+                                                  const rocblas_fill uplo,
+                                                  const rocblas_int n,
+                                                  float* A,
+                                                  const rocblas_int lda,
+                                                  float* D,
+                                                  rocblas_int* info);
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_dsyevdj(rocblas_handle handle,
+                                                  const rocblas_evect evect,
+                                                  const rocblas_fill uplo,
+                                                  const rocblas_int n,
+                                                  double* A,
+                                                  const rocblas_int lda,
+                                                  double* D,
+                                                  rocblas_int* info);
+//! @}
+
+/*! @{
+    \brief HEEVDJ computes the eigenvalues and optionally the eigenvectors of a complex Hermitian
+    matrix A.
+
+    \details
+    The eigenvalues are found using the iterative Jacobi algorithm and are returned in ascending order.
+    The eigenvectors are computed using a divide-and-conquer approach depending on the value of evect.
+    The computed eigenvectors are orthonormal.
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    evect       #rocblas_evect.\n
+                Specifies whether the eigenvectors are to be computed.
+                If evect is rocblas_evect_original, then the eigenvectors are computed.
+                rocblas_evect_tridiagonal is not supported.
+    @param[in]
+    uplo        rocblas_fill.\n
+                Specifies whether the upper or lower part of the Hermitian matrix A is stored.
+                If uplo indicates lower (or upper), then the upper (or lower) part of A
+                is not used.
+    @param[in]
+    n           rocblas_int. n >= 0.\n
+                Number of rows and columns of matrix A.
+    @param[inout]
+    A           pointer to type. Array on the GPU of dimension lda*n.\n
+                On entry, the matrix A. On exit, the eigenvectors of A if they were computed and
+                the algorithm converged; otherwise the contents of A are destroyed.
+    @param[in]
+    lda         rocblas_int. lda >= n.\n
+                Specifies the leading dimension of matrix A.
+    @param[out]
+    D           pointer to real type. Array on the GPU of dimension n.\n
+                The eigenvalues of A in increasing order.
+    @param[out]
+    info        pointer to a rocblas_int on the GPU.\n
+                If info = 0, successful exit. If info = 1, the algorithm did not converge.
+    **************************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_cheevdj(rocblas_handle handle,
+                                                  const rocblas_evect evect,
+                                                  const rocblas_fill uplo,
+                                                  const rocblas_int n,
+                                                  rocblas_float_complex* A,
+                                                  const rocblas_int lda,
+                                                  float* D,
+                                                  rocblas_int* info);
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_zheevdj(rocblas_handle handle,
+                                                  const rocblas_evect evect,
+                                                  const rocblas_fill uplo,
+                                                  const rocblas_int n,
+                                                  rocblas_double_complex* A,
+                                                  const rocblas_int lda,
+                                                  double* D,
+                                                  rocblas_int* info);
+//! @}
+
+/*! @{
+    \brief SYEVDJ_BATCHED computes the eigenvalues and optionally the eigenvectors of a
+    batch of real symmetric matrices A_l.
+
+    \details
+    The eigenvalues are found using the iterative Jacobi algorithm and are returned in ascending order.
+    The eigenvectors are computed using a divide-and-conquer approach depending on the value of evect.
+    The computed eigenvectors are orthonormal.
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    evect       #rocblas_evect.\n
+                Specifies whether the eigenvectors are to be computed.
+                If evect is rocblas_evect_original, then the eigenvectors are computed.
+                rocblas_evect_tridiagonal is not supported.
+    @param[in]
+    uplo        rocblas_fill.\n
+                Specifies whether the upper or lower part of the symmetric matrices A_l is stored.
+                If uplo indicates lower (or upper), then the upper (or lower) part of A_l
+                is not used.
+    @param[in]
+    n           rocblas_int. n >= 0.\n
+                Number of rows and columns of matrices A_l.
+    @param[inout]
+    A           Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
+                On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and
+                the algorithm converged; otherwise the contents of A_l are destroyed.
+    @param[in]
+    lda         rocblas_int. lda >= n.\n
+                Specifies the leading dimension of matrices A_l.
+    @param[out]
+    D           pointer to type. Array on the GPU (the size depends on the value of strideD).\n
+                The eigenvalues of A_l in increasing order.
+    @param[in]
+    strideD     rocblas_stride.\n
+                Stride from the start of one vector D_l to the next one D_(l+1).
+                There is no restriction for the value of strideD. Normal use case is strideD >= n.
+    @param[out]
+    info        pointer to rocblas_int. Array of batch_count integers on the GPU.\n
+                If info[l] = 0, successful exit for A_l. If info[l] = 1, the algorithm did not converge for A_l.
+    @param[in]
+    batch_count rocblas_int. batch_count >= 0.\n
+                Number of matrices in the batch.
+    **************************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_ssyevdj_batched(rocblas_handle handle,
+                                                          const rocblas_evect evect,
+                                                          const rocblas_fill uplo,
+                                                          const rocblas_int n,
+                                                          float* const A[],
+                                                          const rocblas_int lda,
+                                                          float* D,
+                                                          const rocblas_stride strideD,
+                                                          rocblas_int* info,
+                                                          const rocblas_int batch_count);
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_dsyevdj_batched(rocblas_handle handle,
+                                                          const rocblas_evect evect,
+                                                          const rocblas_fill uplo,
+                                                          const rocblas_int n,
+                                                          double* const A[],
+                                                          const rocblas_int lda,
+                                                          double* D,
+                                                          const rocblas_stride strideD,
+                                                          rocblas_int* info,
+                                                          const rocblas_int batch_count);
+//! @}
+
+/*! @{
+    \brief HEEVDJ_BATCHED computes the eigenvalues and optionally the eigenvectors of a
+    batch of complex Hermitian matrices A_l.
+
+    \details
+    The eigenvalues are found using the iterative Jacobi algorithm and are returned in ascending order.
+    The eigenvectors are computed using a divide-and-conquer approach depending on the value of evect.
+    The computed eigenvectors are orthonormal.
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    evect       #rocblas_evect.\n
+                Specifies whether the eigenvectors are to be computed.
+                If evect is rocblas_evect_original, then the eigenvectors are computed.
+                rocblas_evect_tridiagonal is not supported.
+    @param[in]
+    uplo        rocblas_fill.\n
+                Specifies whether the upper or lower part of the Hermitian matrices A_l is stored.
+                If uplo indicates lower (or upper), then the upper (or lower) part of A_l
+                is not used.
+    @param[in]
+    n           rocblas_int. n >= 0.\n
+                Number of rows and columns of matrices A_l.
+    @param[inout]
+    A           Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
+                On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and
+                the algorithm converged; otherwise the contents of A_l are destroyed.
+    @param[in]
+    lda         rocblas_int. lda >= n.\n
+                Specifies the leading dimension of matrices A_l.
+    @param[out]
+    D           pointer to real type. Array on the GPU (the size depends on the value of strideD).\n
+                The eigenvalues of A_l in increasing order.
+    @param[in]
+    strideD     rocblas_stride.\n
+                Stride from the start of one vector D_l to the next one D_(l+1).
+                There is no restriction for the value of strideD. Normal use case is strideD >= n.
+    @param[out]
+    info        pointer to rocblas_int. Array of batch_count integers on the GPU.\n
+                If info[l] = 0, successful exit for A_l. If info[l] = 1, the algorithm did not converge for A_l.
+    @param[in]
+    batch_count rocblas_int. batch_count >= 0.\n
+                Number of matrices in the batch.
+    **************************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_cheevdj_batched(rocblas_handle handle,
+                                                          const rocblas_evect evect,
+                                                          const rocblas_fill uplo,
+                                                          const rocblas_int n,
+                                                          rocblas_float_complex* const A[],
+                                                          const rocblas_int lda,
+                                                          float* D,
+                                                          const rocblas_stride strideD,
+                                                          rocblas_int* info,
+                                                          const rocblas_int batch_count);
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_zheevdj_batched(rocblas_handle handle,
+                                                          const rocblas_evect evect,
+                                                          const rocblas_fill uplo,
+                                                          const rocblas_int n,
+                                                          rocblas_double_complex* const A[],
+                                                          const rocblas_int lda,
+                                                          double* D,
+                                                          const rocblas_stride strideD,
+                                                          rocblas_int* info,
+                                                          const rocblas_int batch_count);
+//! @}
+
+/*! @{
+    \brief SYEVDJ_STRIDED_BATCHED computes the eigenvalues and optionally the eigenvectors of a
+    batch of real symmetric matrices A_l.
+
+    \details
+    The eigenvalues are found using the iterative Jacobi algorithm and are returned in ascending order.
+    The eigenvectors are computed using a divide-and-conquer approach depending on the value of evect.
+    The computed eigenvectors are orthonormal.
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    evect       #rocblas_evect.\n
+                Specifies whether the eigenvectors are to be computed.
+                If evect is rocblas_evect_original, then the eigenvectors are computed.
+                rocblas_evect_tridiagonal is not supported.
+    @param[in]
+    uplo        rocblas_fill.\n
+                Specifies whether the upper or lower part of the symmetric matrices A_l is stored.
+                If uplo indicates lower (or upper), then the upper (or lower) part of A_l
+                is not used.
+    @param[in]
+    n           rocblas_int. n >= 0.\n
+                Number of rows and columns of matrices A_l.
+    @param[inout]
+    A           Pointer to type. Array on the GPU (the size depends on the value of strideA).\n
+                On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and
+                the algorithm converged; otherwise the contents of A_l are destroyed.
+    @param[in]
+    lda         rocblas_int. lda >= n.\n
+                Specifies the leading dimension of matrices A_l.
+    @param[in]
+    strideA     rocblas_stride.\n
+                Stride from the start of one matrix A_l to the next one A_(l+1).
+                There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
+    @param[out]
+    D           pointer to type. Array on the GPU (the size depends on the value of strideD).\n
+                The eigenvalues of A_l in increasing order.
+    @param[in]
+    strideD     rocblas_stride.\n
+                Stride from the start of one vector D_l to the next one D_(l+1).
+                There is no restriction for the value of strideD. Normal use case is strideD >= n.
+    @param[out]
+    info        pointer to rocblas_int. Array of batch_count integers on the GPU.\n
+                If info[l] = 0, successful exit for A_l. If info[l] = 1, the algorithm did not converge for A_l.
+    @param[in]
+    batch_count rocblas_int. batch_count >= 0.\n
+                Number of matrices in the batch.
+    **************************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_ssyevdj_strided_batched(rocblas_handle handle,
+                                                                  const rocblas_evect evect,
+                                                                  const rocblas_fill uplo,
+                                                                  const rocblas_int n,
+                                                                  float* A,
+                                                                  const rocblas_int lda,
+                                                                  const rocblas_stride strideA,
+                                                                  float* D,
+                                                                  const rocblas_stride strideD,
+                                                                  rocblas_int* info,
+                                                                  const rocblas_int batch_count);
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_dsyevdj_strided_batched(rocblas_handle handle,
+                                                                  const rocblas_evect evect,
+                                                                  const rocblas_fill uplo,
+                                                                  const rocblas_int n,
+                                                                  double* A,
+                                                                  const rocblas_int lda,
+                                                                  const rocblas_stride strideA,
+                                                                  double* D,
+                                                                  const rocblas_stride strideD,
+                                                                  rocblas_int* info,
+                                                                  const rocblas_int batch_count);
+//! @}
+
+/*! @{
+    \brief HEEVDJ_STRIDED_BATCHED computes the eigenvalues and optionally the eigenvectors of a
+    batch of complex Hermitian matrices A_l.
+
+    \details
+    The eigenvalues are found using the iterative Jacobi algorithm and are returned in ascending order.
+    The eigenvectors are computed using a divide-and-conquer approach depending on the value of evect.
+    The computed eigenvectors are orthonormal.
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    evect       #rocblas_evect.\n
+                Specifies whether the eigenvectors are to be computed.
+                If evect is rocblas_evect_original, then the eigenvectors are computed.
+                rocblas_evect_tridiagonal is not supported.
+    @param[in]
+    uplo        rocblas_fill.\n
+                Specifies whether the upper or lower part of the Hermitian matrices A_l is stored.
+                If uplo indicates lower (or upper), then the upper (or lower) part of A_l
+                is not used.
+    @param[in]
+    n           rocblas_int. n >= 0.\n
+                Number of rows and columns of matrices A_l.
+    @param[inout]
+    A           Pointer to type. Array on the GPU (the size depends on the value of strideA).\n
+                On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and
+                the algorithm converged; otherwise the contents of A_l are destroyed.
+    @param[in]
+    lda         rocblas_int. lda >= n.\n
+                Specifies the leading dimension of matrices A_l.
+    @param[in]
+    strideA     rocblas_stride.\n
+                Stride from the start of one matrix A_l to the next one A_(l+1).
+                There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
+    @param[out]
+    D           pointer to real type. Array on the GPU (the size depends on the value of strideD).\n
+                The eigenvalues of A_l in increasing order.
+    @param[in]
+    strideD     rocblas_stride.\n
+                Stride from the start of one vector D_l to the next one D_(l+1).
+                There is no restriction for the value of strideD. Normal use case is strideD >= n.
+    @param[out]
+    info        pointer to rocblas_int. Array of batch_count integers on the GPU.\n
+                If info[l] = 0, successful exit for A_l. If info[l] = 1, the algorithm did not converge for A_l.
+    @param[in]
+    batch_count rocblas_int. batch_count >= 0.\n
+                Number of matrices in the batch.
+    **************************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_cheevdj_strided_batched(rocblas_handle handle,
+                                                                  const rocblas_evect evect,
+                                                                  const rocblas_fill uplo,
+                                                                  const rocblas_int n,
+                                                                  rocblas_float_complex* A,
+                                                                  const rocblas_int lda,
+                                                                  const rocblas_stride strideA,
+                                                                  float* D,
+                                                                  const rocblas_stride strideD,
+                                                                  rocblas_int* info,
+                                                                  const rocblas_int batch_count);
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_zheevdj_strided_batched(rocblas_handle handle,
+                                                                  const rocblas_evect evect,
+                                                                  const rocblas_fill uplo,
+                                                                  const rocblas_int n,
+                                                                  rocblas_double_complex* A,
+                                                                  const rocblas_int lda,
+                                                                  const rocblas_stride strideA,
+                                                                  double* D,
+                                                                  const rocblas_stride strideD,
+                                                                  rocblas_int* info,
+                                                                  const rocblas_int batch_count);
+//! @}
+
+/*! @{
+    \brief SYGVDJ computes the eigenvalues and (optionally) eigenvectors of
+    a real generalized symmetric-definite eigenproblem.
+
+    \details
+    The problem solved by this function is either of the form
+
+    \f[
+        \begin{array}{cl}
+        A X = \lambda B X & \: \text{1st form,}\\
+        A B X = \lambda X & \: \text{2nd form, or}\\
+        B A X = \lambda X & \: \text{3rd form,}
+        \end{array}
+    \f]
+
+    depending on the value of itype. The eigenvalues are found using the iterative Jacobi algorithm,
+    and are returned in ascending order. The eigenvectors are computed using a divide-and-conquer algorithm,
+    depending on the value of evect.
+
+    When computed, the matrix Z of eigenvectors is normalized as follows:
+
+    \f[
+        \begin{array}{cl}
+        Z^T B Z=I & \: \text{if 1st or 2nd form, or}\\
+        Z^T B^{-1} Z=I & \: \text{if 3rd form.}
+        \end{array}
+    \f]
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    itype       #rocblas_eform.\n
+                Specifies the form of the generalized eigenproblem.
+    @param[in]
+    evect       #rocblas_evect.\n
+                Specifies whether the eigenvectors are to be computed.
+                If evect is rocblas_evect_original, then the eigenvectors are computed.
+                rocblas_evect_tridiagonal is not supported.
+    @param[in]
+    uplo        rocblas_fill.\n
+                Specifies whether the upper or lower parts of the matrices A and B are stored.
+                If uplo indicates lower (or upper), then the upper (or lower) parts of A and B
+                are not used.
+    @param[in]
+    n           rocblas_int. n >= 0.\n
+                Number of rows and columns of matrix A.
+    @param[inout]
+    A           pointer to type. Array on the GPU of dimension lda*n.\n
+                On entry, the matrix A. On exit, the normalized matrix Z of eigenvectors if they were computed
+                and the algorithm converged; otherwise the contents of A are destroyed.
+    @param[in]
+    lda         rocblas_int. lda >= n.\n
+                Specifies the leading dimension of matrix A.
+    @param[inout]
+    B           pointer to type. Array on the GPU of dimension ldb*n.\n
+                On entry, the symmetric positive definite matrix B. On exit,
+                the triangular factor of B as returned by \ref rocsolver_spotrf "POTRF".
+    @param[in]
+    ldb         rocblas_int. ldb >= n.\n
+                Specifies the leading dimension of matrix B.
+    @param[out]
+    D           pointer to type. Array on the GPU of dimension n.\n
+                The eigenvalues in increasing order.
+    @param[out]
+    info        pointer to a rocblas_int on the GPU.\n
+                If info = 0, successful exit. If info = 1, the algorithm did not converge.
+                If info = n + i, the leading minor of order i of B is not positive definite.
+    **************************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_ssygvdj(rocblas_handle handle,
+                                                  const rocblas_eform itype,
+                                                  const rocblas_evect evect,
+                                                  const rocblas_fill uplo,
+                                                  const rocblas_int n,
+                                                  float* A,
+                                                  const rocblas_int lda,
+                                                  float* B,
+                                                  const rocblas_int ldb,
+                                                  float* D,
+                                                  rocblas_int* info);
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_dsygvdj(rocblas_handle handle,
+                                                  const rocblas_eform itype,
+                                                  const rocblas_evect evect,
+                                                  const rocblas_fill uplo,
+                                                  const rocblas_int n,
+                                                  double* A,
+                                                  const rocblas_int lda,
+                                                  double* B,
+                                                  const rocblas_int ldb,
+                                                  double* D,
+                                                  rocblas_int* info);
+//! @}
+
+/*! @{
+    \brief HEGVDJ computes the eigenvalues and (optionally) eigenvectors of
+    a complex generalized Hermitian-definite eigenproblem.
+
+    \details
+    The problem solved by this function is either of the form
+
+    \f[
+        \begin{array}{cl}
+        A X = \lambda B X & \: \text{1st form,}\\
+        A B X = \lambda X & \: \text{2nd form, or}\\
+        B A X = \lambda X & \: \text{3rd form,}
+        \end{array}
+    \f]
+
+    depending on the value of itype. The eigenvalues are found using the iterative Jacobi algorithm,
+    and are returned in ascending order. The eigenvectors are computed using a divide-and-conquer algorithm,
+    depending on the value of evect.
+
+    When computed, the matrix Z of eigenvectors is normalized as follows:
+
+    \f[
+        \begin{array}{cl}
+        Z^H B Z=I & \: \text{if 1st or 2nd form, or}\\
+        Z^H B^{-1} Z=I & \: \text{if 3rd form.}
+        \end{array}
+    \f]
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    itype       #rocblas_eform.\n
+                Specifies the form of the generalized eigenproblem.
+    @param[in]
+    evect       #rocblas_evect.\n
+                Specifies whether the eigenvectors are to be computed.
+                If evect is rocblas_evect_original, then the eigenvectors are computed.
+                rocblas_evect_tridiagonal is not supported.
+    @param[in]
+    uplo        rocblas_fill.\n
+                Specifies whether the upper or lower parts of the matrices A and B are stored.
+                If uplo indicates lower (or upper), then the upper (or lower) parts of A and B
+                are not used.
+    @param[in]
+    n           rocblas_int. n >= 0.\n
+                Number of rows and columns of matrix A.
+    @param[inout]
+    A           pointer to type. Array on the GPU of dimension lda*n.\n
+                On entry, the matrix A. On exit, the normalized matrix Z of eigenvectors if they were computed
+                and the algorithm converged; otherwise the contents of A are destroyed.
+    @param[in]
+    lda         rocblas_int. lda >= n.\n
+                Specifies the leading dimension of matrix A.
+    @param[inout]
+    B           pointer to type. Array on the GPU of dimension ldb*n.\n
+                On entry, the Hermitian positive definite matrix B. On exit,
+                the triangular factor of B as returned by \ref rocsolver_spotrf "POTRF".
+    @param[in]
+    ldb         rocblas_int. ldb >= n.\n
+                Specifies the leading dimension of matrix B.
+    @param[out]
+    D           pointer to real type. Array on the GPU of dimension n.\n
+                The eigenvalues in increasing order.
+    @param[out]
+    info        pointer to a rocblas_int on the GPU.\n
+                If info = 0, successful exit. If info = 1, the algorithm did not converge.
+                If info = n + i, the leading minor of order i of B is not positive definite.
+    **************************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_chegvdj(rocblas_handle handle,
+                                                  const rocblas_eform itype,
+                                                  const rocblas_evect evect,
+                                                  const rocblas_fill uplo,
+                                                  const rocblas_int n,
+                                                  rocblas_float_complex* A,
+                                                  const rocblas_int lda,
+                                                  rocblas_float_complex* B,
+                                                  const rocblas_int ldb,
+                                                  float* D,
+                                                  rocblas_int* info);
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_zhegvdj(rocblas_handle handle,
+                                                  const rocblas_eform itype,
+                                                  const rocblas_evect evect,
+                                                  const rocblas_fill uplo,
+                                                  const rocblas_int n,
+                                                  rocblas_double_complex* A,
+                                                  const rocblas_int lda,
+                                                  rocblas_double_complex* B,
+                                                  const rocblas_int ldb,
+                                                  double* D,
+                                                  rocblas_int* info);
+//! @}
+
+/*! @{
+    \brief SYGVDJ_BATCHED computes the eigenvalues and (optionally) eigenvectors of
+    batch of real generalized symmetric-definite eigenproblems.
+
+    \details
+    For each instance in the batch, the problem solved by this function is either of the form
+
+    \f[
+        \begin{array}{cl}
+        A_l X_l = \lambda B_l X_l & \: \text{1st form,}\\
+        A_l B_l X_l = \lambda X_l & \: \text{2nd form, or}\\
+        B_l A_l X_l = \lambda X_l & \: \text{3rd form,}
+        \end{array}
+    \f]
+
+    depending on the value of itype. The eigenvalues are found using the iterative Jacobi algorithm,
+    and are returned in ascending order. The eigenvectors are computed using a divide-and-conquer algorithm,
+    depending on the value of evect.
+
+    When computed, the matrix Z_l of eigenvectors is normalized as follows:
+
+    \f[
+        \begin{array}{cl}
+        Z^T_l B_l Z_l=I & \: \text{if 1st or 2nd form, or}\\
+        Z^T_l B^{-1}_l Z_l=I & \: \text{if 3rd form.}
+        \end{array}
+    \f]
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    itype       #rocblas_eform.\n
+                Specifies the form of the generalized eigenproblems.
+    @param[in]
+    evect       #rocblas_evect.\n
+                Specifies whether the eigenvectors are to be computed.
+                If evect is rocblas_evect_original, then the eigenvectors are computed.
+                rocblas_evect_tridiagonal is not supported.
+    @param[in]
+    uplo        rocblas_fill.\n
+                Specifies whether the upper or lower parts of the matrices A_l and B_l are stored.
+                If uplo indicates lower (or upper), then the upper (or lower) parts of A_l and B_l
+                are not used.
+    @param[in]
+    n           rocblas_int. n >= 0.\n
+                Number of rows and columns of matrix A_l.
+    @param[inout]
+    A           array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
+                On entry, the matrices A_l. On exit, the normalized matrices Z_l of eigenvectors if they were computed
+                and the algorithm converged; otherwise the contents of A_l are destroyed.
+    @param[in]
+    lda         rocblas_int. lda >= n.\n
+                Specifies the leading dimension of matrices A_l.
+    @param[inout]
+    B           array of pointers to type. Each pointer points to an array on the GPU of dimension ldb*n.\n
+                On entry, the symmetric positive definite matrices B_l. On exit,
+                the triangular factor of B_l as returned by \ref rocsolver_spotrf_batched "POTRF_BATCHED".
+    @param[in]
+    ldb         rocblas_int. ldb >= n.\n
+                Specifies the leading dimension of matrices B_l.
+    @param[out]
+    D           pointer to type. Array on the GPU (the size depends on the value of strideD).\n
+                The eigenvalues in increasing order.
+    @param[in]
+    strideD     rocblas_stride.\n
+                Stride from the start of one vector D_l to the next one D_(l+1).
+                There is no restriction for the value of strideD. Normal use is strideD >= n.
+    @param[out]
+    info        pointer to rocblas_int. Array of batch_count integers on the GPU.\n
+                If info[l] = 0, successful exit. If info[l] = 1, the algorithm did not converge for matrix A_l.
+                If info[l] = n + i, the leading minor of order i of B_l is not positive definite.
+    @param[in]
+    batch_count rocblas_int. batch_count >= 0.\n
+                Number of eigenproblems in the batch.
+    **************************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_ssygvdj_batched(rocblas_handle handle,
+                                                          const rocblas_eform itype,
+                                                          const rocblas_evect evect,
+                                                          const rocblas_fill uplo,
+                                                          const rocblas_int n,
+                                                          float* const A[],
+                                                          const rocblas_int lda,
+                                                          float* const B[],
+                                                          const rocblas_int ldb,
+                                                          float* D,
+                                                          const rocblas_stride strideD,
+                                                          rocblas_int* info,
+                                                          const rocblas_int batch_count);
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_dsygvdj_batched(rocblas_handle handle,
+                                                          const rocblas_eform itype,
+                                                          const rocblas_evect evect,
+                                                          const rocblas_fill uplo,
+                                                          const rocblas_int n,
+                                                          double* const A[],
+                                                          const rocblas_int lda,
+                                                          double* const B[],
+                                                          const rocblas_int ldb,
+                                                          double* D,
+                                                          const rocblas_stride strideD,
+                                                          rocblas_int* info,
+                                                          const rocblas_int batch_count);
+//! @}
+
+/*! @{
+    \brief HEGVDJ_BATCHED computes the eigenvalues and (optionally) eigenvectors of
+    batch of complex generalized Hermitian-definite eigenproblems.
+
+    \details
+    For each instance in the batch, the problem solved by this function is either of the form
+
+    \f[
+        \begin{array}{cl}
+        A_l X_l = \lambda B_l X_l & \: \text{1st form,}\\
+        A_l B_l X_l = \lambda X_l & \: \text{2nd form, or}\\
+        B_l A_l X_l = \lambda X_l & \: \text{3rd form,}
+        \end{array}
+    \f]
+
+    depending on the value of itype. The eigenvalues are found using the iterative Jacobi algorithm,
+    and are returned in ascending order. The eigenvectors are computed using a divide-and-conquer algorithm,
+    depending on the value of evect.
+
+    When computed, the matrix Z_l of eigenvectors is normalized as follows:
+
+    \f[
+        \begin{array}{cl}
+        Z^H_l B_l Z_l=I & \: \text{if 1st or 2nd form, or}\\
+        Z^H_l B^{-1}_l Z_l=I & \: \text{if 3rd form.}
+        \end{array}
+    \f]
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    itype       #rocblas_eform.\n
+                Specifies the form of the generalized eigenproblems.
+    @param[in]
+    evect       #rocblas_evect.\n
+                Specifies whether the eigenvectors are to be computed.
+                If evect is rocblas_evect_original, then the eigenvectors are computed.
+                rocblas_evect_tridiagonal is not supported.
+    @param[in]
+    uplo        rocblas_fill.\n
+                Specifies whether the upper or lower parts of the matrices A_l and B_l are stored.
+                If uplo indicates lower (or upper), then the upper (or lower) parts of A_l and B_l
+                are not used.
+    @param[in]
+    n           rocblas_int. n >= 0.\n
+                Number of rows and columns of matrix A_l.
+    @param[inout]
+    A           array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
+                On entry, the matrices A_l. On exit, the normalized matrices Z_l of eigenvectors if they were computed
+                and the algorithm converged; otherwise the contents of A_l are destroyed.
+    @param[in]
+    lda         rocblas_int. lda >= n.\n
+                Specifies the leading dimension of matrices A_l.
+    @param[inout]
+    B           array of pointers to type. Each pointer points to an array on the GPU of dimension ldb*n.\n
+                On entry, the Hermitian positive definite matrices B_l. On exit,
+                the triangular factor of B_l as returned by \ref rocsolver_spotrf_batched "POTRF_BATCHED".
+    @param[in]
+    ldb         rocblas_int. ldb >= n.\n
+                Specifies the leading dimension of matrices B_l.
+    @param[out]
+    D           pointer to real type. Array on the GPU (the size depends on the value of strideD).\n
+                The eigenvalues in increasing order.
+    @param[in]
+    strideD     rocblas_stride.\n
+                Stride from the start of one vector D_l to the next one D_(l+1).
+                There is no restriction for the value of strideD. Normal use is strideD >= n.
+    @param[out]
+    info        pointer to rocblas_int. Array of batch_count integers on the GPU.\n
+                If info[l] = 0, successful exit. If info[l] = 1, the algorithm did not converge for matrix A_l.
+                If info[l] = n + i, the leading minor of order i of B_l is not positive definite.
+    @param[in]
+    batch_count rocblas_int. batch_count >= 0.\n
+                Number of eigenproblems in the batch.
+    **************************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_chegvdj_batched(rocblas_handle handle,
+                                                          const rocblas_eform itype,
+                                                          const rocblas_evect evect,
+                                                          const rocblas_fill uplo,
+                                                          const rocblas_int n,
+                                                          rocblas_float_complex* const A[],
+                                                          const rocblas_int lda,
+                                                          rocblas_float_complex* const B[],
+                                                          const rocblas_int ldb,
+                                                          float* D,
+                                                          const rocblas_stride strideD,
+                                                          rocblas_int* info,
+                                                          const rocblas_int batch_count);
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_zhegvdj_batched(rocblas_handle handle,
+                                                          const rocblas_eform itype,
+                                                          const rocblas_evect evect,
+                                                          const rocblas_fill uplo,
+                                                          const rocblas_int n,
+                                                          rocblas_double_complex* const A[],
+                                                          const rocblas_int lda,
+                                                          rocblas_double_complex* const B[],
+                                                          const rocblas_int ldb,
+                                                          double* D,
+                                                          const rocblas_stride strideD,
+                                                          rocblas_int* info,
+                                                          const rocblas_int batch_count);
 //! @}
 
 /*! @{
-    \brief SYEVD_STRIDED_BATCHED computes the eigenvalues and optionally the eigenvectors of a batch of
-    real symmetric matrices A_l.
+    \brief SYGVDJ_STRIDED_BATCHED computes the eigenvalues and (optionally) eigenvectors of
+    batch of real generalized symmetric-definite eigenproblems.
 
     \details
-    The eigenvalues are returned in ascending order. The eigenvectors are computed using a
-    divide-and-conquer algorithm, depending on the value of evect. The computed eigenvectors
-    are orthonormal.
+    For each instance in the batch, the problem solved by this function is either of the form
+
+    \f[
+        \begin{array}{cl}
+        A_l X_l = \lambda B_l X_l & \: \text{1st form,}\\
+        A_l B_l X_l = \lambda X_l & \: \text{2nd form, or}\\
+        B_l A_l X_l = \lambda X_l & \: \text{3rd form,}
+        \end{array}
+    \f]
+
+    depending on the value of itype. The eigenvalues are found using the iterative Jacobi algorithm,
+    and are returned in ascending order. The eigenvectors are computed using a divide-and-conquer algorithm,
+    depending on the value of evect.
+
+    When computed, the matrix Z_l of eigenvectors is normalized as follows:
+
+    \f[
+        \begin{array}{cl}
+        Z^T_l B_l Z_l=I & \: \text{if 1st or 2nd form, or}\\
+        Z^T_l B^{-1}_l Z_l=I & \: \text{if 3rd form.}
+        \end{array}
+    \f]
 
     @param[in]
     handle      rocblas_handle.
     @param[in]
-    evect       #rocblas_evect.
+    itype       #rocblas_eform.\n
+                Specifies the form of the generalized eigenproblems.
+    @param[in]
+    evect       #rocblas_evect.\n
                 Specifies whether the eigenvectors are to be computed.
                 If evect is rocblas_evect_original, then the eigenvectors are computed.
                 rocblas_evect_tridiagonal is not supported.
     @param[in]
-    uplo        rocblas_fill.
-                Specifies whether the upper or lower part of the symmetric matrices A_l is stored.
-                If uplo indicates lower (or upper), then the upper (or lower) part of A_l
-                is not used.
+    uplo        rocblas_fill.\n
+                Specifies whether the upper or lower parts of the matrices A_l and B_l are stored.
+                If uplo indicates lower (or upper), then the upper (or lower) parts of A_l and B_l
+                are not used.
     @param[in]
-    n           rocblas_int. n >= 0.
-                Number of rows and columns of matrices A_l.
+    n           rocblas_int. n >= 0.\n
+                Number of rows and columns of matrix A_l.
     @param[inout]
-    A           pointer to type. Array on the GPU (the size depends on the value of strideA).
-                On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and
-                the algorithm converged; otherwise the contents of A_l are destroyed.
+    A           pointer to type. Array on the GPU (the size depends on the value of strideA).\n
+                On entry, the matrices A_l. On exit, the normalized matrices Z_l of eigenvectors if they were computed
+                and the algorithm converged; otherwise the contents of A_l are destroyed.
     @param[in]
-    lda         rocblas_int. lda >= n.
+    lda         rocblas_int. lda >= n.\n
                 Specifies the leading dimension of matrices A_l.
     @param[in]
-    strideA     rocblas_stride.
+    strideA     rocblas_stride.\n
                 Stride from the start of one matrix A_l to the next one A_(l+1).
-                There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
-    @param[out]
-    D           pointer to type. Array on the GPU (the size depends on the value of strideD).
-                The eigenvalues of A_l in increasing order.
+                There is no restriction for the value of strideA. Normal use is strideA >= lda*n.
+    @param[inout]
+    B           pointer to type. Array on the GPU (the size depends on the value of strideB).\n
+                On entry, the symmetric positive definite matrices B_l. On exit,
+                the triangular factor of B_l as returned by \ref rocsolver_spotrf_strided_batched "POTRF_STRIDED_BATCHED".
     @param[in]
-    strideD     rocblas_stride.
-                Stride from the start of one vector D_l to the next one D_(l+1).
-                There is no restriction for the value of strideD. Normal use case is strideD >= n.
+    ldb         rocblas_int. ldb >= n.\n
+                Specifies the leading dimension of matrices B_l.
+    @param[in]
+    strideB     rocblas_stride.\n
+                Stride from the start of one matrix B_l to the next one B_(l+1).
+                There is no restriction for the value of strideB. Normal use is strideB >= ldb*n.
     @param[out]
-    E           pointer to type. Array on the GPU (the size depends on the value of strideE).
-                This array is used to work internally with the tridiagonal matrix T_l associated with A_l.
-                On exit, if info[l] > 0, E_l contains the unconverged off-diagonal elements of T_l
-                (or properly speaking, a tridiagonal matrix equivalent to T_l). The diagonal elements
-                of this matrix are in D_l; those that converged correspond to a subset of the
-                eigenvalues of A_l (not necessarily ordered).
+    D           pointer to type. Array on the GPU (the size depends on the value of strideD).\n
+                The eigenvalues in increasing order.
     @param[in]
-    strideE     rocblas_stride.
-                Stride from the start of one vector E_l to the next one E_(l+1).
-                There is no restriction for the value of strideE. Normal use case is strideE >= n.
+    strideD     rocblas_stride.\n
+                Stride from the start of one vector D_l to the next one D_(l+1).
+                There is no restriction for the value of strideD. Normal use is strideD >= n.
     @param[out]
-    info        pointer to rocblas_int. Array of batch_count integers on the GPU.
-                If info[l] = 0, successful exit for matrix A_l.
-                If info[l] = i > 0 and evect is rocblas_evect_none, the algorithm did not converge.
-                i elements of E_l did not converge to zero.
-                If info[l] = i > 0 and evect is rocblas_evect_original, the algorithm failed to
-                compute an eigenvalue in the submatrix from [i/(n+1), i/(n+1)] to [i%(n+1), i%(n+1)].
+    info        pointer to rocblas_int. Array of batch_count integers on the GPU.\n
+                If info[l] = 0, successful exit. If info[l] = 1, the algorithm did not converge for matrix A_l.
+                If info[l] = n + i, the leading minor of order i of B_l is not positive definite.
     @param[in]
-    batch_count rocblas_int. batch_count >= 0.
-                Number of matrices in the batch.
+    batch_count rocblas_int. batch_count >= 0.\n
+                Number of eigenproblems in the batch.
     **************************************************************************************/
 
-ROCSOLVER_EXPORT rocblas_status rocsolver_ssyevd_strided_batched(rocblas_handle handle,
-                                                                 const rocblas_evect evect,
-                                                                 const rocblas_fill uplo,
-                                                                 const rocblas_int n,
-                                                                 float* A,
-                                                                 const rocblas_int lda,
-                                                                 const rocblas_stride strideA,
-                                                                 float* D,
-                                                                 const rocblas_stride strideD,
-                                                                 float* E,
-                                                                 const rocblas_stride strideE,
-                                                                 rocblas_int* info,
-                                                                 const rocblas_int batch_count);
+ROCSOLVER_EXPORT rocblas_status rocsolver_ssygvdj_strided_batched(rocblas_handle handle,
+                                                                  const rocblas_eform itype,
+                                                                  const rocblas_evect evect,
+                                                                  const rocblas_fill uplo,
+                                                                  const rocblas_int n,
+                                                                  float* A,
+                                                                  const rocblas_int lda,
+                                                                  const rocblas_stride strideA,
+                                                                  float* B,
+                                                                  const rocblas_int ldb,
+                                                                  const rocblas_stride strideB,
+                                                                  float* D,
+                                                                  const rocblas_stride strideD,
+                                                                  rocblas_int* info,
+                                                                  const rocblas_int batch_count);
 
-ROCSOLVER_EXPORT rocblas_status rocsolver_dsyevd_strided_batched(rocblas_handle handle,
-                                                                 const rocblas_evect evect,
-                                                                 const rocblas_fill uplo,
-                                                                 const rocblas_int n,
-                                                                 double* A,
-                                                                 const rocblas_int lda,
-                                                                 const rocblas_stride strideA,
-                                                                 double* D,
-                                                                 const rocblas_stride strideD,
-                                                                 double* E,
-                                                                 const rocblas_stride strideE,
-                                                                 rocblas_int* info,
-                                                                 const rocblas_int batch_count);
+ROCSOLVER_EXPORT rocblas_status rocsolver_dsygvdj_strided_batched(rocblas_handle handle,
+                                                                  const rocblas_eform itype,
+                                                                  const rocblas_evect evect,
+                                                                  const rocblas_fill uplo,
+                                                                  const rocblas_int n,
+                                                                  double* A,
+                                                                  const rocblas_int lda,
+                                                                  const rocblas_stride strideA,
+                                                                  double* B,
+                                                                  const rocblas_int ldb,
+                                                                  const rocblas_stride strideB,
+                                                                  double* D,
+                                                                  const rocblas_stride strideD,
+                                                                  rocblas_int* info,
+                                                                  const rocblas_int batch_count);
 //! @}
 
 /*! @{
-    \brief HEEVD_STRIDED_BATCHED computes the eigenvalues and optionally the eigenvectors of a batch of
-    Hermitian matrices A_l.
+    \brief HEGVDJ_STRIDED_BATCHED computes the eigenvalues and (optionally) eigenvectors of
+    batch of complex generalized Hermitian-definite eigenproblems.
 
     \details
-    The eigenvalues are returned in ascending order. The eigenvectors are computed using a
-    divide-and-conquer algorithm, depending on the value of evect. The computed eigenvectors
-    are orthonormal.
+    For each instance in the batch, the problem solved by this function is either of the form
+
+    \f[
+        \begin{array}{cl}
+        A_l X_l = \lambda B_l X_l & \: \text{1st form,}\\
+        A_l B_l X_l = \lambda X_l & \: \text{2nd form, or}\\
+        B_l A_l X_l = \lambda X_l & \: \text{3rd form,}
+        \end{array}
+    \f]
+
+    depending on the value of itype. The eigenvalues are found using the iterative Jacobi algorithm,
+    and are returned in ascending order. The eigenvectors are computed using a divide-and-conquer algorithm,
+    depending on the value of evect.
+
+    When computed, the matrix Z_l of eigenvectors is normalized as follows:
+
+    \f[
+        \begin{array}{cl}
+        Z^H_l B_l Z_l=I & \: \text{if 1st or 2nd form, or}\\
+        Z^H_l B^{-1}_l Z_l=I & \: \text{if 3rd form.}
+        \end{array}
+    \f]
 
     @param[in]
     handle      rocblas_handle.
     @param[in]
-    evect       #rocblas_evect.
+    itype       #rocblas_eform.\n
+                Specifies the form of the generalized eigenproblems.
+    @param[in]
+    evect       #rocblas_evect.\n
                 Specifies whether the eigenvectors are to be computed.
                 If evect is rocblas_evect_original, then the eigenvectors are computed.
                 rocblas_evect_tridiagonal is not supported.
     @param[in]
-    uplo        rocblas_fill.
-                Specifies whether the upper or lower part of the Hermitian matrices A_l is stored.
-                If uplo indicates lower (or upper), then the upper (or lower) part of A_l
-                is not used.
+    uplo        rocblas_fill.\n
+                Specifies whether the upper or lower parts of the matrices A_l and B_l are stored.
+                If uplo indicates lower (or upper), then the upper (or lower) parts of A_l and B_l
+                are not used.
     @param[in]
-    n           rocblas_int. n >= 0.
-                Number of rows and columns of matrices A_l.
+    n           rocblas_int. n >= 0.\n
+                Number of rows and columns of matrix A_l.
     @param[inout]
-    A           pointer to type. Array on the GPU (the size depends on the value of strideA).
-                On entry, the matrices A_l. On exit, the eigenvectors of A_l if they were computed and
-                the algorithm converged; otherwise the contents of A_l are destroyed.
+    A           pointer to type. Array on the GPU (the size depends on the value of strideA).\n
+                On entry, the matrices A_l. On exit, the normalized matrices Z_l of eigenvectors if they were computed
+                and the algorithm converged; otherwise the contents of A_l are destroyed.
     @param[in]
-    lda         rocblas_int. lda >= n.
+    lda         rocblas_int. lda >= n.\n
                 Specifies the leading dimension of matrices A_l.
     @param[in]
-    strideA     rocblas_stride.
+    strideA     rocblas_stride.\n
                 Stride from the start of one matrix A_l to the next one A_(l+1).
-                There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
-    @param[out]
-    D           pointer to real type. Array on the GPU (the size depends on the value of strideD).
-                The eigenvalues of A_l in increasing order.
+                There is no restriction for the value of strideA. Normal use is strideA >= lda*n.
+    @param[inout]
+    B           pointer to type. Array on the GPU (the size depends on the value of strideB).\n
+                On entry, the Hermitian positive definite matrices B_l. On exit,
+                the triangular factor of B_l as returned by \ref rocsolver_spotrf_batched "POTRF_BATCHED".
     @param[in]
-    strideD     rocblas_stride.
-                Stride from the start of one vector D_l to the next one D_(l+1).
-                There is no restriction for the value of strideD. Normal use case is strideD >= n.
+    ldb         rocblas_int. ldb >= n.\n
+                Specifies the leading dimension of matrices B_l.
+    @param[in]
+    strideB     rocblas_stride.\n
+                Stride from the start of one matrix B_l to the next one B_(l+1).
+                There is no restriction for the value of strideB. Normal use is strideB >= ldb*n.
     @param[out]
-    E           pointer to real type. Array on the GPU (the size depends on the value of strideE).
-                This array is used to work internally with the tridiagonal matrix T_l associated with A_l.
-                On exit, if info[l] > 0, E_l contains the unconverged off-diagonal elements of T_l
-                (or properly speaking, a tridiagonal matrix equivalent to T_l). The diagonal elements
-                of this matrix are in D_l; those that converged correspond to a subset of the
-                eigenvalues of A_l (not necessarily ordered).
+    D           pointer to real type. Array on the GPU (the size depends on the value of strideD).\n
+                The eigenvalues in increasing order.
     @param[in]
-    strideE     rocblas_stride.
-                Stride from the start of one vector E_l to the next one E_(l+1).
-                There is no restriction for the value of strideE. Normal use case is strideE >= n.
+    strideD     rocblas_stride.\n
+                Stride from the start of one vector D_l to the next one D_(l+1).
+                There is no restriction for the value of strideD. Normal use is strideD >= n.
     @param[out]
-    info        pointer to rocblas_int. Array of batch_count integers on the GPU.
-                If info[l] = 0, successful exit for matrix A_l.
-                If info[l] = i > 0 and evect is rocblas_evect_none, the algorithm did not converge.
-                i elements of E_l did not converge to zero.
-                If info[l] = i > 0 and evect is rocblas_evect_original, the algorithm failed to
-                compute an eigenvalue in the submatrix from [i/(n+1), i/(n+1)] to [i%(n+1), i%(n+1)].
+    info        pointer to rocblas_int. Array of batch_count integers on the GPU.\n
+                If info[l] = 0, successful exit. If info[l] = 1, the algorithm did not converge for matrix A_l.
+                If info[l] = n + i, the leading minor of order i of B_l is not positive definite.
     @param[in]
-    batch_count rocblas_int. batch_count >= 0.
-                Number of matrices in the batch.
+    batch_count rocblas_int. batch_count >= 0.\n
+                Number of eigenproblems in the batch.
     **************************************************************************************/
 
-ROCSOLVER_EXPORT rocblas_status rocsolver_cheevd_strided_batched(rocblas_handle handle,
-                                                                 const rocblas_evect evect,
-                                                                 const rocblas_fill uplo,
-                                                                 const rocblas_int n,
-                                                                 rocblas_float_complex* A,
-                                                                 const rocblas_int lda,
-                                                                 const rocblas_stride strideA,
-                                                                 float* D,
-                                                                 const rocblas_stride strideD,
-                                                                 float* E,
-                                                                 const rocblas_stride strideE,
-                                                                 rocblas_int* info,
-                                                                 const rocblas_int batch_count);
+ROCSOLVER_EXPORT rocblas_status rocsolver_chegvdj_strided_batched(rocblas_handle handle,
+                                                                  const rocblas_eform itype,
+                                                                  const rocblas_evect evect,
+                                                                  const rocblas_fill uplo,
+                                                                  const rocblas_int n,
+                                                                  rocblas_float_complex* A,
+                                                                  const rocblas_int lda,
+                                                                  const rocblas_stride strideA,
+                                                                  rocblas_float_complex* B,
+                                                                  const rocblas_int ldb,
+                                                                  const rocblas_stride strideB,
+                                                                  float* D,
+                                                                  const rocblas_stride strideD,
+                                                                  rocblas_int* info,
+                                                                  const rocblas_int batch_count);
 
-ROCSOLVER_EXPORT rocblas_status rocsolver_zheevd_strided_batched(rocblas_handle handle,
-                                                                 const rocblas_evect evect,
-                                                                 const rocblas_fill uplo,
-                                                                 const rocblas_int n,
-                                                                 rocblas_double_complex* A,
-                                                                 const rocblas_int lda,
-                                                                 const rocblas_stride strideA,
-                                                                 double* D,
-                                                                 const rocblas_stride strideD,
-                                                                 double* E,
-                                                                 const rocblas_stride strideE,
-                                                                 rocblas_int* info,
-                                                                 const rocblas_int batch_count);
+ROCSOLVER_EXPORT rocblas_status rocsolver_zhegvdj_strided_batched(rocblas_handle handle,
+                                                                  const rocblas_eform itype,
+                                                                  const rocblas_evect evect,
+                                                                  const rocblas_fill uplo,
+                                                                  const rocblas_int n,
+                                                                  rocblas_double_complex* A,
+                                                                  const rocblas_int lda,
+                                                                  const rocblas_stride strideA,
+                                                                  rocblas_double_complex* B,
+                                                                  const rocblas_int ldb,
+                                                                  const rocblas_stride strideB,
+                                                                  double* D,
+                                                                  const rocblas_stride strideD,
+                                                                  rocblas_int* info,
+                                                                  const rocblas_int batch_count);
 //! @}
 
 /*! @{
@@ -24116,9 +25152,9 @@ ROCSOLVER_EXPORT rocblas_status
  */
 
 /*! \brief CREATE_RFINFO initializes the structure rfinfo that contains the meta data and descriptors of the involved matrices
-	required by the re-factorization functions
+    required by the re-factorization functions
     \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and
-	by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE".
+    by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE".
 
     \details
     @param[out]
@@ -24132,7 +25168,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_create_rfinfo(rocsolver_rfinfo* rfinfo
                                                         rocblas_handle handle);
 
 /*! \brief DESTROY_RFINFO destroys the structure rfinfo used by the re-factorization functions
-	\ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and
+    \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and
     by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE".
 
     \details
@@ -24145,7 +25181,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_destroy_rfinfo(rocsolver_rfinfo rfinfo
 
 /*! \brief SET_RFINFO_MODE sets the mode of the structure rfinfo required by the re-factorization functions
     \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and
-	by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE".
+    by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE".
 
     \details
     @param[in]
@@ -24153,7 +25189,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_destroy_rfinfo(rocsolver_rfinfo rfinfo
                 The rfinfo struct to be set up.
     @param[in]
     mode        #rocsolver_rfinfo_mode.
-				Use rocsolver_rfinfo_mode_cholesky when the Cholesky factorization is required.
+                Use rocsolver_rfinfo_mode_cholesky when the Cholesky factorization is required.
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_set_rfinfo_mode(rocsolver_rfinfo rfinfo,
@@ -24161,7 +25197,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_set_rfinfo_mode(rocsolver_rfinfo rfinf
 
 /*! \brief GET_RFINFO_MODE gets the mode of the structure rfinfo required by the re-factorization functions
     \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and
-	by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE".
+    by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE".
 
     \details
     @param[in]
@@ -24169,7 +25205,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_set_rfinfo_mode(rocsolver_rfinfo rfinf
                 The referenced rfinfo struct.
     @param[out]
     mode        #rocsolver_rfinfo_mode.
-				The queried mode.
+                The queried mode.
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_get_rfinfo_mode(rocsolver_rfinfo rfinfo,
@@ -24344,7 +25380,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_splitlu(rocblas_handle handle,
 
 /*! @{
     \brief CSRRF_ANALYSIS performs the analysis phase required by the re-factorization functions
-	\ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and
+    \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" and \ref rocsolver_scsrrf_refactchol "CSRRF_REFACTCHOL", and
     by the direct solver \ref rocsolver_scsrrf_solve "CSRRF_SOLVE".
 
     \details Consider a sparse matrix \f$M\f$ previously factorized as
@@ -24353,20 +25389,20 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_splitlu(rocblas_handle handle,
         Q^TMQ = L_ML_M^T
     \f]
 
-	(Cholesky factorization for the symmetric positive definite case), or
+    (Cholesky factorization for the symmetric positive definite case), or
 
-	\f[
+    \f[
         PMQ = L_MU_M
     \f]
 
-	(LU factorization for the general case)
+    (LU factorization for the general case)
 
     where \f$L_M\f$ is lower triangular (with unit diagonal in the general case), \f$U_M\f$ is upper triangular, and \f$P\f$
     and \f$Q\f$ are permutation matrices associated with pivoting and re-ordering (to minimize
     fill-in), respectively. The meta data generated by this routine is collected in the output parameter
     rfinfo. This information will allow the fast re-factorization of another sparse matrix \f$A\f$ as
 
-	\f[
+    \f[
         Q^TAQ = L_AL_A^T, \quad \text{or}
     \f]
 
@@ -24382,17 +25418,17 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_splitlu(rocblas_handle handle,
 
     as long as \f$A\f$ has the same sparsity pattern as the previous matrix \f$M\f$.
 
-	This function supposes that the rfinfo struct has been initialized by \ref rocsolver_create_rfinfo "RFINFO_CREATE".
-	By default, rfinfo is set up to work with the LU factorization (general matrices). If the matrix is symmetric positive definite,
-	and the Cholesky factorization is
-	desired, then the corresponding mode must be manually set up by \ref rocsolver_set_rfinfo_mode "SET_RFINFO_MODE". This function
-	does not automatically detect symmetry.
+    This function supposes that the rfinfo struct has been initialized by \ref rocsolver_create_rfinfo "RFINFO_CREATE".
+    By default, rfinfo is set up to work with the LU factorization (general matrices). If the matrix is symmetric positive definite,
+    and the Cholesky factorization is
+    desired, then the corresponding mode must be manually set up by \ref rocsolver_set_rfinfo_mode "SET_RFINFO_MODE". This function
+    does not automatically detect symmetry.
 
     For the LU factorization mode, the LU factors \f$L_M\f$ and \f$U_M\f$ must be passed in a bundle
     matrix \f$T=(L_M-I)+U_M\f$ as returned by \ref rocsolver_scsrrf_sumlu "CSRRF_SUMLU". For the Cholesky mode,
     the lower triangular part of \f$T\f$ must contain the Cholesky factor \f$L_M\f$; the strictly upper triangular
-	part of \f$T\f$ will be ignored. Similarly, the strictly upper triangular part of \f$M\f$ is ignored when working
-	in Cholesky mode.
+    part of \f$T\f$ will be ignored. Similarly, the strictly upper triangular part of \f$M\f$ is ignored when working
+    in Cholesky mode.
 
     \note
     If only a re-factorization will be executed (i.e. no solver phase), then nrhs can be set to zero
@@ -24421,7 +25457,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_splitlu(rocblas_handle handle,
     @param[in]
     valM        pointer to type. Array on the GPU of dimension nnzM.
                 The values of the non-zero elements of M. The strictly upper triangular entries are
-				not referenced when working in Cholesky mode.
+                not referenced when working in Cholesky mode.
     @param[in]
     nnzT        rocblas_int. nnzT >= 0.
                 The number of non-zero elements in T.
@@ -24441,7 +25477,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_splitlu(rocblas_handle handle,
     pivP        pointer to rocblas_int. Array on the GPU of dimension n.
                 Contains the pivot indices representing the permutation matrix P, i.e. the
                 order in which the rows of matrix M were re-arranged. When working in Cholesky mode,
-				this array is not referenced and can be null.
+                this array is not referenced and can be null.
     @param[in]
     pivQ        pointer to rocblas_int. Array on the GPU of dimension n.
                 Contains the pivot indices representing the permutation matrix Q, i.e. the
@@ -24515,8 +25551,8 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_analysis(rocblas_handle handle,
 
     This function supposes that rfinfo has been updated, by function \ref rocsolver_scsrrf_analysis "CSRRF_ANALYSIS",
     after the analysis phase of the previous matrix M and its initial factorization. Both functions, CSRRF_ANALYSIS and
-	CSRRF_REFACTLU must be run with the same rfinfo mode (LU factorization, the default mode), otherwise the workflow will
-	result in an error.
+    CSRRF_REFACTLU must be run with the same rfinfo mode (LU factorization, the default mode), otherwise the workflow will
+    result in an error.
 
     @param[in]
     handle      rocblas_handle.
@@ -24595,7 +25631,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_refactlu(rocblas_handle handle,
 
 /*! @{
     \brief CSRRF_REFACTCHOL performs a fast Cholesky factorization of a sparse symmetric positive definite matrix \f$A\f$
-	based on the information from the factorization of a previous matrix \f$M\f$ with the same sparsity pattern
+    based on the information from the factorization of a previous matrix \f$M\f$ with the same sparsity pattern
     (re-factorization).
 
     \details Consider a sparse matrix \f$M\f$ previously factorized as
@@ -24652,7 +25688,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_refactlu(rocblas_handle handle,
     @param[out]
     valT        pointer to type. Array on the GPU of dimension nnzT.
                 The values of the non-zero elements of the new Cholesky factor L_A.
-				The strictly upper triangular entries of this array are not referenced.
+                The strictly upper triangular entries of this array are not referenced.
     @param[in]
     pivQ        pointer to rocblas_int. Array on the GPU of dimension n.
                 Contains the pivot indices representing the permutation matrix Q, i.e. the
@@ -24705,23 +25741,23 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dcsrrf_refactchol(rocblas_handle handl
         Q^TAQ = L_AL_A^T
     \f]
 
-	(Cholesky factorization for the symmetric positive definite case), or
+    (Cholesky factorization for the symmetric positive definite case), or
 
-	\f[
+    \f[
         PAQ = L_AU_A
     \f]
 
-	(LU factorization for the general case),
+    (LU factorization for the general case),
 
     and \f$B\f$ is a dense matrix of right hand sides.
 
-	This function supposes that rfinfo has been updated by function \ref rocsolver_scsrrf_analysis "CSRRF_ANALYSIS",
+    This function supposes that rfinfo has been updated by function \ref rocsolver_scsrrf_analysis "CSRRF_ANALYSIS",
     after the analysis phase. Both functions, CSRRF_ANALYSIS and
     CSRRF_SOLVE must be run with the same rfinfo mode (LU or Cholesky factorization), otherwise the workflow will
     result in an error.
 
     For the LU factorization mode, the LU factors \f$L_A\f$ and \f$U_A\f$ must be passed in a bundle matrix \f$T=(L_A-I)+U_A\f$
-	as returned by \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" or \ref rocsolver_scsrrf_sumlu "CSRRF_SUMLU". For the Cholesky mode,
+    as returned by \ref rocsolver_scsrrf_refactlu "CSRRF_REFACTLU" or \ref rocsolver_scsrrf_sumlu "CSRRF_SUMLU". For the Cholesky mode,
     the lower triangular part of \f$T\f$ must contain the Cholesky factor \f$L_A\f$; the strictly upper triangular
     part of \f$T\f$ will be ignored.
 
diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
index dd7fc560a..1b3998cba 100755
--- a/library/src/CMakeLists.txt
+++ b/library/src/CMakeLists.txt
@@ -1,5 +1,5 @@
 # ##########################################################################
-# Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -172,6 +172,12 @@ set(rocsolver_lapack_source
   lapack/roclapack_sygvx_hegvx_batched.cpp
   lapack/roclapack_sygvx_hegvx_strided_batched.cpp
   lapack/roclapack_sygvdx_hegvdx_inplace.cpp
+  lapack/roclapack_syevdj_heevdj.cpp
+  lapack/roclapack_syevdj_heevdj_batched.cpp
+  lapack/roclapack_syevdj_heevdj_strided_batched.cpp
+  lapack/roclapack_sygvdj_hegvdj.cpp
+  lapack/roclapack_sygvdj_hegvdj_batched.cpp
+  lapack/roclapack_sygvdj_hegvdj_strided_batched.cpp
 )
 
 set(rocsolver_auxiliary_source
diff --git a/library/src/auxiliary/rocauxiliary_stedc.hpp b/library/src/auxiliary/rocauxiliary_stedc.hpp
index a4e35e153..2cc595f6c 100644
--- a/library/src/auxiliary/rocauxiliary_stedc.hpp
+++ b/library/src/auxiliary/rocauxiliary_stedc.hpp
@@ -4,7 +4,7 @@
  *     Univ. of Tennessee, Univ. of California Berkeley,
  *     Univ. of Colorado Denver and NAG Ltd..
  *     December 2016
- * Copyright (C) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -39,12 +39,17 @@
 #include "rocblas.hpp"
 #include "rocsolver/rocsolver.h"
 
-#define STEDC_BDIM 512 // Number of threads per thread-block used in main stedc kernel
+#define STEDC_BDIM 512 // Number of threads per thread-block used in main stedc kernels
 #define MAXITERS 50 // Max number of iterations for root finding method
+#define MAXSWEEPS 20 // Max number of sweeps for Jacobi solver (when used)
 #define CLASSICQR 1 // used to specify classic QR iteration solver (default case)
 #define JACOBI 2 // used to specify Jacobi iteration solver
 #define BISECTION 3 // used to specify bisection and inverse iteration solver
 
+/***************** Device auxiliary functions *****************************************/
+/**************************************************************************************/
+
+//--------------------------------------------------------------------------------------//
 /** SEQ_EVAL evaluates the secular equation at a given point. It accumulates the
     corrections to the elements in D so that distance to poles are computed accurately **/
 template <typename S>
@@ -143,6 +148,7 @@ __device__ void seq_eval(const rocblas_int type,
     *pt_er = er;
 }
 
+//--------------------------------------------------------------------------------------//
 /** SEQ_SOLVE solves secular equation at point k (i.e. computes kth eigenvalue that
     is within an internal interval). We use rational interpolation and fixed weights
     method between the 2 poles of the interval.
@@ -405,6 +411,7 @@ __device__ rocblas_int seq_solve(const rocblas_int dd,
     return converged ? 0 : 1;
 }
 
+//--------------------------------------------------------------------------------------//
 /** SEQ_SOLVE_EXT solves secular equation at point n (i.e. computes last eigenvalue).
     We use rational interpolation and fixed weights method between the (n-1)th and nth poles.
     (TODO: In the future, we could consider using 3 poles for those cases that may need it
@@ -597,28 +604,76 @@ __device__ rocblas_int seq_solve_ext(const rocblas_int dd,
     return converged ? 0 : 1;
 }
 
+//--------------------------------------------------------------------------------------//
 /** STEDC_NUM_LEVELS returns the ideal number of times/levels in which a matrix (or split block)
     will be divided during the divide phase of divide & conquer algorithm.
     i.e. number of sub-blocks = 2^levels **/
 __host__ __device__ inline rocblas_int stedc_num_levels(const rocblas_int n, int solver_mode)
 {
     rocblas_int levels = 0;
+    // return the max number of levels such that the sub-blocks are at least of size 1
+    // (i.e. 2^levels <= n), and there are no more than 256 sub-blocks (i.e. 2^levels <= 256)
+    if(n <= 2)
+        return levels;
 
     switch(solver_mode)
     {
     case BISECTION:
+    {
         //	TODO
         break;
+    }
 
-    case JACOBI:
-        //	TODO
+    case JACOBI: // TODO: tuning will be required; using the same tuning as QR for now
+    {
+        if(n <= 4)
+        {
+            levels = 1;
+        }
+        else if(n <= 32)
+        {
+            levels = 2;
+        }
+        else if(n <= 232)
+        {
+            levels = 4;
+        }
+        else
+        {
+            if(n <= 1946)
+            {
+                if(n <= 1692)
+                {
+                    if(n <= 295)
+                    {
+                        levels = 5;
+                    }
+                    else
+                    {
+                        levels = 7;
+                    }
+                }
+                else
+                {
+                    levels = 7;
+                }
+            }
+            else
+            {
+                levels = 8;
+            }
+        }
         break;
+    }
 
     case CLASSICQR:
     default:
-        // return the max number of levels such that the sub-blocks are at least of size 8, and
-        // there are no more than 256 sub-blocks
-        if(n <= 32)
+    {
+        if(n <= 4)
+        {
+            levels = 1;
+        }
+        else if(n <= 32)
         {
             levels = 2;
         }
@@ -652,13 +707,45 @@ __host__ __device__ inline rocblas_int stedc_num_levels(const rocblas_int n, int
             }
         }
     }
+    } //end switch
 
     return levels;
 }
 
-/** STEDC_SPLIT finds independent blocks in the tridiagonal matrix
-    given by D and E. (The independent blocks can then be solved in
-    parallel by the DC algorithm) **/
+//--------------------------------------------------------------------------------------//
+/** DE2TRIDIAG generates a tridiagonal matrix from vectors of diagonal entries (D) and
+	off-diagonal entries (E). **/
+template <typename S>
+__device__ inline void de2tridiag(const int numt,
+                                  const rocblas_int id,
+                                  const rocblas_int n,
+                                  S* D,
+                                  S* E,
+                                  S* C,
+                                  const rocblas_int ldc)
+{
+    for(rocblas_int k = id; k < n * n; k += numt)
+    {
+        rocblas_int i = k % n;
+        rocblas_int j = k / n;
+        S val;
+        bool offd = (i == j + 1);
+        if(offd || i == j - 1)
+            val = offd ? E[j] : E[i];
+        else
+            val = (i == j) ? D[i] : 0;
+        C[i + j * ldc] = val;
+    }
+}
+
+/*************** Main kernels *********************************************************/
+/**************************************************************************************/
+
+//--------------------------------------------------------------------------------------//
+/** STEDC_SPLIT finds independent blocks (split-blocks) in the tridiagonal matrix
+    given by D and E. The independent blocks can then be solved in
+    parallel by the DC algorithm.
+	- Call this kernel with batch_count single-threaded groups in x **/
 template <typename S>
 ROCSOLVER_KERNEL void stedc_split(const rocblas_int n,
                                   S* DD,
@@ -673,7 +760,7 @@ ROCSOLVER_KERNEL void stedc_split(const rocblas_int n,
     // select batch instance
     S* D = DD + (bid * strideD);
     S* E = EE + (bid * strideE);
-    rocblas_int* splits = splitsA + bid * (3 * n + 2);
+    rocblas_int* splits = splitsA + bid * (5 * n + 2);
 
     rocblas_int k = 0; //position where the last block starts
     S tol; //tolerance. If an element of E is <= tol we have an independent block
@@ -704,28 +791,330 @@ ROCSOLVER_KERNEL void stedc_split(const rocblas_int n,
     splits[n + 1] = nb; //also save the number of split blocks
 }
 
-/** STEDC_KERNEL implements the main loop of the DC algorithm to compute the
-	eigenvalues/eigenvectors of the symmetric tridiagonal submatrices. **/
+//--------------------------------------------------------------------------------------//
+/** STEDC_DIVIDE_KERNEL implements the divide phase of the DC algorithm. It divides each
+	split-block into a number of sub-blocks.
+	- Call this kernel with batch_count groups in x. Groups are of size STEDC_BDIM.
+	- If there are actually more split-blocks than STEDC_BDIM, some threads will work with more
+	  than one split-block sequentially. **/
+template <typename S>
+ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_divide_kernel(const rocblas_int n,
+                                                                        S* DD,
+                                                                        const rocblas_stride strideD,
+                                                                        S* EE,
+                                                                        const rocblas_stride strideE,
+                                                                        rocblas_int* splitsA,
+                                                                        const int solver_mode)
+{
+    // threads and groups indices
+    /* --------------------------------------------------- */
+    // batch instance id
+    rocblas_int bid = hipBlockIdx_x;
+    // split-block id
+    rocblas_int sid = hipThreadIdx_x;
+    /* --------------------------------------------------- */
+
+    // select batch instance to work with
+    /* --------------------------------------------------- */
+    S* D = DD + bid * strideD;
+    S* E = EE + bid * strideE;
+    /* --------------------------------------------------- */
+
+    // temporary arrays in global memory
+    /* --------------------------------------------------- */
+    // contains the beginning of split blocks
+    rocblas_int* splits = splitsA + bid * (5 * n + 2);
+    // the sub-blocks sizes
+    rocblas_int* nsA = splits + n + 2;
+    // the sub-blocks initial positions
+    rocblas_int* psA = nsA + n;
+    /* --------------------------------------------------- */
+
+    // local variables
+    /* --------------------------------------------------- */
+    // total number of split blocks
+    rocblas_int nb = splits[n + 1];
+    // size of split block
+    rocblas_int bs;
+    // beginning of split block
+    rocblas_int p1;
+    // beginning of sub-block
+    rocblas_int p2;
+    // number of sub-blocks
+    rocblas_int blks;
+    // number of level of division
+    rocblas_int levs;
+    // other aux variables
+    S p;
+    rocblas_int *ns, *ps;
+    /* --------------------------------------------------- */
+
+    // work with STEDC_BDIM split blocks in parallel
+    /* --------------------------------------------------- */
+    for(int kb = sid; kb < nb; kb += STEDC_BDIM)
+    {
+        // Select current split block
+        p1 = splits[kb];
+        p2 = splits[kb + 1];
+        bs = p2 - p1;
+        ns = nsA + p1;
+        ps = psA + p1;
+
+        // determine ideal number of sub-blocks in split-block
+        levs = stedc_num_levels(bs, solver_mode);
+        blks = 1 << levs;
+
+        // 1. DIVIDE PHASE
+        /* ----------------------------------------------------------------- */
+        // (artificially divide split-block into blks sub-blocks
+        // find initial positions of each sub-blocks)
+
+        // find sizes of sub-blocks
+        ns[0] = bs;
+        rocblas_int t, t2;
+        for(int i = 0; i < levs; ++i)
+        {
+            for(int j = (1 << i); j > 0; --j)
+            {
+                t = ns[j - 1];
+                t2 = t / 2;
+                ns[j * 2 - 1] = (2 * t2 < t) ? t2 + 1 : t2;
+                ns[j * 2 - 2] = t2;
+            }
+        }
+
+        // find beginning of sub-blocks and update D elements
+        p2 = p1;
+        ps[0] = p2;
+        for(int i = 1; i < blks; ++i)
+        {
+            p2 += ns[i - 1];
+            ps[i] = p2;
+
+            // perform sub-block division
+            p = E[p2 - 1];
+            D[p2] -= p;
+            D[p2 - 1] -= p;
+        }
+    }
+}
+
+//--------------------------------------------------------------------------------------//
+/** STEDC_SOLVE_KERNEL implements the solver phase of the DC algorithm to
+	compute the eigenvalues/eigenvectors of the different sub-blocks of each split-block.
+	A matrix in the batch could have many split-blocks, and each split-block could be
+	divided in a maximum of nn sub-blocks.
+	- Call his kernel with batch_count groups in z,	STEDC_NUM_SPLIT_BLKS groups in y
+	  and nn groups in x. Groups are size STEDC_BDIM.
+	- STEDC_NUM_SPLIT_BLKS is fixed (is the number of split-blocks that will be analysed
+	  in parallel). If there are actually more split-blocks, some groups will work with more
+	  than one split-block sequentially.
+	- An uper bound for the number of sub-blocks (nn) can be estimated from the size n.
+	  If a group has an id larger than the actual number of sub-blocks in a split-block,
+	  it will do nothing. **/
+template <typename S>
+ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_solve_kernel(const rocblas_int n,
+                                                                       S* DD,
+                                                                       const rocblas_stride strideD,
+                                                                       S* EE,
+                                                                       const rocblas_stride strideE,
+                                                                       S* CC,
+                                                                       const rocblas_int shiftC,
+                                                                       const rocblas_int ldc,
+                                                                       const rocblas_stride strideC,
+                                                                       rocblas_int* iinfo,
+                                                                       S* WA,
+                                                                       rocblas_int* splitsA,
+                                                                       const S eps,
+                                                                       const S ssfmin,
+                                                                       const S ssfmax,
+                                                                       const int solver_mode)
+{
+    // threads and groups indices
+    /* --------------------------------------------------- */
+    // batch instance id
+    rocblas_int bid = hipBlockIdx_z;
+    // split-block id
+    rocblas_int sid = hipBlockIdx_y;
+    // sub-block id
+    rocblas_int tid = hipBlockIdx_x;
+    // thread index
+    rocblas_int tidb = hipThreadIdx_x;
+    /* --------------------------------------------------- */
+
+    // select batch instance to work with
+    /* --------------------------------------------------- */
+    S* C;
+    if(CC)
+        C = load_ptr_batch<S>(CC, bid, shiftC, strideC);
+    S* D = DD + bid * strideD;
+    S* E = EE + bid * strideE;
+    rocblas_int* info = iinfo + bid;
+    /* --------------------------------------------------- */
+
+    // temporary arrays in global memory
+    /* --------------------------------------------------- */
+    // contains the beginning of split blocks
+    rocblas_int* splits = splitsA + bid * (5 * n + 2);
+    // the sub-blocks sizes
+    rocblas_int* nsA = splits + n + 2;
+    // the sub-blocks initial positions
+    rocblas_int* psA = nsA + n;
+    // worksapce for solvers
+    S* W;
+    switch(solver_mode)
+    {
+    case BISECTION: break;
+
+    case JACOBI: W = WA + bid * (2 + n * n); break;
+
+    case CLASSICQR:
+    default: W = WA + bid * (2 * n);
+    }
+    /* --------------------------------------------------- */
+
+    // temporary arrays in shared memory
+    /* --------------------------------------------------- */
+    extern __shared__ rocblas_int lsmem[];
+    // shared mem for jacobi solver if needed
+    S* sj1;
+    rocblas_int* sj2;
+    if(solver_mode == JACOBI)
+    {
+        sj2 = lsmem;
+        sj1 = reinterpret_cast<S*>(sj2 + n + n % 2);
+    }
+    /* --------------------------------------------------- */
+
+    // local variables
+    /* --------------------------------------------------- */
+    // total number of split blocks
+    rocblas_int nb = splits[n + 1];
+    // size of split block
+    rocblas_int bs;
+    // size of sub block
+    rocblas_int sbs;
+    // beginning of split block
+    rocblas_int p1;
+    // beginning of sub-block
+    rocblas_int p2;
+    // number of sub-blocks
+    rocblas_int blks;
+    // number of level of division
+    rocblas_int levs;
+    // other aux variables
+    S p;
+    rocblas_int *ns, *ps;
+    /* --------------------------------------------------- */
+
+    // work with STEDC_NUM_SPLIT_BLKS split blocks in parallel
+    /* --------------------------------------------------- */
+    for(int kb = sid; kb < nb; kb += STEDC_NUM_SPLIT_BLKS)
+    {
+        // Select current split block
+        p1 = splits[kb];
+        p2 = splits[kb + 1];
+        bs = p2 - p1;
+        ns = nsA + p1;
+        ps = psA + p1;
+
+        // determine ideal number of sub-blocks
+        levs = stedc_num_levels(bs, solver_mode);
+        blks = 1 << levs;
+
+        // 2. SOLVE PHASE
+        /* ----------------------------------------------------------------- */
+        // Solve the blks sub-blocks in parallel.
+
+        if(tid < blks)
+        {
+            sbs = ns[tid];
+            p2 = ps[tid];
+
+            switch(solver_mode)
+            {
+            case BISECTION:
+            {
+                //  TODO
+                break;
+            }
+
+            case JACOBI:
+            {
+                // transform D and E into full upper tridiag matrix and copy to C
+                de2tridiag(STEDC_BDIM, tidb, sbs, D + p2, E + p2, C + p2 + p2 * ldc, ldc);
+
+                // set work space
+                S* W_Acpy = W;
+                S* W_residual = W_Acpy + n * n;
+                rocblas_int* W_n_sweeps = reinterpret_cast<rocblas_int*>(W_residual + 1);
+
+                // set shared mem
+                rocblas_int even_n = sbs + sbs % 2;
+                rocblas_int half_n = even_n / 2;
+                S* cosines_res = sj1;
+                S* sines_diag = cosines_res + half_n;
+                rocblas_int* top = sj2;
+                rocblas_int* bottom = top + half_n;
+
+                // re-arrange threads in 2D array
+                rocblas_int ddx, ddy;
+                syevj_get_dims(sbs, STEDC_BDIM, &ddx, &ddy);
+                rocblas_int tix = tidb % ddx;
+                rocblas_int tiy = tidb / ddx;
+                __syncthreads();
+
+                // solve
+                run_syevj<S, S>(ddx, ddy, tix, tiy, rocblas_esort_ascending, rocblas_evect_original,
+                                rocblas_fill_upper, sbs, C + p2 + p2 * ldc, ldc, 0, eps, W_residual,
+                                MAXSWEEPS, W_n_sweeps, D + p2, info, W_Acpy + p2 + p2 * n,
+                                cosines_res, sines_diag, top, bottom);
+
+                break;
+            }
+
+            case CLASSICQR:
+            default:
+            {
+                // (Until STEQR is parallelized, only the first thread associated
+                // to each sub-block do computations)
+                if(tidb == 0)
+                    run_steqr(sbs, D + p2, E + p2, C + p2 + p2 * ldc, ldc, info, W + p2 * 2,
+                              30 * bs, eps, ssfmin, ssfmax, false);
+            }
+            } // end switch
+            __syncthreads();
+        }
+    }
+}
+
+//--------------------------------------------------------------------------------------//
+/** STEDC_MERGE_KERNEL implements the main loop of the DC algorithm to merge the
+	eigenvalues/eigenvectors of the different sub-blocks of each split-block.
+	A matrix in the batch could have many split-blocks, and each split-block could be
+	divided in a maximum of nn sub-blocks.
+	- Call his kernel with batch_count groups in y,	and STEDC_NUM_SPLIT_BLKS groups in x.
+	  Groups are size STEDC_BDIM.
+	- STEDC_NUM_SPLIT_BLKS is fixed (is the number of split-blocks that will be analysed
+	  in parallel). If there are actually more split-blocks, some groups will work with more
+	  than one split-block sequentially. **/
 template <typename S>
-ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_kernel(const rocblas_int n,
-                                                                 S* DD,
-                                                                 const rocblas_stride strideD,
-                                                                 S* EE,
-                                                                 const rocblas_stride strideE,
-                                                                 S* CC,
-                                                                 const rocblas_int shiftC,
-                                                                 const rocblas_int ldc,
-                                                                 const rocblas_stride strideC,
-                                                                 rocblas_int* iinfo,
-                                                                 S* WA,
-                                                                 S* tmpzA,
-                                                                 S* vecsA,
-                                                                 rocblas_int* splitsA,
-                                                                 const S eps,
-                                                                 const S ssfmin,
-                                                                 const S ssfmax,
-                                                                 const rocblas_int maxblks,
-                                                                 int solver_mode)
+ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_merge_kernel(const rocblas_int n,
+                                                                       S* DD,
+                                                                       const rocblas_stride strideD,
+                                                                       S* EE,
+                                                                       const rocblas_stride strideE,
+                                                                       S* CC,
+                                                                       const rocblas_int shiftC,
+                                                                       const rocblas_int ldc,
+                                                                       const rocblas_stride strideC,
+                                                                       S* tmpzA,
+                                                                       S* vecsA,
+                                                                       rocblas_int* splitsA,
+                                                                       const S eps,
+                                                                       const S ssfmin,
+                                                                       const S ssfmax,
+                                                                       const int solver_mode)
 {
     // threads and groups indices
     /* --------------------------------------------------- */
@@ -744,19 +1133,20 @@ ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_kernel(const rocblas_i
         C = load_ptr_batch<S>(CC, bid, shiftC, strideC);
     S* D = DD + bid * strideD;
     S* E = EE + bid * strideE;
-    rocblas_int* info = iinfo + bid;
     /* --------------------------------------------------- */
 
     // temporary arrays in global memory
     /* --------------------------------------------------- */
     // contains the beginning of split blocks
-    rocblas_int* splits = splitsA + bid * (3 * n + 2);
+    rocblas_int* splits = splitsA + bid * (5 * n + 2);
+    // the sub-blocks sizes
+    rocblas_int* nsA = splits + n + 2;
+    // the sub-blocks initial positions
+    rocblas_int* psA = nsA + n;
     // if idd[i] = 0, the value in position i has been deflated
-    rocblas_int* idd = splits + n + 2;
+    rocblas_int* idd = psA + n;
     // container of permutations when solving the secular eqns
     rocblas_int* pers = idd + n;
-    // worksapce for STEQR
-    S* W = WA + bid * (2 * n);
     // the rank-1 modification vectors in the merges
     S* z = tmpzA + bid * (2 * n);
     // roots of secular equations
@@ -769,18 +1159,14 @@ ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_kernel(const rocblas_i
 
     // temporary arrays in shared memory
     /* --------------------------------------------------- */
-    extern __shared__ rocblas_int lmem[];
-    // shares the sub-blocks sizes
-    rocblas_int* ns = lmem;
-    // shares the sub-blocks initial positions
-    rocblas_int* ps = ns + maxblks;
+    extern __shared__ rocblas_int lsmem[];
     // used to store temp values during the different reductions
-    S* inrms = reinterpret_cast<S*>(ps + maxblks);
+    S* inrms = reinterpret_cast<S*>(lsmem);
     /* --------------------------------------------------- */
 
     // local variables
     /* --------------------------------------------------- */
-    // total number of blocks
+    // total number of split blocks
     rocblas_int nb = splits[n + 1];
     // size of split block
     rocblas_int bs;
@@ -792,7 +1178,11 @@ ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_kernel(const rocblas_i
     rocblas_int blks;
     // number of level of division
     rocblas_int levs;
+    // number of threads working in sub-block
+    rocblas_int tn;
+    // other aux variables
     S p;
+    rocblas_int *ns, *ps;
     /* --------------------------------------------------- */
 
     // work with STEDC_NUM_SPLIT_BLKS split blocks in parallel
@@ -805,583 +1195,488 @@ ROCSOLVER_KERNEL void __launch_bounds__(STEDC_BDIM) stedc_kernel(const rocblas_i
         p1 = splits[kb];
         p2 = splits[kb + 1];
         bs = p2 - p1;
+        ns = nsA + p1;
+        ps = psA + p1;
 
         // determine ideal number of sub-blocks
         levs = stedc_num_levels(bs, solver_mode);
         blks = 1 << levs;
 
-        // if split block is too small
-        if(blks == 1)
+        // arrange threads so that a group of bdim/blks threads works with each sub-block
+        // tn is the number of threads associated to each sub-block
+        tn = STEDC_BDIM / blks;
+        // tid indexes the sub-block
+        tid = id / tn;
+        p2 = ps[tid];
+        // tidb indexes the threads in each sub-block
+        tidb = id % tn;
+
+        // 3. MERGE PHASE
+        /* ----------------------------------------------------------------- */
+        rocblas_int iam, sz, bdm;
+        S* ptz;
+
+        // main loop to work with merges on each level
+        // (once two leaves in the merge tree are identified, all related threads
+        // work together to solve the secular equation and update eiegenvectors)
+        for(int k = 0; k < levs; ++k)
         {
-            switch(solver_mode)
+            // 3a. find rank-1 modification components (z and p) for this merge
+            /* ----------------------------------------------------------------- */
+            // iam indexes the sub-blocks according to its level in the merge tree
+            rocblas_int bd = 1 << k;
+            rocblas_int ss = 1;
+            bdm = bd << 1;
+            iam = tid % bdm;
+
+            // Threads with iam < bd work with components above the merge point;
+            //  threads with iam >= bd work below the merge point
+            if(iam < bd && tid < blks)
             {
-            case BISECTION:
-                //  TODO
-                break;
-
-            case JACOBI:
-                //  TODO
-                break;
-
-            case CLASSICQR:
-            default:
-                if(id == 0)
-                {
-                    run_steqr(bs, D + p1, E + p1, C + p1 + p1 * ldc, ldc, info, W + p1 * 2, 30 * bs,
-                              eps, ssfmin, ssfmax, false);
-                }
+                sz = ns[tid];
+                for(int j = 1; j < bd - iam; ++j)
+                    sz += ns[tid + j];
+                // with this, all threads involved in a merge (above merge point)
+                // will point to the same row of C and the same off-diag element
+                ptz = C + p2 - 1 + sz;
+                p = 2 * E[p2 - 1 + sz];
+            }
+            else if(iam >= bd && tid < blks)
+            {
+                sz = 0;
+                for(int j = 0; j < iam - bd; ++j)
+                    sz += ns[tid - j - 1];
+                // with this, all threads involved in a merge (below merge point)
+                // will point to the same row of C and the same off-diag element
+                ptz = C + p2 - sz;
+                p = 2 * E[p2 - sz - 1];
             }
-        }
 
-        // ===== otherwise, use divide & conquer =====
-        else
-        {
-            // arrange threads so that a group of bdim/blks threads works with each sub-block
-            // tn is the number of threads associated to each sub-block
-            rocblas_int tn = STEDC_BDIM / blks;
-            // tid indexes the sub-block
-            tid = id / tn;
-            // tidb indexes the threads in each sub-block
-            tidb = id % tn;
-
-            // 1. DIVIDE PHASE
-            /* ----------------------------------------------------------------- */
-            // (artificially divide split block into blks sub-blocks
-            // find initial positions of each sub-blocks)
+            // copy elements of z
             if(tidb == 0)
-                ns[tid] = 0;
-            __syncthreads();
-
-            // find sub-block sizes
-            if(id == 0)
             {
-                ns[0] = bs;
-                rocblas_int t, t2;
-                for(int i = 0; i < levs; ++i)
-                {
-                    for(int j = (1 << i); j > 0; --j)
-                    {
-                        t = ns[j - 1];
-                        t2 = t / 2;
-                        ns[j * 2 - 1] = (2 * t2 < t) ? t2 + 1 : t2;
-                        ns[j * 2 - 2] = t2;
-                    }
-                }
+                for(int j = 0; j < ns[tid]; ++j)
+                    z[p2 + j] = ptz[(p2 + j) * ldc] / sqrt(2);
             }
             __syncthreads();
 
-            // find beginning of sub-block and update D elements
-            p2 = 0;
-            for(int i = 0; i < tid; ++i)
-                p2 += ns[i];
-            p2 += p1;
+            /* ----------------------------------------------------------------- */
+
+            // 3b. calculate deflation tolerance
+            /* ----------------------------------------------------------------- */
+            S valf, valg, maxd, maxz;
+
+            // first compute maximum of diagonal and z in each thread block
             if(tidb == 0)
             {
-                ps[tid] = p2;
-                if(tid > 0 && tid < blks)
+                maxd = std::abs(D[p2]);
+                maxz = std::abs(z[p2]);
+                for(int i = 1; i < ns[tid]; ++i)
                 {
-                    // perform sub-block division
-                    p = E[p2 - 1];
-                    D[p2] -= p;
-                    D[p2 - 1] -= p;
+                    valf = std::abs(D[p2 + i]);
+                    valg = std::abs(z[p2 + i]);
+                    maxd = valf > maxd ? valf : maxd;
+                    maxz = valg > maxz ? valg : maxz;
                 }
+                inrms[tid] = maxd;
+                inrms[tid + blks] = maxz;
             }
             __syncthreads();
-            /* ----------------------------------------------------------------- */
 
-            // 2. SOLVE PHASE
-            /* ----------------------------------------------------------------- */
-            // Solve the blks sub-blocks in parallel.
-            switch(solver_mode)
+            // now follow reduction process
+            // (using only one thread as not compute intensive)
+            if(iam == 0 && tidb == 0)
             {
-            case BISECTION:
-                //  TODO
-                break;
-
-            case JACOBI:
-                //  TODO
-                break;
-
-            case CLASSICQR:
-            default:
-                // (Until STEQR is parallelized, only the first thread associated
-                // to each sub-block do computations)
-                if(tidb == 0)
+                for(int i = 1; i < bdm; ++i)
                 {
-                    run_steqr(ns[tid], D + p2, E + p2, C + p2 + p2 * ldc, ldc, info, W + p2 * 2,
-                              30 * bs, eps, ssfmin, ssfmax, false);
+                    valf = inrms[tid + i];
+                    valg = inrms[tid + blks + i];
+                    maxd = valf > maxd ? valf : maxd;
+                    maxz = valg > maxz ? valg : maxz;
                 }
+                inrms[tid] = maxd;
+                inrms[tid + blks] = maxz;
             }
             __syncthreads();
+
+            // tol should be  8 * eps * (max diagonal or z element participating in merge)
+            maxd = inrms[tid - iam];
+            maxz = inrms[tid - iam + blks];
+            maxd = maxz > maxd ? maxz : maxd;
+            S tol = 8 * eps * maxd;
             /* ----------------------------------------------------------------- */
 
-            // 3. MERGE PHASE
+            // 3c. deflate enigenvalues
             /* ----------------------------------------------------------------- */
-            rocblas_int iam, sz, bdm;
-            S* ptz;
+            S f, g, c, s, r;
 
-            // main loop to work with merges on each level
-            // (once two leaves in the merge tree are identified, all related threads
-            // work together to solve the secular equation and update eiegenvectors)
-            for(int k = 0; k < levs; ++k)
+            // first deflate each thread sub-block
+            // (only the first thread of each sub-block works as this is
+            // a sequential process)
+            if(tidb == 0)
             {
-                // 3a. find rank-1 modification components (z and p) for this merge
-                /* ----------------------------------------------------------------- */
-                // iam indexes the sub-blocks according to its level in the merge tree
-                rocblas_int bd = 1 << k;
-                rocblas_int ss = 1;
-                bdm = bd << 1;
-                iam = tid % bdm;
-
-                // Threads with iam < bd work with components above the merge point;
-                //  threads with iam >= bd work below the merge point
-                if(iam < bd && tid < blks)
-                {
-                    sz = ns[tid];
-                    for(int j = 1; j < bd - iam; ++j)
-                        sz += ns[tid + j];
-                    // with this, all threads involved in a merge (above merge point)
-                    // will point to the same row of C and the same off-diag element
-                    ptz = C + p2 - 1 + sz;
-                    p = 2 * E[p2 - 1 + sz];
-                }
-                else if(iam >= bd && tid < blks)
+                for(int i = 0; i < ns[tid]; ++i)
                 {
-                    sz = 0;
-                    for(int j = 0; j < iam - bd; ++j)
-                        sz += ns[tid - j - 1];
-                    // with this, all threads involved in a merge (below merge point)
-                    // will point to the same row of C and the same off-diag element
-                    ptz = C + p2 - sz;
-                    p = 2 * E[p2 - sz - 1];
-                }
-
-                // copy elements of z
-                if(tidb == 0)
-                {
-                    for(int j = 0; j < ns[tid]; ++j)
-                        z[p2 + j] = ptz[(p2 + j) * ldc] / sqrt(2);
-                }
-                __syncthreads();
-                /* ----------------------------------------------------------------- */
-
-                // 3b. calculate deflation tolerance
-                /* ----------------------------------------------------------------- */
-                S valf, valg, maxd, maxz;
-
-                // first compute maximum of diagonal and z in each thread block
-                if(tidb == 0)
-                {
-                    maxd = std::abs(D[p2]);
-                    maxz = std::abs(z[p2]);
-                    for(int i = 1; i < ns[tid]; ++i)
-                    {
-                        valf = std::abs(D[p2 + i]);
-                        valg = std::abs(z[p2 + i]);
-                        maxd = valf > maxd ? valf : maxd;
-                        maxz = valg > maxz ? valg : maxz;
-                    }
-                    inrms[tid] = maxd;
-                    inrms[tid + blks] = maxz;
-                }
-                __syncthreads();
-
-                // now follow reduction process
-                // (using only one thread as not compute intensive)
-                if(iam == 0 && tidb == 0)
-                {
-                    for(int i = 1; i < bdm; ++i)
+                    g = z[p2 + i];
+                    if(abs(p * g) <= tol)
                     {
-                        valf = inrms[tid + i];
-                        valg = inrms[tid + blks + i];
-                        maxd = valf > maxd ? valf : maxd;
-                        maxz = valg > maxz ? valg : maxz;
+                        // deflated ev because component in z is zero
+                        idd[p2 + i] = 0;
                     }
-                    inrms[tid] = maxd;
-                    inrms[tid + blks] = maxz;
-                }
-                __syncthreads();
-
-                // tol should be  8 * eps * (max diagonal or z element participating in merge)
-                maxd = inrms[tid - iam];
-                maxz = inrms[tid - iam + blks];
-                maxd = maxz > maxd ? maxz : maxd;
-                S tol = 8 * eps * maxd;
-                /* ----------------------------------------------------------------- */
-
-                // 3c. deflate enigenvalues
-                /* ----------------------------------------------------------------- */
-                S f, g, c, s, r;
-
-                // first deflate each thread sub-block
-                // (only the first thread of each sub-block works as this is
-                // a sequential process)
-                if(tidb == 0)
-                {
-                    for(int i = 0; i < ns[tid]; ++i)
+                    else
                     {
-                        g = z[p2 + i];
-                        if(abs(p * g) <= tol)
+                        rocblas_int jj = 1;
+                        valg = D[p2 + i];
+                        for(int j = 0; j < i; ++j)
                         {
-                            // deflated ev because component in z is zero
-                            idd[p2 + i] = 0;
-                        }
-                        else
-                        {
-                            rocblas_int jj = 1;
-                            valg = D[p2 + i];
-                            for(int j = 0; j < i; ++j)
+                            if(idd[p2 + j] == 1 && abs(D[p2 + j] - valg) <= tol)
                             {
-                                if(idd[p2 + j] == 1 && abs(D[p2 + j] - valg) <= tol)
+                                // deflated ev because it is repeated
+                                idd[p2 + i] = 0;
+                                // rotation to eliminate component in z
+                                f = z[p2 + j];
+                                lartg(f, g, c, s, r);
+                                z[p2 + j] = r;
+                                z[p2 + i] = 0;
+                                // update C with the rotation
+                                for(int ii = 0; ii < n; ++ii)
                                 {
-                                    // deflated ev because it is repeated
-                                    idd[p2 + i] = 0;
-                                    // rotation to eliminate component in z
-                                    f = z[p2 + j];
-                                    lartg(f, g, c, s, r);
-                                    z[p2 + j] = r;
-                                    z[p2 + i] = 0;
-                                    // update C with the rotation
-                                    for(int ii = 0; ii < n; ++ii)
-                                    {
-                                        valf = C[ii + (p2 + j) * ldc];
-                                        valg = C[ii + (p2 + i) * ldc];
-                                        C[ii + (p2 + j) * ldc] = valf * c - valg * s;
-                                        C[ii + (p2 + i) * ldc] = valf * s + valg * c;
-                                    }
-                                    break;
+                                    valf = C[ii + (p2 + j) * ldc];
+                                    valg = C[ii + (p2 + i) * ldc];
+                                    C[ii + (p2 + j) * ldc] = valf * c - valg * s;
+                                    C[ii + (p2 + i) * ldc] = valf * s + valg * c;
                                 }
-                                jj++;
-                            }
-                            if(jj > i)
-                            {
-                                // non-deflated ev
-                                idd[p2 + i] = 1;
+                                break;
                             }
+                            jj++;
+                        }
+                        if(jj > i)
+                        {
+                            // non-deflated ev
+                            idd[p2 + i] = 1;
                         }
                     }
                 }
-                __syncthreads();
+            }
+            __syncthreads();
 
-                // then compare with other sub-blocks participating in this merge
-                // following a simple, reduction-like process.
-                // (only the first thread of each sub-block works in the reduction)
-                for(int ii = 0; ii <= k; ++ii)
+            // then compare with other sub-blocks participating in this merge
+            // following a simple, reduction-like process.
+            // (only the first thread of each sub-block works in the reduction)
+            for(int ii = 0; ii <= k; ++ii)
+            {
+                if(tidb == 0)
                 {
-                    if(tidb == 0)
+                    rocblas_int div = 1 << (ii + 1);
+                    //actual number of threads is halved each time
+                    if(iam % div == div - 1)
                     {
-                        rocblas_int div = 1 << (ii + 1);
-                        //actual number of threads is halved each time
-                        if(iam % div == div - 1)
+                        // find limits
+                        rocblas_int inb = (1 << ii) - 1;
+                        rocblas_int inc = div - 1;
+                        rocblas_int countb = ns[tid];
+                        rocblas_int countc = 0;
+                        for(int i = inc; i > inb; --i)
+                            countc += ns[tid - i];
+                        for(int i = inb; i > 0; --i)
+                            countb += ns[tid - i];
+                        inb = ps[tid - inb];
+                        inc = ps[tid - inc];
+
+                        // perform comparisons
+                        for(int i = 0; i < countb; ++i)
                         {
-                            // find limits
-                            rocblas_int inb = (1 << ii) - 1;
-                            rocblas_int inc = div - 1;
-                            rocblas_int countb = ns[tid];
-                            rocblas_int countc = 0;
-                            for(int i = inc; i > inb; --i)
-                                countc += ns[tid - i];
-                            for(int i = inb; i > 0; --i)
-                                countb += ns[tid - i];
-                            inb = ps[tid - inb];
-                            inc = ps[tid - inc];
-
-                            // perform comparisons
-                            for(int i = 0; i < countb; ++i)
+                            if(idd[inb + i] == 1)
                             {
-                                if(idd[inb + i] == 1)
+                                valg = D[inb + i];
+                                for(int j = 0; j < countc; ++j)
                                 {
-                                    valg = D[inb + i];
-                                    for(int j = 0; j < countc; ++j)
+                                    if(idd[inc + j] == 1 && abs(D[inc + j] - valg) <= tol)
                                     {
-                                        if(idd[inc + j] == 1 && abs(D[inc + j] - valg) <= tol)
+                                        // deflated ev because it is repeated
+                                        idd[inb + i] = 0;
+                                        // rotation to eliminate component in z
+                                        g = z[inb + i];
+                                        f = z[inc + j];
+                                        lartg(f, g, c, s, r);
+                                        z[inc + j] = r;
+                                        z[inb + i] = 0;
+                                        // update C with the rotation
+                                        for(int ii = 0; ii < n; ++ii)
                                         {
-                                            // deflated ev because it is repeated
-                                            idd[inb + i] = 0;
-                                            // rotation to eliminate component in z
-                                            g = z[inb + i];
-                                            f = z[inc + j];
-                                            lartg(f, g, c, s, r);
-                                            z[inc + j] = r;
-                                            z[inb + i] = 0;
-                                            // update C with the rotation
-                                            for(int ii = 0; ii < n; ++ii)
-                                            {
-                                                valf = C[ii + (inc + j) * ldc];
-                                                valg = C[ii + (inb + i) * ldc];
-                                                C[ii + (inc + j) * ldc] = valf * c - valg * s;
-                                                C[ii + (inb + i) * ldc] = valf * s + valg * c;
-                                            }
-                                            break;
+                                            valf = C[ii + (inc + j) * ldc];
+                                            valg = C[ii + (inb + i) * ldc];
+                                            C[ii + (inc + j) * ldc] = valf * c - valg * s;
+                                            C[ii + (inb + i) * ldc] = valf * s + valg * c;
                                         }
+                                        break;
                                     }
                                 }
                             }
                         }
                     }
-                    __syncthreads();
                 }
-                /* ----------------------------------------------------------------- */
-
-                // 3d. Organize data with non-deflated values to prepare secular equation
-                /* ----------------------------------------------------------------- */
-                // determine boundaries of what would be the new merged sub-block
-                // 'in' will be its initial position
-                rocblas_int in = ps[tid - iam];
-                // 'sz' will be its size (i.e. the sum of the sizes of all merging sub-blocks)
-                sz = ns[tid];
-                for(int i = iam; i > 0; --i)
-                    sz += ns[tid - i];
-                for(int i = bdm - 1 - iam; i > 0; --i)
-                    sz += ns[tid + i];
-
-                // All threads of the participating merging blocks will work together
-                // to solve the correspondinbg secular eqn. Now 'iam' indexes those threads
-                iam = iam * tn + tidb;
-                bdm *= tn;
-
-                // define shifted arrays
-                S* tmpd = temps + in * n;
-                S* ev = evs + in;
-                S* diag = D + in;
-                rocblas_int* mask = idd + in;
-                S* zz = z + in;
-                rocblas_int* per = pers + in;
-
-                // find degree and components of secular equation
-                // tmpd contains the non-deflated diagonal elements (ie. poles of the secular eqn)
-                // zz contains the corresponding non-zero elements of the rank-1 modif vector
-                rocblas_int dd = 0;
-                for(int i = 0; i < sz; ++i)
+                __syncthreads();
+            }
+            /* ----------------------------------------------------------------- */
+
+            // 3d. Organize data with non-deflated values to prepare secular equation
+            /* ----------------------------------------------------------------- */
+            // determine boundaries of what would be the new merged sub-block
+            // 'in' will be its initial position
+            rocblas_int in = ps[tid - iam];
+            // 'sz' will be its size (i.e. the sum of the sizes of all merging sub-blocks)
+            sz = ns[tid];
+            for(int i = iam; i > 0; --i)
+                sz += ns[tid - i];
+            for(int i = bdm - 1 - iam; i > 0; --i)
+                sz += ns[tid + i];
+
+            // All threads of the participating merging blocks will work together
+            // to solve the correspondinbg secular eqn. Now 'iam' indexes those threads
+            iam = iam * tn + tidb;
+            bdm *= tn;
+
+            // define shifted arrays
+            S* tmpd = temps + in * n;
+            S* ev = evs + in;
+            S* diag = D + in;
+            rocblas_int* mask = idd + in;
+            S* zz = z + in;
+            rocblas_int* per = pers + in;
+
+            // find degree and components of secular equation
+            // tmpd contains the non-deflated diagonal elements (ie. poles of the secular eqn)
+            // zz contains the corresponding non-zero elements of the rank-1 modif vector
+            rocblas_int dd = 0;
+            for(int i = 0; i < sz; ++i)
+            {
+                if(mask[i] == 1)
                 {
-                    if(mask[i] == 1)
+                    if(tidb == 0 && iam == 0)
                     {
-                        if(tidb == 0 && iam == 0)
-                        {
-                            per[dd] = i;
-                            tmpd[dd] = p < 0 ? -diag[i] : diag[i];
-                            if(dd != i)
-                                zz[dd] = zz[i];
-                        }
-                        dd++;
+                        per[dd] = i;
+                        tmpd[dd] = p < 0 ? -diag[i] : diag[i];
+                        if(dd != i)
+                            zz[dd] = zz[i];
                     }
+                    dd++;
                 }
-                __syncthreads();
+            }
+            __syncthreads();
 
-                // Order the elements in tmpd and zz using a simple parallel selection/bubble sort.
-                // This will allow us to find initial intervals for eigenvalue guesses
-                rocblas_int tsz = 1 << (levs - 1 - k);
-                tsz = (bs - 1) / tsz + 1;
-                for(int i = 0; i < tsz; ++i)
+            // Order the elements in tmpd and zz using a simple parallel selection/bubble sort.
+            // This will allow us to find initial intervals for eigenvalue guesses
+            rocblas_int tsz = 1 << (levs - 1 - k);
+            tsz = (bs - 1) / tsz + 1;
+            for(int i = 0; i < tsz; ++i)
+            {
+                if(i < dd)
                 {
-                    if(i < dd)
+                    if(i % 2 == 0)
                     {
-                        if(i % 2 == 0)
+                        for(int j = iam; j < dd / 2; j += bdm)
                         {
-                            for(int j = iam; j < dd / 2; j += bdm)
+                            if(tmpd[2 * j] > tmpd[2 * j + 1])
                             {
-                                if(tmpd[2 * j] > tmpd[2 * j + 1])
-                                {
-                                    valf = tmpd[2 * j];
-                                    tmpd[2 * j] = tmpd[2 * j + 1];
-                                    tmpd[2 * j + 1] = valf;
-                                    valf = zz[2 * j];
-                                    zz[2 * j] = zz[2 * j + 1];
-                                    zz[2 * j + 1] = valf;
-                                    bd = per[2 * j];
-                                    per[2 * j] = per[2 * j + 1];
-                                    per[2 * j + 1] = bd;
-                                }
+                                valf = tmpd[2 * j];
+                                tmpd[2 * j] = tmpd[2 * j + 1];
+                                tmpd[2 * j + 1] = valf;
+                                valf = zz[2 * j];
+                                zz[2 * j] = zz[2 * j + 1];
+                                zz[2 * j + 1] = valf;
+                                bd = per[2 * j];
+                                per[2 * j] = per[2 * j + 1];
+                                per[2 * j + 1] = bd;
                             }
                         }
-                        else
+                    }
+                    else
+                    {
+                        for(int j = iam; j < (dd - 1) / 2; j += bdm)
                         {
-                            for(int j = iam; j < (dd - 1) / 2; j += bdm)
+                            if(tmpd[2 * j + 1] > tmpd[2 * j + 2])
                             {
-                                if(tmpd[2 * j + 1] > tmpd[2 * j + 2])
-                                {
-                                    valf = tmpd[2 * j + 1];
-                                    tmpd[2 * j + 1] = tmpd[2 * j + 2];
-                                    tmpd[2 * j + 2] = valf;
-                                    valf = zz[2 * j + 1];
-                                    zz[2 * j + 1] = zz[2 * j + 2];
-                                    zz[2 * j + 2] = valf;
-                                    bd = per[2 * j + 1];
-                                    per[2 * j + 1] = per[2 * j + 2];
-                                    per[2 * j + 2] = bd;
-                                }
+                                valf = tmpd[2 * j + 1];
+                                tmpd[2 * j + 1] = tmpd[2 * j + 2];
+                                tmpd[2 * j + 2] = valf;
+                                valf = zz[2 * j + 1];
+                                zz[2 * j + 1] = zz[2 * j + 2];
+                                zz[2 * j + 2] = valf;
+                                bd = per[2 * j + 1];
+                                per[2 * j + 1] = per[2 * j + 2];
+                                per[2 * j + 2] = bd;
                             }
                         }
                     }
-                    __syncthreads();
                 }
+                __syncthreads();
+            }
+
+            // make dd copies of the non-deflated ordered diagonal elements
+            // (i.e. the poles of the secular eqn) so that the distances to the
+            // eigenvalues (D - lambda_i) are updated while computing each eigenvalue.
+            // This will prevent collapses and division by zero when an eigenvalue
+            // is too close to a pole.
+            for(int j = iam + 1; j < sz; j += bdm)
+            {
+                for(int i = 0; i < dd; ++i)
+                    tmpd[i + j * n] = tmpd[i];
+            }
+
+            // finally copy over all diagonal elements in ev. ev will be overwritten by the
+            // new computed eigenvalues of the merged block
+            for(int i = iam; i < sz; i += bdm)
+                ev[i] = diag[i];
+            __syncthreads();
+            /* ----------------------------------------------------------------- */
 
-                // make dd copies of the non-deflated ordered diagonal elements
-                // (i.e. the poles of the secular eqn) so that the distances to the
-                // eigenvalues (D - lambda_i) are updated while computing each eigenvalue.
-                // This will prevent collapses and division by zero when an eigenvalue
-                // is too close to a pole.
-                for(int j = iam + 1; j < sz; j += bdm)
+            // 3e. Solve secular eqns, i.e. find the dd zeros
+            // corresponding to non-deflated new eigenvalues of the merged block
+            /* ----------------------------------------------------------------- */
+            // each thread will find a different zero in parallel
+            S a, b;
+            for(int j = iam; j < sz; j += bdm)
+            {
+                if(mask[j] == 1)
                 {
-                    for(int i = 0; i < dd; ++i)
-                        tmpd[i + j * n] = tmpd[i];
+                    // find position in the ordered array
+                    int cc = 0;
+                    valf = p < 0 ? -ev[j] : ev[j];
+                    for(int jj = 0; jj < dd; ++jj)
+                    {
+                        if(tmpd[jj + j * n] == valf)
+                            break;
+                        else
+                            cc++;
+                    }
+
+                    // computed zero will overwrite 'ev' at the corresponding position.
+                    // 'tmpd' will be updated with the distances D - lambda_i.
+                    // deflated values are not changed.
+                    rocblas_int linfo;
+                    if(cc == dd - 1)
+                        linfo = seq_solve_ext(dd, tmpd + j * n, zz, (p < 0 ? -p : p), ev + j, eps,
+                                              ssfmin, ssfmax);
+                    else
+                        linfo = seq_solve(dd, tmpd + j * n, zz, (p < 0 ? -p : p), cc, ev + j, eps,
+                                          ssfmin, ssfmax);
+                    if(p < 0)
+                        ev[j] *= -1;
                 }
+            }
+            __syncthreads();
 
-                // finally copy over all diagonal elements in ev. ev will be overwritten by the
-                // new computed eigenvalues of the merged block
-                for(int i = iam; i < sz; i += bdm)
-                    ev[i] = diag[i];
-                __syncthreads();
-                /* ----------------------------------------------------------------- */
-
-                // 3e. Solve secular eqns, i.e. find the dd zeros
-                // corresponding to non-deflated new eigenvalues of the merged block
-                /* ----------------------------------------------------------------- */
-                // each thread will find a different zero in parallel
-                S a, b;
-                for(int j = iam; j < sz; j += bdm)
+            // Re-scale vector Z to avoid bad numerics when an eigenvalue
+            // is too close to a pole
+            for(int i = iam; i < dd; i += bdm)
+            {
+                valf = 1;
+                for(int j = 0; j < sz; ++j)
                 {
                     if(mask[j] == 1)
                     {
-                        // find position in the ordered array
-                        int cc = 0;
-                        valf = p < 0 ? -ev[j] : ev[j];
-                        for(int jj = 0; jj < dd; ++jj)
-                        {
-                            if(tmpd[jj + j * n] == valf)
-                                break;
-                            else
-                                cc++;
-                        }
-
-                        // computed zero will overwrite 'ev' at the corresponding position.
-                        // 'tmpd' will be updated with the distances D - lambda_i.
-                        // deflated values are not changed.
-                        rocblas_int linfo;
-                        if(cc == dd - 1)
-                            linfo = seq_solve_ext(dd, tmpd + j * n, zz, (p < 0 ? -p : p), ev + j,
-                                                  eps, ssfmin, ssfmax);
+                        valg = tmpd[i + j * n];
+                        if(p > 0)
+                            valf *= (per[i] == j) ? valg : valg / (diag[per[i]] - diag[j]);
                         else
-                            linfo = seq_solve(dd, tmpd + j * n, zz, (p < 0 ? -p : p), cc, ev + j,
-                                              eps, ssfmin, ssfmax);
-                        if(p < 0)
-                            ev[j] *= -1;
+                            valf *= (per[i] == j) ? valg : -valg / (diag[per[i]] - diag[j]);
                     }
                 }
-                __syncthreads();
+                valf = sqrt(-valf);
+                zz[i] = zz[i] < 0 ? -valf : valf;
+            }
+            __syncthreads();
+            /* ----------------------------------------------------------------- */
 
-                // Re-scale vector Z to avoid bad numerics when an eigenvalue
-                // is too close to a pole
-                for(int i = iam; i < dd; i += bdm)
+            // 3f. Compute vectors corresponding to non-deflated values
+            /* ----------------------------------------------------------------- */
+            S temp, nrm, evj;
+            rocblas_int nn = (bs - 1) / blks + 1;
+            bool go;
+            for(int j = 0; j < nn; ++j)
+            {
+                go = (j < ns[tid] && idd[p2 + j] == 1);
+
+                // compute vectors of rank-1 perturbed system and their norms
+                nrm = 0;
+                if(go)
                 {
-                    valf = 1;
-                    for(int j = 0; j < sz; ++j)
+                    evj = evs[p2 + j];
+                    for(int i = tidb; i < dd; i += tn)
                     {
-                        if(mask[j] == 1)
-                        {
-                            valg = tmpd[i + j * n];
-                            if(p > 0)
-                                valf *= (per[i] == j) ? valg : valg / (diag[per[i]] - diag[j]);
-                            else
-                                valf *= (per[i] == j) ? valg : -valg / (diag[per[i]] - diag[j]);
-                        }
+                        valf = zz[i] / temps[i + (p2 + j) * n];
+                        nrm += valf * valf;
+                        temps[i + (p2 + j) * n] = valf;
                     }
-                    valf = sqrt(-valf);
-                    zz[i] = zz[i] < 0 ? -valf : valf;
                 }
+                inrms[tid * tn + tidb] = nrm;
                 __syncthreads();
-                /* ----------------------------------------------------------------- */
-
-                // 3f. Compute vectors corresponding to non-deflated values
-                /* ----------------------------------------------------------------- */
-                S temp, nrm, evj;
-                rocblas_int nn = (bs - 1) / blks + 1;
-                bool go;
-                for(int j = 0; j < nn; ++j)
+
+                // reduction (for the norms)
+                for(int r = tn / 2; r > 0; r /= 2)
                 {
-                    go = (j < ns[tid] && idd[p2 + j] == 1);
+                    if(go && tidb < r)
+                    {
+                        nrm += inrms[tid * tn + tidb + r];
+                        inrms[tid * tn + tidb] = nrm;
+                    }
+                    __syncthreads();
+                }
+                nrm = sqrt(nrm);
 
-                    // compute vectors of rank-1 perturbed system and their norms
-                    nrm = 0;
+                // multiply by C (row by row)
+                for(int ii = 0; ii < tsz; ++ii)
+                {
+                    rocblas_int i = in + ii;
+                    go &= (ii < sz);
+
+                    // inner products
+                    temp = 0;
                     if(go)
                     {
-                        evj = evs[p2 + j];
-                        for(int i = tidb; i < dd; i += tn)
-                        {
-                            valf = zz[i] / temps[i + (p2 + j) * n];
-                            nrm += valf * valf;
-                            temps[i + (p2 + j) * n] = valf;
-                        }
+                        for(int kk = tidb; kk < dd; kk += tn)
+                            temp += C[i + (per[kk] + in) * ldc] * temps[kk + (p2 + j) * n];
                     }
-                    inrms[tid * tn + tidb] = nrm;
+                    inrms[tid * tn + tidb] = temp;
                     __syncthreads();
 
-                    // reduction (for the norms)
+                    // reduction
                     for(int r = tn / 2; r > 0; r /= 2)
                     {
                         if(go && tidb < r)
                         {
-                            nrm += inrms[tid * tn + tidb + r];
-                            inrms[tid * tn + tidb] = nrm;
+                            temp += inrms[tid * tn + tidb + r];
+                            inrms[tid * tn + tidb] = temp;
                         }
                         __syncthreads();
                     }
-                    nrm = sqrt(nrm);
-
-                    // multiply by C (row by row)
-                    for(int ii = 0; ii < tsz; ++ii)
-                    {
-                        rocblas_int i = in + ii;
-                        go &= (ii < sz);
 
-                        // inner products
-                        temp = 0;
-                        if(go)
-                        {
-                            for(int kk = tidb; kk < dd; kk += tn)
-                                temp += C[i + (per[kk] + in) * ldc] * temps[kk + (p2 + j) * n];
-                        }
-                        inrms[tid * tn + tidb] = temp;
-                        __syncthreads();
-
-                        // reduction
-                        for(int r = tn / 2; r > 0; r /= 2)
-                        {
-                            if(go && tidb < r)
-                            {
-                                temp += inrms[tid * tn + tidb + r];
-                                inrms[tid * tn + tidb] = temp;
-                            }
-                            __syncthreads();
-                        }
-
-                        // result
-                        if(go && tidb == 0)
-                            vecs[i + (p2 + j) * n] = temp / nrm;
-                        __syncthreads();
-                    }
+                    // result
+                    if(go && tidb == 0)
+                        vecs[i + (p2 + j) * n] = temp / nrm;
+                    __syncthreads();
                 }
-                __syncthreads();
-                /* ----------------------------------------------------------------- */
+            }
+            __syncthreads();
+            /* ----------------------------------------------------------------- */
 
-                // 3g. update D and C with computed values and vectors
-                /* ----------------------------------------------------------------- */
-                for(int j = 0; j < nn; ++j)
+            // 3g. update D and C with computed values and vectors
+            /* ----------------------------------------------------------------- */
+            for(int j = 0; j < nn; ++j)
+            {
+                if(j < ns[tid] && idd[p2 + j] == 1)
                 {
-                    if(j < ns[tid] && idd[p2 + j] == 1)
-                    {
-                        if(tidb == 0)
-                            D[p2 + j] = evs[p2 + j];
-                        for(int i = in + tidb; i < in + sz; i += tn)
-                            C[i + (p2 + j) * ldc] = vecs[i + (p2 + j) * n];
-                    }
-                    __syncthreads();
+                    if(tidb == 0)
+                        D[p2 + j] = evs[p2 + j];
+                    for(int i = in + tidb; i < in + sz; i += tn)
+                        C[i + (p2 + j) * ldc] = vecs[i + (p2 + j) * n];
                 }
-                /* ----------------------------------------------------------------- */
-
-            } // end of main loop in merge phase of divide & conquer
+                __syncthreads();
+            }
+            /* ----------------------------------------------------------------- */
 
-        } // end of conditional that decides when to use normal algorithm or divide & conquer
+        } // end of main loop in merge phase of divide & conquer
 
     } // end of for-loop for the independent split blocks
 }
@@ -1431,6 +1726,10 @@ ROCSOLVER_KERNEL void stedc_sort(const rocblas_int n,
     }
 }
 
+/******************* Host functions ********************************************/
+/*******************************************************************************/
+
+//--------------------------------------------------------------------------------------//
 /** This local gemm adapts rocblas_gemm to multiply complex*real, and
     overwrite result: A = A*B **/
 template <bool BATCHED,
@@ -1544,6 +1843,7 @@ void local_gemm(rocblas_handle handle,
     rocblas_set_pointer_mode(handle, old_mode);
 }
 
+//--------------------------------------------------------------------------------------//
 /** This helper calculates required workspace size **/
 template <bool BATCHED, typename T, typename S>
 void rocsolver_stedc_getMemorySize(const rocblas_evect evect,
@@ -1598,11 +1898,16 @@ void rocsolver_stedc_getMemorySize(const rocblas_evect evect,
     {
         size_t s1, s2;
 
-        // requirements for classic solver of small independent blocks
-        if(solver_mode == BISECTION || solver_mode == JACOBI)
-            s1 = 0;
-        else
-            rocsolver_steqr_getMemorySize<T, S>(evect, n, batch_count, &s1);
+        // requirements for solver of small independent blocks
+        switch(solver_mode)
+        {
+        case BISECTION: s1 = 0; break;
+
+        case JACOBI: s1 = sizeof(S) * (n * n + 2) * batch_count; break;
+
+        case CLASSICQR:
+        default: rocsolver_steqr_getMemorySize<T, S>(evect, n, batch_count, &s1);
+        }
 
         // extra requirements for original eigenvectors of small independent blocks
         *size_tempvect = (n * n) * batch_count * sizeof(S);
@@ -1618,13 +1923,14 @@ void rocsolver_stedc_getMemorySize(const rocblas_evect evect,
         *size_work_stack = max(s1, s2);
 
         // size for split blocks and sub-blocks positions
-        *size_splits = sizeof(rocblas_int) * (3 * n + 2) * batch_count;
+        *size_splits = sizeof(rocblas_int) * (5 * n + 2) * batch_count;
 
         // size for temporary diagonal and rank-1 modif vector
         *size_tmpz = sizeof(S) * (2 * n) * batch_count;
     }
 }
 
+//--------------------------------------------------------------------------------------//
 /** This helper check argument correctness for stedc API **/
 template <typename T, typename S>
 rocblas_status rocsolver_stedc_argCheck(rocblas_handle handle,
@@ -1660,6 +1966,7 @@ rocblas_status rocsolver_stedc_argCheck(rocblas_handle handle,
     return rocblas_status_continue;
 }
 
+//--------------------------------------------------------------------------------------//
 /** STEDC templated function **/
 template <bool BATCHED, bool STRIDED, typename T, typename S, typename U>
 rocblas_status rocsolver_stedc_template(rocblas_handle handle,
@@ -1734,10 +2041,6 @@ rocblas_status rocsolver_stedc_template(rocblas_handle handle,
         ssfmax = sqrt(ssfmax) / S(3.0);
         rocblas_int blocksn = (n - 1) / BS2 + 1;
 
-        // find independent split blocks in matrix
-        ROCSOLVER_LAUNCH_KERNEL(stedc_split, dim3(batch_count), dim3(1), 0, stream, n, D + shiftD,
-                                strideD, E + shiftE, strideE, splits, eps);
-
         // initialize identity matrix in C if required
         if(evect == rocblas_evect_tridiagonal)
             ROCSOLVER_LAUNCH_KERNEL(init_ident<T>, dim3(blocksn, blocksn, batch_count),
@@ -1751,19 +2054,44 @@ rocblas_status rocsolver_stedc_template(rocblas_handle handle,
 
         // find max number of sub-blocks to consider during the divide phase
         rocblas_int maxblks = 1 << stedc_num_levels(n, solver_mode);
-        size_t lmemsize = sizeof(rocblas_int) * 2 * maxblks + sizeof(S) * STEDC_BDIM;
 
-        // execute divide and conquer kernel with tempvect
-        ROCSOLVER_LAUNCH_KERNEL((stedc_kernel<S>), dim3(STEDC_NUM_SPLIT_BLKS, batch_count),
+        // find independent split blocks in matrix
+        ROCSOLVER_LAUNCH_KERNEL(stedc_split, dim3(batch_count), dim3(1), 0, stream, n, D + shiftD,
+                                strideD, E + shiftE, strideE, splits, eps);
+
+        // 1. divide phase
+        //-----------------------------
+        ROCSOLVER_LAUNCH_KERNEL((stedc_divide_kernel<S>), dim3(batch_count), dim3(STEDC_BDIM), 0,
+                                stream, n, D + shiftD, strideD, E + shiftE, strideE, splits,
+                                solver_mode);
+
+        // 2. solve phase
+        //-----------------------------
+        size_t lmemsize = 0;
+        if(solver_mode == JACOBI)
+            lmemsize += (n + n % 2) * (sizeof(rocblas_int) + sizeof(S));
+
+        ROCSOLVER_LAUNCH_KERNEL((stedc_solve_kernel<S>),
+                                dim3(maxblks, STEDC_NUM_SPLIT_BLKS, batch_count), dim3(STEDC_BDIM),
+                                lmemsize, stream, n, D + shiftD, strideD, E + shiftE, strideE,
+                                tempvect, 0, ldt, strideT, info, (S*)work_stack, splits, eps,
+                                ssfmin, ssfmax, solver_mode);
+
+        // 3. merge phase
+        //----------------
+        lmemsize = sizeof(S) * STEDC_BDIM;
+
+        ROCSOLVER_LAUNCH_KERNEL((stedc_merge_kernel<S>), dim3(STEDC_NUM_SPLIT_BLKS, batch_count),
                                 dim3(STEDC_BDIM), lmemsize, stream, n, D + shiftD, strideD,
-                                E + shiftE, strideE, tempvect, 0, ldt, strideT, info, (S*)work_stack,
-                                tmpz, tempgemm, splits, eps, ssfmin, ssfmax, maxblks, solver_mode);
+                                E + shiftE, strideE, tempvect, 0, ldt, strideT, tmpz, tempgemm,
+                                splits, eps, ssfmin, ssfmax, solver_mode);
 
-        // update eigenvectors C <- C*tempvect
+        // 4. update and sort
+        //----------------------
+        // eigenvectors C <- C*tempvect
         local_gemm<BATCHED, STRIDED, T>(handle, n, C, shiftC, ldc, strideC, tempvect, tempgemm,
                                         (S*)work_stack, 0, ldt, strideT, batch_count, workArr);
 
-        // finally sort eigenvalues and eigenvectors
         ROCSOLVER_LAUNCH_KERNEL((stedc_sort<T>), dim3(batch_count), dim3(1), 0, stream, n,
                                 D + shiftD, strideD, C, shiftC, ldc, strideC);
     }
diff --git a/library/src/include/ideal_sizes.hpp b/library/src/include/ideal_sizes.hpp
index ad965bb46..903aed00f 100644
--- a/library/src/include/ideal_sizes.hpp
+++ b/library/src/include/ideal_sizes.hpp
@@ -228,6 +228,14 @@
     if any, will be factorized with the unblocked algorithm (SYTF2).*/
 #define SYTRF_SYTF2_SWITCHSIZE 128
 
+/****************************** syevdj ******************************************
+*******************************************************************************/
+/*! \brief Determines the minimum size required for the Jacobi divide and conquer to be used.
+
+    \details If the size of the block is smaller than SYEVDJ_MIN_DC_SIZE,
+    the eigenvectors are computed with the normal Jacobi algorithm. */
+#define SYEVDJ_MIN_DC_SIZE 16
+
 /**************************** getf2/getfr *************************************
 *******************************************************************************/
 #define GETF2_SPKER_MAX_M 1024 //always <= 1024
diff --git a/library/src/lapack/roclapack_syevdj_heevdj.cpp b/library/src/lapack/roclapack_syevdj_heevdj.cpp
new file mode 100644
index 000000000..68073a543
--- /dev/null
+++ b/library/src/lapack/roclapack_syevdj_heevdj.cpp
@@ -0,0 +1,173 @@
+/* ************************************************************************
+ * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#include "roclapack_syevdj_heevdj.hpp"
+
+template <typename T, typename S, typename W>
+rocblas_status rocsolver_syevdj_heevdj_impl(rocblas_handle handle,
+                                            const rocblas_evect evect,
+                                            const rocblas_fill uplo,
+                                            const rocblas_int n,
+                                            W A,
+                                            const rocblas_int lda,
+                                            S* D,
+                                            rocblas_int* info)
+{
+    const char* name = (!rocblas_is_complex<T> ? "syevdj" : "heevdj");
+    ROCSOLVER_ENTER_TOP(name, "--evect", evect, "--uplo", uplo, "-n", n, "--lda", lda);
+
+    if(!handle)
+        return rocblas_status_invalid_handle;
+
+    // argument checking
+    rocblas_status st = rocsolver_syevdj_heevdj_argCheck(handle, evect, uplo, n, A, lda, D, info);
+    if(st != rocblas_status_continue)
+        return st;
+
+    // working with unshifted arrays
+    rocblas_int shiftA = 0;
+
+    // normal (non-batched non-strided) execution
+    rocblas_stride strideA = 0;
+    rocblas_stride strideD = 0;
+    rocblas_int batch_count = 1;
+
+    // memory workspace sizes:
+    // size for constants in rocblas calls
+    size_t size_scalars;
+    // size of reusable workspaces
+    size_t size_work1;
+    size_t size_work2;
+    size_t size_work3;
+    // extra space for call stedc
+    size_t size_workSplits, size_work4;
+    // size of array of pointers (only for batched case)
+    size_t size_workArr;
+    // size for temporary householder scalars
+    size_t size_workTau;
+    // size for temporary vectors
+    size_t size_workVec;
+    // size for temporary superdiagonal of tridiag form
+    size_t size_workE;
+
+    rocsolver_syevdj_heevdj_getMemorySize<false, T, S>(
+        evect, uplo, n, batch_count, &size_scalars, &size_workE, &size_workTau, &size_workVec,
+        &size_workSplits, &size_work1, &size_work2, &size_work3, &size_work4, &size_workArr);
+
+    if(rocblas_is_device_memory_size_query(handle))
+        return rocblas_set_optimal_device_memory_size(
+            handle, size_scalars, size_workE, size_workTau, size_workVec, size_workSplits,
+            size_work1, size_work2, size_work3, size_work4, size_workArr);
+
+    // memory workspace allocation
+    void *scalars, *work1, *work2, *work3, *work4, *workE, *workVec, *workSplits, *workTau, *workArr;
+    rocblas_device_malloc mem(handle, size_scalars, size_workE, size_workTau, size_workVec,
+                              size_workSplits, size_work1, size_work2, size_work3, size_work4,
+                              size_workArr);
+
+    if(!mem)
+        return rocblas_status_memory_error;
+
+    scalars = mem[0];
+    workE = mem[1];
+    workTau = mem[2];
+    workVec = mem[3];
+    workSplits = mem[4];
+    work1 = mem[5];
+    work2 = mem[6];
+    work3 = mem[7];
+    work4 = mem[8];
+    workArr = mem[9];
+    if(size_scalars > 0)
+        init_scalars(handle, (T*)scalars);
+
+    // execution
+    return rocsolver_syevdj_heevdj_template<false, false, T>(
+        handle, evect, uplo, n, A, shiftA, lda, strideA, D, strideD, info, batch_count, (T*)scalars,
+        (S*)workE, (T*)workTau, (T*)workVec, (rocblas_int*)workSplits, work1, work2, work3, work4,
+        workArr);
+}
+
+/*
+ * ===========================================================================
+ *    C wrapper
+ * ===========================================================================
+ */
+
+extern "C" {
+
+rocblas_status rocsolver_ssyevdj(rocblas_handle handle,
+                                 const rocblas_evect evect,
+                                 const rocblas_fill uplo,
+                                 const rocblas_int n,
+                                 float* A,
+                                 const rocblas_int lda,
+                                 float* D,
+                                 rocblas_int* info)
+{
+    return rocsolver_syevdj_heevdj_impl<float>(handle, evect, uplo, n, A, lda, D, info);
+}
+
+rocblas_status rocsolver_dsyevdj(rocblas_handle handle,
+                                 const rocblas_evect evect,
+                                 const rocblas_fill uplo,
+                                 const rocblas_int n,
+                                 double* A,
+                                 const rocblas_int lda,
+                                 double* D,
+                                 rocblas_int* info)
+{
+    return rocsolver_syevdj_heevdj_impl<double>(handle, evect, uplo, n, A, lda, D, info);
+}
+
+rocblas_status rocsolver_cheevdj(rocblas_handle handle,
+                                 const rocblas_evect evect,
+                                 const rocblas_fill uplo,
+                                 const rocblas_int n,
+                                 rocblas_float_complex* A,
+                                 const rocblas_int lda,
+                                 float* D,
+                                 rocblas_int* info)
+{
+    return rocsolver_syevdj_heevdj_impl<rocblas_float_complex>(handle, evect, uplo, n, A, lda, D,
+                                                               info);
+}
+
+rocblas_status rocsolver_zheevdj(rocblas_handle handle,
+                                 const rocblas_evect evect,
+                                 const rocblas_fill uplo,
+                                 const rocblas_int n,
+                                 rocblas_double_complex* A,
+                                 const rocblas_int lda,
+                                 double* D,
+                                 rocblas_int* info)
+{
+    return rocsolver_syevdj_heevdj_impl<rocblas_double_complex>(handle, evect, uplo, n, A, lda, D,
+                                                                info);
+}
+
+} // extern C
diff --git a/library/src/lapack/roclapack_syevdj_heevdj.hpp b/library/src/lapack/roclapack_syevdj_heevdj.hpp
new file mode 100644
index 000000000..0ba73bb06
--- /dev/null
+++ b/library/src/lapack/roclapack_syevdj_heevdj.hpp
@@ -0,0 +1,265 @@
+/************************************************************************
+ * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ************************************************************************/
+
+#pragma once
+
+#include "auxiliary/rocauxiliary_ormtr_unmtr.hpp"
+#include "auxiliary/rocauxiliary_stedc.hpp"
+#include "rocblas.hpp"
+#include "roclapack_syev_heev.hpp"
+#include "roclapack_sytrd_hetrd.hpp"
+#include "rocsolver/rocsolver.h"
+
+/** Helper to calculate workspace sizes **/
+template <bool BATCHED, typename T, typename S>
+void rocsolver_syevdj_heevdj_getMemorySize(const rocblas_evect evect,
+                                           const rocblas_fill uplo,
+                                           const rocblas_int n,
+                                           const rocblas_int batch_count,
+                                           size_t* size_scalars,
+                                           size_t* size_workE,
+                                           size_t* size_workTau,
+                                           size_t* size_workVec,
+                                           size_t* size_workSplits,
+                                           size_t* size_work1,
+                                           size_t* size_work2,
+                                           size_t* size_work3,
+                                           size_t* size_work4,
+                                           size_t* size_workArr)
+{
+    // if quick return, set workspace to zero
+    if(n <= 1 || batch_count == 0)
+    {
+        *size_scalars = 0;
+        *size_work1 = 0;
+        *size_work2 = 0;
+        *size_work3 = 0;
+        *size_work4 = 0;
+        *size_workSplits = 0;
+        *size_workE = 0;
+        *size_workTau = 0;
+        *size_workVec = 0;
+        *size_workArr = 0;
+        return;
+    }
+
+    // if size too small or no vectors required
+    if(evect != rocblas_evect_original || n < SYEVDJ_MIN_DC_SIZE)
+    {
+        // space to store the residual
+        *size_workE = sizeof(S) * batch_count;
+
+        // space to store the number of sweeps
+        *size_workSplits = sizeof(rocblas_int) * batch_count;
+
+        // requirements for jacobi
+        rocsolver_syevj_heevj_getMemorySize<BATCHED, T, S>(evect, uplo, n, batch_count,
+                                                           size_workVec, size_workTau, size_work1,
+                                                           size_work2, size_work3, size_work4);
+
+        *size_scalars = 0;
+        *size_workArr = 0;
+
+        return;
+    }
+
+    size_t unused;
+    size_t w11 = 0, w12 = 0, w13 = 0;
+    size_t w21 = 0, w22 = 0, w23 = 0;
+    size_t w31 = 0, w32 = 0, w33 = 0;
+
+    // space for the superdiagonal of tridiag form
+    *size_workE = sizeof(S) * n * batch_count;
+
+    // space for the householder scalars
+    *size_workTau = sizeof(T) * n * batch_count;
+
+    // temp space for eigenvectors
+    *size_workVec = sizeof(T) * n * n * batch_count;
+
+    // requirements for tridiagonalization (sytrd/hetrd)
+    rocsolver_sytrd_hetrd_getMemorySize<BATCHED, T>(n, batch_count, size_scalars, &w11, &w21, &w31,
+                                                    &unused);
+
+    // extra requirements for computing eigenvalues and vectors (stedc)
+    int solver_mode = JACOBI;
+    rocsolver_stedc_getMemorySize<BATCHED, T, S>(rocblas_evect_tridiagonal, n, batch_count, &w12,
+                                                 &w22, &w32, size_work4, size_workSplits, &unused,
+                                                 solver_mode);
+
+    // extra requirements for ormtr/unmtr
+    rocsolver_ormtr_unmtr_getMemorySize<BATCHED, T>(rocblas_side_left, uplo, n, n, batch_count,
+                                                    &unused, &w13, &w23, &w33, &unused);
+
+    // get max values
+    *size_work1 = std::max({w11, w12, w13});
+    *size_work2 = std::max({w21, w22, w23});
+    *size_work3 = std::max({w31, w32, w33});
+
+    // size of array of pointers to workspace
+    if(BATCHED)
+        *size_workArr = 2 * sizeof(T*) * batch_count;
+    else
+        *size_workArr = 0;
+}
+
+/** Argument checking **/
+template <typename T, typename S>
+rocblas_status rocsolver_syevdj_heevdj_argCheck(rocblas_handle handle,
+                                                const rocblas_evect evect,
+                                                const rocblas_fill uplo,
+                                                const rocblas_int n,
+                                                T A,
+                                                const rocblas_int lda,
+                                                S* D,
+                                                rocblas_int* info,
+                                                const rocblas_int batch_count = 1)
+{
+    // order is important for unit tests:
+
+    // 1. invalid/non-supported values
+    if((evect != rocblas_evect_original && evect != rocblas_evect_none)
+       || (uplo != rocblas_fill_lower && uplo != rocblas_fill_upper))
+        return rocblas_status_invalid_value;
+
+    // 2. invalid size
+    if(n < 0 || lda < n || batch_count < 0)
+        return rocblas_status_invalid_size;
+
+    // skip pointer check if querying memory size
+    if(rocblas_is_device_memory_size_query(handle))
+        return rocblas_status_continue;
+
+    // 3. invalid pointers
+    if((n && !A) || (n && !D) || (batch_count && !info))
+        return rocblas_status_invalid_pointer;
+
+    return rocblas_status_continue;
+}
+
+template <bool BATCHED, bool STRIDED, typename T, typename S, typename W>
+rocblas_status rocsolver_syevdj_heevdj_template(rocblas_handle handle,
+                                                const rocblas_evect evect,
+                                                const rocblas_fill uplo,
+                                                const rocblas_int n,
+                                                W A,
+                                                const rocblas_int shiftA,
+                                                const rocblas_int lda,
+                                                const rocblas_stride strideA,
+                                                S* D,
+                                                const rocblas_stride strideD,
+                                                rocblas_int* info,
+                                                const rocblas_int batch_count,
+                                                T* scalars,
+                                                S* workE,
+                                                T* workTau,
+                                                T* workVec,
+                                                rocblas_int* workSplits,
+                                                void* work1,
+                                                void* work2,
+                                                void* work3,
+                                                void* work4,
+                                                void* workArr)
+{
+    ROCSOLVER_ENTER("syevdj_heevdj", "evect:", evect, "uplo:", uplo, "n:", n, "shiftA:", shiftA,
+                    "lda:", lda, "bc:", batch_count);
+
+    // quick return
+    if(batch_count == 0)
+        return rocblas_status_success;
+
+    hipStream_t stream;
+    rocblas_get_stream(handle, &stream);
+
+    rocblas_int blocksReset = (batch_count - 1) / BS1 + 1;
+    dim3 gridReset(blocksReset, 1, 1);
+    dim3 threads(BS1, 1, 1);
+
+    // info = 0
+    ROCSOLVER_LAUNCH_KERNEL(reset_info, gridReset, threads, 0, stream, info, batch_count, 0);
+
+    // quick return
+    if(n == 0)
+        return rocblas_status_success;
+
+    // quick return for n = 1 (scalar case)
+    if(n == 1)
+    {
+        ROCSOLVER_LAUNCH_KERNEL(scalar_case<T>, gridReset, threads, 0, stream, evect, A, strideA, D,
+                                strideD, batch_count);
+        return rocblas_status_success;
+    }
+
+    // TODO: Scale the matrix
+
+    if(evect != rocblas_evect_original || n < SYEVDJ_MIN_DC_SIZE)
+    {
+        // **** do not use D&C approach ****
+
+        rocsolver_syevj_heevj_template<BATCHED, STRIDED, T>(
+            handle, rocblas_esort_ascending, evect, uplo, n, A, shiftA, lda, strideA, (S)0, workE,
+            20, workSplits, D, strideD, info, batch_count, workVec, workTau, (S*)work1,
+            (rocblas_int*)work2, (rocblas_int*)work3, (rocblas_int*)work4);
+    }
+    else
+    {
+        // **** Use D&C approach ****
+
+        // reduce A to tridiagonal form
+        // (Note: a tridiag form is necessary to apply D&C. To solve the subblocks with Jacobi will
+        // require copy D and E into a full tridiag matrix however, given all the zeros above the super diagonal,
+        // it is expected that the algorithm converges in fewer sweeps)
+        rocsolver_sytrd_hetrd_template<BATCHED>(handle, uplo, n, A, shiftA, lda, strideA, D,
+                                                strideD, workE, n, workTau, n, batch_count, scalars,
+                                                (T*)work1, (T*)work2, (T*)work3, (T**)workArr);
+
+        constexpr bool ISBATCHED = BATCHED || STRIDED;
+        const rocblas_int ldv = n;
+        const rocblas_stride strideV = n * n;
+
+        // solve with Jacobi solver
+        int solver_mode = JACOBI;
+        rocsolver_stedc_template<false, ISBATCHED, T>(
+            handle, rocblas_evect_tridiagonal, n, D, 0, strideD, workE, 0, n, workVec, 0, ldv,
+            strideV, info, batch_count, work1, (S*)work2, (S*)work3, (S*)work4, workSplits,
+            (S**)workArr, solver_mode);
+
+        // update vectors
+        rocsolver_ormtr_unmtr_template<BATCHED, STRIDED>(
+            handle, rocblas_side_left, uplo, rocblas_operation_none, n, n, A, shiftA, lda, strideA,
+            workTau, n, workVec, 0, ldv, strideV, batch_count, scalars, (T*)work1, (T*)work2,
+            (T*)work3, (T**)workArr);
+
+        // copy vectors into A
+        const rocblas_int copyblocks = (n - 1) / BS2 + 1;
+        ROCSOLVER_LAUNCH_KERNEL(copy_mat<T>, dim3(copyblocks, copyblocks, batch_count),
+                                dim3(BS2, BS2), 0, stream, n, n, workVec, 0, ldv, strideV, A,
+                                shiftA, lda, strideA);
+    }
+
+    return rocblas_status_success;
+}
diff --git a/library/src/lapack/roclapack_syevdj_heevdj_batched.cpp b/library/src/lapack/roclapack_syevdj_heevdj_batched.cpp
new file mode 100644
index 000000000..266f40d43
--- /dev/null
+++ b/library/src/lapack/roclapack_syevdj_heevdj_batched.cpp
@@ -0,0 +1,185 @@
+/* ************************************************************************
+ * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#include "roclapack_syevdj_heevdj.hpp"
+
+template <typename T, typename S, typename W>
+rocblas_status rocsolver_syevdj_heevdj_batched_impl(rocblas_handle handle,
+                                                    const rocblas_evect evect,
+                                                    const rocblas_fill uplo,
+                                                    const rocblas_int n,
+                                                    W A,
+                                                    const rocblas_int lda,
+                                                    S* D,
+                                                    const rocblas_stride strideD,
+                                                    rocblas_int* info,
+                                                    const rocblas_int batch_count)
+{
+    const char* name = (!rocblas_is_complex<T> ? "syevdj_batched" : "heevdj_batched");
+    ROCSOLVER_ENTER_TOP(name, "--evect", evect, "--uplo", uplo, "-n", n, "--lda", lda, "--strideD",
+                        strideD, "--batch_count", batch_count);
+
+    if(!handle)
+        return rocblas_status_invalid_handle;
+
+    // argument checking
+    rocblas_status st
+        = rocsolver_syevdj_heevdj_argCheck(handle, evect, uplo, n, A, lda, D, info, batch_count);
+    if(st != rocblas_status_continue)
+        return st;
+
+    // working with unshifted arrays
+    rocblas_int shiftA = 0;
+
+    // batched execution
+    rocblas_stride strideA = 0;
+
+    // memory workspace sizes:
+    // size for constants in rocblas calls
+    size_t size_scalars;
+    // size of reusable workspaces
+    size_t size_work1;
+    size_t size_work2;
+    size_t size_work3;
+    // extra space for call stedc
+    size_t size_workSplits, size_work4;
+    // size of array of pointers (only for batched case)
+    size_t size_workArr;
+    // size for temporary householder scalars
+    size_t size_workTau;
+    // size for temporary vectors
+    size_t size_workVec;
+    // size for temporary superdiagonal of tridiag form
+    size_t size_workE;
+
+    rocsolver_syevdj_heevdj_getMemorySize<true, T, S>(
+        evect, uplo, n, batch_count, &size_scalars, &size_workE, &size_workTau, &size_workVec,
+        &size_workSplits, &size_work1, &size_work2, &size_work3, &size_work4, &size_workArr);
+
+    if(rocblas_is_device_memory_size_query(handle))
+        return rocblas_set_optimal_device_memory_size(
+            handle, size_scalars, size_workE, size_workTau, size_workVec, size_workSplits,
+            size_work1, size_work2, size_work3, size_work4, size_workArr);
+
+    // memory workspace allocation
+    void *scalars, *work1, *work2, *work3, *work4, *workE, *workVec, *workSplits, *workTau, *workArr;
+    rocblas_device_malloc mem(handle, size_scalars, size_workE, size_workTau, size_workVec,
+                              size_workSplits, size_work1, size_work2, size_work3, size_work4,
+                              size_workArr);
+
+    if(!mem)
+        return rocblas_status_memory_error;
+
+    scalars = mem[0];
+    workE = mem[1];
+    workTau = mem[2];
+    workVec = mem[3];
+    workSplits = mem[4];
+    work1 = mem[5];
+    work2 = mem[6];
+    work3 = mem[7];
+    work4 = mem[8];
+    workArr = mem[9];
+    if(size_scalars > 0)
+        init_scalars(handle, (T*)scalars);
+
+    // execution
+    return rocsolver_syevdj_heevdj_template<true, false, T>(
+        handle, evect, uplo, n, A, shiftA, lda, strideA, D, strideD, info, batch_count, (T*)scalars,
+        (S*)workE, (T*)workTau, (T*)workVec, (rocblas_int*)workSplits, work1, work2, work3, work4,
+        workArr);
+}
+
+/*
+ * ===========================================================================
+ *    C wrapper
+ * ===========================================================================
+ */
+
+extern "C" {
+
+rocblas_status rocsolver_ssyevdj_batched(rocblas_handle handle,
+                                         const rocblas_evect evect,
+                                         const rocblas_fill uplo,
+                                         const rocblas_int n,
+                                         float* const A[],
+                                         const rocblas_int lda,
+                                         float* D,
+                                         const rocblas_stride strideD,
+                                         rocblas_int* info,
+                                         const rocblas_int batch_count)
+{
+    return rocsolver_syevdj_heevdj_batched_impl<float>(handle, evect, uplo, n, A, lda, D, strideD,
+                                                       info, batch_count);
+}
+
+rocblas_status rocsolver_dsyevdj_batched(rocblas_handle handle,
+                                         const rocblas_evect evect,
+                                         const rocblas_fill uplo,
+                                         const rocblas_int n,
+                                         double* const A[],
+                                         const rocblas_int lda,
+                                         double* D,
+                                         const rocblas_stride strideD,
+                                         rocblas_int* info,
+                                         const rocblas_int batch_count)
+{
+    return rocsolver_syevdj_heevdj_batched_impl<double>(handle, evect, uplo, n, A, lda, D, strideD,
+                                                        info, batch_count);
+}
+
+rocblas_status rocsolver_cheevdj_batched(rocblas_handle handle,
+                                         const rocblas_evect evect,
+                                         const rocblas_fill uplo,
+                                         const rocblas_int n,
+                                         rocblas_float_complex* const A[],
+                                         const rocblas_int lda,
+                                         float* D,
+                                         const rocblas_stride strideD,
+                                         rocblas_int* info,
+                                         const rocblas_int batch_count)
+{
+    return rocsolver_syevdj_heevdj_batched_impl<rocblas_float_complex>(
+        handle, evect, uplo, n, A, lda, D, strideD, info, batch_count);
+}
+
+rocblas_status rocsolver_zheevdj_batched(rocblas_handle handle,
+                                         const rocblas_evect evect,
+                                         const rocblas_fill uplo,
+                                         const rocblas_int n,
+                                         rocblas_double_complex* const A[],
+                                         const rocblas_int lda,
+                                         double* D,
+                                         const rocblas_stride strideD,
+                                         rocblas_int* info,
+                                         const rocblas_int batch_count)
+{
+    return rocsolver_syevdj_heevdj_batched_impl<rocblas_double_complex>(
+        handle, evect, uplo, n, A, lda, D, strideD, info, batch_count);
+}
+
+} // extern C
diff --git a/library/src/lapack/roclapack_syevdj_heevdj_strided_batched.cpp b/library/src/lapack/roclapack_syevdj_heevdj_strided_batched.cpp
new file mode 100644
index 000000000..e109755b7
--- /dev/null
+++ b/library/src/lapack/roclapack_syevdj_heevdj_strided_batched.cpp
@@ -0,0 +1,189 @@
+/* ************************************************************************
+ * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#include "roclapack_syevdj_heevdj.hpp"
+
+template <typename T, typename S, typename W>
+rocblas_status rocsolver_syevdj_heevdj_strided_batched_impl(rocblas_handle handle,
+                                                            const rocblas_evect evect,
+                                                            const rocblas_fill uplo,
+                                                            const rocblas_int n,
+                                                            W A,
+                                                            const rocblas_int lda,
+                                                            const rocblas_stride strideA,
+                                                            S* D,
+                                                            const rocblas_stride strideD,
+                                                            rocblas_int* info,
+                                                            const rocblas_int batch_count)
+{
+    const char* name = (!rocblas_is_complex<T> ? "syevdj_strided_batched" : "heevdj_strided_batched");
+    ROCSOLVER_ENTER_TOP(name, "--evect", evect, "--uplo", uplo, "-n", n, "--lda", lda, "--strideA",
+                        strideA, "--strideD", strideD, "--batch_count", batch_count);
+
+    if(!handle)
+        return rocblas_status_invalid_handle;
+
+    // argument checking
+    rocblas_status st
+        = rocsolver_syevdj_heevdj_argCheck(handle, evect, uplo, n, A, lda, D, info, batch_count);
+    if(st != rocblas_status_continue)
+        return st;
+
+    // working with unshifted arrays
+    rocblas_int shiftA = 0;
+
+    // strided_batched execution
+
+    // memory workspace sizes:
+    // size for constants in rocblas calls
+    size_t size_scalars;
+    // size of reusable workspaces
+    size_t size_work1;
+    size_t size_work2;
+    size_t size_work3;
+    // extra space for call stedc
+    size_t size_workSplits, size_work4;
+    // size of array of pointers (only for batched case)
+    size_t size_workArr;
+    // size for temporary householder scalars
+    size_t size_workTau;
+    // size for temporary vectors
+    size_t size_workVec;
+    // size for temporary superdiagonal of tridiag form
+    size_t size_workE;
+
+    rocsolver_syevdj_heevdj_getMemorySize<false, T, S>(
+        evect, uplo, n, batch_count, &size_scalars, &size_workE, &size_workTau, &size_workVec,
+        &size_workSplits, &size_work1, &size_work2, &size_work3, &size_work4, &size_workArr);
+
+    if(rocblas_is_device_memory_size_query(handle))
+        return rocblas_set_optimal_device_memory_size(
+            handle, size_scalars, size_workE, size_workTau, size_workVec, size_workSplits,
+            size_work1, size_work2, size_work3, size_work4, size_workArr);
+
+    // memory workspace allocation
+    void *scalars, *work1, *work2, *work3, *work4, *workE, *workVec, *workSplits, *workTau, *workArr;
+    rocblas_device_malloc mem(handle, size_scalars, size_workE, size_workTau, size_workVec,
+                              size_workSplits, size_work1, size_work2, size_work3, size_work4,
+                              size_workArr);
+
+    if(!mem)
+        return rocblas_status_memory_error;
+
+    scalars = mem[0];
+    workE = mem[1];
+    workTau = mem[2];
+    workVec = mem[3];
+    workSplits = mem[4];
+    work1 = mem[5];
+    work2 = mem[6];
+    work3 = mem[7];
+    work4 = mem[8];
+    workArr = mem[9];
+    if(size_scalars > 0)
+        init_scalars(handle, (T*)scalars);
+
+    // execution
+    return rocsolver_syevdj_heevdj_template<false, true, T>(
+        handle, evect, uplo, n, A, shiftA, lda, strideA, D, strideD, info, batch_count, (T*)scalars,
+        (S*)workE, (T*)workTau, (T*)workVec, (rocblas_int*)workSplits, work1, work2, work3, work4,
+        workArr);
+}
+
+/*
+ * ===========================================================================
+ *    C wrapper
+ * ===========================================================================
+ */
+
+extern "C" {
+
+rocblas_status rocsolver_ssyevdj_strided_batched(rocblas_handle handle,
+                                                 const rocblas_evect evect,
+                                                 const rocblas_fill uplo,
+                                                 const rocblas_int n,
+                                                 float* A,
+                                                 const rocblas_int lda,
+                                                 const rocblas_stride strideA,
+                                                 float* D,
+                                                 const rocblas_stride strideD,
+                                                 rocblas_int* info,
+                                                 const rocblas_int batch_count)
+{
+    return rocsolver_syevdj_heevdj_strided_batched_impl<float>(
+        handle, evect, uplo, n, A, lda, strideA, D, strideD, info, batch_count);
+}
+
+rocblas_status rocsolver_dsyevdj_strided_batched(rocblas_handle handle,
+                                                 const rocblas_evect evect,
+                                                 const rocblas_fill uplo,
+                                                 const rocblas_int n,
+                                                 double* A,
+                                                 const rocblas_int lda,
+                                                 const rocblas_stride strideA,
+                                                 double* D,
+                                                 const rocblas_stride strideD,
+                                                 rocblas_int* info,
+                                                 const rocblas_int batch_count)
+{
+    return rocsolver_syevdj_heevdj_strided_batched_impl<double>(
+        handle, evect, uplo, n, A, lda, strideA, D, strideD, info, batch_count);
+}
+
+rocblas_status rocsolver_cheevdj_strided_batched(rocblas_handle handle,
+                                                 const rocblas_evect evect,
+                                                 const rocblas_fill uplo,
+                                                 const rocblas_int n,
+                                                 rocblas_float_complex* A,
+                                                 const rocblas_int lda,
+                                                 const rocblas_stride strideA,
+                                                 float* D,
+                                                 const rocblas_stride strideD,
+                                                 rocblas_int* info,
+                                                 const rocblas_int batch_count)
+{
+    return rocsolver_syevdj_heevdj_strided_batched_impl<rocblas_float_complex>(
+        handle, evect, uplo, n, A, lda, strideA, D, strideD, info, batch_count);
+}
+
+rocblas_status rocsolver_zheevdj_strided_batched(rocblas_handle handle,
+                                                 const rocblas_evect evect,
+                                                 const rocblas_fill uplo,
+                                                 const rocblas_int n,
+                                                 rocblas_double_complex* A,
+                                                 const rocblas_int lda,
+                                                 const rocblas_stride strideA,
+                                                 double* D,
+                                                 const rocblas_stride strideD,
+                                                 rocblas_int* info,
+                                                 const rocblas_int batch_count)
+{
+    return rocsolver_syevdj_heevdj_strided_batched_impl<rocblas_double_complex>(
+        handle, evect, uplo, n, A, lda, strideA, D, strideD, info, batch_count);
+}
+
+} // extern C
diff --git a/library/src/lapack/roclapack_syevj_heevj.hpp b/library/src/lapack/roclapack_syevj_heevj.hpp
index 840f3aebd..8dfa690f2 100644
--- a/library/src/lapack/roclapack_syevj_heevj.hpp
+++ b/library/src/lapack/roclapack_syevj_heevj.hpp
@@ -6,7 +6,30 @@
  * and
  * Hari & Kovac (2019). On the Convergence of Complex Jacobi Methods.
  *     Linear and Multilinear Algebra 69(3), p. 489-514.
- * Copyright (C) 2021-2023 Advanced Micro Devices, Inc.
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  * ***********************************************************************/
 
 #pragma once
@@ -95,8 +118,6 @@ __device__ void run_syevj(const rocblas_int dimx,
                     }
                 }
             }
-            cosines_res[tix] = local_res;
-            sines_diag[tix] = local_diag;
         }
         else
         {
@@ -123,9 +144,9 @@ __device__ void run_syevj(const rocblas_int dimx,
                     }
                 }
             }
-            cosines_res[tix] = local_res;
-            sines_diag[tix] = local_diag;
         }
+        cosines_res[tix] = local_res;
+        sines_diag[tix] = local_diag;
 
         // initialize top/bottom pairs
         for(i = tix; i < half_n; i += dimx)
@@ -155,9 +176,8 @@ __device__ void run_syevj(const rocblas_int dimx,
         {
             for(rocblas_int cc = 0; cc < count; ++cc)
             {
-                rocblas_int kx = tix + cc * dimx;
-
                 // get current top/bottom pair
+                rocblas_int kx = tix + cc * dimx;
                 i = kx < half_n ? top[kx] : n;
                 j = kx < half_n ? bottom[kx] : n;
 
@@ -281,6 +301,7 @@ __device__ void run_syevj(const rocblas_int dimx,
         if(tiy == 0)
         {
             local_res = 0;
+
             for(i = tix; i < n; i += dimx)
             {
                 for(j = 0; j < i; j++)
@@ -297,33 +318,35 @@ __device__ void run_syevj(const rocblas_int dimx,
         sweeps++;
     }
 
-    if(tiy > 0)
-        return;
-
     // finalize outputs
-    if(tix == 0)
+    if(tiy == 0)
     {
-        *residual = sqrt(local_res);
-        if(sweeps <= max_sweeps)
-        {
-            *n_sweeps = sweeps;
-            *info = 0;
-        }
-        else
+        if(tix == 0)
         {
-            *n_sweeps = max_sweeps;
-            *info = 1;
+            *residual = sqrt(local_res);
+            if(sweeps <= max_sweeps)
+            {
+                *n_sweeps = sweeps;
+                *info = 0;
+            }
+            else
+            {
+                *n_sweeps = max_sweeps;
+                *info = 1;
+            }
         }
-    }
 
-    // update W and then sort eigenvalues and eigenvectors by selection sort
-    for(i = tix; i < n; i += dimx)
-        W[i] = std::real(Acpy[i + i * n]);
+        // update W
+        for(i = tix; i < n; i += dimx)
+            W[i] = std::real(Acpy[i + i * n]);
+    }
     __syncthreads();
 
-    if((evect == rocblas_evect_none && tix > 0) || esort == rocblas_esort_none)
+    // if no sort, then stop
+    if(esort == rocblas_esort_none)
         return;
 
+    //otherwise sort eigenvalues and eigenvectors by selection sort
     rocblas_int m;
     S p;
     for(j = 0; j < n - 1; j++)
@@ -340,7 +363,7 @@ __device__ void run_syevj(const rocblas_int dimx,
         }
         __syncthreads();
 
-        if(m != j)
+        if(m != j && tiy == 0)
         {
             if(tix == 0)
             {
@@ -354,19 +377,23 @@ __device__ void run_syevj(const rocblas_int dimx,
                     swap(A[i + m * lda], A[i + j * lda]);
             }
         }
+        __syncthreads();
     }
 }
 
-__host__ __device__ inline void syevj_get_dims(rocblas_int n, rocblas_int* ddx, rocblas_int* ddy)
+__host__ __device__ inline void
+    syevj_get_dims(rocblas_int n, rocblas_int bdim, rocblas_int* ddx, rocblas_int* ddy)
 {
     // (TODO: Some tuning could be beneficial in the future.
     //	For now, we use a max of BDIM = ddx * ddy threads.
-    //	ddy is set to ceil(n/2) and ddx to min(BDIM/ddy, ceil(n/2)).
+    //	ddy is set to min(BDIM/4, ceil(n/2)) and ddx to min(BDIM/ddy, ceil(n/2)).
 
     rocblas_int even_n = n + n % 2;
     rocblas_int half_n = even_n / 2;
-    *ddy = half_n;
-    *ddx = min(SYEVJ_BDIM / half_n, half_n);
+    rocblas_int y = min(bdim / 4, half_n);
+    rocblas_int x = min(bdim / y, half_n);
+    *ddx = x;
+    *ddy = y;
 }
 
 template <typename T, typename S, typename U>
@@ -403,7 +430,7 @@ ROCSOLVER_KERNEL void __launch_bounds__(SYEVJ_BDIM) syevj_small_kernel(const roc
 
     // get dimensions of 2D thread array
     rocblas_int ddx, ddy;
-    syevj_get_dims(n, &ddx, &ddy);
+    syevj_get_dims(n, SYEVJ_BDIM, &ddx, &ddy);
 
     // shared memory
     extern __shared__ double lmem[];
@@ -1436,7 +1463,7 @@ rocblas_status rocsolver_syevj_heevj_template(rocblas_handle handle,
         // (TODO: SYEVJ_BLOCKED_SWITCH may need re-tuning as it could be larger than 64 now).
 
         rocblas_int ddx, ddy;
-        syevj_get_dims(n, &ddx, &ddy);
+        syevj_get_dims(n, SYEVJ_BDIM, &ddx, &ddy);
         dim3 grid(1, 1, batch_count);
         dim3 threads(ddx * ddy, 1, 1);
         size_t lmemsize = (sizeof(S) + sizeof(T)) * ddx + 2 * sizeof(rocblas_int) * half_n;
diff --git a/library/src/lapack/roclapack_sygvdj_hegvdj.cpp b/library/src/lapack/roclapack_sygvdj_hegvdj.cpp
new file mode 100644
index 000000000..7469995d1
--- /dev/null
+++ b/library/src/lapack/roclapack_sygvdj_hegvdj.cpp
@@ -0,0 +1,194 @@
+/* **************************************************************************
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#include "roclapack_sygvdj_hegvdj.hpp"
+
+template <typename T, typename S, typename U>
+rocblas_status rocsolver_sygvdj_hegvdj_impl(rocblas_handle handle,
+                                            const rocblas_eform itype,
+                                            const rocblas_evect evect,
+                                            const rocblas_fill uplo,
+                                            const rocblas_int n,
+                                            U A,
+                                            const rocblas_int lda,
+                                            U B,
+                                            const rocblas_int ldb,
+                                            S* D,
+                                            rocblas_int* info)
+{
+    const char* name = (!rocblas_is_complex<T> ? "sygvdj" : "hegvdj");
+    ROCSOLVER_ENTER_TOP(name, "--itype", itype, "--evect", evect, "--uplo", uplo, "-n", n, "--lda",
+                        lda, "--ldb", ldb);
+
+    if(!handle)
+        return rocblas_status_invalid_handle;
+
+    // argument checking
+    rocblas_status st
+        = rocsolver_sygvdj_hegvdj_argCheck(handle, itype, evect, uplo, n, lda, ldb, A, B, D, info);
+    if(st != rocblas_status_continue)
+        return st;
+
+    // working with unshifted arrays
+    rocblas_int shiftA = 0;
+    rocblas_int shiftB = 0;
+
+    // normal (non-batched non-strided) execution
+    rocblas_stride strideA = 0;
+    rocblas_stride strideB = 0;
+    rocblas_stride strideD = 0;
+    rocblas_int batch_count = 1;
+
+    // memory workspace sizes:
+    // size for constants in rocblas calls
+    size_t size_scalars;
+    // size of reusable workspaces
+    bool optim_mem;
+    size_t size_work1, size_work2, size_work3, size_work4;
+    // extra requirements for calling SYEVDJ/HEEVDJ
+    size_t size_workE;
+    size_t size_workTau;
+    size_t size_workVec;
+    size_t size_workSplits;
+    size_t size_workArr;
+    // size of temporary info array
+    size_t size_iinfo;
+
+    rocsolver_sygvdj_hegvdj_getMemorySize<false, false, T, S>(
+        itype, evect, uplo, n, batch_count, &size_scalars, &size_work1, &size_work2, &size_work3,
+        &size_work4, &size_workE, &size_workTau, &size_workVec, &size_workSplits, &size_iinfo,
+        &size_workArr, &optim_mem);
+
+    if(rocblas_is_device_memory_size_query(handle))
+        return rocblas_set_optimal_device_memory_size(
+            handle, size_scalars, size_work1, size_work2, size_work3, size_work4, size_workE,
+            size_workTau, size_workVec, size_workSplits, size_iinfo, size_workArr);
+
+    // memory workspace allocation
+    void *scalars, *work1, *work2, *work3, *work4, *workE, *workTau, *workVec, *workSplits,
+        *workArr, *iinfo;
+    rocblas_device_malloc mem(handle, size_scalars, size_work1, size_work2, size_work3, size_work4,
+                              size_workE, size_workTau, size_workVec, size_workSplits, size_iinfo,
+                              size_workArr);
+
+    if(!mem)
+        return rocblas_status_memory_error;
+
+    scalars = mem[0];
+    work1 = mem[1];
+    work2 = mem[2];
+    work3 = mem[3];
+    work4 = mem[4];
+    workE = mem[5];
+    workTau = mem[6];
+    workVec = mem[7];
+    workSplits = mem[8];
+    iinfo = mem[9];
+    workArr = mem[10];
+    if(size_scalars > 0)
+        init_scalars(handle, (T*)scalars);
+
+    // execution
+    return rocsolver_sygvdj_hegvdj_template<false, false, T>(
+        handle, itype, evect, uplo, n, A, shiftA, lda, strideA, B, shiftB, ldb, strideB, D, strideD,
+        info, batch_count, (T*)scalars, work1, work2, work3, work4, (S*)workE, (T*)workTau,
+        (T*)workVec, (rocblas_int*)workSplits, (rocblas_int*)iinfo, workArr, optim_mem);
+}
+
+/*
+ * ===========================================================================
+ *    C wrapper
+ * ===========================================================================
+ */
+
+extern "C" {
+
+rocblas_status rocsolver_ssygvdj(rocblas_handle handle,
+                                 const rocblas_eform itype,
+                                 const rocblas_evect evect,
+                                 const rocblas_fill uplo,
+                                 const rocblas_int n,
+                                 float* A,
+                                 const rocblas_int lda,
+                                 float* B,
+                                 const rocblas_int ldb,
+                                 float* D,
+                                 rocblas_int* info)
+{
+    return rocsolver_sygvdj_hegvdj_impl<float>(handle, itype, evect, uplo, n, A, lda, B, ldb, D,
+                                               info);
+}
+
+rocblas_status rocsolver_dsygvdj(rocblas_handle handle,
+                                 const rocblas_eform itype,
+                                 const rocblas_evect evect,
+                                 const rocblas_fill uplo,
+                                 const rocblas_int n,
+                                 double* A,
+                                 const rocblas_int lda,
+                                 double* B,
+                                 const rocblas_int ldb,
+                                 double* D,
+                                 rocblas_int* info)
+{
+    return rocsolver_sygvdj_hegvdj_impl<double>(handle, itype, evect, uplo, n, A, lda, B, ldb, D,
+                                                info);
+}
+
+rocblas_status rocsolver_chegvdj(rocblas_handle handle,
+                                 const rocblas_eform itype,
+                                 const rocblas_evect evect,
+                                 const rocblas_fill uplo,
+                                 const rocblas_int n,
+                                 rocblas_float_complex* A,
+                                 const rocblas_int lda,
+                                 rocblas_float_complex* B,
+                                 const rocblas_int ldb,
+                                 float* D,
+                                 rocblas_int* info)
+{
+    return rocsolver_sygvdj_hegvdj_impl<rocblas_float_complex>(handle, itype, evect, uplo, n, A,
+                                                               lda, B, ldb, D, info);
+}
+
+rocblas_status rocsolver_zhegvdj(rocblas_handle handle,
+                                 const rocblas_eform itype,
+                                 const rocblas_evect evect,
+                                 const rocblas_fill uplo,
+                                 const rocblas_int n,
+                                 rocblas_double_complex* A,
+                                 const rocblas_int lda,
+                                 rocblas_double_complex* B,
+                                 const rocblas_int ldb,
+                                 double* D,
+                                 rocblas_int* info)
+{
+    return rocsolver_sygvdj_hegvdj_impl<rocblas_double_complex>(handle, itype, evect, uplo, n, A,
+                                                                lda, B, ldb, D, info);
+}
+
+} // extern C
diff --git a/library/src/lapack/roclapack_sygvdj_hegvdj.hpp b/library/src/lapack/roclapack_sygvdj_hegvdj.hpp
new file mode 100644
index 000000000..cdd6c5a9e
--- /dev/null
+++ b/library/src/lapack/roclapack_sygvdj_hegvdj.hpp
@@ -0,0 +1,273 @@
+/*************************************************************************
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#pragma once
+
+#include "rocblas.hpp"
+#include "roclapack_potrf.hpp"
+#include "roclapack_syevdj_heevdj.hpp"
+#include "roclapack_sygst_hegst.hpp"
+#include "roclapack_sygv_hegv.hpp"
+#include "rocsolver/rocsolver.h"
+
+template <typename T, typename S>
+rocblas_status rocsolver_sygvdj_hegvdj_argCheck(rocblas_handle handle,
+                                                const rocblas_eform itype,
+                                                const rocblas_evect evect,
+                                                const rocblas_fill uplo,
+                                                const rocblas_int n,
+                                                const rocblas_int lda,
+                                                const rocblas_int ldb,
+                                                T A,
+                                                T B,
+                                                S D,
+                                                rocblas_int* info,
+                                                const rocblas_int batch_count = 1)
+{
+    // order is important for unit tests:
+
+    // 1. invalid/non-supported values
+    if(itype != rocblas_eform_ax && itype != rocblas_eform_abx && itype != rocblas_eform_bax)
+        return rocblas_status_invalid_value;
+    if(evect != rocblas_evect_none && evect != rocblas_evect_original)
+        return rocblas_status_invalid_value;
+    if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower)
+        return rocblas_status_invalid_value;
+
+    // 2. invalid size
+    if(n < 0 || lda < n || ldb < n || batch_count < 0)
+        return rocblas_status_invalid_size;
+
+    // skip pointer check if querying memory size
+    if(rocblas_is_device_memory_size_query(handle))
+        return rocblas_status_continue;
+
+    // 3. invalid pointers
+    if((n && !A) || (n && !B) || (n && !D) || (batch_count && !info))
+        return rocblas_status_invalid_pointer;
+
+    return rocblas_status_continue;
+}
+
+template <bool BATCHED, bool STRIDED, typename T, typename S>
+void rocsolver_sygvdj_hegvdj_getMemorySize(const rocblas_eform itype,
+                                           const rocblas_evect evect,
+                                           const rocblas_fill uplo,
+                                           const rocblas_int n,
+                                           const rocblas_int batch_count,
+                                           size_t* size_scalars,
+                                           size_t* size_work1,
+                                           size_t* size_work2,
+                                           size_t* size_work3,
+                                           size_t* size_work4,
+                                           size_t* size_workE,
+                                           size_t* size_workTau,
+                                           size_t* size_workVec,
+                                           size_t* size_workSplits,
+                                           size_t* size_iinfo,
+                                           size_t* size_workArr,
+                                           bool* optim_mem)
+{
+    // if quick return no need of workspace
+    if(n == 0 || batch_count == 0)
+    {
+        *size_scalars = 0;
+        *size_work1 = 0;
+        *size_work2 = 0;
+        *size_work3 = 0;
+        *size_work4 = 0;
+        *size_workE = 0;
+        *size_workTau = 0;
+        *size_workVec = 0;
+        *size_iinfo = 0;
+        *size_workSplits = 0;
+        *size_workArr = 0;
+        *optim_mem = true;
+        return;
+    }
+
+    bool opt1, opt2, opt3 = true;
+    size_t unused, temp1, temp2, temp3, temp4, temp5;
+
+    // requirements for calling POTRF
+    rocsolver_potrf_getMemorySize<BATCHED, STRIDED, T>(n, uplo, batch_count, size_scalars,
+                                                       size_work1, size_work2, size_work3,
+                                                       size_work4, size_workArr, size_iinfo, &opt1);
+    *size_iinfo = max(*size_iinfo, sizeof(rocblas_int) * batch_count);
+
+    // requirements for calling SYGST/HEGST
+    rocsolver_sygst_hegst_getMemorySize<BATCHED, STRIDED, T>(uplo, itype, n, batch_count, &unused,
+                                                             &temp1, &temp2, &temp3, &temp4, &opt2);
+    *size_work1 = max(*size_work1, temp1);
+    *size_work2 = max(*size_work2, temp2);
+    *size_work3 = max(*size_work3, temp3);
+    *size_work4 = max(*size_work4, temp4);
+
+    // requirements for calling SYEVDJ/HEEVDJ
+    rocsolver_syevdj_heevdj_getMemorySize<BATCHED, T, S>(
+        evect, uplo, n, batch_count, &unused, size_workE, size_workTau, size_workVec,
+        size_workSplits, &temp1, &temp2, &temp3, &temp4, &temp5);
+    *size_work1 = max(*size_work1, temp1);
+    *size_work2 = max(*size_work2, temp2);
+    *size_work3 = max(*size_work3, temp3);
+    *size_work4 = max(*size_work4, temp4);
+    *size_workArr = max(*size_workArr, temp5);
+
+    if(evect == rocblas_evect_original)
+    {
+        if(itype == rocblas_eform_ax || itype == rocblas_eform_abx)
+        {
+            // requirements for calling TRSM
+            rocblas_operation trans
+                = (uplo == rocblas_fill_upper ? rocblas_operation_none
+                                              : rocblas_operation_conjugate_transpose);
+            rocsolver_trsm_mem<BATCHED, STRIDED, T>(rocblas_side_left, trans, n, n, batch_count,
+                                                    &temp1, &temp2, &temp3, &temp4, &opt3);
+            *size_work1 = max(*size_work1, temp1);
+            *size_work2 = max(*size_work2, temp2);
+            *size_work3 = max(*size_work3, temp3);
+            *size_work4 = max(*size_work4, temp4);
+        }
+    }
+
+    *optim_mem = opt1 && opt2 && opt3;
+}
+
+template <bool BATCHED, bool STRIDED, typename T, typename S, typename U>
+rocblas_status rocsolver_sygvdj_hegvdj_template(rocblas_handle handle,
+                                                const rocblas_eform itype,
+                                                const rocblas_evect evect,
+                                                const rocblas_fill uplo,
+                                                const rocblas_int n,
+                                                U A,
+                                                const rocblas_int shiftA,
+                                                const rocblas_int lda,
+                                                const rocblas_stride strideA,
+                                                U B,
+                                                const rocblas_int shiftB,
+                                                const rocblas_int ldb,
+                                                const rocblas_stride strideB,
+                                                S* D,
+                                                const rocblas_stride strideD,
+                                                rocblas_int* info,
+                                                const rocblas_int batch_count,
+                                                T* scalars,
+                                                void* work1,
+                                                void* work2,
+                                                void* work3,
+                                                void* work4,
+                                                S* workE,
+                                                T* workTau,
+                                                T* workVec,
+                                                rocblas_int* workSplits,
+                                                rocblas_int* iinfo,
+                                                void* workArr,
+                                                bool optim_mem)
+{
+    ROCSOLVER_ENTER("sygvdj_hegvdj", "itype:", itype, "evect:", evect, "uplo:", uplo, "n:", n,
+                    "shiftA:", shiftA, "lda:", lda, "shiftB:", shiftB, "ldb:", ldb,
+                    "bc:", batch_count);
+
+    // quick return
+    if(batch_count == 0)
+        return rocblas_status_success;
+
+    hipStream_t stream;
+    rocblas_get_stream(handle, &stream);
+
+    rocblas_int blocksReset = (batch_count - 1) / BS1 + 1;
+    dim3 gridReset(blocksReset, 1, 1);
+    dim3 threads(BS1, 1, 1);
+
+    // info=0 (starting with no errors)
+    ROCSOLVER_LAUNCH_KERNEL(reset_info, gridReset, threads, 0, stream, info, batch_count, 0);
+
+    // quick return
+    if(n == 0)
+        return rocblas_status_success;
+
+    // everything must be executed with scalars on the host
+    rocblas_pointer_mode old_mode;
+    rocblas_get_pointer_mode(handle, &old_mode);
+    rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
+
+    // constants for rocblas functions calls
+    T one = 1;
+
+    // perform Cholesky factorization of B
+    rocsolver_potrf_template<BATCHED, STRIDED, T, S>(handle, uplo, n, B, shiftB, ldb, strideB, info,
+                                                     batch_count, scalars, work1, work2, work3,
+                                                     work4, (T*)workArr, iinfo, optim_mem);
+
+    /** (TODO: Strictly speaking, computations should stop here is B is not positive definite.
+        A should not be modified in this case as no eigenvalues or eigenvectors can be computed.
+        Need to find a way to do this efficiently; for now A will be destroyed in the non
+        positive-definite case) **/
+
+    // reduce to standard eigenvalue problem and solve
+    rocsolver_sygst_hegst_template<BATCHED, STRIDED, T, S>(
+        handle, itype, uplo, n, A, shiftA, lda, strideA, B, shiftB, ldb, strideB, batch_count,
+        scalars, work1, work2, work3, work4, optim_mem);
+
+    rocsolver_syevdj_heevdj_template<BATCHED, STRIDED, T>(
+        handle, evect, uplo, n, A, shiftA, lda, strideA, D, strideD, iinfo, batch_count, scalars,
+        workE, workTau, workVec, workSplits, work1, work2, work3, work4, workArr);
+
+    // combine info from POTRF with info from SYEV/HEEV
+    ROCSOLVER_LAUNCH_KERNEL(sygv_update_info, gridReset, threads, 0, stream, info, iinfo, n,
+                            batch_count);
+
+    // backtransform eigenvectors
+    if(evect == rocblas_evect_original)
+    {
+        if(itype == rocblas_eform_ax || itype == rocblas_eform_abx)
+        {
+            if(uplo == rocblas_fill_upper)
+                rocsolver_trsm_upper<BATCHED, STRIDED, T>(
+                    handle, rocblas_side_left, rocblas_operation_none, rocblas_diagonal_non_unit, n,
+                    n, B, shiftB, ldb, strideB, A, shiftA, lda, strideA, batch_count, optim_mem,
+                    work1, work2, work3, work4);
+            else
+                rocsolver_trsm_lower<BATCHED, STRIDED, T>(
+                    handle, rocblas_side_left, rocblas_operation_conjugate_transpose,
+                    rocblas_diagonal_non_unit, n, n, B, shiftB, ldb, strideB, A, shiftA, lda,
+                    strideA, batch_count, optim_mem, work1, work2, work3, work4);
+        }
+        else
+        {
+            rocblas_operation trans
+                = (uplo == rocblas_fill_upper ? rocblas_operation_conjugate_transpose
+                                              : rocblas_operation_none);
+            rocblasCall_trmm(handle, rocblas_side_left, uplo, trans, rocblas_diagonal_non_unit, n,
+                             n, &one, 0, B, shiftB, ldb, strideB, A, shiftA, lda, strideA,
+                             batch_count, (T**)workArr);
+        }
+    }
+
+    rocblas_set_pointer_mode(handle, old_mode);
+    return rocblas_status_success;
+}
diff --git a/library/src/lapack/roclapack_sygvdj_hegvdj_batched.cpp b/library/src/lapack/roclapack_sygvdj_hegvdj_batched.cpp
new file mode 100644
index 000000000..0e04aca02
--- /dev/null
+++ b/library/src/lapack/roclapack_sygvdj_hegvdj_batched.cpp
@@ -0,0 +1,202 @@
+/* **************************************************************************
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#include "roclapack_sygvdj_hegvdj.hpp"
+
+template <typename T, typename S, typename U>
+rocblas_status rocsolver_sygvdj_hegvdj_batched_impl(rocblas_handle handle,
+                                                    const rocblas_eform itype,
+                                                    const rocblas_evect evect,
+                                                    const rocblas_fill uplo,
+                                                    const rocblas_int n,
+                                                    U A,
+                                                    const rocblas_int lda,
+                                                    U B,
+                                                    const rocblas_int ldb,
+                                                    S* D,
+                                                    const rocblas_stride strideD,
+                                                    rocblas_int* info,
+                                                    const rocblas_int batch_count)
+{
+    const char* name = (!rocblas_is_complex<T> ? "sygvdj_batched" : "hegvdj_batched");
+    ROCSOLVER_ENTER_TOP(name, "--itype", itype, "--evect", evect, "--uplo", uplo, "-n", n, "--lda",
+                        lda, "--ldb", ldb, "--strideD", strideD, "--batch_count", batch_count);
+
+    if(!handle)
+        return rocblas_status_invalid_handle;
+
+    // argument checking
+    rocblas_status st = rocsolver_sygvdj_hegvdj_argCheck(handle, itype, evect, uplo, n, lda, ldb, A,
+                                                         B, D, info, batch_count);
+    if(st != rocblas_status_continue)
+        return st;
+
+    // working with unshifted arrays
+    rocblas_int shiftA = 0;
+    rocblas_int shiftB = 0;
+
+    // batched execution
+    rocblas_stride strideA = 0;
+    rocblas_stride strideB = 0;
+
+    // memory workspace sizes:
+    // size for constants in rocblas calls
+    size_t size_scalars;
+    // size of reusable workspaces
+    bool optim_mem;
+    size_t size_work1, size_work2, size_work3, size_work4;
+    // extra requirements for calling SYEVDJ/HEEVDJ
+    size_t size_workE;
+    size_t size_workTau;
+    size_t size_workVec;
+    size_t size_workSplits;
+    size_t size_workArr;
+    // size of temporary info array
+    size_t size_iinfo;
+
+    rocsolver_sygvdj_hegvdj_getMemorySize<true, false, T, S>(
+        itype, evect, uplo, n, batch_count, &size_scalars, &size_work1, &size_work2, &size_work3,
+        &size_work4, &size_workE, &size_workTau, &size_workVec, &size_workSplits, &size_iinfo,
+        &size_workArr, &optim_mem);
+
+    if(rocblas_is_device_memory_size_query(handle))
+        return rocblas_set_optimal_device_memory_size(
+            handle, size_scalars, size_work1, size_work2, size_work3, size_work4, size_workE,
+            size_workTau, size_workVec, size_workSplits, size_iinfo, size_workArr);
+
+    // memory workspace allocation
+    void *scalars, *work1, *work2, *work3, *work4, *workE, *workTau, *workVec, *workSplits,
+        *workArr, *iinfo;
+    rocblas_device_malloc mem(handle, size_scalars, size_work1, size_work2, size_work3, size_work4,
+                              size_workE, size_workTau, size_workVec, size_workSplits, size_iinfo,
+                              size_workArr);
+
+    if(!mem)
+        return rocblas_status_memory_error;
+
+    scalars = mem[0];
+    work1 = mem[1];
+    work2 = mem[2];
+    work3 = mem[3];
+    work4 = mem[4];
+    workE = mem[5];
+    workTau = mem[6];
+    workVec = mem[7];
+    workSplits = mem[8];
+    iinfo = mem[9];
+    workArr = mem[10];
+    if(size_scalars > 0)
+        init_scalars(handle, (T*)scalars);
+
+    // execution
+    return rocsolver_sygvdj_hegvdj_template<true, false, T>(
+        handle, itype, evect, uplo, n, A, shiftA, lda, strideA, B, shiftB, ldb, strideB, D, strideD,
+        info, batch_count, (T*)scalars, work1, work2, work3, work4, (S*)workE, (T*)workTau,
+        (T*)workVec, (rocblas_int*)workSplits, (rocblas_int*)iinfo, workArr, optim_mem);
+}
+
+/*
+ * ===========================================================================
+ *    C wrapper
+ * ===========================================================================
+ */
+
+extern "C" {
+
+rocblas_status rocsolver_ssygvdj_batched(rocblas_handle handle,
+                                         const rocblas_eform itype,
+                                         const rocblas_evect evect,
+                                         const rocblas_fill uplo,
+                                         const rocblas_int n,
+                                         float* const A[],
+                                         const rocblas_int lda,
+                                         float* const B[],
+                                         const rocblas_int ldb,
+                                         float* D,
+                                         const rocblas_stride strideD,
+                                         rocblas_int* info,
+                                         const rocblas_int batch_count)
+{
+    return rocsolver_sygvdj_hegvdj_batched_impl<float>(handle, itype, evect, uplo, n, A, lda, B,
+                                                       ldb, D, strideD, info, batch_count);
+}
+
+rocblas_status rocsolver_dsygvdj_batched(rocblas_handle handle,
+                                         const rocblas_eform itype,
+                                         const rocblas_evect evect,
+                                         const rocblas_fill uplo,
+                                         const rocblas_int n,
+                                         double* const A[],
+                                         const rocblas_int lda,
+                                         double* const B[],
+                                         const rocblas_int ldb,
+                                         double* D,
+                                         const rocblas_stride strideD,
+                                         rocblas_int* info,
+                                         const rocblas_int batch_count)
+{
+    return rocsolver_sygvdj_hegvdj_batched_impl<double>(handle, itype, evect, uplo, n, A, lda, B,
+                                                        ldb, D, strideD, info, batch_count);
+}
+
+rocblas_status rocsolver_chegvdj_batched(rocblas_handle handle,
+                                         const rocblas_eform itype,
+                                         const rocblas_evect evect,
+                                         const rocblas_fill uplo,
+                                         const rocblas_int n,
+                                         rocblas_float_complex* const A[],
+                                         const rocblas_int lda,
+                                         rocblas_float_complex* const B[],
+                                         const rocblas_int ldb,
+                                         float* D,
+                                         const rocblas_stride strideD,
+                                         rocblas_int* info,
+                                         const rocblas_int batch_count)
+{
+    return rocsolver_sygvdj_hegvdj_batched_impl<rocblas_float_complex>(
+        handle, itype, evect, uplo, n, A, lda, B, ldb, D, strideD, info, batch_count);
+}
+
+rocblas_status rocsolver_zhegvdj_batched(rocblas_handle handle,
+                                         const rocblas_eform itype,
+                                         const rocblas_evect evect,
+                                         const rocblas_fill uplo,
+                                         const rocblas_int n,
+                                         rocblas_double_complex* const A[],
+                                         const rocblas_int lda,
+                                         rocblas_double_complex* const B[],
+                                         const rocblas_int ldb,
+                                         double* D,
+                                         const rocblas_stride strideD,
+                                         rocblas_int* info,
+                                         const rocblas_int batch_count)
+{
+    return rocsolver_sygvdj_hegvdj_batched_impl<rocblas_double_complex>(
+        handle, itype, evect, uplo, n, A, lda, B, ldb, D, strideD, info, batch_count);
+}
+
+} // extern C
diff --git a/library/src/lapack/roclapack_sygvdj_hegvdj_strided_batched.cpp b/library/src/lapack/roclapack_sygvdj_hegvdj_strided_batched.cpp
new file mode 100644
index 000000000..e31903501
--- /dev/null
+++ b/library/src/lapack/roclapack_sygvdj_hegvdj_strided_batched.cpp
@@ -0,0 +1,213 @@
+/* **************************************************************************
+ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#include "roclapack_sygvdj_hegvdj.hpp"
+
+template <typename T, typename S, typename U>
+rocblas_status rocsolver_sygvdj_hegvdj_strided_batched_impl(rocblas_handle handle,
+                                                            const rocblas_eform itype,
+                                                            const rocblas_evect evect,
+                                                            const rocblas_fill uplo,
+                                                            const rocblas_int n,
+                                                            U A,
+                                                            const rocblas_int lda,
+                                                            const rocblas_stride strideA,
+                                                            U B,
+                                                            const rocblas_int ldb,
+                                                            const rocblas_stride strideB,
+                                                            S* D,
+                                                            const rocblas_stride strideD,
+                                                            rocblas_int* info,
+                                                            const rocblas_int batch_count)
+{
+    const char* name = (!rocblas_is_complex<T> ? "sygvdj_strided_batched" : "hegvdj_strided_batched");
+    ROCSOLVER_ENTER_TOP(name, "--itype", itype, "--evect", evect, "--uplo", uplo, "-n", n, "--lda",
+                        lda, "--strideA", strideA, "--ldb", ldb, "--strideB", strideB, "--strideD",
+                        strideD, "--batch_count", batch_count);
+
+    if(!handle)
+        return rocblas_status_invalid_handle;
+
+    // argument checking
+    rocblas_status st = rocsolver_sygvdj_hegvdj_argCheck(handle, itype, evect, uplo, n, lda, ldb, A,
+                                                         B, D, info, batch_count);
+    if(st != rocblas_status_continue)
+        return st;
+
+    // working with unshifted arrays
+    rocblas_int shiftA = 0;
+    rocblas_int shiftB = 0;
+
+    // memory workspace sizes:
+    // size for constants in rocblas calls
+    size_t size_scalars;
+    // size of reusable workspaces
+    bool optim_mem;
+    size_t size_work1, size_work2, size_work3, size_work4;
+    // extra requirements for calling SYEVDJ/HEEVDJ
+    size_t size_workE;
+    size_t size_workTau;
+    size_t size_workVec;
+    size_t size_workSplits;
+    size_t size_workArr;
+    // size of temporary info array
+    size_t size_iinfo;
+
+    rocsolver_sygvdj_hegvdj_getMemorySize<false, true, T, S>(
+        itype, evect, uplo, n, batch_count, &size_scalars, &size_work1, &size_work2, &size_work3,
+        &size_work4, &size_workE, &size_workTau, &size_workVec, &size_workSplits, &size_iinfo,
+        &size_workArr, &optim_mem);
+
+    if(rocblas_is_device_memory_size_query(handle))
+        return rocblas_set_optimal_device_memory_size(
+            handle, size_scalars, size_work1, size_work2, size_work3, size_work4, size_workE,
+            size_workTau, size_workVec, size_workSplits, size_iinfo, size_workArr);
+
+    // memory workspace allocation
+    void *scalars, *work1, *work2, *work3, *work4, *workE, *workTau, *workVec, *workSplits,
+        *workArr, *iinfo;
+    rocblas_device_malloc mem(handle, size_scalars, size_work1, size_work2, size_work3, size_work4,
+                              size_workE, size_workTau, size_workVec, size_workSplits, size_iinfo,
+                              size_workArr);
+
+    if(!mem)
+        return rocblas_status_memory_error;
+
+    scalars = mem[0];
+    work1 = mem[1];
+    work2 = mem[2];
+    work3 = mem[3];
+    work4 = mem[4];
+    workE = mem[5];
+    workTau = mem[6];
+    workVec = mem[7];
+    workSplits = mem[8];
+    iinfo = mem[9];
+    workArr = mem[10];
+    if(size_scalars > 0)
+        init_scalars(handle, (T*)scalars);
+
+    // execution
+    return rocsolver_sygvdj_hegvdj_template<false, true, T>(
+        handle, itype, evect, uplo, n, A, shiftA, lda, strideA, B, shiftB, ldb, strideB, D, strideD,
+        info, batch_count, (T*)scalars, work1, work2, work3, work4, (S*)workE, (T*)workTau,
+        (T*)workVec, (rocblas_int*)workSplits, (rocblas_int*)iinfo, workArr, optim_mem);
+}
+
+/*
+ * ===========================================================================
+ *    C wrapper
+ * ===========================================================================
+ */
+
+extern "C" {
+
+rocblas_status rocsolver_ssygvdj_strided_batched(rocblas_handle handle,
+                                                 const rocblas_eform itype,
+                                                 const rocblas_evect evect,
+                                                 const rocblas_fill uplo,
+                                                 const rocblas_int n,
+                                                 float* A,
+                                                 const rocblas_int lda,
+                                                 const rocblas_stride strideA,
+                                                 float* B,
+                                                 const rocblas_int ldb,
+                                                 const rocblas_stride strideB,
+                                                 float* D,
+                                                 const rocblas_stride strideD,
+                                                 rocblas_int* info,
+                                                 const rocblas_int batch_count)
+{
+    return rocsolver_sygvdj_hegvdj_strided_batched_impl<float>(handle, itype, evect, uplo, n, A,
+                                                               lda, strideA, B, ldb, strideB, D,
+                                                               strideD, info, batch_count);
+}
+
+rocblas_status rocsolver_dsygvdj_strided_batched(rocblas_handle handle,
+                                                 const rocblas_eform itype,
+                                                 const rocblas_evect evect,
+                                                 const rocblas_fill uplo,
+                                                 const rocblas_int n,
+                                                 double* A,
+                                                 const rocblas_int lda,
+                                                 const rocblas_stride strideA,
+                                                 double* B,
+                                                 const rocblas_int ldb,
+                                                 const rocblas_stride strideB,
+                                                 double* D,
+                                                 const rocblas_stride strideD,
+                                                 rocblas_int* info,
+                                                 const rocblas_int batch_count)
+{
+    return rocsolver_sygvdj_hegvdj_strided_batched_impl<double>(handle, itype, evect, uplo, n, A,
+                                                                lda, strideA, B, ldb, strideB, D,
+                                                                strideD, info, batch_count);
+}
+
+rocblas_status rocsolver_chegvdj_strided_batched(rocblas_handle handle,
+                                                 const rocblas_eform itype,
+                                                 const rocblas_evect evect,
+                                                 const rocblas_fill uplo,
+                                                 const rocblas_int n,
+                                                 rocblas_float_complex* A,
+                                                 const rocblas_int lda,
+                                                 const rocblas_stride strideA,
+                                                 rocblas_float_complex* B,
+                                                 const rocblas_int ldb,
+                                                 const rocblas_stride strideB,
+                                                 float* D,
+                                                 const rocblas_stride strideD,
+                                                 rocblas_int* info,
+                                                 const rocblas_int batch_count)
+{
+    return rocsolver_sygvdj_hegvdj_strided_batched_impl<rocblas_float_complex>(
+        handle, itype, evect, uplo, n, A, lda, strideA, B, ldb, strideB, D, strideD, info,
+        batch_count);
+}
+
+rocblas_status rocsolver_zhegvdj_strided_batched(rocblas_handle handle,
+                                                 const rocblas_eform itype,
+                                                 const rocblas_evect evect,
+                                                 const rocblas_fill uplo,
+                                                 const rocblas_int n,
+                                                 rocblas_double_complex* A,
+                                                 const rocblas_int lda,
+                                                 const rocblas_stride strideA,
+                                                 rocblas_double_complex* B,
+                                                 const rocblas_int ldb,
+                                                 const rocblas_stride strideB,
+                                                 double* D,
+                                                 const rocblas_stride strideD,
+                                                 rocblas_int* info,
+                                                 const rocblas_int batch_count)
+{
+    return rocsolver_sygvdj_hegvdj_strided_batched_impl<rocblas_double_complex>(
+        handle, itype, evect, uplo, n, A, lda, strideA, B, ldb, strideB, D, strideD, info,
+        batch_count);
+}
+
+} // extern C